diff --git a/config b/config index 746172b..32d8691 100644 --- a/config +++ b/config @@ -2,7 +2,7 @@ # Automatically generated file; DO NOT EDIT. # Linux/x86 6.2.1 Kernel Configuration # -CONFIG_CC_VERSION_TEXT="gcc (GCC) 12.2.1 20230216" +CONFIG_CC_VERSION_TEXT="gcc (GCC) 12.2.1 20230311" CONFIG_CC_IS_GCC=y CONFIG_GCC_VERSION=120201 CONFIG_CLANG_VERSION=0 @@ -534,7 +534,7 @@ CONFIG_X86_PAT=y CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_X86_UMIP=y CONFIG_CC_HAS_IBT=y -# CONFIG_X86_KERNEL_IBT is not set +CONFIG_X86_KERNEL_IBT=y CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y # CONFIG_X86_INTEL_TSX_MODE_OFF is not set # CONFIG_X86_INTEL_TSX_MODE_ON is not set @@ -6938,7 +6938,7 @@ CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y # CONFIG_LOGO is not set # end of Graphics support -# CONFIG_DRM_ACCEL is not set +CONFIG_DRM_ACCEL=y CONFIG_SOUND=m CONFIG_SOUND_OSS_CORE=y # CONFIG_SOUND_OSS_CORE_PRECLAIM is not set diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index c1c40d6..ffe879d 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,7 +1,7 @@ -From d7322fe0d4d120555d7dd3c2a6167f7f726b8738 Mon Sep 17 00:00:00 2001 +From c2fc7486fbb316ab576a741ae264255a4cc4de44 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 10 Mar 2023 17:59:47 +0100 -Subject: [PATCH 01/16] bbr2 +Date: Mon, 6 Mar 2023 18:43:03 +0100 +Subject: [PATCH 01/10] bbr2 Signed-off-by: Peter Jung --- @@ -256,10 +256,10 @@ index 2dfb12230f08..b6bec331a82e 100644 config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile -index af7d2cf490fb..e7a86a50838a 100644 +index 880277c9fd07..ef1da49d20a6 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile -@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o @@ -268,10 +268,10 @@ index af7d2cf490fb..e7a86a50838a 100644 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 33f559f491c8..e9e8040d6491 100644 +index 288693981b00..1d530667b172 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags) +@@ -3192,6 +3192,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -280,7 +280,7 @@ index 33f559f491c8..e9e8040d6491 100644 /* Clean up fastopen related fields */ diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index d2c470524e58..af08fb3cb139 100644 +index 146792cd26fe..16038f6ee52a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) @@ -288,7 +288,7 @@ index d2c470524e58..af08fb3cb139 100644 } -/* override sysctl_tcp_min_tso_segs */ - static u32 bbr_min_tso_segs(struct sock *sk) + __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) { return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } @@ -3025,7 +3025,7 @@ index 000000000000..85f8052144d1 +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index d3cae40749e8..0f268f2ff2e9 100644 +index db8b4b488c31..0d6d1a949e11 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk) @@ -3123,7 +3123,7 @@ index cc072d2cfcd8..754e0212c951 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 71d01cf3c13e..0da3da9e56db 100644 +index ba839e441450..5ffec885e66f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, @@ -3281,1451 +3281,31 @@ index cb79127f45c3..70e4de876a7f 100644 event = icsk->icsk_pending; -- -2.40.0.rc2 +2.40.0 -From 87439b08ac56036539528efb6da691914f41ca76 Mon Sep 17 00:00:00 2001 +From 8f60626e149f4437570d56968a2cab10a822fcd4 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 10 Mar 2023 18:00:04 +0100 -Subject: [PATCH 02/16] bfq +Date: Sun, 9 Apr 2023 21:21:39 +0200 +Subject: [PATCH 02/10] bfq Signed-off-by: Peter Jung --- - block/bfq-cgroup.c | 101 ++++--- - block/bfq-iosched.c | 637 ++++++++++++++++++++++++++++-------------- - block/bfq-iosched.h | 144 ++++++++-- - block/bfq-wf2q.c | 2 +- - block/blk-cgroup.c | 122 ++++---- - block/blk-cgroup.h | 10 +- - block/blk-iocost.c | 58 ++-- - block/blk-iolatency.c | 39 ++- - block/blk-rq-qos.h | 2 +- - block/blk-throttle.c | 16 +- - block/blk.h | 6 - - 11 files changed, 747 insertions(+), 390 deletions(-) + block/bfq-iosched.c | 6 ++++++ + 1 file changed, 6 insertions(+) -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index 0fbde0fc0628..59929dfd559b 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -706,12 +706,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_activate_bfqq(bfqd, bfqq); - } - -- if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) - bfq_schedule_dispatch(bfqd); - /* release extra ref taken above, bfqq may happen to be freed now */ - bfq_put_queue(bfqq); - } - -+static void bfq_sync_bfqq_move(struct bfq_data *bfqd, -+ struct bfq_queue *sync_bfqq, -+ struct bfq_io_cq *bic, -+ struct bfq_group *bfqg, -+ unsigned int act_idx) -+{ -+ struct bfq_queue *bfqq; -+ -+ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { -+ /* We are the only user of this bfqq, just move it */ -+ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ return; -+ } -+ -+ /* -+ * The queue was merged to a different queue. Check -+ * that the merge chain still belongs to the same -+ * cgroup. -+ */ -+ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) -+ if (bfqq->entity.sched_data != &bfqg->sched_data) -+ break; -+ if (bfqq) { -+ /* -+ * Some queue changed cgroup so the merge is not valid -+ * anymore. We cannot easily just cancel the merge (by -+ * clearing new_bfqq) as there may be other processes -+ * using this queue and holding refs to all queues -+ * below sync_bfqq->new_bfqq. Similarly if the merge -+ * already happened, we need to detach from bfqq now -+ * so that we cannot merge bio to a request from the -+ * old cgroup. -+ */ -+ bfq_put_cooperator(sync_bfqq); -+ bic_set_bfqq(bic, NULL, true, act_idx); -+ bfq_release_process_ref(bfqd, sync_bfqq); -+ } -+} -+ - /** - * __bfq_bic_change_cgroup - move @bic to @bfqg. - * @bfqd: the queue descriptor. -@@ -726,53 +766,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bfq_group *bfqg) - { -- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); -- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); -- struct bfq_entity *entity; -+ unsigned int act_idx; - -- if (async_bfqq) { -- entity = &async_bfqq->entity; -+ for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); - -- if (entity->sched_data != &bfqg->sched_data) { -- bic_set_bfqq(bic, NULL, false); -+ if (async_bfqq && -+ async_bfqq->entity.sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, false, act_idx); - bfq_release_process_ref(bfqd, async_bfqq); - } -- } - -- if (sync_bfqq) { -- if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { -- /* We are the only user of this bfqq, just move it */ -- if (sync_bfqq->entity.sched_data != &bfqg->sched_data) -- bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -- } else { -- struct bfq_queue *bfqq; -- -- /* -- * The queue was merged to a different queue. Check -- * that the merge chain still belongs to the same -- * cgroup. -- */ -- for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) -- if (bfqq->entity.sched_data != -- &bfqg->sched_data) -- break; -- if (bfqq) { -- /* -- * Some queue changed cgroup so the merge is -- * not valid anymore. We cannot easily just -- * cancel the merge (by clearing new_bfqq) as -- * there may be other processes using this -- * queue and holding refs to all queues below -- * sync_bfqq->new_bfqq. Similarly if the merge -- * already happened, we need to detach from -- * bfqq now so that we cannot merge bio to a -- * request from the old cgroup. -- */ -- bfq_put_cooperator(sync_bfqq); -- bic_set_bfqq(bic, NULL, true); -- bfq_release_process_ref(bfqd, sync_bfqq); -- } -- } -+ if (sync_bfqq) -+ bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); - } - } - -@@ -1106,9 +1113,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, - struct bfq_group *bfqg; - u64 v; - -- ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); -+ blkg_conf_init(&ctx, buf); -+ -+ ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); - if (ret) -- return ret; -+ goto out; - - if (sscanf(ctx.body, "%llu", &v) == 1) { - /* require "default" on dfl */ -@@ -1130,7 +1139,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, - ret = 0; - } - out: -- blkg_conf_finish(&ctx); -+ blkg_conf_exit(&ctx); - return ret ?: nbytes; - } - diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 380e9bda2e57..aa644973d260 100644 +index d9ed3108c17a..f32b177a36e5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c -@@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; - #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - --struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, -+ unsigned int actuator_idx) - { -- return bic->bfqq[is_sync]; -+ if (is_sync) -+ return bic->bfqq[1][actuator_idx]; -+ -+ return bic->bfqq[0][actuator_idx]; - } - - static void bfq_put_stable_ref(struct bfq_queue *bfqq); - --void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) -+void bic_set_bfqq(struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, -+ bool is_sync, -+ unsigned int actuator_idx) - { -- struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; -- -- /* Clear bic pointer if bfqq is detached from this bic */ -- if (old_bfqq && old_bfqq->bic == bic) -- old_bfqq->bic = NULL; -+ struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; - - /* - * If bfqq != NULL, then a non-stable queue merge between -@@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) - * we cancel the stable merge if - * bic->stable_merge_bfqq == bfqq. - */ -- bic->bfqq[is_sync] = bfqq; -+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; -+ -+ /* Clear bic pointer if bfqq is detached from this bic */ -+ if (old_bfqq && old_bfqq->bic == bic) -+ old_bfqq->bic = NULL; - -- if (bfqq && bic->stable_merge_bfqq == bfqq) { -+ if (is_sync) -+ bic->bfqq[1][actuator_idx] = bfqq; -+ else -+ bic->bfqq[0][actuator_idx] = bfqq; -+ -+ if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { - /* - * Actually, these same instructions are executed also - * in bfq_setup_cooperator, in case of abort or actual -@@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) - * did so, we would nest even more complexity in this - * function. - */ -- bfq_put_stable_ref(bic->stable_merge_bfqq); -+ bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); - -- bic->stable_merge_bfqq = NULL; -+ bfqq_data->stable_merge_bfqq = NULL; - } - } - -@@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) - { - struct bfq_data *bfqd = data->q->elevator->elevator_data; - struct bfq_io_cq *bic = bfq_bic_lookup(data->q); -- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; - int depth; - unsigned limit = data->q->nr_requests; -+ unsigned int act_idx; - - /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) { -@@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) - limit = (limit * depth) >> bfqd->full_depth_shift; - } - -- /* -- * Does queue (or any parent entity) exceed number of requests that -- * should be available to it? Heavily limit depth so that it cannot -- * consume more available requests and thus starve other entities. -- */ -- if (bfqq && bfqq_request_over_limit(bfqq, limit)) -- depth = 1; -+ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { -+ struct bfq_queue *bfqq = -+ bic_to_bfqq(bic, op_is_sync(opf), act_idx); - -+ /* -+ * Does queue (or any parent entity) exceed number of -+ * requests that should be available to it? Heavily -+ * limit depth so that it cannot consume more -+ * available requests and thus starve other entities. -+ */ -+ if (bfqq && bfqq_request_over_limit(bfqq, limit)) { -+ depth = 1; -+ break; -+ } -+ } - bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); - if (depth) -@@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - { - u64 dur; - -- if (bfqd->bfq_wr_max_time > 0) -- return bfqd->bfq_wr_max_time; -- - dur = bfqd->rate_dur_prod; - do_div(dur, bfqd->peak_rate); - -@@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - { - unsigned int old_wr_coeff = 1; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ unsigned int a_idx = bfqq->actuator_idx; -+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - -- if (bic->saved_has_short_ttime) -+ if (bfqq_data->saved_has_short_ttime) - bfq_mark_bfqq_has_short_ttime(bfqq); - else - bfq_clear_bfqq_has_short_ttime(bfqq); - -- if (bic->saved_IO_bound) -+ if (bfqq_data->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); - else - bfq_clear_bfqq_IO_bound(bfqq); - -- bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; -- bfqq->inject_limit = bic->saved_inject_limit; -- bfqq->decrease_time_jif = bic->saved_decrease_time_jif; -+ bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; -+ bfqq->inject_limit = bfqq_data->saved_inject_limit; -+ bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; - -- bfqq->entity.new_weight = bic->saved_weight; -- bfqq->ttime = bic->saved_ttime; -- bfqq->io_start_time = bic->saved_io_start_time; -- bfqq->tot_idle_time = bic->saved_tot_idle_time; -+ bfqq->entity.new_weight = bfqq_data->saved_weight; -+ bfqq->ttime = bfqq_data->saved_ttime; -+ bfqq->io_start_time = bfqq_data->saved_io_start_time; -+ bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; - /* - * Restore weight coefficient only if low_latency is on - */ - if (bfqd->low_latency) { - old_wr_coeff = bfqq->wr_coeff; -- bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_coeff = bfqq_data->saved_wr_coeff; - } -- bfqq->service_from_wr = bic->saved_service_from_wr; -- bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -- bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -- bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ bfqq->service_from_wr = bfqq_data->saved_service_from_wr; -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq_data->saved_wr_start_at_switch_to_srt; -+ bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; - - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + -@@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, - return bfqq_weight > in_serv_weight; - } - -+/* -+ * Get the index of the actuator that will serve bio. -+ */ -+static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) -+{ -+ unsigned int i; -+ sector_t end; -+ -+ /* no search needed if one or zero ranges present */ -+ if (bfqd->num_actuators == 1) -+ return 0; -+ -+ /* bio_end_sector(bio) gives the sector after the last one */ -+ end = bio_end_sector(bio) - 1; -+ -+ for (i = 0; i < bfqd->num_actuators; i++) { -+ if (end >= bfqd->sector[i] && -+ end < bfqd->sector[i] + bfqd->nr_sectors[i]) -+ return i; -+ } -+ -+ WARN_ONCE(true, -+ "bfq_actuator_index: bio sector out of ranges: end=%llu\n", -+ end); -+ return 0; -+} -+ - static bool bfq_better_to_idle(struct bfq_queue *bfqq); - - static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -@@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - arrived_in_time = ktime_get_ns() <= - bfqq->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; -- -+ unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); -+ bool bfqq_non_merged_or_stably_merged = -+ bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; - - /* - * bfqq deserves to be weight-raised if: -@@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - */ - wr_or_deserves_wr = bfqd->low_latency && - (bfqq->wr_coeff > 1 || -- (bfq_bfqq_sync(bfqq) && -- (bfqq->bic || RQ_BIC(rq)->stably_merged) && -- (*interactive || soft_rt))); -+ (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && -+ (*interactive || soft_rt))); - - /* - * Using the last flag, update budget and check whether bfqq -@@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * We reset waker detection logic also if too much time has passed - * since the first detection. If wakeups are rare, pointless idling - * doesn't hurt throughput that much. The condition below makes sure -- * we do not uselessly idle blocking waker in more than 1/64 cases. -+ * we do not uselessly idle blocking waker in more than 1/64 cases. - */ - if (bfqd->last_completed_rq_bfqq != - bfqq->tentative_waker_bfqq || -@@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq) - * elapsed. - */ - if (bfqq == bfqd->in_service_queue && -- (bfqd->rq_in_driver == 0 || -+ (bfqd->tot_rq_in_driver == 0 || - (bfqq->last_serv_time_ns > 0 && -- bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && -+ bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && - time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); -@@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq) - * will be set in case injection is performed - * on bfqq before rq is completed). - */ -- if (bfqd->rq_in_driver == 0) -+ if (bfqd->tot_rq_in_driver == 0) - bfqd->rqs_injected = false; - } - } -@@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, - */ - bfq_bic_update_cgroup(bic, bio); - -- bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), -+ bfq_actuator_index(bfqd, bio)); - } else { - bfqd->bio_bfqq = NULL; - } -@@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg) - { -- int i, j; -+ int i, j, k; - -- for (i = 0; i < 2; i++) -- for (j = 0; j < IOPRIO_NR_LEVELS; j++) -- if (bfqg->async_bfqq[i][j]) -- bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -- if (bfqg->async_idle_bfqq) -- bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+ for (k = 0; k < bfqd->num_actuators; k++) { -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_NR_LEVELS; j++) -+ if (bfqg->async_bfqq[i][j][k]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); -+ if (bfqg->async_idle_bfqq[k]) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); -+ } - } - - static void bfq_end_wr(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq; -+ int i; - - spin_lock_irq(&bfqd->lock); - -- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -- bfq_bfqq_end_wr(bfqq); -+ for (i = 0; i < bfqd->num_actuators; i++) { -+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ } - list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); -@@ -2794,6 +2847,40 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, - struct bfq_queue *bfqq); - -+static struct bfq_queue * -+bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_queue *stable_merge_bfqq, -+ struct bfq_iocq_bfqq_data *bfqq_data) -+{ -+ int proc_ref = min(bfqq_process_refs(bfqq), -+ bfqq_process_refs(stable_merge_bfqq)); -+ struct bfq_queue *new_bfqq = NULL; -+ -+ bfqq_data->stable_merge_bfqq = NULL; -+ if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0) -+ goto out; -+ -+ /* next function will take at least one ref */ -+ new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); -+ -+ if (new_bfqq) { -+ bfqq_data->stably_merged = true; -+ if (new_bfqq->bic) { -+ unsigned int new_a_idx = new_bfqq->actuator_idx; -+ struct bfq_iocq_bfqq_data *new_bfqq_data = -+ &new_bfqq->bic->bfqq_data[new_a_idx]; -+ -+ new_bfqq_data->stably_merged = true; -+ } -+ } -+ -+out: -+ /* deschedule stable merge, because done or aborted here */ -+ bfq_put_stable_ref(stable_merge_bfqq); -+ -+ return new_bfqq; -+} -+ - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2819,6 +2906,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request, struct bfq_io_cq *bic) - { - struct bfq_queue *in_service_bfqq, *new_bfqq; -+ unsigned int a_idx = bfqq->actuator_idx; -+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - - /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) -@@ -2840,37 +2929,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * stable merging) also if bic is associated with a - * sync queue, but this bfqq is async - */ -- if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && -+ if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && - !bfq_bfqq_just_created(bfqq) && - time_is_before_jiffies(bfqq->split_time + - msecs_to_jiffies(bfq_late_stable_merging)) && - time_is_before_jiffies(bfqq->creation_time + - msecs_to_jiffies(bfq_late_stable_merging))) { - struct bfq_queue *stable_merge_bfqq = -- bic->stable_merge_bfqq; -- int proc_ref = min(bfqq_process_refs(bfqq), -- bfqq_process_refs(stable_merge_bfqq)); -- -- /* deschedule stable merge, because done or aborted here */ -- bfq_put_stable_ref(stable_merge_bfqq); -- -- bic->stable_merge_bfqq = NULL; -- -- if (!idling_boosts_thr_without_issues(bfqd, bfqq) && -- proc_ref > 0) { -- /* next function will take at least one ref */ -- struct bfq_queue *new_bfqq = -- bfq_setup_merge(bfqq, stable_merge_bfqq); -- -- if (new_bfqq) { -- bic->stably_merged = true; -- if (new_bfqq->bic) -- new_bfqq->bic->stably_merged = -- true; -- } -- return new_bfqq; -- } else -- return NULL; -+ bfqq_data->stable_merge_bfqq; -+ -+ return bfq_setup_stable_merge(bfqd, bfqq, -+ stable_merge_bfqq, -+ bfqq_data); - } - } - -@@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - { - struct bfq_io_cq *bic = bfqq->bic; -+ unsigned int a_idx = bfqq->actuator_idx; -+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - - /* - * If !bfqq->bic, the queue is already shared or its requests -@@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -- bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; -- bic->saved_inject_limit = bfqq->inject_limit; -- bic->saved_decrease_time_jif = bfqq->decrease_time_jif; -- -- bic->saved_weight = bfqq->entity.orig_weight; -- bic->saved_ttime = bfqq->ttime; -- bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -- bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -- bic->saved_io_start_time = bfqq->io_start_time; -- bic->saved_tot_idle_time = bfqq->tot_idle_time; -- bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -- bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; -+ bfqq_data->saved_inject_limit = bfqq->inject_limit; -+ bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; -+ -+ bfqq_data->saved_weight = bfqq->entity.orig_weight; -+ bfqq_data->saved_ttime = bfqq->ttime; -+ bfqq_data->saved_has_short_ttime = -+ bfq_bfqq_has_short_ttime(bfqq); -+ bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bfqq_data->saved_io_start_time = bfqq->io_start_time; -+ bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; -+ bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bfqq_data->was_in_burst_list = -+ !hlist_unhashed(&bfqq->burst_list_node); -+ - if (unlikely(bfq_bfqq_just_created(bfqq) && - !bfq_bfqq_in_large_burst(bfqq) && - bfqq->bfqd->low_latency)) { -@@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - * to bfqq, so that to avoid that bfqq unjustly fails - * to enjoy weight raising if split soon. - */ -- bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); -- bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -- bic->saved_last_wr_start_finish = jiffies; -+ bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bfqq_data->saved_wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); -+ bfqq_data->saved_wr_cur_max_time = -+ bfq_wr_duration(bfqq->bfqd); -+ bfqq_data->saved_last_wr_start_finish = jiffies; - } else { -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = -+ bfqq_data->saved_wr_coeff = bfqq->wr_coeff; -+ bfqq_data->saved_wr_start_at_switch_to_srt = - bfqq->wr_start_at_switch_to_srt; -- bic->saved_service_from_wr = bfqq->service_from_wr; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ bfqq_data->saved_service_from_wr = -+ bfqq->service_from_wr; -+ bfqq_data->saved_last_wr_start_finish = -+ bfqq->last_wr_start_finish; -+ bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - } - } - -@@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ -- bic_set_bfqq(bic, new_bfqq, true); -+ bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); - bfq_mark_bfqq_coop(new_bfqq); - /* - * new_bfqq now belongs to at least two bics (it is a shared queue): -@@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - * - start a new observation interval with this dispatch - */ - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -- bfqd->rq_in_driver == 0) -+ bfqd->tot_rq_in_driver == 0) - goto update_rate_and_reset; - - /* Update sampling information */ - bfqd->peak_rate_samples++; - -- if ((bfqd->rq_in_driver > 0 || -+ if ((bfqd->tot_rq_in_driver > 0 || - now_ns - bfqd->last_completion < BFQ_MIN_TT) - && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) - bfqd->sequential_samples++; -@@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, - return false; - - return (bfqq->wr_coeff > 1 && -- (bfqd->wr_busy_queues < -- tot_busy_queues || -- bfqd->rq_in_driver >= -- bfqq->dispatched + 4)) || -+ (bfqd->wr_busy_queues < tot_busy_queues || -+ bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq) || - tot_busy_queues == 1; - } -@@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - * function to evaluate the I/O speed of a process. - */ - static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- bool compensate, enum bfqq_expiration reason, -- unsigned long *delta_ms) -+ bool compensate, unsigned long *delta_ms) - { - ktime_t delta_ktime; - u32 delta_usecs; -@@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, - /* - * Check whether the process is slow (see bfq_bfqq_is_slow). - */ -- slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); - - /* - * As above explained, charge slow (typically seeky) and -@@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; - unsigned int limit = in_serv_bfqq->inject_limit; -+ int i; -+ - /* - * If - * - bfqq is not weight-raised and therefore does not carry -@@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) - ) - limit = 1; - -- if (bfqd->rq_in_driver >= limit) -+ if (bfqd->tot_rq_in_driver >= limit) - return NULL; - - /* -@@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) - * (and re-added only if it gets new requests, but then it - * is assigned again enough budget for its new backlog). - */ -- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -- if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -- (in_serv_always_inject || bfqq->wr_coeff > 1) && -- bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -- bfq_bfqq_budget_left(bfqq)) { -+ for (i = 0; i < bfqd->num_actuators; i++) { -+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (in_serv_always_inject || bfqq->wr_coeff > 1) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { - /* - * Allow for only one large in-flight request - * on non-rotational devices, for the -@@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) - */ - if (blk_queue_nonrot(bfqd->queue) && - blk_rq_sectors(bfqq->next_rq) >= -- BFQQ_SECT_THR_NONROT) -- limit = min_t(unsigned int, 1, limit); -- else -- limit = in_serv_bfqq->inject_limit; -- -- if (bfqd->rq_in_driver < limit) { -+ BFQQ_SECT_THR_NONROT && -+ bfqd->tot_rq_in_driver >= 1) -+ continue; -+ else { - bfqd->rqs_injected = true; - return bfqq; - } - } -+ } - - return NULL; - } - -+static struct bfq_queue * -+bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) -+{ -+ struct bfq_queue *bfqq; -+ -+ if (bfqd->in_service_queue && -+ bfqd->in_service_queue->actuator_idx == idx) -+ return bfqd->in_service_queue; -+ -+ list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { -+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= -+ bfq_bfqq_budget_left(bfqq)) { -+ return bfqq; -+ } -+ } -+ -+ return NULL; -+} -+ -+/* -+ * Perform a linear scan of each actuator, until an actuator is found -+ * for which the following three conditions hold: the load of the -+ * actuator is below the threshold (see comments on -+ * actuator_load_threshold for details) and lower than that of the -+ * next actuator (comments on this extra condition below), and there -+ * is a queue that contains I/O for that actuator. On success, return -+ * that queue. -+ * -+ * Performing a plain linear scan entails a prioritization among -+ * actuators. The extra condition above breaks this prioritization and -+ * tends to distribute injection uniformly across actuators. -+ */ -+static struct bfq_queue * -+bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) -+{ -+ int i; -+ -+ for (i = 0 ; i < bfqd->num_actuators; i++) { -+ if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && -+ (i == bfqd->num_actuators - 1 || -+ bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { -+ struct bfq_queue *bfqq = -+ bfq_find_active_bfqq_for_actuator(bfqd, i); -+ -+ if (bfqq) -+ return bfqq; -+ } -+ } -+ -+ return NULL; -+} -+ -+ - /* - * Select a queue for service. If we have a current queue in service, - * check whether to continue servicing it, or retrieve and set a new one. - */ - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq; -+ struct bfq_queue *bfqq, *inject_bfqq; - struct request *next_rq; - enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; - -@@ -4689,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - goto expire; - - check_queue: -+ /* -+ * If some actuator is underutilized, but the in-service -+ * queue does not contain I/O for that actuator, then try to -+ * inject I/O for that actuator. -+ */ -+ inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); -+ if (inject_bfqq && inject_bfqq != bfqq) -+ return inject_bfqq; -+ - /* - * This loop is rarely executed more than once. Even when it - * happens, it is much more convenient to re-execute this loop -@@ -4748,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - */ - if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { -- struct bfq_queue *async_bfqq = -- bfqq->bic && bfqq->bic->bfqq[0] && -- bfq_bfqq_busy(bfqq->bic->bfqq[0]) && -- bfqq->bic->bfqq[0]->next_rq ? -- bfqq->bic->bfqq[0] : NULL; -+ unsigned int act_idx = bfqq->actuator_idx; -+ struct bfq_queue *async_bfqq = NULL; - struct bfq_queue *blocked_bfqq = - !hlist_empty(&bfqq->woken_list) ? - container_of(bfqq->woken_list.first, -@@ -4760,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - woken_list_node) - : NULL; - -+ if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && -+ bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && -+ bfqq->bic->bfqq[0][act_idx]->next_rq) -+ async_bfqq = bfqq->bic->bfqq[0][act_idx]; - /* - * The next four mutually-exclusive ifs decide - * whether to try injection, and choose the queue to -@@ -4844,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && - bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= - bfq_bfqq_budget_left(async_bfqq)) -- bfqq = bfqq->bic->bfqq[0]; -+ bfqq = async_bfqq; - else if (bfqq->waker_bfqq && - bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->waker_bfqq->next_rq && -@@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, - bfq_dispatch_remove(bfqd->queue, rq); - - if (bfqq != bfqd->in_service_queue) -- goto return_rq; -+ return rq; - - /* - * If weight raising has to terminate for bfqq, then next -@@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, - * belongs to CLASS_IDLE and other queues are waiting for - * service. - */ -- if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) -- goto return_rq; -- -- bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); -+ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); - --return_rq: - return rq; - } - -@@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - - /* - * We exploit the bfq_finish_requeue_request hook to -- * decrement rq_in_driver, but -+ * decrement tot_rq_in_driver, but - * bfq_finish_requeue_request will not be invoked on - * this request. So, to avoid unbalance, just start -- * this request, without incrementing rq_in_driver. As -- * a negative consequence, rq_in_driver is deceptively -+ * this request, without incrementing tot_rq_in_driver. As -+ * a negative consequence, tot_rq_in_driver is deceptively - * lower than it should be while this request is in - * service. This may cause bfq_schedule_dispatch to be - * invoked uselessly. -@@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_finish_requeue_request hook, if defined, is - * probably invoked also on this request. So, by - * exploiting this hook, we could 1) increment -- * rq_in_driver here, and 2) decrement it in -+ * tot_rq_in_driver here, and 2) decrement it in - * bfq_finish_requeue_request. Such a solution would - * let the value of the counter be always accurate, - * but it would entail using an extra interface -@@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * Of course, serving one request at a time may cause loss of - * throughput. - */ -- if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) - goto exit; - - bfqq = bfq_select_queue(bfqd); -@@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - - if (rq) { - inc_in_driver_start_rq: -- bfqd->rq_in_driver++; -+ bfqd->rq_in_driver[bfqq->actuator_idx]++; -+ bfqd->tot_rq_in_driver++; - start_rq: - rq->rq_flags |= RQF_STARTED; - } -@@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) - */ - __bfqq = bfqq->new_bfqq; - while (__bfqq) { -- if (__bfqq == bfqq) -- break; - next = __bfqq->new_bfqq; - bfq_put_queue(__bfqq); - __bfqq = next; -@@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_release_process_ref(bfqd, bfqq); - } - --static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, -+ unsigned int actuator_idx) - { -- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); - struct bfq_data *bfqd; - - if (bfqq) - bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - - if (bfqq && bfqd) { -- unsigned long flags; -- -- spin_lock_irqsave(&bfqd->lock, flags); -- bic_set_bfqq(bic, NULL, is_sync); -+ bic_set_bfqq(bic, NULL, is_sync, actuator_idx); - bfq_exit_bfqq(bfqd, bfqq); -- spin_unlock_irqrestore(&bfqd->lock, flags); - } - } - - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ unsigned long flags; -+ unsigned int act_idx; -+ /* -+ * If bfqd and thus bfqd->num_actuators is not available any -+ * longer, then cycle over all possible per-actuator bfqqs in -+ * next loop. We rely on bic being zeroed on creation, and -+ * therefore on its unused per-actuator fields being NULL. -+ */ -+ unsigned int num_actuators = BFQ_MAX_ACTUATORS; -+ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - -- if (bic->stable_merge_bfqq) { -- struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; -+ /* -+ * bfqd is NULL if scheduler already exited, and in that case -+ * this is the last time these queues are accessed. -+ */ -+ if (bfqd) { -+ spin_lock_irqsave(&bfqd->lock, flags); -+ num_actuators = bfqd->num_actuators; -+ } - -- /* -- * bfqd is NULL if scheduler already exited, and in -- * that case this is the last time bfqq is accessed. -- */ -- if (bfqd) { -- unsigned long flags; -+ for (act_idx = 0; act_idx < num_actuators; act_idx++) { -+ if (bfqq_data[act_idx].stable_merge_bfqq) -+ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - -- spin_lock_irqsave(&bfqd->lock, flags); -- bfq_put_stable_ref(bic->stable_merge_bfqq); -- spin_unlock_irqrestore(&bfqd->lock, flags); -- } else { -- bfq_put_stable_ref(bic->stable_merge_bfqq); -- } -+ bfq_exit_icq_bfqq(bic, true, act_idx); -+ bfq_exit_icq_bfqq(bic, false, act_idx); - } - -- bfq_exit_icq_bfqq(bic, true); -- bfq_exit_icq_bfqq(bic, false); -+ if (bfqd) -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } - - /* -@@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - - bic->ioprio = ioprio; - -- bfqq = bic_to_bfqq(bic, false); -+ bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); - if (bfqq) { - struct bfq_queue *old_bfqq = bfqq; - - bfqq = bfq_get_queue(bfqd, bio, false, bic, true); -- bic_set_bfqq(bic, bfqq, false); -+ bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); - bfq_release_process_ref(bfqd, old_bfqq); - } - -- bfqq = bic_to_bfqq(bic, true); -+ bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); - if (bfqq) - bfq_set_next_ioprio_data(bfqq, bic); - } - - static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- struct bfq_io_cq *bic, pid_t pid, int is_sync) -+ struct bfq_io_cq *bic, pid_t pid, int is_sync, -+ unsigned int act_idx) - { - u64 now_ns = ktime_get_ns(); - -+ bfqq->actuator_idx = act_idx; - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - INIT_HLIST_NODE(&bfqq->burst_list_node); -@@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; -+ -+ bfqq->decrease_time_jif = jiffies; - } - - static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - struct bfq_group *bfqg, -- int ioprio_class, int ioprio) -+ int ioprio_class, int ioprio, int act_idx) - { - switch (ioprio_class) { - case IOPRIO_CLASS_RT: -- return &bfqg->async_bfqq[0][ioprio]; -+ return &bfqg->async_bfqq[0][ioprio][act_idx]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_BE_NORM; - fallthrough; - case IOPRIO_CLASS_BE: -- return &bfqg->async_bfqq[1][ioprio]; -+ return &bfqg->async_bfqq[1][ioprio][act_idx]; - case IOPRIO_CLASS_IDLE: -- return &bfqg->async_idle_bfqq; -+ return &bfqg->async_idle_bfqq[act_idx]; - default: - return NULL; - } -@@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, - struct bfq_queue *last_bfqq_created) - { -+ unsigned int a_idx = last_bfqq_created->actuator_idx; - struct bfq_queue *new_bfqq = - bfq_setup_merge(bfqq, last_bfqq_created); - -@@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, - return bfqq; - - if (new_bfqq->bic) -- new_bfqq->bic->stably_merged = true; -- bic->stably_merged = true; -+ new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; -+ bic->bfqq_data[a_idx].stably_merged = true; - - /* - * Reusing merge functions. This implies that -@@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, - * it has been set already, but too long ago, then move it - * forward to bfqq. Finally, move also if bfqq belongs to a - * different group than last_bfqq_created, or if bfqq has a -- * different ioprio or ioprio_class. If none of these -- * conditions holds true, then try an early stable merge or -- * schedule a delayed stable merge. -+ * different ioprio, ioprio_class or actuator_idx. If none of -+ * these conditions holds true, then try an early stable merge -+ * or schedule a delayed stable merge. As for the condition on -+ * actuator_idx, the reason is that, if queues associated with -+ * different actuators are merged, then control is lost on -+ * each actuator. Therefore some actuator may be -+ * underutilized, and throughput may decrease. - * - * A delayed merge is scheduled (instead of performing an - * early merge), in case bfqq might soon prove to be more -@@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, - bfqq->creation_time) || - bfqq->entity.parent != last_bfqq_created->entity.parent || - bfqq->ioprio != last_bfqq_created->ioprio || -- bfqq->ioprio_class != last_bfqq_created->ioprio_class) -+ bfqq->ioprio_class != last_bfqq_created->ioprio_class || -+ bfqq->actuator_idx != last_bfqq_created->actuator_idx) - *source_bfqq = bfqq; - else if (time_after_eq(last_bfqq_created->creation_time + - bfqd->bfq_burst_interval, -@@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, - /* - * Record the bfqq to merge to. - */ -- bic->stable_merge_bfqq = last_bfqq_created; -+ bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = -+ last_bfqq_created; - } - } - -@@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - bfqg = bfq_bio_bfqg(bfqd, bio); - if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -- ioprio); -+ ioprio, -+ bfq_actuator_index(bfqd, bio)); - bfqq = *async_bfqq; - if (bfqq) - goto out; -@@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - - if (bfqq) { - bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -- is_sync); -+ is_sync, bfq_actuator_index(bfqd, bio)); - bfq_init_entity(&bfqq->entity, bfqg); - bfq_log_bfqq(bfqd, bfqq, "allocated"); - } else { -@@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - * then complete the merge and redirect it to - * new_bfqq. - */ -- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ if (bic_to_bfqq(RQ_BIC(rq), true, -+ bfq_actuator_index(bfqd, rq->bio)) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); - -@@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - struct bfq_queue *bfqq = bfqd->in_service_queue; - - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -- bfqd->rq_in_driver); -+ bfqd->tot_rq_in_driver); - - if (bfqd->hw_tag == 1) - return; -@@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - * sum is not exact, as it's not taking into account deactivated - * requests. - */ -- if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) -+ if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) - return; - - /* -@@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && - bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < - BFQ_HW_QUEUE_THRESHOLD && -- bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) -+ bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) - return; - - if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -@@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - - bfq_update_hw_tag(bfqd); - -- bfqd->rq_in_driver--; -+ bfqd->rq_in_driver[bfqq->actuator_idx]--; -+ bfqd->tot_rq_in_driver--; - bfqq->dispatched--; - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -@@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - BFQQE_NO_MORE_REQUESTS); - } - -- if (!bfqd->rq_in_driver) -+ if (!bfqd->tot_rq_in_driver) - bfq_schedule_dispatch(bfqd); - } - -@@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, - * conditions to do it, or we can lower the last base value - * computed. - * -- * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O -+ * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O - * request in flight, because this function is in the code - * path that handles the completion of a request of bfqq, and, - * in particular, this function is executed before -- * bfqd->rq_in_driver is decremented in such a code path. -+ * bfqd->tot_rq_in_driver is decremented in such a code path. - */ -- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || -+ if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || - tot_time_ns < bfqq->last_serv_time_ns) { - if (bfqq->last_serv_time_ns == 0) { - /* -@@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, - bfqq->inject_limit = max_t(unsigned int, 1, old_limit); - } - bfqq->last_serv_time_ns = tot_time_ns; -- } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) -+ } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) - /* - * No I/O injected and no request still in service in - * the drive: these are the exact conditions for -@@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - return bfqq; - } - -- bic_set_bfqq(bic, NULL, true); -+ bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); - - bfq_put_cooperator(bfqq); - -@@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - bool split, bool is_sync, - bool *new_queue) - { -- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ unsigned int act_idx = bfq_actuator_index(bfqd, bio); -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); -+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; - - if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) - return bfqq; -@@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - -- bic_set_bfqq(bic, bfqq, is_sync); -+ bic_set_bfqq(bic, bfqq, is_sync, act_idx); - if (split && is_sync) { -- if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || -+ bfqq_data->saved_in_large_burst) - bfq_mark_bfqq_in_large_burst(bfqq); - else { - bfq_clear_bfqq_in_large_burst(bfqq); -- if (bic->was_in_burst_list) -+ if (bfqq_data->was_in_burst_list) - /* - * If bfqq was in the current - * burst list before being -@@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) - struct bfq_queue *bfqq; - bool new_queue = false; - bool bfqq_already_existing = false, split = false; -+ unsigned int a_idx = bfq_actuator_index(bfqd, bio); - - if (unlikely(!rq->elv.icq)) - return NULL; - - /* -- * Assuming that elv.priv[1] is set only if everything is set -+ * Assuming that RQ_BFQQ(rq) is set only if everything is set - * for this rq. This holds true, because this function is - * invoked only for insertion or merging, and, after such - * events, a request cannot be manipulated any longer before - * being removed from bfq. - */ -- if (rq->elv.priv[1]) -- return rq->elv.priv[1]; -+ if (RQ_BFQQ(rq)) -+ return RQ_BFQQ(rq); - - bic = icq_to_bic(rq->elv.icq); - -@@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) - if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && -- !bic->stably_merged) { -+ !bic->bfqq_data[a_idx].stably_merged) { - struct bfq_queue *old_bfqq = bfqq; - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) -- bic->saved_in_large_burst = true; -+ bic->bfqq_data[a_idx].saved_in_large_burst = -+ true; - - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; -@@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - */ - void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) - { -- int i, j; -+ int i, j, k; - -- for (i = 0; i < 2; i++) -- for (j = 0; j < IOPRIO_NR_LEVELS; j++) -- __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ for (k = 0; k < bfqd->num_actuators; k++) { -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_NR_LEVELS; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); - -- __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); -+ } - } - - /* -@@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - { - struct bfq_data *bfqd; - struct elevator_queue *eq; -+ unsigned int i; -+ struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; - - eq = elevator_alloc(q, e); - if (!eq) -@@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. -+ * Set zero as actuator index: we will pretend that -+ * all I/O requests are for the same actuator. - */ -- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); - bfqd->oom_bfqq.ref++; - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -@@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - - bfqd->queue = q; - -+ bfqd->num_actuators = 1; -+ /* -+ * If the disk supports multiple actuators, copy independent -+ * access ranges from the request queue structure. -+ */ -+ spin_lock_irq(&q->queue_lock); -+ if (ia_ranges) { -+ /* -+ * Check if the disk ia_ranges size exceeds the current bfq -+ * actuator limit. -+ */ -+ if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { -+ pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", -+ ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); -+ pr_crit("Falling back to single actuator mode.\n"); -+ } else { -+ bfqd->num_actuators = ia_ranges->nr_ia_ranges; -+ -+ for (i = 0; i < bfqd->num_actuators; i++) { -+ bfqd->sector[i] = ia_ranges->ia_range[i].sector; -+ bfqd->nr_sectors[i] = -+ ia_ranges->ia_range[i].nr_sectors; -+ } -+ } -+ } -+ -+ /* Otherwise use single-actuator dev info */ -+ if (bfqd->num_actuators == 1) { -+ bfqd->sector[0] = 0; -+ bfqd->nr_sectors[0] = get_capacity(q->disk); -+ } -+ spin_unlock_irq(&q->queue_lock); -+ - INIT_LIST_HEAD(&bfqd->dispatch); - - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -@@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->num_groups_with_pending_reqs = 0; - #endif - -- INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->active_list[0]); -+ INIT_LIST_HEAD(&bfqd->active_list[1]); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); - -@@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - */ - bfqd->bfq_wr_coeff = 30; - bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -- bfqd->bfq_wr_max_time = 0; - bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); - bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); - bfqd->bfq_wr_max_softrt_rate = 7000; /* -@@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - -+ /* see comments on the definition of next field inside bfq_data */ -+ bfqd->actuator_load_threshold = 4; -+ - spin_lock_init(&bfqd->lock); - - /* -@@ -7412,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched"); +@@ -7617,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; -+ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.2"; ++ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.3"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); -@@ -7443,6 +7649,11 @@ static int __init bfq_init(void) +@@ -7648,6 +7649,11 @@ static int __init bfq_init(void) if (ret) goto slab_kill; @@ -4737,2196 +3317,18 @@ index 380e9bda2e57..aa644973d260 100644 return 0; slab_kill: -diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h -index 466e4865ace6..75cc6a324267 100644 ---- a/block/bfq-iosched.h -+++ b/block/bfq-iosched.h -@@ -33,6 +33,14 @@ - */ - #define BFQ_SOFTRT_WEIGHT_FACTOR 100 - -+/* -+ * Maximum number of actuators supported. This constant is used simply -+ * to define the size of the static array that will contain -+ * per-actuator data. The current value is hopefully a good upper -+ * bound to the possible number of actuators of any actual drive. -+ */ -+#define BFQ_MAX_ACTUATORS 8 -+ - struct bfq_entity; - - /** -@@ -227,12 +235,14 @@ struct bfq_ttime { - * struct bfq_queue - leaf schedulable entity. - * - * A bfq_queue is a leaf request queue; it can be associated with an -- * io_context or more, if it is async or shared between cooperating -- * processes. @cgroup holds a reference to the cgroup, to be sure that it -- * does not disappear while a bfqq still references it (mostly to avoid -- * races between request issuing and task migration followed by cgroup -- * destruction). -- * All the fields are protected by the queue lock of the containing bfqd. -+ * io_context or more, if it is async or shared between cooperating -+ * processes. Besides, it contains I/O requests for only one actuator -+ * (an io_context is associated with a different bfq_queue for each -+ * actuator it generates I/O for). @cgroup holds a reference to the -+ * cgroup, to be sure that it does not disappear while a bfqq still -+ * references it (mostly to avoid races between request issuing and -+ * task migration followed by cgroup destruction). All the fields are -+ * protected by the queue lock of the containing bfqd. - */ - struct bfq_queue { - /* reference counter */ -@@ -397,24 +407,18 @@ struct bfq_queue { - * the woken queues when this queue exits. - */ - struct hlist_head woken_list; -+ -+ /* index of the actuator this queue is associated with */ -+ unsigned int actuator_idx; - }; - - /** -- * struct bfq_io_cq - per (request_queue, io_context) structure. -- */ --struct bfq_io_cq { -- /* associated io_cq structure */ -- struct io_cq icq; /* must be the first member */ -- /* array of two process queues, the sync and the async */ -- struct bfq_queue *bfqq[2]; -- /* per (request_queue, blkcg) ioprio */ -- int ioprio; --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- uint64_t blkcg_serial_nr; /* the current blkcg serial */ --#endif -+* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq -+*/ -+struct bfq_iocq_bfqq_data { - /* - * Snapshot of the has_short_time flag before merging; taken -- * to remember its value while the queue is merged, so as to -+ * to remember its values while the queue is merged, so as to - * be able to restore it in case of split. - */ - bool saved_has_short_ttime; -@@ -428,7 +432,7 @@ struct bfq_io_cq { - u64 saved_tot_idle_time; - - /* -- * Same purpose as the previous fields for the value of the -+ * Same purpose as the previous fields for the values of the - * field keeping the queue's belonging to a large burst - */ - bool saved_in_large_burst; -@@ -466,6 +470,38 @@ struct bfq_io_cq { - struct bfq_queue *stable_merge_bfqq; - - bool stably_merged; /* non splittable if true */ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* -+ * Matrix of associated process queues: first row for async -+ * queues, second row sync queues. Each row contains one -+ * column for each actuator. An I/O request generated by the -+ * process is inserted into the queue pointed by bfqq[i][j] if -+ * the request is to be served by the j-th actuator of the -+ * drive, where i==0 or i==1, depending on whether the request -+ * is async or sync. So there is a distinct queue for each -+ * actuator. -+ */ -+ struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Persistent data for associated synchronous process queues -+ * (one queue per actuator, see field bfqq above). In -+ * particular, each of these queues may undergo a merge. -+ */ -+ struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; -+ - unsigned int requests; /* Number of requests this process has in flight */ - }; - -@@ -554,7 +590,12 @@ struct bfq_data { - /* number of queued requests */ - int queued; - /* number of requests dispatched and waiting for completion */ -- int rq_in_driver; -+ int tot_rq_in_driver; -+ /* -+ * number of requests dispatched and waiting for completion -+ * for each actuator -+ */ -+ int rq_in_driver[BFQ_MAX_ACTUATORS]; - - /* true if the device is non rotational and performs queueing */ - bool nonrot_with_queueing; -@@ -648,8 +689,13 @@ struct bfq_data { - /* maximum budget allotted to a bfq_queue before rescheduling */ - int bfq_max_budget; - -- /* list of all the bfq_queues active on the device */ -- struct list_head active_list; -+ /* -+ * List of all the bfq_queues active for a specific actuator -+ * on the device. Keeping active queues separate on a -+ * per-actuator basis helps implementing per-actuator -+ * injection more efficiently. -+ */ -+ struct list_head active_list[BFQ_MAX_ACTUATORS]; - /* list of all the bfq_queues idle on the device */ - struct list_head idle_list; - -@@ -723,8 +769,6 @@ struct bfq_data { - * is multiplied. - */ - unsigned int bfq_wr_coeff; -- /* maximum duration of a weight-raising period (jiffies) */ -- unsigned int bfq_wr_max_time; - - /* Maximum weight-raising duration for soft real-time processes */ - unsigned int bfq_wr_rt_max_time; -@@ -772,6 +816,42 @@ struct bfq_data { - */ - unsigned int word_depths[2][2]; - unsigned int full_depth_shift; -+ -+ /* -+ * Number of independent actuators. This is equal to 1 in -+ * case of single-actuator drives. -+ */ -+ unsigned int num_actuators; -+ /* -+ * Disk independent access ranges for each actuator -+ * in this device. -+ */ -+ sector_t sector[BFQ_MAX_ACTUATORS]; -+ sector_t nr_sectors[BFQ_MAX_ACTUATORS]; -+ struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; -+ -+ /* -+ * If the number of I/O requests queued in the device for a -+ * given actuator is below next threshold, then the actuator -+ * is deemed as underutilized. If this condition is found to -+ * hold for some actuator upon a dispatch, but (i) the -+ * in-service queue does not contain I/O for that actuator, -+ * while (ii) some other queue does contain I/O for that -+ * actuator, then the head I/O request of the latter queue is -+ * returned (injected), instead of the head request of the -+ * currently in-service queue. -+ * -+ * We set the threshold, empirically, to the minimum possible -+ * value for which an actuator is fully utilized, or close to -+ * be fully utilized. By doing so, injected I/O 'steals' as -+ * few drive-queue slots as possibile to the in-service -+ * queue. This reduces as much as possible the probability -+ * that the service of I/O from the in-service bfq_queue gets -+ * delayed because of slot exhaustion, i.e., because all the -+ * slots of the drive queue are filled with I/O injected from -+ * other queues (NCQ provides for 32 slots). -+ */ -+ unsigned int actuator_load_threshold; - }; - - enum bfqq_state_flags { -@@ -937,8 +1017,8 @@ struct bfq_group { - - struct bfq_data *bfqd; - -- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; -- struct bfq_queue *async_idle_bfqq; -+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; -+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; - - struct bfq_entity *my_entity; - -@@ -955,8 +1035,8 @@ struct bfq_group { - struct bfq_entity entity; - struct bfq_sched_data sched_data; - -- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; -- struct bfq_queue *async_idle_bfqq; -+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; -+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; - - struct rb_root rq_pos_tree; - }; -@@ -969,8 +1049,10 @@ struct bfq_group { - - extern const int bfq_timeout; - --struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); --void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); -+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, -+ unsigned int actuator_idx); -+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, -+ unsigned int actuator_idx); - struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); - void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); - void bfq_weights_tree_add(struct bfq_queue *bfqq); -diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c -index ea4c3d757fdd..7941b6f07391 100644 ---- a/block/bfq-wf2q.c -+++ b/block/bfq-wf2q.c -@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - bfq_update_active_tree(node); - - if (bfqq) -- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); - - bfq_inc_active_entities(entity); - } -diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c -index 9ac1efb053e0..4272599a3f08 100644 ---- a/block/blk-cgroup.c -+++ b/block/blk-cgroup.c -@@ -33,7 +33,6 @@ - #include "blk-cgroup.h" - #include "blk-ioprio.h" - #include "blk-throttle.h" --#include "blk-rq-qos.h" - - /* - * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. -@@ -626,69 +625,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) - EXPORT_SYMBOL_GPL(__blkg_prfill_u64); - - /** -- * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update -- * @inputp: input string pointer -+ * blkg_conf_init - initialize a blkg_conf_ctx -+ * @ctx: blkg_conf_ctx to initialize -+ * @input: input string -+ * -+ * Initialize @ctx which can be used to parse blkg config input string @input. -+ * Once initialized, @ctx can be used with blkg_conf_open_bdev() and -+ * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). -+ */ -+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) -+{ -+ *ctx = (struct blkg_conf_ctx){ .input = input }; -+} -+EXPORT_SYMBOL_GPL(blkg_conf_init); -+ -+/** -+ * blkg_conf_open_bdev - parse and open bdev for per-blkg config update -+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() - * -- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update -- * from @input and get and return the matching bdev. *@inputp is -- * updated to point past the device node prefix. Returns an ERR_PTR() -- * value on error. -+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from -+ * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is -+ * set to point past the device node prefix. - * -- * Use this function iff blkg_conf_prep() can't be used for some reason. -+ * This function may be called multiple times on @ctx and the extra calls become -+ * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function -+ * explicitly if bdev access is needed without resolving the blkcg / policy part -+ * of @ctx->input. Returns -errno on error. - */ --struct block_device *blkcg_conf_open_bdev(char **inputp) -+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) - { -- char *input = *inputp; -+ char *input = ctx->input; - unsigned int major, minor; - struct block_device *bdev; - int key_len; - -+ if (ctx->bdev) -+ return 0; -+ - if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) -- return ERR_PTR(-EINVAL); -+ return -EINVAL; - - input += key_len; - if (!isspace(*input)) -- return ERR_PTR(-EINVAL); -+ return -EINVAL; - input = skip_spaces(input); - - bdev = blkdev_get_no_open(MKDEV(major, minor)); - if (!bdev) -- return ERR_PTR(-ENODEV); -+ return -ENODEV; - if (bdev_is_partition(bdev)) { - blkdev_put_no_open(bdev); -- return ERR_PTR(-ENODEV); -+ return -ENODEV; - } - -- *inputp = input; -- return bdev; -+ ctx->body = input; -+ ctx->bdev = bdev; -+ return 0; - } - - /** - * blkg_conf_prep - parse and prepare for per-blkg config update - * @blkcg: target block cgroup - * @pol: target policy -- * @input: input string -- * @ctx: blkg_conf_ctx to be filled -+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() -+ * -+ * Parse per-blkg config update from @ctx->input and initialize @ctx -+ * accordingly. On success, @ctx->body points to the part of @ctx->input -+ * following MAJ:MIN, @ctx->bdev points to the target block device and -+ * @ctx->blkg to the blkg being configured. - * -- * Parse per-blkg config update from @input and initialize @ctx with the -- * result. @ctx->blkg points to the blkg to be updated and @ctx->body the -- * part of @input following MAJ:MIN. This function returns with RCU read -- * lock and queue lock held and must be paired with blkg_conf_finish(). -+ * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this -+ * function returns with queue lock held and must be followed by -+ * blkg_conf_exit(). - */ - int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, -- char *input, struct blkg_conf_ctx *ctx) -- __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) -+ struct blkg_conf_ctx *ctx) -+ __acquires(&bdev->bd_queue->queue_lock) - { -- struct block_device *bdev; - struct gendisk *disk; - struct request_queue *q; - struct blkcg_gq *blkg; - int ret; - -- bdev = blkcg_conf_open_bdev(&input); -- if (IS_ERR(bdev)) -- return PTR_ERR(bdev); -- disk = bdev->bd_disk; -+ ret = blkg_conf_open_bdev(ctx); -+ if (ret) -+ return ret; -+ -+ disk = ctx->bdev->bd_disk; - q = disk->queue; - - /* -@@ -699,7 +722,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - if (ret) - goto fail; - -- rcu_read_lock(); - spin_lock_irq(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) { -@@ -728,7 +750,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - - /* Drop locks to do new blkg allocation with GFP_KERNEL. */ - spin_unlock_irq(&q->queue_lock); -- rcu_read_unlock(); - - new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); - if (unlikely(!new_blkg)) { -@@ -742,7 +763,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - goto fail_exit_queue; - } - -- rcu_read_lock(); - spin_lock_irq(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) { -@@ -769,20 +789,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - } - success: - blk_queue_exit(q); -- ctx->bdev = bdev; - ctx->blkg = blkg; -- ctx->body = input; - return 0; - - fail_preloaded: - radix_tree_preload_end(); - fail_unlock: - spin_unlock_irq(&q->queue_lock); -- rcu_read_unlock(); - fail_exit_queue: - blk_queue_exit(q); - fail: -- blkdev_put_no_open(bdev); - /* - * If queue was bypassing, we should retry. Do so after a - * short msleep(). It isn't strictly necessary but queue -@@ -798,20 +814,27 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - EXPORT_SYMBOL_GPL(blkg_conf_prep); - - /** -- * blkg_conf_finish - finish up per-blkg config update -- * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() -+ * blkg_conf_exit - clean up per-blkg config update -+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() - * -- * Finish up after per-blkg config update. This function must be paired -- * with blkg_conf_prep(). -+ * Clean up after per-blkg config update. This function must be called on all -+ * blkg_conf_ctx's initialized with blkg_conf_init(). - */ --void blkg_conf_finish(struct blkg_conf_ctx *ctx) -- __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) -+void blkg_conf_exit(struct blkg_conf_ctx *ctx) -+ __releases(&ctx->bdev->bd_queue->queue_lock) - { -- spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); -- rcu_read_unlock(); -- blkdev_put_no_open(ctx->bdev); -+ if (ctx->blkg) { -+ spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); -+ ctx->blkg = NULL; -+ } -+ -+ if (ctx->bdev) { -+ blkdev_put_no_open(ctx->bdev); -+ ctx->body = NULL; -+ ctx->bdev = NULL; -+ } - } --EXPORT_SYMBOL_GPL(blkg_conf_finish); -+EXPORT_SYMBOL_GPL(blkg_conf_exit); - - static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) - { -@@ -1300,14 +1323,8 @@ int blkcg_init_disk(struct gendisk *disk) - if (ret) - goto err_ioprio_exit; - -- ret = blk_iolatency_init(disk); -- if (ret) -- goto err_throtl_exit; -- - return 0; - --err_throtl_exit: -- blk_throtl_exit(disk); - err_ioprio_exit: - blk_ioprio_exit(disk); - err_destroy_all: -@@ -1323,7 +1340,6 @@ int blkcg_init_disk(struct gendisk *disk) - void blkcg_exit_disk(struct gendisk *disk) - { - blkg_destroy_all(disk); -- rq_qos_exit(disk->queue); - blk_throtl_exit(disk); - } - -diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h -index 1e94e404eaa8..fe09e8b4c2a8 100644 ---- a/block/blk-cgroup.h -+++ b/block/blk-cgroup.h -@@ -208,15 +208,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); - - struct blkg_conf_ctx { -+ char *input; -+ char *body; - struct block_device *bdev; - struct blkcg_gq *blkg; -- char *body; - }; - --struct block_device *blkcg_conf_open_bdev(char **inputp); -+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); -+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); - int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, -- char *input, struct blkg_conf_ctx *ctx); --void blkg_conf_finish(struct blkg_conf_ctx *ctx); -+ struct blkg_conf_ctx *ctx); -+void blkg_conf_exit(struct blkg_conf_ctx *ctx); - - /** - * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg -diff --git a/block/blk-iocost.c b/block/blk-iocost.c -index ec7219caea16..c31d57e29bf8 100644 ---- a/block/blk-iocost.c -+++ b/block/blk-iocost.c -@@ -3096,9 +3096,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, - return nbytes; - } - -- ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); -+ blkg_conf_init(&ctx, buf); -+ -+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); - if (ret) -- return ret; -+ goto err; - - iocg = blkg_to_iocg(ctx.blkg); - -@@ -3117,12 +3119,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, - weight_updated(iocg, &now); - spin_unlock(&iocg->ioc->lock); - -- blkg_conf_finish(&ctx); -+ blkg_conf_exit(&ctx); - return nbytes; - - einval: -- blkg_conf_finish(&ctx); -- return -EINVAL; -+ ret = -EINVAL; -+err: -+ blkg_conf_exit(&ctx); -+ return ret; - } - - static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, -@@ -3177,19 +3181,22 @@ static const match_table_t qos_tokens = { - static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, - size_t nbytes, loff_t off) - { -- struct block_device *bdev; -+ struct blkg_conf_ctx ctx; - struct gendisk *disk; - struct ioc *ioc; - u32 qos[NR_QOS_PARAMS]; - bool enable, user; -- char *p; -+ char *body, *p; - int ret; - -- bdev = blkcg_conf_open_bdev(&input); -- if (IS_ERR(bdev)) -- return PTR_ERR(bdev); -+ blkg_conf_init(&ctx, input); - -- disk = bdev->bd_disk; -+ ret = blkg_conf_open_bdev(&ctx); -+ if (ret) -+ goto err; -+ -+ body = ctx.body; -+ disk = ctx.bdev->bd_disk; - ioc = q_to_ioc(disk->queue); - if (!ioc) { - ret = blk_iocost_init(disk); -@@ -3206,7 +3213,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, - enable = ioc->enabled; - user = ioc->user_qos_params; - -- while ((p = strsep(&input, " \t\n"))) { -+ while ((p = strsep(&body, " \t\n"))) { - substring_t args[MAX_OPT_ARGS]; - char buf[32]; - int tok; -@@ -3295,7 +3302,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, - blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); - -- blkdev_put_no_open(bdev); -+ blkg_conf_exit(&ctx); - return nbytes; - einval: - spin_unlock_irq(&ioc->lock); -@@ -3305,7 +3312,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, - - ret = -EINVAL; - err: -- blkdev_put_no_open(bdev); -+ blkg_conf_exit(&ctx); - return ret; - } - -@@ -3356,22 +3363,25 @@ static const match_table_t i_lcoef_tokens = { - static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, - size_t nbytes, loff_t off) - { -- struct block_device *bdev; -+ struct blkg_conf_ctx ctx; - struct request_queue *q; - struct ioc *ioc; - u64 u[NR_I_LCOEFS]; - bool user; -- char *p; -+ char *body, *p; - int ret; - -- bdev = blkcg_conf_open_bdev(&input); -- if (IS_ERR(bdev)) -- return PTR_ERR(bdev); -+ blkg_conf_init(&ctx, input); -+ -+ ret = blkg_conf_open_bdev(&ctx); -+ if (ret) -+ goto err; - -- q = bdev_get_queue(bdev); -+ body = ctx.body; -+ q = bdev_get_queue(ctx.bdev); - ioc = q_to_ioc(q); - if (!ioc) { -- ret = blk_iocost_init(bdev->bd_disk); -+ ret = blk_iocost_init(ctx.bdev->bd_disk); - if (ret) - goto err; - ioc = q_to_ioc(q); -@@ -3384,7 +3394,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, - memcpy(u, ioc->params.i_lcoefs, sizeof(u)); - user = ioc->user_cost_model; - -- while ((p = strsep(&input, " \t\n"))) { -+ while ((p = strsep(&body, " \t\n"))) { - substring_t args[MAX_OPT_ARGS]; - char buf[32]; - int tok; -@@ -3431,7 +3441,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - -- blkdev_put_no_open(bdev); -+ blkg_conf_exit(&ctx); - return nbytes; - - einval: -@@ -3442,7 +3452,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, - - ret = -EINVAL; - err: -- blkdev_put_no_open(bdev); -+ blkg_conf_exit(&ctx); - return ret; - } - -diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c -index ecdc10741836..3484393dbc4a 100644 ---- a/block/blk-iolatency.c -+++ b/block/blk-iolatency.c -@@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) - } - } - --int blk_iolatency_init(struct gendisk *disk) -+static int blk_iolatency_init(struct gendisk *disk) - { - struct request_queue *q = disk->queue; - struct blk_iolatency *blkiolat; -@@ -830,6 +830,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) - } - } - -+static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) -+{ -+ static DEFINE_MUTEX(init_mutex); -+ int ret; -+ -+ ret = blkg_conf_open_bdev(ctx); -+ if (ret) -+ return ret; -+ -+ /* -+ * blk_iolatency_init() may fail after rq_qos_add() succeeds which can -+ * confuse iolat_rq_qos() test. Make the test and init atomic. -+ */ -+ mutex_lock(&init_mutex); -+ -+ if (!iolat_rq_qos(ctx->bdev->bd_queue)) -+ ret = blk_iolatency_init(ctx->bdev->bd_disk); -+ -+ mutex_unlock(&init_mutex); -+ -+ return ret; -+} -+ - static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) - { -@@ -842,9 +865,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, - u64 oldval; - int ret; - -- ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); -+ blkg_conf_init(&ctx, buf); -+ -+ ret = blk_iolatency_try_init(&ctx); - if (ret) -- return ret; -+ goto out; -+ -+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); -+ if (ret) -+ goto out; - - iolat = blkg_to_lat(ctx.blkg); - p = ctx.body; -@@ -880,7 +909,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, - iolatency_clear_scaling(blkg); - ret = 0; - out: -- blkg_conf_finish(&ctx); -+ blkg_conf_exit(&ctx); - return ret ?: nbytes; - } - -@@ -974,7 +1003,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) - { - struct iolatency_grp *iolat = pd_to_lat(pd); - struct blkcg_gq *blkg = lat_to_blkg(iolat); -- struct rq_qos *rqos = blkcg_rq_qos(blkg->q); -+ struct rq_qos *rqos = iolat_rq_qos(blkg->q); - struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); - int cpu; -diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h -index 1ef1f7d4bc3c..27f004fae66b 100644 ---- a/block/blk-rq-qos.h -+++ b/block/blk-rq-qos.h -@@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) - return rq_qos_id(q, RQ_QOS_WBT); - } - --static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) -+static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) - { - return rq_qos_id(q, RQ_QOS_LATENCY); - } -diff --git a/block/blk-throttle.c b/block/blk-throttle.c -index 6fb5a2f9e1ee..75841d1d9bf4 100644 ---- a/block/blk-throttle.c -+++ b/block/blk-throttle.c -@@ -1369,9 +1369,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, - int ret; - u64 v; - -- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); -+ blkg_conf_init(&ctx, buf); -+ -+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); - if (ret) -- return ret; -+ goto out_finish; - - ret = -EINVAL; - if (sscanf(ctx.body, "%llu", &v) != 1) -@@ -1390,7 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, - tg_conf_updated(tg, false); - ret = 0; - out_finish: -- blkg_conf_finish(&ctx); -+ blkg_conf_exit(&ctx); - return ret ?: nbytes; - } - -@@ -1562,9 +1564,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, - int ret; - int index = of_cft(of)->private; - -- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); -+ blkg_conf_init(&ctx, buf); -+ -+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); - if (ret) -- return ret; -+ goto out_finish; - - tg = blkg_to_tg(ctx.blkg); - tg_update_carryover(tg); -@@ -1663,7 +1667,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, - tg->td->limit_valid[LIMIT_LOW]); - ret = 0; - out_finish: -- blkg_conf_finish(&ctx); -+ blkg_conf_exit(&ctx); - return ret ?: nbytes; - } - -diff --git a/block/blk.h b/block/blk.h -index 4c3b3325219a..78f1706cddca 100644 ---- a/block/blk.h -+++ b/block/blk.h -@@ -392,12 +392,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, - return bio; - } - --#ifdef CONFIG_BLK_CGROUP_IOLATENCY --int blk_iolatency_init(struct gendisk *disk); --#else --static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; --#endif -- - #ifdef CONFIG_BLK_DEV_ZONED - void disk_free_zone_bitmaps(struct gendisk *disk); - void disk_clear_zone_settings(struct gendisk *disk); -- -2.40.0.rc2 +2.40.0 -From e44295cea72d5cefc97900011495f89f000873ac Mon Sep 17 00:00:00 2001 +From 7a2801ac4761f911a6b2e7a8532b9fedc5382bc5 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 13 Feb 2023 11:26:20 +0100 -Subject: [PATCH 03/16] bitmap - -Signed-off-by: Peter Jung ---- - include/linux/bitmap.h | 46 ++++++------- - include/linux/cpumask.h | 144 +++++++++++++++++++-------------------- - include/linux/find.h | 40 +++++------ - include/linux/nodemask.h | 86 +++++++++++------------ - 4 files changed, 158 insertions(+), 158 deletions(-) - -diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h -index 7d6d73b78147..40e53a2ecc0d 100644 ---- a/include/linux/bitmap.h -+++ b/include/linux/bitmap.h -@@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, - * the bit offset of all zero areas this function finds is multiples of that - * power of 2. A @align_mask of 0 means no alignment is required. - */ --static inline unsigned long -+static __always_inline unsigned long - bitmap_find_next_zero_area(unsigned long *map, - unsigned long size, - unsigned long start, -@@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, - #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) - #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) - --static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) -+static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) - { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); - -@@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) - memset(dst, 0, len); - } - --static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) -+static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) - { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); - -@@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) - memset(dst, 0xff, len); - } - --static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, -+static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, - unsigned int nbits) - { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); -@@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, - /* - * Copy bitmap and clear tail bits in last word. - */ --static inline void bitmap_copy_clear_tail(unsigned long *dst, -+static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, - const unsigned long *src, unsigned int nbits) - { - bitmap_copy(dst, src, nbits); -@@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); - bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) - #endif - --static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, -+static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, - return __bitmap_and(dst, src1, src2, nbits); - } - --static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, -+static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - __bitmap_or(dst, src1, src2, nbits); - } - --static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, -+static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - __bitmap_xor(dst, src1, src2, nbits); - } - --static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, -+static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, - return __bitmap_andnot(dst, src1, src2, nbits); - } - --static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, -+static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr - #endif - #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) - --static inline bool bitmap_equal(const unsigned long *src1, -+static __always_inline bool bitmap_equal(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1, - * - * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise - */ --static inline bool bitmap_or_equal(const unsigned long *src1, -+static __always_inline bool bitmap_or_equal(const unsigned long *src1, - const unsigned long *src2, - const unsigned long *src3, - unsigned int nbits) -@@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1, - return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); - } - --static inline bool bitmap_intersects(const unsigned long *src1, -+static __always_inline bool bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, - unsigned int nbits) - { -@@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1, - return __bitmap_intersects(src1, src2, nbits); - } - --static inline bool bitmap_subset(const unsigned long *src1, -+static __always_inline bool bitmap_subset(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1, - return __bitmap_subset(src1, src2, nbits); - } - --static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) -+static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) - { - if (small_const_nbits(nbits)) - return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); -@@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) - return find_first_bit(src, nbits) == nbits; - } - --static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) -+static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) - { - if (small_const_nbits(nbits)) - return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); -@@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, - __bitmap_clear(map, start, nbits); - } - --static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, -+static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s - __bitmap_shift_right(dst, src, shift, nbits); - } - --static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, -+static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits) - { - if (small_const_nbits(nbits)) -@@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr - __bitmap_shift_left(dst, src, shift, nbits); - } - --static inline void bitmap_replace(unsigned long *dst, -+static __always_inline void bitmap_replace(unsigned long *dst, - const unsigned long *old, - const unsigned long *new, - const unsigned long *mask, -@@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst, - __bitmap_replace(dst, old, new, mask, nbits); - } - --static inline void bitmap_next_set_region(unsigned long *bitmap, -+static __always_inline void bitmap_next_set_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) - { -@@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, - * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, - * but we expect the lower 32-bits of u64. - */ --static inline void bitmap_from_u64(unsigned long *dst, u64 mask) -+static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) - { - bitmap_from_arr64(dst, &mask, 64); - } -@@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) - * Returns the 8-bit value located at the @start bit offset within the @src - * memory region. - */ --static inline unsigned long bitmap_get_value8(const unsigned long *map, -+static __always_inline unsigned long bitmap_get_value8(const unsigned long *map, - unsigned long start) - { - const size_t index = BIT_WORD(start); -@@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map, - * @value: the 8-bit value; values wider than 8 bits may clobber bitmap - * @start: bit offset of the 8-bit value; must be a multiple of 8 - */ --static inline void bitmap_set_value8(unsigned long *map, unsigned long value, -+static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value, - unsigned long start) - { - const size_t index = BIT_WORD(start); -diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h -index c2aa0aa26b45..9543b22d6dc2 100644 ---- a/include/linux/cpumask.h -+++ b/include/linux/cpumask.h -@@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; - extern unsigned int nr_cpu_ids; - #endif - --static inline void set_nr_cpu_ids(unsigned int nr) -+static __always_inline void set_nr_cpu_ids(unsigned int nr) - { - #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) - WARN_ON(nr != nr_cpu_ids); -@@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) - * - * Returns >= nr_cpu_ids if no cpus set. - */ --static inline unsigned int cpumask_first(const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) - { - return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) - * - * Returns >= nr_cpu_ids if all cpus are set. - */ --static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) - { - return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) - * - * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). - */ --static inline -+static __always_inline - unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) - { - return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); -@@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask - * - * Returns >= nr_cpumask_bits if no CPUs set. - */ --static inline unsigned int cpumask_last(const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) - { - return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) - * - * Returns >= nr_cpu_ids if no further cpus set. - */ --static inline -+static __always_inline - unsigned int cpumask_next(int n, const struct cpumask *srcp) - { - /* -1 is a legal arg here. */ -@@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) - * - * Returns >= nr_cpu_ids if no further cpus unset. - */ --static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) - { - /* -1 is a legal arg here. */ - if (n != -1) -@@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) - - #if NR_CPUS == 1 - /* Uniprocessor: there is only one valid CPU */ --static inline unsigned int cpumask_local_spread(unsigned int i, int node) -+static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) - { - return 0; - } - --static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, -+static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, - const struct cpumask *src2p) - { - return cpumask_first_and(src1p, src2p); - } - --static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) - { - return cpumask_first(srcp); - } -@@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp); - * - * Returns >= nr_cpu_ids if no further cpus set in both. - */ --static inline -+static __always_inline - unsigned int cpumask_next_and(int n, const struct cpumask *src1p, - const struct cpumask *src2p) - { -@@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, - for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) - - #if NR_CPUS == 1 --static inline -+static __always_inline - unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) - { - cpumask_check(start); -@@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta - * Often used to find any cpu but smp_processor_id() in a mask. - * Returns >= nr_cpu_ids if no cpus set. - */ --static inline -+static __always_inline - unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) - { - unsigned int i; -@@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) - * - * Returns >= nr_cpu_ids if such cpu doesn't exist. - */ --static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) - { - return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu)); - } -@@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s - * - * Returns >= nr_cpu_ids if such cpu doesn't exist. - */ --static inline -+static __always_inline - unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, - const struct cpumask *srcp2) - { -@@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, - * - * Returns >= nr_cpu_ids if such cpu doesn't exist. - */ --static inline -+static __always_inline - unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, - const struct cpumask *srcp2) - { -@@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask * - * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask - * @dstp: the cpumask pointer - */ --static inline void cpumask_setall(struct cpumask *dstp) -+static __always_inline void cpumask_setall(struct cpumask *dstp) - { - bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); - } -@@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp) - * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask - * @dstp: the cpumask pointer - */ --static inline void cpumask_clear(struct cpumask *dstp) -+static __always_inline void cpumask_clear(struct cpumask *dstp) - { - bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); - } -@@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp) - * - * If *@dstp is empty, returns false, else returns true - */ --static inline bool cpumask_and(struct cpumask *dstp, -+static __always_inline bool cpumask_and(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) - { -@@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp, - * @src1p: the first input - * @src2p: the second input - */ --static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, -+static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, - const struct cpumask *src2p) - { - bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), -@@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, - * @src1p: the first input - * @src2p: the second input - */ --static inline void cpumask_xor(struct cpumask *dstp, -+static __always_inline void cpumask_xor(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) - { -@@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp, - * - * If *@dstp is empty, returns false, else returns true - */ --static inline bool cpumask_andnot(struct cpumask *dstp, -+static __always_inline bool cpumask_andnot(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) - { -@@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp, - * @dstp: the cpumask result - * @srcp: the input to invert - */ --static inline void cpumask_complement(struct cpumask *dstp, -+static __always_inline void cpumask_complement(struct cpumask *dstp, - const struct cpumask *srcp) - { - bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), -@@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp, - * @src1p: the first input - * @src2p: the second input - */ --static inline bool cpumask_equal(const struct cpumask *src1p, -+static __always_inline bool cpumask_equal(const struct cpumask *src1p, - const struct cpumask *src2p) - { - return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), -@@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p, - * @src2p: the second input - * @src3p: the third input - */ --static inline bool cpumask_or_equal(const struct cpumask *src1p, -+static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, - const struct cpumask *src2p, - const struct cpumask *src3p) - { -@@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p, - * @src1p: the first input - * @src2p: the second input - */ --static inline bool cpumask_intersects(const struct cpumask *src1p, -+static __always_inline bool cpumask_intersects(const struct cpumask *src1p, - const struct cpumask *src2p) - { - return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), -@@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p, - * - * Returns true if *@src1p is a subset of *@src2p, else returns false - */ --static inline bool cpumask_subset(const struct cpumask *src1p, -+static __always_inline bool cpumask_subset(const struct cpumask *src1p, - const struct cpumask *src2p) - { - return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), -@@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p, - * cpumask_empty - *srcp == 0 - * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. - */ --static inline bool cpumask_empty(const struct cpumask *srcp) -+static __always_inline bool cpumask_empty(const struct cpumask *srcp) - { - return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp) - * cpumask_full - *srcp == 0xFFFFFFFF... - * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. - */ --static inline bool cpumask_full(const struct cpumask *srcp) -+static __always_inline bool cpumask_full(const struct cpumask *srcp) - { - return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp) - * cpumask_weight - Count of bits in *srcp - * @srcp: the cpumask to count bits (< nr_cpu_ids) in. - */ --static inline unsigned int cpumask_weight(const struct cpumask *srcp) -+static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) - { - return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); - } -@@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) - * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. - * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. - */ --static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, -+static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, - const struct cpumask *srcp2) - { - return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); -@@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, - * @srcp: the input to shift - * @n: the number of bits to shift by - */ --static inline void cpumask_shift_right(struct cpumask *dstp, -+static __always_inline void cpumask_shift_right(struct cpumask *dstp, - const struct cpumask *srcp, int n) - { - bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, -@@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp, - * @srcp: the input to shift - * @n: the number of bits to shift by - */ --static inline void cpumask_shift_left(struct cpumask *dstp, -+static __always_inline void cpumask_shift_left(struct cpumask *dstp, - const struct cpumask *srcp, int n) - { - bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, -@@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp, - * @dstp: the result - * @srcp: the input cpumask - */ --static inline void cpumask_copy(struct cpumask *dstp, -+static __always_inline void cpumask_copy(struct cpumask *dstp, - const struct cpumask *srcp) - { - bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); -@@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp, - * - * Returns -errno, or 0 for success. - */ --static inline int cpumask_parse_user(const char __user *buf, int len, -+static __always_inline int cpumask_parse_user(const char __user *buf, int len, - struct cpumask *dstp) - { - return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); -@@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len, - * - * Returns -errno, or 0 for success. - */ --static inline int cpumask_parselist_user(const char __user *buf, int len, -+static __always_inline int cpumask_parselist_user(const char __user *buf, int len, - struct cpumask *dstp) - { - return bitmap_parselist_user(buf, len, cpumask_bits(dstp), -@@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, - * - * Returns -errno, or 0 for success. - */ --static inline int cpumask_parse(const char *buf, struct cpumask *dstp) -+static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) - { - return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); - } -@@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) - * - * Returns -errno, or 0 for success. - */ --static inline int cpulist_parse(const char *buf, struct cpumask *dstp) -+static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) - { - return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); - } -@@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) - /** - * cpumask_size - size to allocate for a 'struct cpumask' in bytes - */ --static inline unsigned int cpumask_size(void) -+static __always_inline unsigned int cpumask_size(void) - { - return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); - } -@@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t; - - bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); - --static inline -+static __always_inline - bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) - { - return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); -@@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) - * - * See alloc_cpumask_var_node. - */ --static inline -+static __always_inline - bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) - { - return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); - } - --static inline -+static __always_inline - bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) - { - return alloc_cpumask_var(mask, flags | __GFP_ZERO); -@@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); - void free_cpumask_var(cpumask_var_t mask); - void free_bootmem_cpumask_var(cpumask_var_t mask); - --static inline bool cpumask_available(cpumask_var_t mask) -+static __always_inline bool cpumask_available(cpumask_var_t mask) - { - return mask != NULL; - } -@@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1]; - #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) - #define __cpumask_var_read_mostly - --static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) -+static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) - { - return true; - } - --static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, -+static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, - int node) - { - return true; - } - --static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) -+static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) - { - cpumask_clear(*mask); - return true; - } - --static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, -+static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, - int node) - { - cpumask_clear(*mask); - return true; - } - --static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) -+static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) - { - } - --static inline void free_cpumask_var(cpumask_var_t mask) -+static __always_inline void free_cpumask_var(cpumask_var_t mask) - { - } - --static inline void free_bootmem_cpumask_var(cpumask_var_t mask) -+static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) - { - } - --static inline bool cpumask_available(cpumask_var_t mask) -+static __always_inline bool cpumask_available(cpumask_var_t mask) - { - return true; - } -@@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src); - void init_cpu_possible(const struct cpumask *src); - void init_cpu_online(const struct cpumask *src); - --static inline void reset_cpu_possible_mask(void) -+static __always_inline void reset_cpu_possible_mask(void) - { - bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); - } - --static inline void -+static __always_inline void - set_cpu_possible(unsigned int cpu, bool possible) - { - if (possible) -@@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible) - cpumask_clear_cpu(cpu, &__cpu_possible_mask); - } - --static inline void -+static __always_inline void - set_cpu_present(unsigned int cpu, bool present) - { - if (present) -@@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present) - - void set_cpu_online(unsigned int cpu, bool online); - --static inline void -+static __always_inline void - set_cpu_active(unsigned int cpu, bool active) - { - if (active) -@@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active) - cpumask_clear_cpu(cpu, &__cpu_active_mask); - } - --static inline void -+static __always_inline void - set_cpu_dying(unsigned int cpu, bool dying) - { - if (dying) -@@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying) - ((struct cpumask *)(1 ? (bitmap) \ - : (void *)sizeof(__check_is_bitmap(bitmap)))) - --static inline int __check_is_bitmap(const unsigned long *bitmap) -+static __always_inline int __check_is_bitmap(const unsigned long *bitmap) - { - return 1; - } -@@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap) - extern const unsigned long - cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; - --static inline const struct cpumask *get_cpu_mask(unsigned int cpu) -+static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) - { - const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; - p -= cpu / BITS_PER_LONG; -@@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) - * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held - * region. - */ --static inline unsigned int num_online_cpus(void) -+static __always_inline unsigned int num_online_cpus(void) - { - return atomic_read(&__num_online_cpus); - } -@@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void) - #define num_present_cpus() cpumask_weight(cpu_present_mask) - #define num_active_cpus() cpumask_weight(cpu_active_mask) - --static inline bool cpu_online(unsigned int cpu) -+static __always_inline bool cpu_online(unsigned int cpu) - { - return cpumask_test_cpu(cpu, cpu_online_mask); - } - --static inline bool cpu_possible(unsigned int cpu) -+static __always_inline bool cpu_possible(unsigned int cpu) - { - return cpumask_test_cpu(cpu, cpu_possible_mask); - } - --static inline bool cpu_present(unsigned int cpu) -+static __always_inline bool cpu_present(unsigned int cpu) - { - return cpumask_test_cpu(cpu, cpu_present_mask); - } - --static inline bool cpu_active(unsigned int cpu) -+static __always_inline bool cpu_active(unsigned int cpu) - { - return cpumask_test_cpu(cpu, cpu_active_mask); - } - --static inline bool cpu_dying(unsigned int cpu) -+static __always_inline bool cpu_dying(unsigned int cpu) - { - return cpumask_test_cpu(cpu, cpu_dying_mask); - } -@@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu) - #define num_present_cpus() 1U - #define num_active_cpus() 1U - --static inline bool cpu_online(unsigned int cpu) -+static __always_inline bool cpu_online(unsigned int cpu) - { - return cpu == 0; - } - --static inline bool cpu_possible(unsigned int cpu) -+static __always_inline bool cpu_possible(unsigned int cpu) - { - return cpu == 0; - } - --static inline bool cpu_present(unsigned int cpu) -+static __always_inline bool cpu_present(unsigned int cpu) - { - return cpu == 0; - } - --static inline bool cpu_active(unsigned int cpu) -+static __always_inline bool cpu_active(unsigned int cpu) - { - return cpu == 0; - } - --static inline bool cpu_dying(unsigned int cpu) -+static __always_inline bool cpu_dying(unsigned int cpu) - { - return false; - } -@@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu) - * Returns the length of the (null-terminated) @buf string, zero if - * nothing is copied. - */ --static inline ssize_t -+static __always_inline ssize_t - cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) - { - return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), -@@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) - * Returns the length of how many bytes have been copied, excluding - * terminating '\0'. - */ --static inline ssize_t -+static __always_inline ssize_t - cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, - loff_t off, size_t count) - { -@@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, - * Everything is same with the above cpumap_print_bitmask_to_buf() - * except the print format. - */ --static inline ssize_t -+static __always_inline ssize_t - cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, - loff_t off, size_t count) - { -diff --git a/include/linux/find.h b/include/linux/find.h -index ccaf61a0f5fd..db2f2851601d 100644 ---- a/include/linux/find.h -+++ b/include/linux/find.h -@@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) - { -@@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -@@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_andnot_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -@@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, - * Returns the bit number of the next zero bit - * If no bits are zero, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) - { -@@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - * Returns the bit number of the first set bit. - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_first_bit(const unsigned long *addr, unsigned long size) - { - if (small_const_nbits(size)) { -@@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) - * Returns the bit number of the N'th set bit. - * If no such, returns @size. - */ --static inline -+static __always_inline - unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) - { - if (n >= size) -@@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign - * Returns the bit number of the N'th set bit. - * If no such, returns @size. - */ --static inline -+static __always_inline - unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, - unsigned long size, unsigned long n) - { -@@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long * - * Returns the bit number of the N'th set bit. - * If no such, returns @size. - */ --static inline -+static __always_inline - unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, - unsigned long size, unsigned long n) - { -@@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_first_and_bit(const unsigned long *addr1, - const unsigned long *addr2, - unsigned long size) -@@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1, - * Returns the bit number of the first cleared bit. - * If no bits are zero, returns @size. - */ --static inline -+static __always_inline - unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) - { - if (small_const_nbits(size)) { -@@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) - * - * Returns the bit number of the last set bit, or size. - */ --static inline -+static __always_inline - unsigned long find_last_bit(const unsigned long *addr, unsigned long size) - { - if (small_const_nbits(size)) { -@@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) - * Returns the bit number for the next set bit, or first set bit up to @offset - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_and_bit_wrap(const unsigned long *addr1, - const unsigned long *addr2, - unsigned long size, unsigned long offset) -@@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, - * Returns the bit number for the next set bit, or first set bit up to @offset - * If no bits are set, returns @size. - */ --static inline -+static __always_inline - unsigned long find_next_bit_wrap(const unsigned long *addr, - unsigned long size, unsigned long offset) - { -@@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, - * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing - * before using it alone. - */ --static inline -+static __always_inline - unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, - unsigned long start, unsigned long n) - { -@@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump, - - #if defined(__LITTLE_ENDIAN) - --static inline unsigned long find_next_zero_bit_le(const void *addr, -+static __always_inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) - { - return find_next_zero_bit(addr, size, offset); - } - --static inline unsigned long find_next_bit_le(const void *addr, -+static __always_inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) - { - return find_next_bit(addr, size, offset); - } - --static inline unsigned long find_first_zero_bit_le(const void *addr, -+static __always_inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) - { - return find_first_zero_bit(addr, size); -@@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, - #elif defined(__BIG_ENDIAN) - - #ifndef find_next_zero_bit_le --static inline -+static __always_inline - unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) - { -@@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned - #endif - - #ifndef find_first_zero_bit_le --static inline -+static __always_inline - unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) - { - if (small_const_nbits(size)) { -@@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) - #endif - - #ifndef find_next_bit_le --static inline -+static __always_inline - unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) - { -diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index bb0ee80526b2..8c04254c5284 100644 ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_; - */ - #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ - __nodemask_pr_bits(maskp) --static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) -+static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) - { - return m ? MAX_NUMNODES : 0; - } --static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) -+static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) - { - return m ? m->bits : NULL; - } -@@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp) - } - - #define node_clear(node, dst) __node_clear((node), &(dst)) --static inline void __node_clear(int node, volatile nodemask_t *dstp) -+static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) - { - clear_bit(node, dstp->bits); - } - - #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) --static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) -+static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) - { - bitmap_fill(dstp->bits, nbits); - } - - #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) --static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) -+static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) - { - bitmap_zero(dstp->bits, nbits); - } -@@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) - - #define node_test_and_set(node, nodemask) \ - __node_test_and_set((node), &(nodemask)) --static inline bool __node_test_and_set(int node, nodemask_t *addr) -+static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) - { - return test_and_set_bit(node, addr->bits); - } - - #define nodes_and(dst, src1, src2) \ - __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) --static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, -+static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); -@@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, - - #define nodes_or(dst, src1, src2) \ - __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) --static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, -+static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); -@@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, - - #define nodes_xor(dst, src1, src2) \ - __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) --static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, -+static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); -@@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, - - #define nodes_andnot(dst, src1, src2) \ - __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) --static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, -+static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); -@@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, - - #define nodes_complement(dst, src) \ - __nodes_complement(&(dst), &(src), MAX_NUMNODES) --static inline void __nodes_complement(nodemask_t *dstp, -+static __always_inline void __nodes_complement(nodemask_t *dstp, - const nodemask_t *srcp, unsigned int nbits) - { - bitmap_complement(dstp->bits, srcp->bits, nbits); -@@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp, - - #define nodes_equal(src1, src2) \ - __nodes_equal(&(src1), &(src2), MAX_NUMNODES) --static inline bool __nodes_equal(const nodemask_t *src1p, -+static __always_inline bool __nodes_equal(const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - return bitmap_equal(src1p->bits, src2p->bits, nbits); -@@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p, - - #define nodes_intersects(src1, src2) \ - __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) --static inline bool __nodes_intersects(const nodemask_t *src1p, -+static __always_inline bool __nodes_intersects(const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - return bitmap_intersects(src1p->bits, src2p->bits, nbits); -@@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p, - - #define nodes_subset(src1, src2) \ - __nodes_subset(&(src1), &(src2), MAX_NUMNODES) --static inline bool __nodes_subset(const nodemask_t *src1p, -+static __always_inline bool __nodes_subset(const nodemask_t *src1p, - const nodemask_t *src2p, unsigned int nbits) - { - return bitmap_subset(src1p->bits, src2p->bits, nbits); - } - - #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) --static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) -+static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) - { - return bitmap_empty(srcp->bits, nbits); - } - - #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) --static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) -+static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) - { - return bitmap_full(srcp->bits, nbits); - } - - #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) --static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) -+static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) - { - return bitmap_weight(srcp->bits, nbits); - } - - #define nodes_shift_right(dst, src, n) \ - __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) --static inline void __nodes_shift_right(nodemask_t *dstp, -+static __always_inline void __nodes_shift_right(nodemask_t *dstp, - const nodemask_t *srcp, int n, int nbits) - { - bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); -@@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp, - - #define nodes_shift_left(dst, src, n) \ - __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) --static inline void __nodes_shift_left(nodemask_t *dstp, -+static __always_inline void __nodes_shift_left(nodemask_t *dstp, - const nodemask_t *srcp, int n, int nbits) - { - bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); -@@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp, - > MAX_NUMNODES, then the silly min_ts could be dropped. */ - - #define first_node(src) __first_node(&(src)) --static inline unsigned int __first_node(const nodemask_t *srcp) -+static __always_inline unsigned int __first_node(const nodemask_t *srcp) - { - return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); - } - - #define next_node(n, src) __next_node((n), &(src)) --static inline unsigned int __next_node(int n, const nodemask_t *srcp) -+static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) - { - return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); - } -@@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp) - * the first node in src if needed. Returns MAX_NUMNODES if src is empty. - */ - #define next_node_in(n, src) __next_node_in((n), &(src)) --static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) -+static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) - { - unsigned int ret = __next_node(node, srcp); - -@@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) - return ret; - } - --static inline void init_nodemask_of_node(nodemask_t *mask, int node) -+static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) - { - nodes_clear(*mask); - node_set(node, *mask); -@@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) - }) - - #define first_unset_node(mask) __first_unset_node(&(mask)) --static inline unsigned int __first_unset_node(const nodemask_t *maskp) -+static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) - { - return min_t(unsigned int, MAX_NUMNODES, - find_first_zero_bit(maskp->bits, MAX_NUMNODES)); -@@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp) - - #define nodemask_parse_user(ubuf, ulen, dst) \ - __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) --static inline int __nodemask_parse_user(const char __user *buf, int len, -+static __always_inline int __nodemask_parse_user(const char __user *buf, int len, - nodemask_t *dstp, int nbits) - { - return bitmap_parse_user(buf, len, dstp->bits, nbits); - } - - #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) --static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) -+static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) - { - return bitmap_parselist(buf, dstp->bits, nbits); - } - - #define node_remap(oldbit, old, new) \ - __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) --static inline int __node_remap(int oldbit, -+static __always_inline int __node_remap(int oldbit, - const nodemask_t *oldp, const nodemask_t *newp, int nbits) - { - return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); -@@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit, - - #define nodes_remap(dst, src, old, new) \ - __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) --static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, -+static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, - const nodemask_t *oldp, const nodemask_t *newp, int nbits) - { - bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); -@@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, - - #define nodes_onto(dst, orig, relmap) \ - __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) --static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, -+static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, - const nodemask_t *relmapp, int nbits) - { - bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); -@@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, - - #define nodes_fold(dst, orig, sz) \ - __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) --static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, -+static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, - int sz, int nbits) - { - bitmap_fold(dstp->bits, origp->bits, sz, nbits); -@@ -418,22 +418,22 @@ enum node_states { - extern nodemask_t node_states[NR_NODE_STATES]; - - #if MAX_NUMNODES > 1 --static inline int node_state(int node, enum node_states state) -+static __always_inline int node_state(int node, enum node_states state) - { - return node_isset(node, node_states[state]); - } - --static inline void node_set_state(int node, enum node_states state) -+static __always_inline void node_set_state(int node, enum node_states state) - { - __node_set(node, &node_states[state]); - } - --static inline void node_clear_state(int node, enum node_states state) -+static __always_inline void node_clear_state(int node, enum node_states state) - { - __node_clear(node, &node_states[state]); - } - --static inline int num_node_state(enum node_states state) -+static __always_inline int num_node_state(enum node_states state) - { - return nodes_weight(node_states[state]); - } -@@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state) - - #define first_online_node first_node(node_states[N_ONLINE]) - #define first_memory_node first_node(node_states[N_MEMORY]) --static inline unsigned int next_online_node(int nid) -+static __always_inline unsigned int next_online_node(int nid) - { - return next_node(nid, node_states[N_ONLINE]); - } --static inline unsigned int next_memory_node(int nid) -+static __always_inline unsigned int next_memory_node(int nid) - { - return next_node(nid, node_states[N_MEMORY]); - } -@@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid) - extern unsigned int nr_node_ids; - extern unsigned int nr_online_nodes; - --static inline void node_set_online(int nid) -+static __always_inline void node_set_online(int nid) - { - node_set_state(nid, N_ONLINE); - nr_online_nodes = num_node_state(N_ONLINE); - } - --static inline void node_set_offline(int nid) -+static __always_inline void node_set_offline(int nid) - { - node_clear_state(nid, N_ONLINE); - nr_online_nodes = num_node_state(N_ONLINE); -@@ -469,20 +469,20 @@ static inline void node_set_offline(int nid) - - #else - --static inline int node_state(int node, enum node_states state) -+static __always_inline int node_state(int node, enum node_states state) - { - return node == 0; - } - --static inline void node_set_state(int node, enum node_states state) -+static __always_inline void node_set_state(int node, enum node_states state) - { - } - --static inline void node_clear_state(int node, enum node_states state) -+static __always_inline void node_clear_state(int node, enum node_states state) - { - } - --static inline int num_node_state(enum node_states state) -+static __always_inline int num_node_state(enum node_states state) - { - return 1; - } -@@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state) - - #endif - --static inline int node_random(const nodemask_t *maskp) -+static __always_inline int node_random(const nodemask_t *maskp) - { - #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) - int w, bit; --- -2.40.0.rc2 - -From 5d1ae6ec70d7e64ac75501503e3dcf229e0942fb Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Sat, 11 Mar 2023 14:42:34 +0100 -Subject: [PATCH 04/16] cachy +Date: Sun, 9 Apr 2023 21:21:58 +0200 +Subject: [PATCH 03/10] cachy Signed-off-by: Peter Jung --- .gitignore | 1 + - .../admin-guide/kernel-parameters.txt | 11 +- + .../admin-guide/kernel-parameters.txt | 9 + Documentation/dontdiff | 1 + Makefile | 8 +- arch/arc/configs/axs101_defconfig | 1 + @@ -6954,7 +3356,7 @@ Signed-off-by: Peter Jung drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/md/dm-crypt.c | 5 + - drivers/pci/quirks.c | 103 ++- + drivers/pci/quirks.c | 101 +++ drivers/platform/x86/Kconfig | 14 + drivers/platform/x86/Makefile | 3 + drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ @@ -6966,15 +3368,9 @@ Signed-off-by: Peter Jung kernel/Kconfig.hz | 24 + kernel/fork.c | 14 + kernel/module/Kconfig | 25 + - kernel/rcu/Kconfig | 4 +- - kernel/rcu/rcutorture.c | 2 +- - kernel/rcu/tree.c | 6 +- - kernel/rcu/tree_nocb.h | 4 +- - kernel/rcu/tree_plugin.h | 4 +- kernel/sched/fair.c | 20 +- kernel/sysctl.c | 12 + kernel/user_namespace.c | 7 + - lib/string.c | 62 +- mm/Kconfig | 2 +- mm/compaction.c | 4 + mm/page-writeback.c | 8 + @@ -6986,16 +3382,16 @@ Signed-off-by: Peter Jung net/ipv4/tcp_ipv4.c | 2 + scripts/Makefile.lib | 13 +- scripts/Makefile.modinst | 7 +- - 61 files changed, 2200 insertions(+), 76 deletions(-) + 55 files changed, 2144 insertions(+), 46 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/.gitignore b/.gitignore -index 20dce5c3b9e0..466c23de56ce 100644 +index 70ec6037fa7a..9bafd3c6bb5f 100644 --- a/.gitignore +++ b/.gitignore -@@ -63,6 +63,7 @@ modules.order +@@ -65,6 +65,7 @@ modules.order /vmlinux /vmlinux.32 /vmlinux.map @@ -7004,10 +3400,10 @@ index 20dce5c3b9e0..466c23de56ce 100644 /vmlinux-gdb.py /vmlinuz diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 6cfa6e3996cf..9595abf34974 100644 +index 6221a1d057dd..4f6761a93715 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4178,6 +4178,15 @@ +@@ -4190,6 +4190,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. @@ -7023,20 +3419,11 @@ index 6cfa6e3996cf..9595abf34974 100644 noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. -@@ -4751,7 +4760,7 @@ - overwritten. - - rcutree.kthread_prio= [KNL,BOOT] -- Set the SCHED_FIFO priority of the RCU per-CPU -+ Set the SCHED_RR priority of the RCU per-CPU - kthreads (rcuc/N). This value is also used for - the priority of the RCU boost threads (rcub/N) - and for the RCU grace-period kthreads (rcu_bh, diff --git a/Documentation/dontdiff b/Documentation/dontdiff -index 352ff53a2306..7c210744d84c 100644 +index 3c399f132e2d..a62ad01e6d11 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff -@@ -255,6 +255,7 @@ vmlinux.aout +@@ -254,6 +254,7 @@ vmlinux.aout vmlinux.bin.all vmlinux.lds vmlinux.map @@ -7045,10 +3432,10 @@ index 352ff53a2306..7c210744d84c 100644 vmlinuz voffset.h diff --git a/Makefile b/Makefile -index 1a1d63f2a9ed..9caed88238ab 100644 +index 5aeea3d98fc0..c6249845f6a1 100644 --- a/Makefile +++ b/Makefile -@@ -834,6 +834,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +@@ -818,6 +818,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -7058,7 +3445,7 @@ index 1a1d63f2a9ed..9caed88238ab 100644 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s -@@ -1075,11 +1078,6 @@ KBUILD_CFLAGS += -fno-strict-overflow +@@ -1060,11 +1063,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check @@ -7749,7 +4136,7 @@ index 542377cd419d..08d887d1220d 100644 config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index 73ed982d4100..cb4c6620b34a 100644 +index b39975977c03..00d94852490b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,7 @@ export BITS @@ -7813,7 +4200,7 @@ index 73ed982d4100..cb4c6620b34a 100644 diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 -index 000000000000..b38ffa4defb3 +index 000000000000..195af937aa4d --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,41 @@ @@ -7829,7 +4216,7 @@ index 000000000000..b38ffa4defb3 +__archpost: + +-include include/config/auto.conf -+include scripts/Kbuild.include ++include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@.relocs @@ -7871,7 +4258,7 @@ index 25805199a506..b2968175fc27 100644 mkpiggy piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile -index 1acff356d97a..d995595394bb 100644 +index 6b6cfe607bdb..19d1fb601796 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE @@ -7984,10 +4371,10 @@ index 75884d2cdec3..18021e8c0c28 100644 #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/drivers/Makefile b/drivers/Makefile -index bdf1c66141c9..1e1a0832fb48 100644 +index 20b118dca999..c19dee206e53 100644 --- a/drivers/Makefile +++ b/drivers/Makefile -@@ -59,15 +59,8 @@ obj-y += char/ +@@ -64,15 +64,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ @@ -8003,7 +4390,7 @@ index bdf1c66141c9..1e1a0832fb48 100644 obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ -@@ -79,6 +72,14 @@ obj-y += macintosh/ +@@ -84,6 +77,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ @@ -8019,7 +4406,7 @@ index bdf1c66141c9..1e1a0832fb48 100644 obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig -index a7bfddf08fa7..c9a5fa597950 100644 +index 25eb4e8fd22f..2f95d74ad0b4 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC @@ -8039,7 +4426,7 @@ index a7bfddf08fa7..c9a5fa597950 100644 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile -index e73cdb1d2b5a..052ccd05c13c 100644 +index af56fe2c75c0..76be74584719 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o @@ -8722,10 +5109,10 @@ index 809fbd014cd6..d54b35b147ee 100644 /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 2653516bcdef..973fe8f80051 100644 +index 3ba53dc3cc3f..0fde1b3ced78 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c -@@ -3207,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +@@ -3213,6 +3213,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } @@ -8738,7 +5125,7 @@ index 2653516bcdef..973fe8f80051 100644 if (ret < 0) goto bad; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 494fa46f5767..bcdfc072cbfb 100644 +index 44cab813bf95..25edf55de985 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -8848,18 +5235,16 @@ index 494fa46f5767..bcdfc072cbfb 100644 /* * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * prevented for those affected devices. -@@ -5000,8 +5100,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, - /* Zhaoxin Root/Downstream Ports */ +@@ -5002,6 +5102,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -- /* Wangxun nics */ -- { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + /* Wangxun nics */ + { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index 5692385e2d26..badc4f642ad2 100644 +index 4a01b315e0a9..e9ddf76b8b57 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1099,6 +1099,20 @@ config WINMATE_FM07_KEYS @@ -9424,10 +5809,10 @@ index 000000000000..77a6677ec19e +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 29e1f9e76eb6..a7852e22101f 100644 +index 0acb8e1fb7af..b0b49c8653b0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h -@@ -1178,7 +1178,7 @@ struct readahead_control { +@@ -1182,7 +1182,7 @@ struct readahead_control { ._index = i, \ } @@ -9489,7 +5874,7 @@ index 901b440238d5..7026df84a0f6 100644 TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/init/Kconfig b/init/Kconfig -index 44e90b28a30f..748a9491ca12 100644 +index 1fb5f313d18f..9b298860cfed 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK @@ -9546,7 +5931,7 @@ index 44e90b28a30f..748a9491ca12 100644 config PID_NS bool "PID Namespaces" default y -@@ -1420,6 +1453,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE +@@ -1433,6 +1466,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. @@ -9602,7 +5987,7 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index 9f7fe3541897..068062cdf5a3 100644 +index 0c92f224c68c..49c173e367d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -98,6 +98,10 @@ @@ -9616,7 +6001,7 @@ index 9f7fe3541897..068062cdf5a3 100644 #include #include #include -@@ -2030,6 +2034,10 @@ static __latent_entropy struct task_struct *copy_process( +@@ -2031,6 +2035,10 @@ static __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -9627,7 +6012,7 @@ index 9f7fe3541897..068062cdf5a3 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3180,6 +3188,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3181,6 +3189,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -9676,105 +6061,8 @@ index 424b3bc58f3f..ecf2798c5ccf 100644 config MODULE_DECOMPRESS bool "Support in-kernel module decompression" depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD -diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig -index ab62074174c3..f1f909bdc30d 100644 ---- a/kernel/rcu/Kconfig -+++ b/kernel/rcu/Kconfig -@@ -280,9 +280,9 @@ config RCU_NOCB_CPU_CB_BOOST - depends on RCU_NOCB_CPU && RCU_BOOST - default y if PREEMPT_RT - help -- Use this option to invoke offloaded callbacks as SCHED_FIFO -+ Use this option to invoke offloaded callbacks as SCHED_RR - to avoid starvation by heavy SCHED_OTHER background load. -- Of course, running as SCHED_FIFO during callback floods will -+ Of course, running as SCHED_RR during callback floods will - cause the rcuo[ps] kthreads to monopolize the CPU for hundreds - of milliseconds or more. Therefore, when enabling this option, - it is your responsibility to ensure that latency-sensitive -diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c -index 634df26a2c27..8c54871cc0a0 100644 ---- a/kernel/rcu/rcutorture.c -+++ b/kernel/rcu/rcutorture.c -@@ -2406,7 +2406,7 @@ static int rcutorture_booster_init(unsigned int cpu) - t = per_cpu(ksoftirqd, cpu); - WARN_ON_ONCE(!t); - sp.sched_priority = 2; -- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(t, SCHED_RR, &sp); - } - - /* Don't allow time recalculation while creating a new task. */ -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index cf34a961821a..80cf9824d461 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -4443,8 +4443,8 @@ static void __init rcu_start_exp_gp_kworkers(void) - return; - } - -- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); -- sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, -+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, ¶m); -+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR, - ¶m); - } - -@@ -4482,7 +4482,7 @@ static int __init rcu_spawn_gp_kthread(void) - return 0; - if (kthread_prio) { - sp.sched_priority = kthread_prio; -- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(t, SCHED_RR, &sp); - } - rnp = rcu_get_root(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); -diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h -index 9e1c8caec5ce..dd39c50ae099 100644 ---- a/kernel/rcu/tree_nocb.h -+++ b/kernel/rcu/tree_nocb.h -@@ -1465,7 +1465,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) - } - WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); - if (kthread_prio) -- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(t, SCHED_RR, &sp); - } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - -@@ -1476,7 +1476,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) - goto end; - - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) -- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(t, SCHED_RR, &sp); - - WRITE_ONCE(rdp->nocb_cb_kthread, t); - WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); -diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h -index 7b0fe741a088..77ad9e033358 100644 ---- a/kernel/rcu/tree_plugin.h -+++ b/kernel/rcu/tree_plugin.h -@@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) - struct sched_param sp; - - sp.sched_priority = kthread_prio; -- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(current, SCHED_RR, &sp); - #endif /* #ifdef CONFIG_RCU_BOOST */ - - WRITE_ONCE(rdp->rcuc_activity, jiffies); -@@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) - rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - sp.sched_priority = kthread_prio; -- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); -+ sched_setscheduler_nocheck(t, SCHED_RR, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - - out: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 0f8736991427..86a988c830ef 100644 +index 6986ea31c984..dcdd8422de72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ @@ -9864,7 +6152,7 @@ index 1c240d2c99bc..98e1a7472fd2 100644 { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 54211dbd516c..16ca0c151629 100644 +index 1d8e47bed3f1..fec01d016a35 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ @@ -9881,102 +6169,11 @@ index 54211dbd516c..16ca0c151629 100644 static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); -diff --git a/lib/string.c b/lib/string.c -index 4fb566ea610f..4746a98b153e 100644 ---- a/lib/string.c -+++ b/lib/string.c -@@ -792,24 +792,61 @@ char *strnstr(const char *s1, const char *s2, size_t len) - EXPORT_SYMBOL(strnstr); - #endif - -+#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 -+ -+#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL) -+ -+#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) -+ -+#define MEMCHR_MASK_GEN(mask) \ -+ do { \ -+ mask *= 0x01010101; \ -+ mask |= mask << 32; \ -+ } while (0) -+ -+#else -+ -+#define MEMCHR_MASK_GEN(mask) \ -+ do { \ -+ mask |= mask << 8; \ -+ mask |= mask << 16; \ -+ mask |= mask << 32; \ -+ } while (0) -+ -+#endif -+ - #ifndef __HAVE_ARCH_MEMCHR - /** - * memchr - Find a character in an area of memory. -- * @s: The memory area -+ * @p: The memory area - * @c: The byte to search for -- * @n: The size of the area. -+ * @length: The size of the area. - * - * returns the address of the first occurrence of @c, or %NULL - * if @c is not found - */ --void *memchr(const void *s, int c, size_t n) -+void *memchr(const void *p, int c, unsigned long length) - { -- const unsigned char *p = s; -- while (n-- != 0) { -- if ((unsigned char)c == *p++) { -- return (void *)(p - 1); -+ u64 mask, val; -+ const void *end = p + length; -+ -+ c &= 0xff; -+ if (p <= end - 8) { -+ mask = c; -+ MEMCHR_MASK_GEN(mask); -+ -+ for (; p <= end - 8; p += 8) { -+ val = *(u64 *)p ^ mask; -+ if ((val + 0xfefefefefefefeffu) & -+ (~val & 0x8080808080808080u)) -+ break; - } - } -+ -+ for (; p < end; p++) -+ if (*(unsigned char *)p == c) -+ return (void *)p; -+ - return NULL; - } - EXPORT_SYMBOL(memchr); -@@ -845,16 +882,7 @@ void *memchr_inv(const void *start, int c, size_t bytes) - return check_bytes8(start, value, bytes); - - value64 = value; --#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 -- value64 *= 0x0101010101010101ULL; --#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) -- value64 *= 0x01010101; -- value64 |= value64 << 32; --#else -- value64 |= value64 << 8; -- value64 |= value64 << 16; -- value64 |= value64 << 32; --#endif -+ MEMCHR_MASK_GEN(value64); - - prefix = (unsigned long)start % 8; - if (prefix) { diff --git a/mm/Kconfig b/mm/Kconfig -index ff7b209dec05..bf317c39ed2d 100644 +index 4751031f3f05..cf2e47030fe8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig -@@ -602,7 +602,7 @@ config COMPACTION +@@ -621,7 +621,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION @@ -9986,10 +6183,10 @@ index ff7b209dec05..bf317c39ed2d 100644 # diff --git a/mm/compaction.c b/mm/compaction.c -index 8238e83385a7..d0b16a5b30f7 100644 +index 5a9501e0ae01..4d8c63b9cdca 100644 --- a/mm/compaction.c +++ b/mm/compaction.c -@@ -2717,7 +2717,11 @@ static void compact_nodes(void) +@@ -2735,7 +2735,11 @@ static void compact_nodes(void) * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ @@ -10002,7 +6199,7 @@ index 8238e83385a7..d0b16a5b30f7 100644 int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index ad608ef2a243..178cfd5490b1 100644 +index 516b1aa247e8..78fb31d27ed7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -71,7 +71,11 @@ static long ratelimit_pages = 32; @@ -10030,10 +6227,10 @@ index ad608ef2a243..178cfd5490b1 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/swap.c b/mm/swap.c -index 70e2063ef43a..79ab9b1c3910 100644 +index 57cb01b042f6..3a7bec75480f 100644 --- a/mm/swap.c +++ b/mm/swap.c -@@ -1134,6 +1134,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); +@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) */ void __init swap_setup(void) { @@ -10044,7 +6241,7 @@ index 70e2063ef43a..79ab9b1c3910 100644 unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ -@@ -1145,4 +1149,5 @@ void __init swap_setup(void) +@@ -1101,4 +1105,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ @@ -10067,10 +6264,10 @@ index b52644771cc4..11a4b0e3b583 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 5b7b8d4f5297..160acbbdf111 100644 +index 9c1c5e8b24b8..71a7f4517e5a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -190,7 +190,11 @@ struct scan_control { +@@ -186,7 +186,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ @@ -10082,7 +6279,7 @@ index 5b7b8d4f5297..160acbbdf111 100644 static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) -@@ -4559,7 +4563,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned +@@ -4536,7 +4540,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -10174,10 +6371,10 @@ index 754e0212c951..b6d7faeb737a 100644 * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c -index 8320d0ecb13a..37a09cd767a1 100644 +index ea370afa70ed..b869b6c1b226 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c -@@ -3274,6 +3274,8 @@ static int __net_init tcp_sk_init(struct net *net) +@@ -3275,6 +3275,8 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; @@ -10187,10 +6384,10 @@ index 8320d0ecb13a..37a09cd767a1 100644 } diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib -index 4a4a5f67c1a6..993e4578c0f2 100644 +index 100a386fcd71..a3ec7265fb57 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib -@@ -557,14 +557,21 @@ quiet_cmd_xzmisc = XZMISC $@ +@@ -542,14 +542,21 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. @@ -10216,7 +6413,7 @@ index 4a4a5f67c1a6..993e4578c0f2 100644 # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst -index 4815a8e32227..6a3c36713045 100644 +index ab0c5bd1a60f..f4989f706d7f 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP $@ @@ -10235,1350 +6432,12 @@ index 4815a8e32227..6a3c36713045 100644 $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) -- -2.40.0.rc2 +2.40.0 -From 0e45a02aaaa398cc0465a407331459f28cdb1ae9 Mon Sep 17 00:00:00 2001 +From 9b77615274a43646ad38d250d0c63be888c15bda Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 10 Mar 2023 18:00:48 +0100 -Subject: [PATCH 05/16] clr - -Signed-off-by: Peter Jung ---- - arch/x86/kernel/tsc.c | 3 + - arch/x86/mm/fault.c | 4 +- - drivers/cpufreq/intel_pstate.c | 7 + - drivers/idle/intel_idle.c | 50 ++-- - drivers/input/serio/i8042.c | 10 +- - drivers/net/dummy.c | 2 +- - drivers/pci/pci.c | 2 +- - drivers/powercap/intel_rapl_common.c | 2 +- - drivers/thermal/intel/intel_powerclamp.c | 10 + - fs/xattr.c | 15 +- - include/linux/jbd2.h | 2 +- - include/linux/rcuref.h | 89 +++++++ - include/linux/types.h | 6 + - include/linux/wait.h | 2 + - include/net/dst.h | 21 +- - include/net/sock.h | 2 +- - include/uapi/linux/if_bonding.h | 2 +- - init/do_mounts.c | 16 +- - kernel/locking/rwsem.c | 4 +- - kernel/sched/wait.c | 24 ++ - kernel/watchdog.c | 2 +- - lib/Makefile | 2 +- - lib/raid6/algos.c | 4 +- - lib/rcuref.c | 311 +++++++++++++++++++++++ - mm/ksm.c | 11 +- - net/bridge/br_nf_core.c | 2 +- - net/core/dst.c | 26 +- - net/core/rtnetlink.c | 2 +- - net/ipv4/inet_connection_sock.c | 2 +- - net/ipv4/tcp.c | 4 +- - net/ipv6/route.c | 6 +- - net/netfilter/ipvs/ip_vs_xmit.c | 4 +- - 32 files changed, 559 insertions(+), 90 deletions(-) - create mode 100644 include/linux/rcuref.h - create mode 100644 lib/rcuref.c - -diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c -index a78e73da4a74..bab8a98080cf 100644 ---- a/arch/x86/kernel/tsc.c -+++ b/arch/x86/kernel/tsc.c -@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) - if (!constant_tsc || !mask) - return 0; - -+ if (cpu != 0) -+ return cpu_data(0).loops_per_jiffy; -+ - sibling = cpumask_any_but(mask, cpu); - if (sibling < nr_cpu_ids) - return cpu_data(sibling).loops_per_jiffy; -diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c -index 7b0d4ab894c8..1a14f52added 100644 ---- a/arch/x86/mm/fault.c -+++ b/arch/x86/mm/fault.c -@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, - if (!printk_ratelimit()) - return; - -- printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", -+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", - loglvl, tsk->comm, task_pid_nr(tsk), address, -- (void *)regs->ip, (void *)regs->sp, error_code); -+ (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); - - print_vma_addr(KERN_CONT " in ", regs->ip); - -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index fd73d6d2b808..0c0071ab3966 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -366,6 +366,13 @@ static void intel_pstate_set_itmt_prio(int cpu) - * update them at any time after it has been called. - */ - sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); -+ /* -+ * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. -+ * In this case we can't use CPPC.highest_perf to enable ITMT. -+ * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. -+ */ -+ if (cppc_perf.highest_perf == 0xff) -+ cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); - - if (max_highest_perf <= min_highest_perf) { - if (cppc_perf.highest_perf > max_highest_perf) -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index f060ac7376e6..1cd277c8f77f 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -572,7 +572,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -580,7 +580,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -596,7 +596,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -604,7 +604,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -612,7 +612,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -620,7 +620,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -640,7 +640,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -648,7 +648,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -656,7 +656,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -664,7 +664,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -672,7 +672,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -680,7 +680,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -688,7 +688,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -709,7 +709,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -717,7 +717,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -725,7 +725,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -733,7 +733,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -741,7 +741,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -749,7 +749,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -757,7 +757,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -778,7 +778,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -807,7 +807,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 4, -- .target_residency = 4, -+ .target_residency = 40, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -815,7 +815,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 170, -- .target_residency = 600, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -981,7 +981,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 2, -- .target_residency = 4, -+ .target_residency = 40, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c -index 6dac7c1853a5..fab04cd8a7a0 100644 ---- a/drivers/input/serio/i8042.c -+++ b/drivers/input/serio/i8042.c -@@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) - if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - i8042_ctr &= ~I8042_CTR_KBDINT; - i8042_ctr |= I8042_CTR_KBDDIS; -- pr_err("Failed to enable KBD port\n"); -+ pr_info("Failed to enable KBD port\n"); - return -EIO; - } - -@@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) - if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - i8042_ctr &= ~I8042_CTR_AUXINT; - i8042_ctr |= I8042_CTR_AUXDIS; -- pr_err("Failed to enable AUX port\n"); -+ pr_info("Failed to enable AUX port\n"); - return -EIO; - } - -@@ -732,7 +732,7 @@ static int i8042_check_mux(void) - i8042_ctr &= ~I8042_CTR_AUXINT; - - if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { -- pr_err("Failed to disable AUX port, can't use MUX\n"); -+ pr_info("Failed to disable AUX port, can't use MUX\n"); - return -EIO; - } - -@@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) - do { - - if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { -- pr_err("i8042 controller selftest timeout\n"); -+ pr_info("i8042 controller selftest timeout\n"); - return -ENODEV; - } - -@@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) - pr_info("giving up on controller selftest, continuing anyway...\n"); - return 0; - #else -- pr_err("i8042 controller selftest failed\n"); -+ pr_info("i8042 controller selftest failed\n"); - return -EIO; - #endif - } -diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c -index c4b1b0aa438a..06b00f7a8eab 100644 ---- a/drivers/net/dummy.c -+++ b/drivers/net/dummy.c -@@ -43,7 +43,7 @@ - - #define DRV_NAME "dummy" - --static int numdummies = 1; -+static int numdummies = 0; - - /* fake multicast ability */ - static void set_multicast_list(struct net_device *dev) -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index 7a67611dc5f4..48b350fe09d8 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -62,7 +62,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { -diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c -index 26d00b1853b4..3e239d6548b5 100644 ---- a/drivers/powercap/intel_rapl_common.c -+++ b/drivers/powercap/intel_rapl_common.c -@@ -1518,7 +1518,7 @@ static int __init rapl_init(void) - - id = x86_match_cpu(rapl_ids); - if (!id) { -- pr_err("driver does not support CPU family %d model %d\n", -+ pr_info("driver does not support CPU family %d model %d\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - - return -ENODEV; -diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c -index 2f4cbfdf26a0..2d297a1cfa34 100644 ---- a/drivers/thermal/intel/intel_powerclamp.c -+++ b/drivers/thermal/intel/intel_powerclamp.c -@@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { - .set_cur_state = powerclamp_set_cur_state, - }; - -+static const struct x86_cpu_id amd_cpu[] = { -+ { X86_VENDOR_AMD }, -+ {}, -+}; -+ - static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { - X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), - {} -@@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); - static int __init powerclamp_probe(void) - { - -+ if (x86_match_cpu(amd_cpu)){ -+ pr_info("Intel PowerClamp does not support AMD CPUs\n"); -+ return -ENODEV; -+ } -+ - if (!x86_match_cpu(intel_powerclamp_ids)) { - pr_err("CPU does not support MWAIT\n"); - return -ENODEV; -diff --git a/fs/xattr.c b/fs/xattr.c -index adab9a70b536..4ada829a3b1b 100644 ---- a/fs/xattr.c -+++ b/fs/xattr.c -@@ -139,16 +139,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, - } - - /* -- * In the user.* namespace, only regular files and directories can have -- * extended attributes. For sticky directories, only the owner and -- * privileged users can write attributes. -+ * In the user.* namespace, only regular files, symbolic links, and -+ * directories can have extended attributes. For symbolic links and -+ * sticky directories, only the owner and privileged users can write -+ * attributes. - */ - if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { -- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) -+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) - return (mask & MAY_WRITE) ? -EPERM : -ENODATA; -- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && -- (mask & MAY_WRITE) && -- !inode_owner_or_capable(mnt_userns, inode)) -+ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) -+ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) -+ && !inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - } - -diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h -index 2170e0cc279d..e8fa79f5bb34 100644 ---- a/include/linux/jbd2.h -+++ b/include/linux/jbd2.h -@@ -45,7 +45,7 @@ - /* - * The default maximum commit age, in seconds. - */ --#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 -+#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 - - #ifdef CONFIG_JBD2_DEBUG - /* -diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h -new file mode 100644 -index 000000000000..57ffb3c02ace ---- /dev/null -+++ b/include/linux/rcuref.h -@@ -0,0 +1,89 @@ -+/* SPDX-License-Identifier: GPL-2.0-only */ -+#ifndef _LINUX_RCUREF_H -+#define _LINUX_RCUREF_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define RCUREF_NOREF 0x00000000 -+#define RCUREF_ONEREF 0x00000001 -+#define RCUREF_MAXREF 0x7FFFFFFF -+#define RCUREF_SATURATED 0xA0000000 -+#define RCUREF_RELEASED 0xC0000000 -+#define RCUREF_DEAD 0xE0000000 -+ -+/** -+ * rcuref_init - Initialize a rcuref reference count with the given reference count -+ * @ref: Pointer to the reference count -+ * @cnt: The initial reference count typically '1' -+ */ -+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) -+{ -+ atomic_set(&ref->refcnt, cnt); -+} -+ -+/** -+ * rcuref_read - Read the number of held reference counts of a rcuref -+ * @ref: Pointer to the reference count -+ * -+ * Return: The number of held references (0 ... N) -+ */ -+static inline unsigned int rcuref_read(rcuref_t *ref) -+{ -+ unsigned int c = atomic_read(&ref->refcnt); -+ -+ /* Return 0 if within the DEAD zone. */ -+ return c >= RCUREF_RELEASED ? 0 : c; -+} -+ -+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref, unsigned int new); -+ -+/** -+ * rcuref_get - Acquire one reference on a rcuref reference count -+ * @ref: Pointer to the reference count -+ * -+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. -+ * -+ * Provides no memory ordering, it is assumed the caller has guaranteed the -+ * object memory to be stable (RCU, etc.). It does provide a control dependency -+ * and thereby orders future stores. See documentation in lib/rcuref.c -+ * -+ * Return: -+ * False if the attempt to acquire a reference failed. This happens -+ * when the last reference has been put already -+ * -+ * True if a reference was successfully acquired -+ */ -+static inline __must_check bool rcuref_get(rcuref_t *ref) -+{ -+ /* -+ * Unconditionally increase the reference count. The saturation and -+ * dead zones provide enough tolerance for this. -+ */ -+ unsigned int old = atomic_fetch_add_relaxed(1, &ref->refcnt); -+ -+ /* -+ * If the old value is less than RCUREF_MAXREF, this is a valid -+ * reference. -+ * -+ * In case the original value was RCUREF_NOREF the above -+ * unconditional increment raced with a concurrent put() operation -+ * dropping the last reference. That racing put() operation -+ * subsequently fails to mark the reference count dead because the -+ * count is now elevated again and the concurrent caller is -+ * therefore not allowed to deconstruct the object. -+ */ -+ if (likely(old < RCUREF_MAXREF)) -+ return true; -+ -+ /* Handle the cases inside the saturation and dead zones */ -+ return rcuref_get_slowpath(ref, old); -+} -+ -+extern __must_check bool rcuref_put(rcuref_t *ref); -+ -+#endif -diff --git a/include/linux/types.h b/include/linux/types.h -index ea8cf60a8a79..419baa980529 100644 ---- a/include/linux/types.h -+++ b/include/linux/types.h -@@ -175,6 +175,12 @@ typedef struct { - } atomic64_t; - #endif - -+typedef struct { -+ atomic_t refcnt; -+} rcuref_t; -+ -+#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i) } -+ - struct list_head { - struct list_head *next, *prev; - }; -diff --git a/include/linux/wait.h b/include/linux/wait.h -index a0307b516b09..edc21128f387 100644 ---- a/include/linux/wait.h -+++ b/include/linux/wait.h -@@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) - - extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); - extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); -+extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); - extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); - extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); - -@@ -1192,6 +1193,7 @@ do { \ - */ - void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); - bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); -+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); - long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); - void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); - long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); -diff --git a/include/net/dst.h b/include/net/dst.h -index d67fda89cd0f..0909a3306902 100644 ---- a/include/net/dst.h -+++ b/include/net/dst.h -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -65,19 +66,29 @@ struct dst_entry { - * input/output/ops or performance tanks badly - */ - #ifdef CONFIG_64BIT -- atomic_t __refcnt; /* 64-bit offset 64 */ -+ rcuref_t __refcnt; /* 64-bit offset 64 */ - #endif - int __use; - unsigned long lastuse; -- struct lwtunnel_state *lwtstate; - struct rcu_head rcu_head; - short error; - short __pad; - __u32 tclassid; - #ifndef CONFIG_64BIT -- atomic_t __refcnt; /* 32-bit offset 64 */ -+ struct lwtunnel_state *lwtstate; -+ rcuref_t __refcnt; /* 32-bit offset 64 */ - #endif - netdevice_tracker dev_tracker; -+#ifdef CONFIG_64BIT -+ /* -+ * Ensure that lwtstate is not in the same cache line as __refcnt, -+ * because that would lead to false sharing under high contention -+ * of __refcnt. This also ensures that rtable::rt_genid is not -+ * sharing the same cache-line. -+ */ -+ int pad2[6]; -+ struct lwtunnel_state *lwtstate; -+#endif - }; - - struct dst_metrics { -@@ -228,7 +239,7 @@ static inline void dst_hold(struct dst_entry *dst) - * the placement of __refcnt in struct dst_entry - */ - BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); -- WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); -+ WARN_ON(!rcuref_get(&dst->__refcnt)); - } - - static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) -@@ -292,7 +303,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb - */ - static inline bool dst_hold_safe(struct dst_entry *dst) - { -- return atomic_inc_not_zero(&dst->__refcnt); -+ return rcuref_get(&dst->__refcnt); - } - - /** -diff --git a/include/net/sock.h b/include/net/sock.h -index c6584a352463..dbf85161c0c7 100644 ---- a/include/net/sock.h -+++ b/include/net/sock.h -@@ -2159,7 +2159,7 @@ sk_dst_get(struct sock *sk) - - rcu_read_lock(); - dst = rcu_dereference(sk->sk_dst_cache); -- if (dst && !atomic_inc_not_zero(&dst->__refcnt)) -+ if (dst && !rcuref_get(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - return dst; -diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h -index d174914a837d..bf8e2af101a3 100644 ---- a/include/uapi/linux/if_bonding.h -+++ b/include/uapi/linux/if_bonding.h -@@ -82,7 +82,7 @@ - #define BOND_STATE_ACTIVE 0 /* link is active */ - #define BOND_STATE_BACKUP 1 /* link is backup */ - --#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ -+#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ - - #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ - -diff --git a/init/do_mounts.c b/init/do_mounts.c -index 811e94daf0a8..06fef7f97c02 100644 ---- a/init/do_mounts.c -+++ b/init/do_mounts.c -@@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) - if (strcmp(name, "/dev/ram") == 0) - return Root_RAM0; - #ifdef CONFIG_BLOCK -- if (strncmp(name, "PARTUUID=", 9) == 0) -- return devt_from_partuuid(name + 9); -+ if (strncmp(name, "PARTUUID=", 9) == 0) { -+ dev_t res; -+ int needtowait = 40<<1; -+ res = devt_from_partuuid(name + 9); -+ while (!res && needtowait) { -+ /* waiting 0.5 sec */ -+ msleep(500); -+ res = devt_from_partuuid(name + 9); -+ needtowait--; -+ } -+ return res; -+ } - if (strncmp(name, "PARTLABEL=", 10) == 0) - return devt_from_partlabel(name + 10); - if (strncmp(name, "/dev/", 5) == 0) -@@ -612,7 +622,9 @@ void __init prepare_namespace(void) - * For example, it is not atypical to wait 5 seconds here - * for the touchpad of a laptop to initialize. - */ -+ async_synchronize_full(); - wait_for_device_probe(); -+ async_synchronize_full(); - - md_run_setup(); - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index 84d5b649b95f..e341ca8731f7 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - lockdep_assert_preemption_disabled(); - -@@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - - return state; -diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c -index 133b74730738..1647fb8662eb 100644 ---- a/kernel/sched/wait.c -+++ b/kernel/sched/wait.c -@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ - } - EXPORT_SYMBOL_GPL(add_wait_queue_priority); - -+void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) -+{ -+ unsigned long flags; -+ -+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; -+ spin_lock_irqsave(&wq_head->lock, flags); -+ __add_wait_queue(wq_head, wq_entry); -+ spin_unlock_irqrestore(&wq_head->lock, flags); -+} -+EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); -+ - void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) - { - unsigned long flags; -@@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent - } - EXPORT_SYMBOL(prepare_to_wait_exclusive); - -+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) -+{ -+ unsigned long flags; -+ -+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; -+ spin_lock_irqsave(&wq_head->lock, flags); -+ if (list_empty(&wq_entry->entry)) -+ __add_wait_queue(wq_head, wq_entry); -+ set_current_state(state); -+ spin_unlock_irqrestore(&wq_head->lock, flags); -+} -+EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); -+ - void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) - { - wq_entry->flags = flags; -diff --git a/kernel/watchdog.c b/kernel/watchdog.c -index 8e61f21e7e33..be1439d38f26 100644 ---- a/kernel/watchdog.c -+++ b/kernel/watchdog.c -@@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; - int __read_mostly watchdog_user_enabled = 1; - int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; - int __read_mostly soft_watchdog_user_enabled = 1; --int __read_mostly watchdog_thresh = 10; -+int __read_mostly watchdog_thresh = 40; - static int __read_mostly nmi_watchdog_available; - - struct cpumask watchdog_cpumask __read_mostly; -diff --git a/lib/Makefile b/lib/Makefile -index 4d9461bfea42..71c9627153b8 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ - list_sort.o uuid.o iov_iter.o clz_ctz.o \ - bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o base64.o \ -- once.o refcount.o usercopy.o errseq.o bucket_locks.o \ -+ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ - generic-radix-tree.o - obj-$(CONFIG_STRING_SELFTEST) += test_string.o - obj-y += string_helpers.o -diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c -index a22a05c9af8a..a70bcbbd1673 100644 ---- a/lib/raid6/algos.c -+++ b/lib/raid6/algos.c -@@ -126,8 +126,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) - - for (best = NULL, algo = raid6_recov_algos; *algo; algo++) - if (!best || (*algo)->priority > best->priority) -- if (!(*algo)->valid || (*algo)->valid()) -+ if (!(*algo)->valid || (*algo)->valid()) { - best = *algo; -+ break; -+ } - - if (best) { - raid6_2data_recov = best->data2; -diff --git a/lib/rcuref.c b/lib/rcuref.c -new file mode 100644 -index 000000000000..34fa40618fca ---- /dev/null -+++ b/lib/rcuref.c -@@ -0,0 +1,311 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+ -+/* -+ * rcuref - A scalable reference count implementation for RCU managed objects -+ * -+ * rcuref is provided to replace open coded reference count implementations -+ * based on atomic_t. It protects explicitely RCU managed objects which can -+ * be visible even after the last reference has been dropped and the object -+ * is heading towards destruction. -+ * -+ * A common usage pattern is: -+ * -+ * get() -+ * rcu_read_lock(); -+ * p = get_ptr(); -+ * if (p && !atomic_inc_not_zero(&p->refcnt)) -+ * p = NULL; -+ * rcu_read_unlock(); -+ * return p; -+ * -+ * put() -+ * if (!atomic_dec_return(&->refcnt)) { -+ * remove_ptr(p); -+ * kfree_rcu((p, rcu); -+ * } -+ * -+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has -+ * O(N^2) behaviour under contention with N concurrent operations. -+ * -+ * rcuref uses atomic_fetch_add_relaxed() and atomic_fetch_sub_release() -+ * for the fast path, which scale better under contention. -+ * -+ * Why not refcount? -+ * ================= -+ * -+ * In principle it should be possible to make refcount use the rcuref -+ * scheme, but the destruction race described below cannot be prevented -+ * unless the protected object is RCU managed. -+ * -+ * Theory of operation -+ * =================== -+ * -+ * rcuref uses an unsigned integer reference counter. As long as the -+ * counter value is greater than or equal to RCUREF_ONEREF and not larger -+ * than RCUREF_MAXREF the reference is alive: -+ * -+ * NOREF ONEREF MAXREF SATURATED RELEASED DEAD -+ * 0 1 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF -+ * <---valid ------------> <-------saturation zone-------> <-----------dead zone----------> -+ * -+ * The get() and put() operations do unconditional increments and -+ * decrements. The result is checked after the operation. This optimizes -+ * for the fast path. -+ * -+ * If the reference count is saturated or dead, then the increments and -+ * decrements are not harmful as the reference count still stays in the -+ * respective zones and is always set back to STATURATED resp. DEAD. The -+ * zones have room for 2^28 racing operations in each direction, which -+ * makes it practically impossible to escape the zones. -+ * -+ * Once the last reference is dropped the reference count becomes -+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The -+ * slowpath then tries to set the reference count from RCUREF_NOREF to -+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a -+ * concurrent rcuref_get() can acquire the reference count and bring it -+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. -+ * -+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in -+ * DEAD + 1, which is inside the dead zone. If that happens the reference -+ * count is put back to DEAD. -+ * -+ * The actual race is possible due to the unconditional increment and -+ * decrements in rcuref_get() and rcuref_put(): -+ * -+ * T1 T2 -+ * get() put() -+ * if (atomic_fetch_sub(1, &ref->refcnt) >= 0) -+ * succeeds-> atomic_try_cmpxchg(&ref->refcnt, -1, DEAD); -+ * -+ * old = atomic_fetch_add(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 -+ * -+ * As @old observed by T1 is within the dead zone the T1 get() fails. -+ * -+ * Possible critical states: -+ * -+ * Context Counter References Operation -+ * T1 1 1 init() -+ * T2 2 2 get() -+ * T1 1 1 put() -+ * T2 0 0 put() tries to mark dead -+ * T1 1 1 get() -+ * T2 1 1 put() mark dead fails -+ * T1 0 0 put() tries to mark dead -+ * T1 DEAD 0 put() mark dead succeeds -+ * T2 DEAD+1 0 get() fails and puts it back to DEAD -+ * -+ * Of course there are more complex scenarios, but the above illustrates -+ * the working principle. The rest is left to the imagination of the -+ * reader. -+ * -+ * Deconstruction race -+ * =================== -+ * -+ * The release operation must be protected by prohibiting a grace period in -+ * order to prevent a possible use after free: -+ * -+ * T1 T2 -+ * put() get() -+ * // ref->refcnt = ONEREF -+ * if (atomic_fetch_sub(1, &ref->cnt) > ONEREF) -+ * return false; <- Not taken -+ * -+ * // ref->refcnt == NOREF -+ * --> preemption -+ * // Elevates ref->c to ONEREF -+ * if (!atomic_fetch_add(1, &ref->refcnt) >= NOREF) -+ * return true; <- taken -+ * -+ * if (put(&p->ref)) { <-- Succeeds -+ * remove_pointer(p); -+ * kfree_rcu(p, rcu); -+ * } -+ * -+ * RCU grace period ends, object is freed -+ * -+ * atomic_cmpxchg(&ref->refcnt, NONE, DEAD); <- UAF -+ * -+ * This is prevented by disabling preemption around the put() operation as -+ * that's in most kernel configurations cheaper than a rcu_read_lock() / -+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it -+ * prevents the grace period which keeps the object alive until all put() -+ * operations complete. -+ * -+ * Saturation protection -+ * ===================== -+ * -+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). -+ * Once this is exceedded the reference count becomes stale by setting it -+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents -+ * wrap arounds which obviously cause worse problems than a memory -+ * leak. When saturation is reached a warning is emitted. -+ * -+ * Race conditions -+ * =============== -+ * -+ * All reference count increment/decrement operations are unconditional and -+ * only verified after the fact. This optimizes for the good case and takes -+ * the occasional race vs. a dead or already saturated refcount into -+ * account. The saturation and dead zones are large enough to accomodate -+ * for that. -+ * -+ * Memory ordering -+ * =============== -+ * -+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions -+ * and provide only what is strictly required for refcounts. -+ * -+ * The increments are fully relaxed; these will not provide ordering. The -+ * rationale is that whatever is used to obtain the object to increase the -+ * reference count on will provide the ordering. For locked data -+ * structures, its the lock acquire, for RCU/lockless data structures its -+ * the dependent load. -+ * -+ * rcuref_get() provides a control dependency ordering future stores which -+ * ensures that the object is not modified when acquiring a reference -+ * fails. -+ * -+ * rcuref_put() provides release order, i.e. all prior loads and stores -+ * will be issued before. It also provides a control dependency ordering -+ * against the subsequent destruction of the object. -+ * -+ * If rcuref_put() successfully dropped the last reference and marked the -+ * object DEAD it also provides acquire ordering. -+ */ -+ -+#include -+#include -+ -+/** -+ * rcuref_get_slowpath - Slowpath of rcuref_get() -+ * @ref: Pointer to the reference count -+ * @old: The reference count before the unconditional increment -+ * operation in rcuref_get() -+ * -+ * Invoked when the reference count is outside of the valid zone. -+ * -+ * Return: -+ * False if the reference count was already marked dead -+ * -+ * True if the reference count is saturated, which prevents the -+ * object from being deconstructed ever. -+ */ -+bool rcuref_get_slowpath(rcuref_t *ref, unsigned int old) -+{ -+ /* -+ * If the reference count was already marked dead, undo the -+ * increment so it stays in the middle of the dead zone and return -+ * fail. -+ */ -+ if (old >= RCUREF_RELEASED) { -+ atomic_set(&ref->refcnt, RCUREF_DEAD); -+ return false; -+ } -+ -+ /* -+ * If it was saturated, warn and mark it so. In case the increment -+ * was already on a saturated value restore the saturation -+ * marker. This keeps it in the middle of the saturation zone and -+ * prevents the reference count from overflowing. This leaks the -+ * object memory, but prevents the obvious reference count overflow -+ * damage. -+ */ -+ WARN_ONCE(old >= RCUREF_MAXREF, "rcuref saturated - leaking memory"); -+ atomic_set(&ref->refcnt, RCUREF_SATURATED); -+ return true; -+} -+EXPORT_SYMBOL_GPL(rcuref_get_slowpath); -+ -+static __must_check bool __rcuref_put(rcuref_t *ref) -+{ -+ /* -+ * Unconditionally decrement the reference count. The saturation and -+ * dead zones provide enough tolerance for this. -+ */ -+ unsigned int old = atomic_fetch_sub_release(1, &ref->refcnt); -+ -+ /* -+ * If the old value is in the valid range and is greater than -+ * RCUREF_ONEREF, nothing to do. -+ */ -+ if (likely(old > RCUREF_ONEREF && old <= RCUREF_MAXREF)) -+ return false; -+ -+ /* Did this drop the last reference? */ -+ if (likely(old == RCUREF_ONEREF)) { -+ /* -+ * Carefully try to set the reference count to RCUREF_DEAD. -+ * -+ * This can fail if a concurrent get() operation has -+ * elevated it again or the corresponding put() even marked -+ * it dead already. Both are valid situations and do not -+ * require a retry. If this fails the caller is not -+ * allowed to deconstruct the object. -+ */ -+ if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) -+ return false; -+ -+ /* -+ * The caller can safely schedule the object for -+ * deconstruction. Provide acquire ordering. -+ */ -+ smp_acquire__after_ctrl_dep(); -+ return true; -+ } -+ -+ /* -+ * If the reference count was already in the dead zone, then this -+ * put() operation is imbalanced. Warn, put the reference count back to -+ * DEAD and tell the caller to not deconstruct the object. -+ */ -+ if (WARN_ONCE(old >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { -+ atomic_set(&ref->refcnt, RCUREF_DEAD); -+ return false; -+ } -+ -+ /* -+ * This is a put() operation on a saturated refcount. Restore the -+ * mean saturation value and tell the caller to not deconstruct the -+ * object. -+ */ -+ atomic_set(&ref->refcnt, RCUREF_SATURATED); -+ return false; -+} -+ -+/** -+ * rcuref_put -- Release one reference for a rcuref reference count -+ * @ref: Pointer to the reference count -+ * -+ * Can be invoked from any context. -+ * -+ * Provides release memory ordering, such that prior loads and stores are done -+ * before, and provides an acquire ordering on success such that free() -+ * must come after. -+ * -+ * Return: -+ * -+ * True if this was the last reference with no future references -+ * possible. This signals the caller that it can safely schedule the -+ * object, which is protected by the reference counter, for -+ * deconstruction. -+ * -+ * False if there are still active references or the put() raced -+ * with a concurrent get()/put() pair. Caller is not allowed to -+ * deconstruct the protected object. -+ */ -+bool rcuref_put(rcuref_t *ref) -+{ -+ bool released; -+ -+ /* -+ * Protect against a concurrent get()/put() pair which marks the -+ * reference count DEAD and schedules it for RCU free. This -+ * prevents a grace period and is cheaper than -+ * rcu_read_lock()/unlock(). -+ */ -+ preempt_disable(); -+ released = __rcuref_put(ref); -+ preempt_enable(); -+ return released; -+} -+EXPORT_SYMBOL_GPL(rcuref_put); -diff --git a/mm/ksm.c b/mm/ksm.c -index addf490da146..a92c9594a2d3 100644 ---- a/mm/ksm.c -+++ b/mm/ksm.c -@@ -2454,9 +2454,14 @@ static int ksm_scan_thread(void *nothing) - - if (ksmd_should_run()) { - sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); -- wait_event_interruptible_timeout(ksm_iter_wait, -- sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), -- msecs_to_jiffies(sleep_ms)); -+ if (sleep_ms >= 1000) -+ wait_event_interruptible_timeout(ksm_iter_wait, -+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), -+ msecs_to_jiffies(round_jiffies_relative(sleep_ms))); -+ else -+ wait_event_interruptible_timeout(ksm_iter_wait, -+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), -+ msecs_to_jiffies(sleep_ms)); - } else { - wait_event_freezable(ksm_thread_wait, - ksmd_should_run() || kthread_should_stop()); -diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c -index 8c69f0c95a8e..c2b628e3cc7f 100644 ---- a/net/bridge/br_nf_core.c -+++ b/net/bridge/br_nf_core.c -@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) - { - struct rtable *rt = &br->fake_rtable; - -- atomic_set(&rt->dst.__refcnt, 1); -+ rcuref_init(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; -diff --git a/net/core/dst.c b/net/core/dst.c -index 6d2dd03dafa8..750440803883 100644 ---- a/net/core/dst.c -+++ b/net/core/dst.c -@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, - dst->tclassid = 0; - #endif - dst->lwtstate = NULL; -- atomic_set(&dst->__refcnt, initial_ref); -+ rcuref_init(&dst->__refcnt, initial_ref); - dst->__use = 0; - dst->lastuse = jiffies; - dst->flags = flags; -@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put); - - void dst_release(struct dst_entry *dst) - { -- if (dst) { -- int newrefcnt; -- -- newrefcnt = atomic_dec_return(&dst->__refcnt); -- if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) -- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", -- __func__, dst, newrefcnt); -- if (!newrefcnt) -- call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); -- } -+ if (dst && rcuref_put(&dst->__refcnt)) -+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } - EXPORT_SYMBOL(dst_release); - - void dst_release_immediate(struct dst_entry *dst) - { -- if (dst) { -- int newrefcnt; -- -- newrefcnt = atomic_dec_return(&dst->__refcnt); -- if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) -- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", -- __func__, dst, newrefcnt); -- if (!newrefcnt) -- dst_destroy(dst); -- } -+ if (dst && rcuref_put(&dst->__refcnt)) -+ dst_destroy(dst); - } - EXPORT_SYMBOL(dst_release_immediate); - -diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c -index 64289bc98887..228c54bbdecc 100644 ---- a/net/core/rtnetlink.c -+++ b/net/core/rtnetlink.c -@@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); - ci.rta_used = dst->__use; -- ci.rta_clntref = atomic_read(&dst->__refcnt); -+ ci.rta_clntref = rcuref_read(&dst->__refcnt); - } - if (expires) { - unsigned long clock; -diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c -index f2c43f67187d..9885bfb429a2 100644 ---- a/net/ipv4/inet_connection_sock.c -+++ b/net/ipv4/inet_connection_sock.c -@@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) - * having to remove and re-insert us on the wait queue. - */ - for (;;) { -- prepare_to_wait_exclusive(sk_sleep(sk), &wait, -+ prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index e9e8040d6491..f9b56123b3b8 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -4815,8 +4815,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; -diff --git a/net/ipv6/route.c b/net/ipv6/route.c -index a6983a13dd20..8b5e3d57b08d 100644 ---- a/net/ipv6/route.c -+++ b/net/ipv6/route.c -@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = { - - static const struct rt6_info ip6_null_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__refcnt = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -ENETUNREACH, -@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = { - - static const struct rt6_info ip6_prohibit_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__refcnt = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -EACCES, -@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { - - static const struct rt6_info ip6_blk_hole_entry_template = { - .dst = { -- .__refcnt = ATOMIC_INIT(1), -+ .__refcnt = RCUREF_INIT(1), - .__use = 1, - .obsolete = DST_OBSOLETE_FORCE_CHK, - .error = -EINVAL, -diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c -index 029171379884..bc9dc51828f7 100644 ---- a/net/netfilter/ipvs/ip_vs_xmit.c -+++ b/net/netfilter/ipvs/ip_vs_xmit.c -@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, - spin_unlock_bh(&dest->dst_lock); - IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", - &dest->addr.ip, &dest_dst->dst_saddr.ip, -- atomic_read(&rt->dst.__refcnt)); -+ rcuref_read(&rt->dst.__refcnt)); - } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.ip; -@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, - spin_unlock_bh(&dest->dst_lock); - IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", - &dest->addr.in6, &dest_dst->dst_saddr.in6, -- atomic_read(&rt->dst.__refcnt)); -+ rcuref_read(&rt->dst.__refcnt)); - } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.in6; --- -2.40.0.rc2 - -From ed2979f1636e3197b42234c8acac4d20f4e2ed8e Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Fri, 10 Mar 2023 18:03:29 +0100 -Subject: [PATCH 06/16] fixes +Date: Sun, 9 Apr 2023 21:22:15 +0200 +Subject: [PATCH 04/10] fixes Signed-off-by: Peter Jung --- @@ -11587,42 +6446,27 @@ Signed-off-by: Peter Jung Documentation/admin-guide/mm/ksm.rst | 7 + Documentation/leds/index.rst | 1 + Documentation/leds/ledtrig-blkdev.rst | 158 +++ - arch/x86/boot/compressed/Makefile | 2 +- - arch/x86/events/rapl.c | 20 +- - arch/x86/kernel/cpu/amd.c | 9 + - arch/x86/mm/tlb.c | 2 +- - arch/x86/net/bpf_jit_comp.c | 5 +- drivers/bluetooth/btusb.c | 2 +- - drivers/char/tpm/tpm-chip.c | 60 +- - drivers/char/tpm/tpm.h | 73 + drivers/leds/trigger/Kconfig | 9 + drivers/leds/trigger/Makefile | 1 + drivers/leds/trigger/ledtrig-blkdev.c | 1221 +++++++++++++++++ - fs/eventpoll.c | 2 +- + fs/eventpoll.c | 188 ++- fs/proc/base.c | 1 + include/linux/mm_types.h | 7 +- include/linux/pageblock-flags.h | 2 +- kernel/kheaders.c | 10 +- - kernel/kthread.c | 5 + kernel/padata.c | 4 +- - lib/string.c | 10 +- - lib/zstd/decompress/huf_decompress.c | 2 +- - mm/compaction.c | 75 +- - mm/internal.h | 6 +- - mm/ksm.c | 196 ++- - mm/page_alloc.c | 22 +- - mm/z3fold.c | 2 - - mm/zsmalloc.c | 3 - + mm/ksm.c | 185 ++- scripts/Makefile.vmlinux_o | 2 +- sound/pci/hda/cs35l41_hda.c | 2 +- - .../selftests/vm/ksm_functional_tests.c | 96 +- - 34 files changed, 1995 insertions(+), 110 deletions(-) + .../selftests/mm/ksm_functional_tests.c | 96 +- + 19 files changed, 1862 insertions(+), 122 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block -index cd14ecb3c9a5..ad47337ac75a 100644 +index 282de3680367..ac1dd2fbd855 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: @@ -11727,10 +6571,10 @@ index 000000000000..28ce8c814fb7 + may not match the device special file paths written to + link_device and unlink_device.) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst -index fb6ba2002a4b..f160f9487a90 100644 +index eed51a910c94..270560fef3b2 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst -@@ -173,6 +173,13 @@ stable_node_chains +@@ -171,6 +171,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages @@ -11745,7 +6589,7 @@ index fb6ba2002a4b..f160f9487a90 100644 A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst -index e5d63b940045..e3c24e468cbc 100644 +index b9ca081fac71..5e37d8e7bd28 100644 --- a/Documentation/leds/index.rst +++ b/Documentation/leds/index.rst @@ -10,6 +10,7 @@ LEDs @@ -11920,129 +6764,8 @@ index 000000000000..9ff5b99de451 +* The ``blkdev`` LED trigger supports many-to-many device/LED associations. + A device can be associated with multiple LEDs, and an LED can be associated + with multiple devices. -diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile -index d995595394bb..19d1fb601796 100644 ---- a/arch/x86/boot/compressed/Makefile -+++ b/arch/x86/boot/compressed/Makefile -@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) - KBUILD_CFLAGS += -fno-asynchronous-unwind-tables - KBUILD_CFLAGS += -D__DISABLE_EXPORTS - # Disable relocation relaxation in case the link is not PIE. --KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) -+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) - KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h - - # sev.c indirectly inludes inat-table.h which is generated during -diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c -index 52e6e7ed4f78..f000cc16d128 100644 ---- a/arch/x86/events/rapl.c -+++ b/arch/x86/events/rapl.c -@@ -343,14 +343,15 @@ static int rapl_pmu_event_init(struct perf_event *event) - if (event->cpu < 0) - return -EINVAL; - -- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; -- - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) - return -EINVAL; - - cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); - bit = cfg - 1; - -+ if (bit != PERF_RAPL_PP0) -+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; -+ - /* check event supported */ - if (!(rapl_cntr_mask & (1 << bit))) - return -EINVAL; -@@ -363,7 +364,15 @@ static int rapl_pmu_event_init(struct perf_event *event) - pmu = cpu_to_rapl_pmu(event->cpu); - if (!pmu) - return -EINVAL; -- event->cpu = pmu->cpu; -+ -+ /* -+ * FIXME: RAPL PMU considers events are uncore and MSRs can be read from -+ * the first available CPU of the die. But this is not true for energy-cores -+ * event. Therefore as a workaround don't consider pmu->cpu here for PERF_RAPL_PP0. -+ */ -+ if (event->event_caps & PERF_EV_CAP_READ_ACTIVE_PKG) -+ event->cpu = pmu->cpu; -+ - event->pmu_private = pmu; - event->hw.event_base = rapl_msrs[bit].msr; - event->hw.config = cfg; -@@ -537,7 +546,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = { - * - want to use same event codes across both architectures - */ - static struct perf_msr amd_rapl_msrs[] = { -- [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, -+ [PERF_RAPL_PP0] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, - [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, - [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, - [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, -@@ -764,7 +773,8 @@ static struct rapl_model model_spr = { - }; - - static struct rapl_model model_amd_hygon = { -- .events = BIT(PERF_RAPL_PKG), -+ .events = BIT(PERF_RAPL_PP0) | -+ BIT(PERF_RAPL_PKG), - .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, - .rapl_msrs = amd_rapl_msrs, - }; -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index f769d6d08b43..06f2ede1544f 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -880,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) - } - } - #endif -+ /* -+ * Work around Erratum 1386. The XSAVES instruction malfunctions in -+ * certain circumstances on Zen1/2 uarch, and not all parts have had -+ * updated microcode at the time of writing (March 2023). -+ * -+ * Affected parts all have no supervisor XSAVE states, meaning that -+ * the XSAVEC instruction (which works fine) is equivalent. -+ */ -+ clear_cpu_cap(c, X86_FEATURE_XSAVES); - } - - static void init_amd_zn(struct cpuinfo_x86 *c) -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index c1e31e9a85d7..92d73ccede70 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -1205,7 +1205,7 @@ void __flush_tlb_all(void) - */ - VM_WARN_ON_ONCE(preemptible()); - -- if (boot_cpu_has(X86_FEATURE_PGE)) { -+ if (cpu_feature_enabled(X86_FEATURE_PGE)) { - __flush_tlb_global(); - } else { - /* -diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c -index b808be77635e..6e696c6b7018 100644 ---- a/arch/x86/net/bpf_jit_comp.c -+++ b/arch/x86/net/bpf_jit_comp.c -@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip) - - static int emit_rsb_call(u8 **pprog, void *func, void *ip) - { -+ void *adjusted_ip; - OPTIMIZER_HIDE_VAR(func); -- x86_call_depth_emit_accounting(pprog, func); -- return emit_patch(pprog, func, ip, 0xE8); -+ adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func); -+ return emit_patch(pprog, func, adjusted_ip, 0xE8); - } - - static int emit_jump(u8 **pprog, void *func, void *ip) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 18bc94718711..7b9ee86b4609 100644 +index 5c536151ef83..5a80379253a7 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) @@ -12054,168 +6777,6 @@ index 18bc94718711..7b9ee86b4609 100644 gpiod_set_value_cansleep(reset_gpio, 1); return; -diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c -index 741d8f3e8fb3..c467eeae9973 100644 ---- a/drivers/char/tpm/tpm-chip.c -+++ b/drivers/char/tpm/tpm-chip.c -@@ -512,6 +512,63 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) - return 0; - } - -+/* -+ * Some AMD fTPM versions may cause stutter -+ * https://www.amd.com/en/support/kb/faq/pa-410 -+ * -+ * Fixes are available in two series of fTPM firmware: -+ * 6.x.y.z series: 6.0.18.6 + -+ * 3.x.y.z series: 3.57.y.5 + -+ */ -+static bool tpm_amd_is_rng_defective(struct tpm_chip *chip) -+{ -+ u32 val1, val2; -+ u64 version; -+ int ret; -+ -+ if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) -+ return false; -+ -+ ret = tpm_request_locality(chip); -+ if (ret) -+ return false; -+ -+ ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL); -+ if (ret) -+ goto release; -+ if (val1 != 0x414D4400U /* AMD */) { -+ ret = -ENODEV; -+ goto release; -+ } -+ ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL); -+ if (ret) -+ goto release; -+ ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL); -+ -+release: -+ tpm_relinquish_locality(chip); -+ -+ if (ret) -+ return false; -+ -+ version = ((u64)val1 << 32) | val2; -+ if ((version >> 48) == 6) { -+ if (version >= 0x0006000000180006ULL) -+ return false; -+ } else if ((version >> 48) == 3) { -+ if (version >= 0x0003005700000005ULL) -+ return false; -+ } else { -+ return false; -+ } -+ -+ dev_warn(&chip->dev, -+ "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n", -+ version); -+ -+ return true; -+} -+ - static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) - { - struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); -@@ -521,7 +578,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) - - static int tpm_add_hwrng(struct tpm_chip *chip) - { -- if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip)) -+ if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || -+ tpm_amd_is_rng_defective(chip)) - return 0; - - snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), -diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h -index 24ee4e1cc452..830014a26609 100644 ---- a/drivers/char/tpm/tpm.h -+++ b/drivers/char/tpm/tpm.h -@@ -150,6 +150,79 @@ enum tpm_sub_capabilities { - TPM_CAP_PROP_TIS_DURATION = 0x120, - }; - -+enum tpm2_pt_props { -+ TPM2_PT_NONE = 0x00000000, -+ TPM2_PT_GROUP = 0x00000100, -+ TPM2_PT_FIXED = TPM2_PT_GROUP * 1, -+ TPM2_PT_FAMILY_INDICATOR = TPM2_PT_FIXED + 0, -+ TPM2_PT_LEVEL = TPM2_PT_FIXED + 1, -+ TPM2_PT_REVISION = TPM2_PT_FIXED + 2, -+ TPM2_PT_DAY_OF_YEAR = TPM2_PT_FIXED + 3, -+ TPM2_PT_YEAR = TPM2_PT_FIXED + 4, -+ TPM2_PT_MANUFACTURER = TPM2_PT_FIXED + 5, -+ TPM2_PT_VENDOR_STRING_1 = TPM2_PT_FIXED + 6, -+ TPM2_PT_VENDOR_STRING_2 = TPM2_PT_FIXED + 7, -+ TPM2_PT_VENDOR_STRING_3 = TPM2_PT_FIXED + 8, -+ TPM2_PT_VENDOR_STRING_4 = TPM2_PT_FIXED + 9, -+ TPM2_PT_VENDOR_TPM_TYPE = TPM2_PT_FIXED + 10, -+ TPM2_PT_FIRMWARE_VERSION_1 = TPM2_PT_FIXED + 11, -+ TPM2_PT_FIRMWARE_VERSION_2 = TPM2_PT_FIXED + 12, -+ TPM2_PT_INPUT_BUFFER = TPM2_PT_FIXED + 13, -+ TPM2_PT_HR_TRANSIENT_MIN = TPM2_PT_FIXED + 14, -+ TPM2_PT_HR_PERSISTENT_MIN = TPM2_PT_FIXED + 15, -+ TPM2_PT_HR_LOADED_MIN = TPM2_PT_FIXED + 16, -+ TPM2_PT_ACTIVE_SESSIONS_MAX = TPM2_PT_FIXED + 17, -+ TPM2_PT_PCR_COUNT = TPM2_PT_FIXED + 18, -+ TPM2_PT_PCR_SELECT_MIN = TPM2_PT_FIXED + 19, -+ TPM2_PT_CONTEXT_GAP_MAX = TPM2_PT_FIXED + 20, -+ TPM2_PT_NV_COUNTERS_MAX = TPM2_PT_FIXED + 22, -+ TPM2_PT_NV_INDEX_MAX = TPM2_PT_FIXED + 23, -+ TPM2_PT_MEMORY = TPM2_PT_FIXED + 24, -+ TPM2_PT_CLOCK_UPDATE = TPM2_PT_FIXED + 25, -+ TPM2_PT_CONTEXT_HASH = TPM2_PT_FIXED + 26, -+ TPM2_PT_CONTEXT_SYM = TPM2_PT_FIXED + 27, -+ TPM2_PT_CONTEXT_SYM_SIZE = TPM2_PT_FIXED + 28, -+ TPM2_PT_ORDERLY_COUNT = TPM2_PT_FIXED + 29, -+ TPM2_PT_MAX_COMMAND_SIZE = TPM2_PT_FIXED + 30, -+ TPM2_PT_MAX_RESPONSE_SIZE = TPM2_PT_FIXED + 31, -+ TPM2_PT_MAX_DIGEST = TPM2_PT_FIXED + 32, -+ TPM2_PT_MAX_OBJECT_CONTEXT = TPM2_PT_FIXED + 33, -+ TPM2_PT_MAX_SESSION_CONTEXT = TPM2_PT_FIXED + 34, -+ TPM2_PT_PS_FAMILY_INDICATOR = TPM2_PT_FIXED + 35, -+ TPM2_PT_PS_LEVEL = TPM2_PT_FIXED + 36, -+ TPM2_PT_PS_REVISION = TPM2_PT_FIXED + 37, -+ TPM2_PT_PS_DAY_OF_YEAR = TPM2_PT_FIXED + 38, -+ TPM2_PT_PS_YEAR = TPM2_PT_FIXED + 39, -+ TPM2_PT_SPLIT_MAX = TPM2_PT_FIXED + 40, -+ TPM2_PT_TOTAL_COMMANDS = TPM2_PT_FIXED + 41, -+ TPM2_PT_LIBRARY_COMMANDS = TPM2_PT_FIXED + 42, -+ TPM2_PT_VENDOR_COMMANDS = TPM2_PT_FIXED + 43, -+ TPM2_PT_NV_BUFFER_MAX = TPM2_PT_FIXED + 44, -+ TPM2_PT_MODES = TPM2_PT_FIXED + 45, -+ TPM2_PT_MAX_CAP_BUFFER = TPM2_PT_FIXED + 46, -+ TPM2_PT_VAR = TPM2_PT_GROUP * 2, -+ TPM2_PT_PERMANENT = TPM2_PT_VAR + 0, -+ TPM2_PT_STARTUP_CLEAR = TPM2_PT_VAR + 1, -+ TPM2_PT_HR_NV_INDEX = TPM2_PT_VAR + 2, -+ TPM2_PT_HR_LOADED = TPM2_PT_VAR + 3, -+ TPM2_PT_HR_LOADED_AVAIL = TPM2_PT_VAR + 4, -+ TPM2_PT_HR_ACTIVE = TPM2_PT_VAR + 5, -+ TPM2_PT_HR_ACTIVE_AVAIL = TPM2_PT_VAR + 6, -+ TPM2_PT_HR_TRANSIENT_AVAIL = TPM2_PT_VAR + 7, -+ TPM2_PT_HR_PERSISTENT = TPM2_PT_VAR + 8, -+ TPM2_PT_HR_PERSISTENT_AVAIL = TPM2_PT_VAR + 9, -+ TPM2_PT_NV_COUNTERS = TPM2_PT_VAR + 10, -+ TPM2_PT_NV_COUNTERS_AVAIL = TPM2_PT_VAR + 11, -+ TPM2_PT_ALGORITHM_SET = TPM2_PT_VAR + 12, -+ TPM2_PT_LOADED_CURVES = TPM2_PT_VAR + 13, -+ TPM2_PT_LOCKOUT_COUNTER = TPM2_PT_VAR + 14, -+ TPM2_PT_MAX_AUTH_FAIL = TPM2_PT_VAR + 15, -+ TPM2_PT_LOCKOUT_INTERVAL = TPM2_PT_VAR + 16, -+ TPM2_PT_LOCKOUT_RECOVERY = TPM2_PT_VAR + 17, -+ TPM2_PT_NV_WRITE_RECOVERY = TPM2_PT_VAR + 18, -+ TPM2_PT_AUDIT_COUNTER_0 = TPM2_PT_VAR + 19, -+ TPM2_PT_AUDIT_COUNTER_1 = TPM2_PT_VAR + 20, -+}; - - /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18 - * bytes, but 128 is still a relatively large number of random bytes and diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index dc6816d36d06..bda249068182 100644 --- a/drivers/leds/trigger/Kconfig @@ -13471,10 +8032,323 @@ index 000000000000..067eedb003b5 +MODULE_AUTHOR("Ian Pilcher "); +MODULE_LICENSE("GPL v2"); diff --git a/fs/eventpoll.c b/fs/eventpoll.c -index 64659b110973..8b5ca9f8f4bb 100644 +index 64659b110973..4cad490028ab 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c -@@ -1760,7 +1760,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, +@@ -57,13 +57,7 @@ + * we need a lock that will allow us to sleep. This lock is a + * mutex (ep->mtx). It is acquired during the event transfer loop, + * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). +- * Then we also need a global mutex to serialize eventpoll_release_file() +- * and ep_free(). +- * This mutex is acquired by ep_free() during the epoll file +- * cleanup path and it is also acquired by eventpoll_release_file() +- * if a file has been pushed inside an epoll set and it is then +- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). +- * It is also acquired when inserting an epoll fd onto another epoll ++ * The epmutex is acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two +@@ -153,6 +147,13 @@ struct epitem { + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; + ++ /* ++ * Protected by file->f_lock, true for to-be-released epitem already ++ * removed from the "struct file" items list; together with ++ * eventpoll->refcount orchestrates "struct eventpoll" disposal ++ */ ++ bool dying; ++ + /* List containing poll wait queues */ + struct eppoll_entry *pwqlist; + +@@ -217,6 +218,12 @@ struct eventpoll { + u64 gen; + struct hlist_head refs; + ++ /* ++ * usage count, used together with epitem->dying to ++ * orchestrate the disposal of this struct ++ */ ++ refcount_t refcount; ++ + #ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +@@ -240,9 +247,7 @@ struct ep_pqueue { + /* Maximum number of epoll watched descriptors, per user */ + static long max_user_watches __read_mostly; + +-/* +- * This mutex is used to serialize ep_free() and eventpoll_release_file(). +- */ ++/* Used for cycles detection */ + static DEFINE_MUTEX(epmutex); + + static u64 loop_check_gen = 0; +@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) + + /* + * This function unregisters poll callbacks from the associated file +- * descriptor. Must be called with "mtx" held (or "epmutex" if called from +- * ep_free). ++ * descriptor. Must be called with "mtx" held. + */ + static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) + { +@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head) + kmem_cache_free(epi_cache, epi); + } + ++static void ep_get(struct eventpoll *ep) ++{ ++ refcount_inc(&ep->refcount); ++} ++ ++/* ++ * Returns true if the event poll can be disposed ++ */ ++static bool ep_refcount_dec_and_test(struct eventpoll *ep) ++{ ++ if (!refcount_dec_and_test(&ep->refcount)) ++ return false; ++ ++ WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); ++ return true; ++} ++ ++static void ep_free(struct eventpoll *ep) ++{ ++ mutex_destroy(&ep->mtx); ++ free_uid(ep->user); ++ wakeup_source_unregister(ep->ws); ++ kfree(ep); ++} ++ + /* + * Removes a "struct epitem" from the eventpoll RB tree and deallocates + * all the associated resources. Must be called with "mtx" held. ++ * If the dying flag is set, do the removal only if force is true. ++ * This prevents ep_clear_and_put() from dropping all the ep references ++ * while running concurrently with eventpoll_release_file(). ++ * Returns true if the eventpoll can be disposed. + */ +-static int ep_remove(struct eventpoll *ep, struct epitem *epi) ++static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) + { + struct file *file = epi->ffd.file; + struct epitems_head *to_free; +@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) + + /* Remove the current item from the list of epoll hooks */ + spin_lock(&file->f_lock); ++ if (epi->dying && !force) { ++ spin_unlock(&file->f_lock); ++ return false; ++ } ++ + to_free = NULL; + head = file->f_ep; + if (head->first == &epi->fllink && !epi->fllink.next) { +@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) + call_rcu(&epi->rcu, epi_rcu_free); + + percpu_counter_dec(&ep->user->epoll_watches); ++ return ep_refcount_dec_and_test(ep); ++} + +- return 0; ++/* ++ * ep_remove variant for callers owing an additional reference to the ep ++ */ ++static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) ++{ ++ WARN_ON_ONCE(__ep_remove(ep, epi, false)); + } + +-static void ep_free(struct eventpoll *ep) ++static void ep_clear_and_put(struct eventpoll *ep) + { + struct rb_node *rbp; + struct epitem *epi; ++ bool dispose; + + /* We need to release all tasks waiting for these file */ + if (waitqueue_active(&ep->poll_wait)) + ep_poll_safewake(ep, NULL, 0); + +- /* +- * We need to lock this because we could be hit by +- * eventpoll_release_file() while we're freeing the "struct eventpoll". +- * We do not need to hold "ep->mtx" here because the epoll file +- * is on the way to be removed and no one has references to it +- * anymore. The only hit might come from eventpoll_release_file() but +- * holding "epmutex" is sufficient here. +- */ +- mutex_lock(&epmutex); ++ mutex_lock(&ep->mtx); + + /* + * Walks through the whole tree by unregistering poll callbacks. +@@ -768,25 +806,21 @@ static void ep_free(struct eventpoll *ep) + + /* + * Walks through the whole tree by freeing each "struct epitem". At this +- * point we are sure no poll callbacks will be lingering around, and also by +- * holding "epmutex" we can be sure that no file cleanup code will hit +- * us during this operation. So we can avoid the lock on "ep->lock". +- * We do not need to lock ep->mtx, either, we only do it to prevent +- * a lockdep warning. ++ * point we are sure no poll callbacks will be lingering around. ++ * Since we still own a reference to the eventpoll struct, the loop can't ++ * dispose it. + */ +- mutex_lock(&ep->mtx); + while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { + epi = rb_entry(rbp, struct epitem, rbn); +- ep_remove(ep, epi); ++ ep_remove_safe(ep, epi); + cond_resched(); + } ++ ++ dispose = ep_refcount_dec_and_test(ep); + mutex_unlock(&ep->mtx); + +- mutex_unlock(&epmutex); +- mutex_destroy(&ep->mtx); +- free_uid(ep->user); +- wakeup_source_unregister(ep->ws); +- kfree(ep); ++ if (dispose) ++ ep_free(ep); + } + + static int ep_eventpoll_release(struct inode *inode, struct file *file) +@@ -794,7 +828,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) + struct eventpoll *ep = file->private_data; + + if (ep) +- ep_free(ep); ++ ep_clear_and_put(ep); + + return 0; + } +@@ -906,33 +940,34 @@ void eventpoll_release_file(struct file *file) + { + struct eventpoll *ep; + struct epitem *epi; +- struct hlist_node *next; ++ bool dispose; + + /* +- * We don't want to get "file->f_lock" because it is not +- * necessary. It is not necessary because we're in the "struct file" +- * cleanup path, and this means that no one is using this file anymore. +- * So, for example, epoll_ctl() cannot hit here since if we reach this +- * point, the file counter already went to zero and fget() would fail. +- * The only hit might come from ep_free() but by holding the mutex +- * will correctly serialize the operation. We do need to acquire +- * "ep->mtx" after "epmutex" because ep_remove() requires it when called +- * from anywhere but ep_free(). +- * +- * Besides, ep_remove() acquires the lock, so we can't hold it here. ++ * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from ++ * touching the epitems list before eventpoll_release_file() can access ++ * the ep->mtx. + */ +- mutex_lock(&epmutex); +- if (unlikely(!file->f_ep)) { +- mutex_unlock(&epmutex); +- return; +- } +- hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { ++again: ++ spin_lock(&file->f_lock); ++ if (file->f_ep && file->f_ep->first) { ++ epi = hlist_entry(file->f_ep->first, struct epitem, fllink); ++ epi->dying = true; ++ spin_unlock(&file->f_lock); ++ ++ /* ++ * ep access is safe as we still own a reference to the ep ++ * struct ++ */ + ep = epi->ep; +- mutex_lock_nested(&ep->mtx, 0); +- ep_remove(ep, epi); ++ mutex_lock(&ep->mtx); ++ dispose = __ep_remove(ep, epi, true); + mutex_unlock(&ep->mtx); ++ ++ if (dispose) ++ ep_free(ep); ++ goto again; + } +- mutex_unlock(&epmutex); ++ spin_unlock(&file->f_lock); + } + + static int ep_alloc(struct eventpoll **pep) +@@ -955,6 +990,7 @@ static int ep_alloc(struct eventpoll **pep) + ep->rbr = RB_ROOT_CACHED; + ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; ++ refcount_set(&ep->refcount, 1); + + *pep = ep; + +@@ -1223,10 +1259,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v + */ + list_del_init(&wait->entry); + /* +- * ->whead != NULL protects us from the race with ep_free() +- * or ep_remove(), ep_remove_wait_queue() takes whead->lock +- * held by the caller. Once we nullify it, nothing protects +- * ep/epi or even wait. ++ * ->whead != NULL protects us from the race with ++ * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() ++ * takes whead->lock held by the caller. Once we nullify it, ++ * nothing protects ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } +@@ -1496,16 +1532,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, + if (tep) + mutex_unlock(&tep->mtx); + ++ /* ++ * ep_remove_safe() calls in the later error paths can't lead to ++ * ep_free() as the ep file itself still holds an ep reference. ++ */ ++ ep_get(ep); ++ + /* now check if we've created too many backpaths */ + if (unlikely(full_check && reverse_path_check())) { +- ep_remove(ep, epi); ++ ep_remove_safe(ep, epi); + return -EINVAL; + } + + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) { +- ep_remove(ep, epi); ++ ep_remove_safe(ep, epi); + return error; + } + } +@@ -1529,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, + * high memory pressure. + */ + if (unlikely(!epq.epi)) { +- ep_remove(ep, epi); ++ ep_remove_safe(ep, epi); + return -ENOMEM; + } + +@@ -1760,7 +1802,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, { int ret = default_wake_function(wq_entry, mode, sync, key); @@ -13483,8 +8357,37 @@ index 64659b110973..8b5ca9f8f4bb 100644 return ret; } +@@ -2025,7 +2067,7 @@ static int do_epoll_create(int flags) + out_free_fd: + put_unused_fd(fd); + out_free_ep: +- ep_free(ep); ++ ep_clear_and_put(ep); + return error; + } + +@@ -2167,10 +2209,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + error = -EEXIST; + break; + case EPOLL_CTL_DEL: +- if (epi) +- error = ep_remove(ep, epi); +- else ++ if (epi) { ++ /* ++ * The eventpoll itself is still alive: the refcount ++ * can't go to zero here. ++ */ ++ ep_remove_safe(ep, epi); ++ error = 0; ++ } else { + error = -ENOENT; ++ } + break; + case EPOLL_CTL_MOD: + if (epi) { diff --git a/fs/proc/base.c b/fs/proc/base.c -index 9e479d7d202b..ac9ebe972be0 100644 +index 5e0e0ccd47aa..07463ad4a70a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, @@ -13496,10 +8399,10 @@ index 9e479d7d202b..ac9ebe972be0 100644 } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 9757067c3053..d853e1c8a581 100644 +index a57e6ae78e65..22b2ac82bffd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h -@@ -776,7 +776,7 @@ struct mm_struct { +@@ -740,7 +740,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM @@ -13508,7 +8411,7 @@ index 9757067c3053..d853e1c8a581 100644 */ unsigned long ksm_merging_pages; /* -@@ -784,6 +784,11 @@ struct mm_struct { +@@ -748,6 +748,11 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; @@ -13567,29 +8470,6 @@ index 8f69772af77b..42163c9e94e5 100644 return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); } -diff --git a/kernel/kthread.c b/kernel/kthread.c -index f97fd01a2932..7e6751b29101 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); - * Flush and destroy @worker. The simple flush is enough because the kthread - * worker API is used only in trivial scenarios. There are no multi-step state - * machines needed. -+ * -+ * Note that this function is not responsible for handling delayed work, so -+ * caller should be responsible for queuing or canceling all delayed work items -+ * before invoke this function. - */ - void kthread_destroy_worker(struct kthread_worker *worker) - { -@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) - - kthread_flush_worker(worker); - kthread_stop(task); -+ WARN_ON(!list_empty(&worker->delayed_work_list)); - WARN_ON(!list_empty(&worker->work_list)); - kfree(worker); - } diff --git a/kernel/padata.c b/kernel/padata.c index e007b8a4b738..7c80301ab084 100644 --- a/kernel/padata.c @@ -13612,214 +8492,8 @@ index e007b8a4b738..7c80301ab084 100644 { struct padata_work *pw = container_of(w, struct padata_work, pw_work); struct padata_mt_job_state *ps = pw->pw_data; -diff --git a/lib/string.c b/lib/string.c -index 4746a98b153e..6b7cf32b4e54 100644 ---- a/lib/string.c -+++ b/lib/string.c -@@ -480,13 +480,11 @@ EXPORT_SYMBOL(strcspn); - */ - char *strpbrk(const char *cs, const char *ct) - { -- const char *sc1, *sc2; -+ const char *sc; - -- for (sc1 = cs; *sc1 != '\0'; ++sc1) { -- for (sc2 = ct; *sc2 != '\0'; ++sc2) { -- if (*sc1 == *sc2) -- return (char *)sc1; -- } -+ for (sc = cs; *sc != '\0'; ++sc) { -+ if (strchr(ct, *sc)) -+ return (char *)sc; - } - return NULL; - } -diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c -index 89b269a641c7..60958afebc41 100644 ---- a/lib/zstd/decompress/huf_decompress.c -+++ b/lib/zstd/decompress/huf_decompress.c -@@ -985,7 +985,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 - - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, -- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, -+ const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, - const U32 nbBitsBaseline) - { - U32* const rankVal = rankValOrigin[0]; -diff --git a/mm/compaction.c b/mm/compaction.c -index d0b16a5b30f7..3613d7f174dc 100644 ---- a/mm/compaction.c -+++ b/mm/compaction.c -@@ -122,7 +122,6 @@ bool PageMovable(struct page *page) - - return false; - } --EXPORT_SYMBOL(PageMovable); - - void __SetPageMovable(struct page *page, const struct movable_operations *mops) - { -@@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, - - /* - * Avoid isolating too much unless this block is being -- * rescanned (e.g. dirty/writeback pages, parallel allocation) -+ * fully scanned (e.g. dirty/writeback pages, parallel allocation) - * or a lock is contended. For contention, isolate quickly to - * potentially remove one source of contention. - */ - if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && -- !cc->rescan && !cc->contended) { -+ !cc->finish_pageblock && !cc->contended) { - ++low_pfn; - break; - } -@@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, - } - - /* -- * Updated the cached scanner pfn once the pageblock has been scanned -+ * Update the cached scanner pfn once the pageblock has been scanned. - * Pages will either be migrated in which case there is no point - * scanning in the near future or migration failed in which case the - * failure reason may persist. The block is marked for skipping if - * there were no pages isolated in the block or if the block is - * rescanned twice in a row. - */ -- if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { -+ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { - if (valid_page && !skip_updated) - set_pageblock_skip(valid_page); - update_cached_migrate(cc, low_pfn); -@@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) - if (cc->ignore_skip_hint) - return pfn; - -+ /* -+ * If the pageblock should be finished then do not select a different -+ * pageblock. -+ */ -+ if (cc->finish_pageblock) -+ return pfn; -+ - /* - * If the migrate_pfn is not at the start of a zone or the start - * of a pageblock then assume this is a continuation of a previous -@@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) - pfn = cc->zone->zone_start_pfn; - cc->fast_search_fail = 0; - found_block = true; -- set_pageblock_skip(freepage); - break; - } - } -@@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) - unsigned long iteration_start_pfn = cc->migrate_pfn; - - /* -- * Avoid multiple rescans which can happen if a page cannot be -- * isolated (dirty/writeback in async mode) or if the migrated -- * pages are being allocated before the pageblock is cleared. -- * The first rescan will capture the entire pageblock for -- * migration. If it fails, it'll be marked skip and scanning -- * will proceed as normal. -+ * Avoid multiple rescans of the same pageblock which can -+ * happen if a page cannot be isolated (dirty/writeback in -+ * async mode) or if the migrated pages are being allocated -+ * before the pageblock is cleared. The first rescan will -+ * capture the entire pageblock for migration. If it fails, -+ * it'll be marked skip and scanning will proceed as normal. - */ -- cc->rescan = false; -+ cc->finish_pageblock = false; - if (pageblock_start_pfn(last_migrated_pfn) == - pageblock_start_pfn(iteration_start_pfn)) { -- cc->rescan = true; -+ cc->finish_pageblock = true; - } - -+rescan: - switch (isolate_migratepages(cc)) { - case ISOLATE_ABORT: - ret = COMPACT_CONTENDED; -@@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) - goto out; - } - /* -- * We failed to migrate at least one page in the current -- * order-aligned block, so skip the rest of it. -+ * If an ASYNC or SYNC_LIGHT fails to migrate a page -+ * within the current order-aligned block, scan the -+ * remainder of the pageblock. This will mark the -+ * pageblock "skip" to avoid rescanning in the near -+ * future. This will isolate more pages than necessary -+ * for the request but avoid loops due to -+ * fast_find_migrateblock revisiting blocks that were -+ * recently partially scanned. - */ -- if (cc->direct_compaction && -- (cc->mode == MIGRATE_ASYNC)) { -- cc->migrate_pfn = block_end_pfn( -- cc->migrate_pfn - 1, cc->order); -- /* Draining pcplists is useless in this case */ -- last_migrated_pfn = 0; -+ if (cc->direct_compaction && !cc->finish_pageblock && -+ (cc->mode < MIGRATE_SYNC)) { -+ cc->finish_pageblock = true; -+ -+ /* -+ * Draining pcplists does not help THP if -+ * any page failed to migrate. Even after -+ * drain, the pageblock will not be free. -+ */ -+ if (cc->order == COMPACTION_HPAGE_ORDER) -+ last_migrated_pfn = 0; -+ -+ goto rescan; - } - } - -+ /* Stop if a page has been captured */ -+ if (capc && capc->page) { -+ ret = COMPACT_SUCCESS; -+ break; -+ } -+ - check_drain: - /* - * Has the migration scanner moved away from the previous -@@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) - last_migrated_pfn = 0; - } - } -- -- /* Stop if a page has been captured */ -- if (capc && capc->page) { -- ret = COMPACT_SUCCESS; -- break; -- } - } - - out: -diff --git a/mm/internal.h b/mm/internal.h -index bcf75a8b032d..21466d0ab22f 100644 ---- a/mm/internal.h -+++ b/mm/internal.h -@@ -422,7 +422,11 @@ struct compact_control { - bool proactive_compaction; /* kcompactd proactive compaction */ - bool whole_zone; /* Whole zone should/has been scanned */ - bool contended; /* Signal lock contention */ -- bool rescan; /* Rescanning the same pageblock */ -+ bool finish_pageblock; /* Scan the remainder of a pageblock. Used -+ * when there are potentially transient -+ * isolation or migration failures to -+ * ensure forward progress. -+ */ - bool alloc_contig; /* alloc_contig_range allocation */ - }; - diff --git a/mm/ksm.c b/mm/ksm.c -index a92c9594a2d3..ee60890cf9b1 100644 +index 2b8d30068cbb..82029f1d454b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -214,6 +214,7 @@ struct ksm_rmap_item { @@ -13974,33 +8648,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 } return err; } -@@ -988,9 +1038,15 @@ static int unmerge_and_remove_all_rmap_items(void) - - mm = mm_slot->slot.mm; - mmap_read_lock(mm); -+ -+ /* -+ * Exit right away if mm is exiting to avoid lockdep issue in -+ * the maple tree -+ */ -+ if (ksm_test_exit(mm)) -+ goto mm_exiting; -+ - for_each_vma(vmi, vma) { -- if (ksm_test_exit(mm)) -- break; - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) - continue; - err = unmerge_ksm_pages(vma, -@@ -999,6 +1055,7 @@ static int unmerge_and_remove_all_rmap_items(void) - goto error; - } - -+mm_exiting: - remove_trailing_rmap_items(&mm_slot->rmap_list); - mmap_read_unlock(mm); - -@@ -2044,6 +2101,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, +@@ -2050,6 +2100,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, rmap_item->mm->ksm_merging_pages++; } @@ -14043,7 +8691,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can -@@ -2055,7 +2148,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, +@@ -2061,7 +2147,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { @@ -14051,7 +8699,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; -@@ -2092,6 +2184,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2098,6 +2183,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } remove_rmap_item_from_tree(rmap_item); @@ -14059,7 +8707,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 if (kpage) { if (PTR_ERR(kpage) == -EBUSY) -@@ -2128,29 +2221,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2134,29 +2220,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ @@ -14096,7 +8744,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { -@@ -2214,23 +2294,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2220,23 +2293,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } } @@ -14142,7 +8790,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ -@@ -2337,6 +2433,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) +@@ -2343,6 +2432,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } if (is_zone_device_page(*page)) goto next_page; @@ -14165,7 +8813,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); -@@ -3138,6 +3250,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, +@@ -3139,6 +3244,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); @@ -14179,7 +8827,7 @@ index a92c9594a2d3..ee60890cf9b1 100644 static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -@@ -3193,6 +3312,7 @@ static struct attribute *ksm_attrs[] = { +@@ -3194,6 +3306,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, @@ -14187,108 +8835,6 @@ index a92c9594a2d3..ee60890cf9b1 100644 &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 3bb3484563ed..3aec9a6a9cb7 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - { - unsigned long flags; - int i, allocated = 0; -+ struct list_head *prev_tail = list->prev; -+ struct page *pos, *n; - - spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { -@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - if (unlikely(page == NULL)) - break; - -- if (unlikely(check_pcp_refill(page, order))) -- continue; -- - /* - * Split buddy pages returned by expand() are received here in - * physical page order. The page is added to the tail of -@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - * pages are ordered properly. - */ - list_add_tail(&page->pcp_list, list); -- allocated++; - if (is_migrate_cma(get_pcppage_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); -@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock_irqrestore(&zone->lock, flags); -+ -+ /* -+ * Pages are appended to the pcp list without checking to reduce the -+ * time holding the zone lock. Checking the appended pages happens right -+ * after the critical section while still holding the pcp lock. -+ */ -+ pos = list_first_entry(prev_tail, struct page, pcp_list); -+ list_for_each_entry_safe_from(pos, n, list, pcp_list) { -+ if (unlikely(check_pcp_refill(pos, order))) { -+ list_del(&pos->pcp_list); -+ continue; -+ } -+ -+ allocated++; -+ } -+ - return allocated; - } - -diff --git a/mm/z3fold.c b/mm/z3fold.c -index a4de0c317ac7..0cef845d397b 100644 ---- a/mm/z3fold.c -+++ b/mm/z3fold.c -@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) - struct z3fold_header *zhdr; - struct z3fold_pool *pool; - -- VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(PageIsolated(page), page); - - if (test_bit(PAGE_HEADLESS, &page->private)) -@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, - struct z3fold_header *zhdr, *new_zhdr; - struct z3fold_pool *pool; - -- VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); -diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c -index 702bc3fd687a..9d27d9b00bce 100644 ---- a/mm/zsmalloc.c -+++ b/mm/zsmalloc.c -@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) - * Page is locked so zspage couldn't be destroyed. For detail, look at - * lock_zspage in free_zspage. - */ -- VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(PageIsolated(page), page); - - zspage = get_zspage(page); -@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, - if (mode == MIGRATE_SYNC_NO_COPY) - return -EINVAL; - -- VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - - /* The page is locked, so this pointer must remain valid */ -@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page) - { - struct zspage *zspage; - -- VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - - zspage = get_zspage(page); diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 0edfdb40364b..ae52d3b3f063 100644 --- a/scripts/Makefile.vmlinux_o @@ -14303,10 +8849,10 @@ index 0edfdb40364b..ae52d3b3f063 100644 targets := .tmp_initcalls.lds diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c -index f7815ee24f83..e94b0a6b96df 100644 +index 75020edd39e7..e4455220e9fd 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c -@@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd +@@ -1239,7 +1239,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd if (strncmp(hid, "CLSA0100", 8) == 0) { hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH; @@ -14315,10 +8861,10 @@ index f7815ee24f83..e94b0a6b96df 100644 hw_cfg->bst_type = CS35L41_EXT_BOOST; hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; hw_cfg->gpio1.valid = true; -diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c -index b11b7e5115dc..3033cd6ed3b4 100644 ---- a/tools/testing/selftests/vm/ksm_functional_tests.c -+++ b/tools/testing/selftests/vm/ksm_functional_tests.c +diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c +index d8b5b4930412..05048ebc24d8 100644 +--- a/tools/testing/selftests/mm/ksm_functional_tests.c ++++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -24,9 +24,12 @@ #define KiB 1024u @@ -14468,12223 +9014,25 @@ index b11b7e5115dc..3033cd6ed3b4 100644 #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); -- -2.40.0.rc2 +2.40.0 -From 50de9c32a97f479390ff525d679f224e1ceb8e3b Mon Sep 17 00:00:00 2001 +From 57f8b594e6808d5ecc244928f704f66249dd9bba Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 3 Mar 2023 16:59:32 +0100 -Subject: [PATCH 07/16] fs-patches +Date: Mon, 6 Mar 2023 18:45:25 +0100 +Subject: [PATCH 05/10] Implement amd-pstate guided driver Signed-off-by: Peter Jung --- - Documentation/admin-guide/xfs.rst | 2 +- - block/blk-merge.c | 3 +- - fs/btrfs/Makefile | 6 +- - fs/btrfs/backref.c | 33 +- - fs/btrfs/bio.c | 557 ++++++++++++++++++++--- - fs/btrfs/bio.h | 67 +-- - fs/btrfs/block-group.c | 276 ++++++++++-- - fs/btrfs/block-group.h | 24 +- - fs/btrfs/btrfs_inode.h | 23 +- - fs/btrfs/compression.c | 276 ++---------- - fs/btrfs/compression.h | 3 - - fs/btrfs/ctree.c | 62 ++- - fs/btrfs/ctree.h | 15 + - fs/btrfs/defrag.c | 4 +- - fs/btrfs/delayed-ref.c | 24 +- - fs/btrfs/delayed-ref.h | 2 +- - fs/btrfs/disk-io.c | 222 +--------- - fs/btrfs/disk-io.h | 14 +- - fs/btrfs/extent-io-tree.c | 10 +- - fs/btrfs/extent-io-tree.h | 1 - - fs/btrfs/extent-tree.c | 181 +++----- - fs/btrfs/extent-tree.h | 81 ++++ - fs/btrfs/extent_io.c | 582 +++--------------------- - fs/btrfs/extent_io.h | 36 +- - fs/btrfs/file-item.c | 72 ++- - fs/btrfs/file-item.h | 8 +- - fs/btrfs/file.c | 13 +- - fs/btrfs/free-space-tree.c | 2 +- - fs/btrfs/fs.h | 5 +- - fs/btrfs/inode.c | 715 ++++++------------------------ - fs/btrfs/ioctl.c | 2 +- - fs/btrfs/lru_cache.c | 166 +++++++ - fs/btrfs/lru_cache.h | 80 ++++ - fs/btrfs/lzo.c | 2 +- - fs/btrfs/messages.c | 30 -- - fs/btrfs/messages.h | 34 -- - fs/btrfs/ordered-data.c | 71 ++- - fs/btrfs/ordered-data.h | 10 +- - fs/btrfs/qgroup.c | 2 +- - fs/btrfs/raid56.c | 334 +++++--------- - fs/btrfs/raid56.h | 4 +- - fs/btrfs/relocation.c | 2 +- - fs/btrfs/scrub.c | 2 +- - fs/btrfs/send.c | 684 ++++++++++++++-------------- - fs/btrfs/super.c | 3 +- - fs/btrfs/sysfs.c | 12 +- - fs/btrfs/tests/extent-map-tests.c | 2 +- - fs/btrfs/transaction.c | 29 ++ - fs/btrfs/transaction.h | 31 ++ - fs/btrfs/tree-log.c | 87 ++-- - fs/btrfs/tree-log.h | 9 +- - fs/btrfs/volumes.c | 116 ++--- - fs/btrfs/volumes.h | 18 - - fs/btrfs/zoned.c | 146 +++--- - fs/btrfs/zoned.h | 20 +- - fs/ext4/extents.c | 2 +- - fs/ext4/file.c | 34 +- - fs/ext4/inode.c | 429 ++++++------------ - fs/ext4/ioctl.c | 3 - - fs/ext4/namei.c | 11 +- - fs/ext4/page-io.c | 10 +- - fs/ext4/super.c | 26 +- - fs/ext4/xattr.c | 137 ++++-- - fs/gfs2/bmap.c | 38 +- - fs/iomap/buffered-io.c | 91 ++-- - fs/iomap/direct-io.c | 10 +- - fs/xfs/libxfs/xfs_alloc.c | 32 +- - fs/xfs/libxfs/xfs_bmap.c | 32 +- - fs/xfs/libxfs/xfs_bmap.h | 5 +- - fs/xfs/libxfs/xfs_btree.c | 18 +- - fs/xfs/libxfs/xfs_refcount.c | 96 ++-- - fs/xfs/libxfs/xfs_refcount.h | 4 +- - fs/xfs/libxfs/xfs_rmap.c | 50 +-- - fs/xfs/libxfs/xfs_rmap.h | 6 +- - fs/xfs/xfs_bmap_item.c | 137 +++--- - fs/xfs/xfs_error.c | 2 +- - fs/xfs/xfs_error.h | 12 +- - fs/xfs/xfs_extfree_item.c | 99 +++-- - fs/xfs/xfs_fsmap.c | 1 + - fs/xfs/xfs_globals.c | 3 +- - fs/xfs/xfs_iomap.c | 4 +- - fs/xfs/xfs_refcount_item.c | 110 +++-- - fs/xfs/xfs_rmap_item.c | 142 +++--- - fs/xfs/xfs_sysfs.c | 12 +- - fs/xfs/xfs_sysfs.h | 10 +- - fs/xfs/xfs_trace.h | 15 +- - include/linux/bio.h | 4 + - include/linux/iomap.h | 30 +- - include/trace/events/btrfs.h | 127 +++++- - include/trace/events/ext4.h | 7 - - 90 files changed, 3213 insertions(+), 3751 deletions(-) - create mode 100644 fs/btrfs/lru_cache.c - create mode 100644 fs/btrfs/lru_cache.h - -diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst -index 8de008c0c5ad..e2561416391c 100644 ---- a/Documentation/admin-guide/xfs.rst -+++ b/Documentation/admin-guide/xfs.rst -@@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem: - XFS_ERRLEVEL_LOW: 1 - XFS_ERRLEVEL_HIGH: 5 - -- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) -+ fs.xfs.panic_mask (Min: 0 Default: 0 Max: 511) - Causes certain error conditions to call BUG(). Value is a bitmask; - OR together the tags which represent errors which should cause panics: - -diff --git a/block/blk-merge.c b/block/blk-merge.c -index 808b58129d3e..1ac782fdc55c 100644 ---- a/block/blk-merge.c -+++ b/block/blk-merge.c -@@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, - * responsible for ensuring that @bs is only destroyed after processing of the - * split bio has finished. - */ --static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, -+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes) - { - struct bio_vec bv, bvprv, *bvprvp = NULL; -@@ -336,6 +336,7 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - bio_clear_polled(bio); - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); - } -+EXPORT_SYMBOL_GPL(bio_split_rw); - - /** - * __bio_split_to_limits - split a bio to fit the queue limits -diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile -index 555c962fdad6..90d53209755b 100644 ---- a/fs/btrfs/Makefile -+++ b/fs/btrfs/Makefile -@@ -11,7 +11,8 @@ condflags := \ - $(call cc-option, -Wunused-but-set-variable) \ - $(call cc-option, -Wunused-const-variable) \ - $(call cc-option, -Wpacked-not-aligned) \ -- $(call cc-option, -Wstringop-truncation) -+ $(call cc-option, -Wstringop-truncation) \ -+ $(call cc-option, -Wmaybe-uninitialized) - subdir-ccflags-y += $(condflags) - # The following turn off the warnings enabled by -Wextra - subdir-ccflags-y += -Wno-missing-field-initializers -@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ -- subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o -+ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ -+ lru_cache.o - - btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o - btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o -diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c -index 46851511b661..90e40d5ceccd 100644 ---- a/fs/btrfs/backref.c -+++ b/fs/btrfs/backref.c -@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct - struct btrfs_root *root, - u64 bytenr, int level, bool *is_shared) - { -+ const struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_backref_shared_cache_entry *entry; - -+ if (!current->journal_info) -+ lockdep_assert_held(&fs_info->commit_root_sem); -+ - if (!ctx->use_path_cache) - return false; - -@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct - * could be a snapshot sharing this extent buffer. - */ - if (entry->is_shared && -- entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) -+ entry->gen != btrfs_get_last_root_drop_gen(fs_info)) - return false; - - *is_shared = entry->is_shared; -@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx - struct btrfs_root *root, - u64 bytenr, int level, bool is_shared) - { -+ const struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_backref_shared_cache_entry *entry; - u64 gen; - -+ if (!current->journal_info) -+ lockdep_assert_held(&fs_info->commit_root_sem); -+ - if (!ctx->use_path_cache) - return; - -@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx - ASSERT(level >= 0); - - if (is_shared) -- gen = btrfs_get_last_root_drop_gen(root->fs_info); -+ gen = btrfs_get_last_root_drop_gen(fs_info); - else - gen = btrfs_root_last_snapshot(&root->root_item); - -@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, - .have_delayed_delete_refs = false, - }; - int level; -+ bool leaf_cached; -+ bool leaf_is_shared; - - for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { - if (ctx->prev_extents_cache[i].bytenr == bytenr) -@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, - walk_ctx.time_seq = elem.seq; - } - -+ ctx->use_path_cache = true; -+ -+ /* -+ * We may have previously determined that the current leaf is shared. -+ * If it is, then we have a data extent that is shared due to a shared -+ * subtree (caused by snapshotting) and we don't need to check for data -+ * backrefs. If the leaf is not shared, then we must do backref walking -+ * to determine if the data extent is shared through reflinks. -+ */ -+ leaf_cached = lookup_backref_shared_cache(ctx, root, -+ ctx->curr_leaf_bytenr, 0, -+ &leaf_is_shared); -+ if (leaf_cached && leaf_is_shared) { -+ ret = 1; -+ goto out_trans; -+ } -+ - walk_ctx.ignore_extent_item_pos = true; - walk_ctx.trans = trans; - walk_ctx.fs_info = fs_info; -@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, - /* -1 means we are in the bytenr of the data extent. */ - level = -1; - ULIST_ITER_INIT(&uiter); -- ctx->use_path_cache = true; - while (1) { - bool is_shared; - bool cached; -@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, - ctx->prev_extents_cache_slot = slot; - } - -+out_trans: - if (trans) { - btrfs_put_tree_mod_seq(fs_info, &elem); - btrfs_end_transaction(trans); -diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c -index 8affc88b0e0a..d8b90f95b157 100644 ---- a/fs/btrfs/bio.c -+++ b/fs/btrfs/bio.c -@@ -14,19 +14,31 @@ - #include "dev-replace.h" - #include "rcu-string.h" - #include "zoned.h" -+#include "file-item.h" - - static struct bio_set btrfs_bioset; -+static struct bio_set btrfs_clone_bioset; -+static struct bio_set btrfs_repair_bioset; -+static mempool_t btrfs_failed_bio_pool; -+ -+struct btrfs_failed_bio { -+ struct btrfs_bio *bbio; -+ int num_copies; -+ atomic_t repair_count; -+}; - - /* - * Initialize a btrfs_bio structure. This skips the embedded bio itself as it - * is already initialized by the block layer. - */ --static inline void btrfs_bio_init(struct btrfs_bio *bbio, -- btrfs_bio_end_io_t end_io, void *private) -+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, -+ btrfs_bio_end_io_t end_io, void *private) - { - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -+ bbio->inode = inode; - bbio->end_io = end_io; - bbio->private = private; -+ atomic_set(&bbio->pending_ios, 1); - } - - /* -@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, - * a mempool. - */ - struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, -+ struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private) - { - struct bio *bio; - - bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); -- btrfs_bio_init(btrfs_bio(bio), end_io, private); -+ btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); - return bio; - } - --struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, -- btrfs_bio_end_io_t end_io, void *private) -+static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, -+ struct bio *orig, u64 map_length, -+ bool use_append) - { -+ struct btrfs_bio *orig_bbio = btrfs_bio(orig); - struct bio *bio; -- struct btrfs_bio *bbio; - -- ASSERT(offset <= UINT_MAX && size <= UINT_MAX); -+ if (use_append) { -+ unsigned int nr_segs; -+ -+ bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, -+ &btrfs_clone_bioset, map_length); -+ } else { -+ bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, -+ &btrfs_clone_bioset); -+ } -+ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - -- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); -- bbio = btrfs_bio(bio); -- btrfs_bio_init(bbio, end_io, private); -+ btrfs_bio(bio)->file_offset = orig_bbio->file_offset; -+ if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) -+ orig_bbio->file_offset += map_length; - -- bio_trim(bio, offset >> 9, size >> 9); -- bbio->iter = bio->bi_iter; -+ atomic_inc(&orig_bbio->pending_ios); - return bio; - } - -+static void btrfs_orig_write_end_io(struct bio *bio); -+ -+static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, -+ struct btrfs_bio *orig_bbio) -+{ -+ /* -+ * For writes we tolerate nr_mirrors - 1 write failures, so we can't -+ * just blindly propagate a write failure here. Instead increment the -+ * error count in the original I/O context so that it is guaranteed to -+ * be larger than the error tolerance. -+ */ -+ if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { -+ struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; -+ struct btrfs_io_context *orig_bioc = orig_stripe->bioc; -+ -+ atomic_add(orig_bioc->max_errors, &orig_bioc->error); -+ } else { -+ orig_bbio->bio.bi_status = bbio->bio.bi_status; -+ } -+} -+ -+static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) -+{ -+ if (bbio->bio.bi_pool == &btrfs_clone_bioset) { -+ struct btrfs_bio *orig_bbio = bbio->private; -+ -+ if (bbio->bio.bi_status) -+ btrfs_bbio_propagate_error(bbio, orig_bbio); -+ bio_put(&bbio->bio); -+ bbio = orig_bbio; -+ } -+ -+ if (atomic_dec_and_test(&bbio->pending_ios)) -+ bbio->end_io(bbio); -+} -+ -+static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) -+{ -+ if (cur_mirror == fbio->num_copies) -+ return cur_mirror + 1 - fbio->num_copies; -+ return cur_mirror + 1; -+} -+ -+static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) -+{ -+ if (cur_mirror == 1) -+ return fbio->num_copies; -+ return cur_mirror - 1; -+} -+ -+static void btrfs_repair_done(struct btrfs_failed_bio *fbio) -+{ -+ if (atomic_dec_and_test(&fbio->repair_count)) { -+ btrfs_orig_bbio_end_io(fbio->bbio); -+ mempool_free(fbio, &btrfs_failed_bio_pool); -+ } -+} -+ -+static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, -+ struct btrfs_device *dev) -+{ -+ struct btrfs_failed_bio *fbio = repair_bbio->private; -+ struct btrfs_inode *inode = repair_bbio->inode; -+ struct btrfs_fs_info *fs_info = inode->root->fs_info; -+ struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); -+ int mirror = repair_bbio->mirror_num; -+ -+ if (repair_bbio->bio.bi_status || -+ !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { -+ bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); -+ repair_bbio->bio.bi_iter = repair_bbio->saved_iter; -+ -+ mirror = next_repair_mirror(fbio, mirror); -+ if (mirror == fbio->bbio->mirror_num) { -+ btrfs_debug(fs_info, "no mirror left"); -+ fbio->bbio->bio.bi_status = BLK_STS_IOERR; -+ goto done; -+ } -+ -+ btrfs_submit_bio(&repair_bbio->bio, mirror); -+ return; -+ } -+ -+ do { -+ mirror = prev_repair_mirror(fbio, mirror); -+ btrfs_repair_io_failure(fs_info, btrfs_ino(inode), -+ repair_bbio->file_offset, fs_info->sectorsize, -+ repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, -+ bv->bv_page, bv->bv_offset, mirror); -+ } while (mirror != fbio->bbio->mirror_num); -+ -+done: -+ btrfs_repair_done(fbio); -+ bio_put(&repair_bbio->bio); -+} -+ -+/* -+ * Try to kick off a repair read to the next available mirror for a bad sector. -+ * -+ * This primarily tries to recover good data to serve the actual read request, -+ * but also tries to write the good data back to the bad mirror(s) when a -+ * read succeeded to restore the redundancy. -+ */ -+static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, -+ u32 bio_offset, -+ struct bio_vec *bv, -+ struct btrfs_failed_bio *fbio) -+{ -+ struct btrfs_inode *inode = failed_bbio->inode; -+ struct btrfs_fs_info *fs_info = inode->root->fs_info; -+ const u32 sectorsize = fs_info->sectorsize; -+ const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); -+ struct btrfs_bio *repair_bbio; -+ struct bio *repair_bio; -+ int num_copies; -+ int mirror; -+ -+ btrfs_debug(fs_info, "repair read error: read error at %llu", -+ failed_bbio->file_offset + bio_offset); -+ -+ num_copies = btrfs_num_copies(fs_info, logical, sectorsize); -+ if (num_copies == 1) { -+ btrfs_debug(fs_info, "no copy to repair from"); -+ failed_bbio->bio.bi_status = BLK_STS_IOERR; -+ return fbio; -+ } -+ -+ if (!fbio) { -+ fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); -+ fbio->bbio = failed_bbio; -+ fbio->num_copies = num_copies; -+ atomic_set(&fbio->repair_count, 1); -+ } -+ -+ atomic_inc(&fbio->repair_count); -+ -+ repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, -+ &btrfs_repair_bioset); -+ repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; -+ bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); -+ -+ repair_bbio = btrfs_bio(repair_bio); -+ btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); -+ repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; -+ -+ mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); -+ btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); -+ btrfs_submit_bio(repair_bio, mirror); -+ return fbio; -+} -+ -+static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) -+{ -+ struct btrfs_inode *inode = bbio->inode; -+ struct btrfs_fs_info *fs_info = inode->root->fs_info; -+ u32 sectorsize = fs_info->sectorsize; -+ struct bvec_iter *iter = &bbio->saved_iter; -+ blk_status_t status = bbio->bio.bi_status; -+ struct btrfs_failed_bio *fbio = NULL; -+ u32 offset = 0; -+ -+ /* -+ * Hand off repair bios to the repair code as there is no upper level -+ * submitter for them. -+ */ -+ if (bbio->bio.bi_pool == &btrfs_repair_bioset) { -+ btrfs_end_repair_bio(bbio, dev); -+ return; -+ } -+ -+ /* Clear the I/O error. A failed repair will reset it. */ -+ bbio->bio.bi_status = BLK_STS_OK; -+ -+ while (iter->bi_size) { -+ struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); -+ -+ bv.bv_len = min(bv.bv_len, sectorsize); -+ if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) -+ fbio = repair_one_sector(bbio, offset, &bv, fbio); -+ -+ bio_advance_iter_single(&bbio->bio, iter, sectorsize); -+ offset += sectorsize; -+ } -+ -+ if (bbio->csum != bbio->csum_inline) -+ kfree(bbio->csum); -+ -+ if (fbio) -+ btrfs_repair_done(fbio); -+ else -+ btrfs_orig_bbio_end_io(bbio); -+} -+ - static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) - { - if (!dev || !dev->bdev) -@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) - { - struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - -- bbio->end_io(bbio); -+ /* Metadata reads are checked and repaired by the submitter. */ -+ if (bbio->bio.bi_opf & REQ_META) -+ bbio->end_io(bbio); -+ else -+ btrfs_check_read_bio(bbio, bbio->bio.bi_private); - } - - static void btrfs_simple_end_io(struct bio *bio) - { -- struct btrfs_fs_info *fs_info = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); -+ struct btrfs_device *dev = bio->bi_private; -+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - - btrfs_bio_counter_dec(fs_info); - - if (bio->bi_status) -- btrfs_log_dev_io_error(bio, bbio->device); -+ btrfs_log_dev_io_error(bio, dev); - - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { -- bbio->end_io(bbio); -+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) -+ btrfs_record_physical_zoned(bbio); -+ btrfs_orig_bbio_end_io(bbio); - } - } - -@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) - - btrfs_bio_counter_dec(bioc->fs_info); - bbio->mirror_num = bioc->mirror_num; -- bbio->end_io(bbio); -+ if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) -+ btrfs_check_read_bio(bbio, NULL); -+ else -+ btrfs_orig_bbio_end_io(bbio); - - btrfs_put_bioc(bioc); - } -@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) - else - bio->bi_status = BLK_STS_OK; - -- bbio->end_io(bbio); -+ btrfs_orig_bbio_end_io(bbio); - btrfs_put_bioc(bioc); - } - -@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) - */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; -+ u64 zone_start = round_down(physical, dev->fs_info->zone_size); - -- if (btrfs_dev_is_sequential(dev, physical)) { -- u64 zone_start = round_down(physical, -- dev->fs_info->zone_size); -- -- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; -- } else { -- bio->bi_opf &= ~REQ_OP_ZONE_APPEND; -- bio->bi_opf |= REQ_OP_WRITE; -- } -+ ASSERT(btrfs_dev_is_sequential(dev, physical)); -+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } - btrfs_debug_in_rcu(dev->fs_info, - "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", -@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) - btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); - } - --void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) -+static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, -+ struct btrfs_io_stripe *smap, int mirror_num) - { -- u64 logical = bio->bi_iter.bi_sector << 9; -- u64 length = bio->bi_iter.bi_size; -- u64 map_length = length; -- struct btrfs_io_context *bioc = NULL; -- struct btrfs_io_stripe smap; -- int ret; -- -- btrfs_bio_counter_inc_blocked(fs_info); -- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, -- &bioc, &smap, &mirror_num, 1); -- if (ret) { -- btrfs_bio_counter_dec(fs_info); -- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); -- return; -- } -- -- if (map_length < length) { -- btrfs_crit(fs_info, -- "mapping failed logical %llu bio len %llu len %llu", -- logical, length, map_length); -- BUG(); -- } -+ /* Do not leak our private flag into the block layer. */ -+ bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - - if (!bioc) { -- /* Single mirror read/write fast path */ -+ /* Single mirror read/write fast path. */ - btrfs_bio(bio)->mirror_num = mirror_num; -- btrfs_bio(bio)->device = smap.dev; -- bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; -- bio->bi_private = fs_info; -+ bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; -+ bio->bi_private = smap->dev; - bio->bi_end_io = btrfs_simple_end_io; -- btrfs_submit_dev_bio(smap.dev, bio); -+ btrfs_submit_dev_bio(smap->dev, bio); - } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { -- /* Parity RAID write or read recovery */ -+ /* Parity RAID write or read recovery. */ - bio->bi_private = bioc; - bio->bi_end_io = btrfs_raid56_end_io; - if (bio_op(bio) == REQ_OP_READ) -@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror - else - raid56_parity_write(bio, bioc); - } else { -- /* Write to multiple mirrors */ -+ /* Write to multiple mirrors. */ - int total_devs = bioc->num_stripes; -- int dev_nr; - - bioc->orig_bio = bio; -- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) -+ for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) - btrfs_submit_mirrored_bio(bioc, dev_nr); - } - } - -+static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) -+{ -+ if (bbio->bio.bi_opf & REQ_META) -+ return btree_csum_one_bio(bbio); -+ return btrfs_csum_one_bio(bbio); -+} -+ -+/* -+ * Async submit bios are used to offload expensive checksumming onto the worker -+ * threads. -+ */ -+struct async_submit_bio { -+ struct btrfs_bio *bbio; -+ struct btrfs_io_context *bioc; -+ struct btrfs_io_stripe smap; -+ int mirror_num; -+ struct btrfs_work work; -+}; -+ -+/* -+ * In order to insert checksums into the metadata in large chunks, we wait -+ * until bio submission time. All the pages in the bio are checksummed and -+ * sums are attached onto the ordered extent record. -+ * -+ * At IO completion time the csums attached on the ordered extent record are -+ * inserted into the btree. -+ */ -+static void run_one_async_start(struct btrfs_work *work) -+{ -+ struct async_submit_bio *async = -+ container_of(work, struct async_submit_bio, work); -+ blk_status_t ret; -+ -+ ret = btrfs_bio_csum(async->bbio); -+ if (ret) -+ async->bbio->bio.bi_status = ret; -+} -+ -+/* -+ * In order to insert checksums into the metadata in large chunks, we wait -+ * until bio submission time. All the pages in the bio are checksummed and -+ * sums are attached onto the ordered extent record. -+ * -+ * At IO completion time the csums attached on the ordered extent record are -+ * inserted into the tree. -+ */ -+static void run_one_async_done(struct btrfs_work *work) -+{ -+ struct async_submit_bio *async = -+ container_of(work, struct async_submit_bio, work); -+ struct bio *bio = &async->bbio->bio; -+ -+ /* If an error occurred we just want to clean up the bio and move on. */ -+ if (bio->bi_status) { -+ btrfs_orig_bbio_end_io(async->bbio); -+ return; -+ } -+ -+ /* -+ * All of the bios that pass through here are from async helpers. -+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. -+ * This changes nothing when cgroups aren't in use. -+ */ -+ bio->bi_opf |= REQ_CGROUP_PUNT; -+ __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); -+} -+ -+static void run_one_async_free(struct btrfs_work *work) -+{ -+ kfree(container_of(work, struct async_submit_bio, work)); -+} -+ -+static bool should_async_write(struct btrfs_bio *bbio) -+{ -+ /* -+ * If the I/O is not issued by fsync and friends, (->sync_writers != 0), -+ * then try to defer the submission to a workqueue to parallelize the -+ * checksum calculation. -+ */ -+ if (atomic_read(&bbio->inode->sync_writers)) -+ return false; -+ -+ /* -+ * Submit metadata writes synchronously if the checksum implementation -+ * is fast, or we are on a zoned device that wants I/O to be submitted -+ * in order. -+ */ -+ if (bbio->bio.bi_opf & REQ_META) { -+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; -+ -+ if (btrfs_is_zoned(fs_info)) -+ return false; -+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) -+ return false; -+ } -+ -+ return true; -+} -+ -+/* -+ * Submit bio to an async queue. -+ * -+ * Return true if the work has been succesfuly submitted, else false. -+ */ -+static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, -+ struct btrfs_io_context *bioc, -+ struct btrfs_io_stripe *smap, int mirror_num) -+{ -+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; -+ struct async_submit_bio *async; -+ -+ async = kmalloc(sizeof(*async), GFP_NOFS); -+ if (!async) -+ return false; -+ -+ async->bbio = bbio; -+ async->bioc = bioc; -+ async->smap = *smap; -+ async->mirror_num = mirror_num; -+ -+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, -+ run_one_async_free); -+ if (op_is_sync(bbio->bio.bi_opf)) -+ btrfs_queue_work(fs_info->hipri_workers, &async->work); -+ else -+ btrfs_queue_work(fs_info->workers, &async->work); -+ return true; -+} -+ -+static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) -+{ -+ struct btrfs_bio *bbio = btrfs_bio(bio); -+ struct btrfs_inode *inode = bbio->inode; -+ struct btrfs_fs_info *fs_info = inode->root->fs_info; -+ struct btrfs_bio *orig_bbio = bbio; -+ u64 logical = bio->bi_iter.bi_sector << 9; -+ u64 length = bio->bi_iter.bi_size; -+ u64 map_length = length; -+ bool use_append = btrfs_use_zone_append(bbio); -+ struct btrfs_io_context *bioc = NULL; -+ struct btrfs_io_stripe smap; -+ blk_status_t ret; -+ int error; -+ -+ btrfs_bio_counter_inc_blocked(fs_info); -+ error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, -+ &bioc, &smap, &mirror_num, 1); -+ if (error) { -+ ret = errno_to_blk_status(error); -+ goto fail; -+ } -+ -+ map_length = min(map_length, length); -+ if (use_append) -+ map_length = min(map_length, fs_info->max_zone_append_size); -+ -+ if (map_length < length) { -+ bio = btrfs_split_bio(fs_info, bio, map_length, use_append); -+ bbio = btrfs_bio(bio); -+ } -+ -+ /* -+ * Save the iter for the end_io handler and preload the checksums for -+ * data reads. -+ */ -+ if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { -+ bbio->saved_iter = bio->bi_iter; -+ ret = btrfs_lookup_bio_sums(bbio); -+ if (ret) -+ goto fail_put_bio; -+ } -+ -+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) { -+ if (use_append) { -+ bio->bi_opf &= ~REQ_OP_WRITE; -+ bio->bi_opf |= REQ_OP_ZONE_APPEND; -+ ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); -+ if (ret) -+ goto fail_put_bio; -+ } -+ -+ /* -+ * Csum items for reloc roots have already been cloned at this -+ * point, so they are handled as part of the no-checksum case. -+ */ -+ if (!(inode->flags & BTRFS_INODE_NODATASUM) && -+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && -+ !btrfs_is_data_reloc_root(inode->root)) { -+ if (should_async_write(bbio) && -+ btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) -+ goto done; -+ -+ ret = btrfs_bio_csum(bbio); -+ if (ret) -+ goto fail_put_bio; -+ } -+ } -+ -+ __btrfs_submit_bio(bio, bioc, &smap, mirror_num); -+done: -+ return map_length == length; -+ -+fail_put_bio: -+ if (map_length < length) -+ bio_put(bio); -+fail: -+ btrfs_bio_counter_dec(fs_info); -+ btrfs_bio_end_io(orig_bbio, ret); -+ /* Do not submit another chunk */ -+ return true; -+} -+ -+void btrfs_submit_bio(struct bio *bio, int mirror_num) -+{ -+ while (!btrfs_submit_chunk(bio, mirror_num)) -+ ; -+} -+ - /* - * Submit a repair write. - * -@@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror - * RAID setup. Here we only want to write the one bad copy, so we do the - * mapping ourselves and submit the bio directly. - * -- * The I/O is issued sychronously to block the repair read completion from -+ * The I/O is issued synchronously to block the repair read completion from - * freeing the bio. - */ - int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, -@@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - return -ENOMEM; -+ if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, -+ offsetof(struct btrfs_bio, bio), 0)) -+ goto out_free_bioset; -+ if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, -+ offsetof(struct btrfs_bio, bio), -+ BIOSET_NEED_BVECS)) -+ goto out_free_clone_bioset; -+ if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, -+ sizeof(struct btrfs_failed_bio))) -+ goto out_free_repair_bioset; - return 0; -+ -+out_free_repair_bioset: -+ bioset_exit(&btrfs_repair_bioset); -+out_free_clone_bioset: -+ bioset_exit(&btrfs_clone_bioset); -+out_free_bioset: -+ bioset_exit(&btrfs_bioset); -+ return -ENOMEM; - } - - void __cold btrfs_bioset_exit(void) - { -+ mempool_exit(&btrfs_failed_bio_pool); -+ bioset_exit(&btrfs_repair_bioset); -+ bioset_exit(&btrfs_clone_bioset); - bioset_exit(&btrfs_bioset); - } -diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h -index b12f84b3b341..873ff85817f0 100644 ---- a/fs/btrfs/bio.h -+++ b/fs/btrfs/bio.h -@@ -26,32 +26,23 @@ struct btrfs_fs_info; - typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); - - /* -- * Additional info to pass along bio. -- * -- * Mostly for btrfs specific features like csum and mirror_num. -+ * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and -+ * passed to btrfs_submit_bio for mapping to the physical devices. - */ - struct btrfs_bio { -- unsigned int mirror_num:7; -- -- /* -- * Extra indicator for metadata bios. -- * For some btrfs bios they use pages without a mapping, thus -- * we can not rely on page->mapping->host to determine if -- * it's a metadata bio. -- */ -- unsigned int is_metadata:1; -- struct bvec_iter iter; -- -- /* for direct I/O */ -+ /* Inode and offset into it that this I/O operates on. */ -+ struct btrfs_inode *inode; - u64 file_offset; - -- /* @device is for stripe IO submission. */ -- struct btrfs_device *device; - union { -- /* For data checksum verification. */ -+ /* -+ * Data checksumming and original I/O information for internal -+ * use in the btrfs_submit_bio machinery. -+ */ - struct { - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; -+ struct bvec_iter saved_iter; - }; - - /* For metadata parentness verification. */ -@@ -62,7 +53,9 @@ struct btrfs_bio { - btrfs_bio_end_io_t end_io; - void *private; - -- /* For read end I/O handling */ -+ /* For internal use in read end I/O handling */ -+ unsigned int mirror_num; -+ atomic_t pending_ios; - struct work_struct end_io_work; - - /* -@@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) - int __init btrfs_bioset_init(void); - void __cold btrfs_bioset_exit(void); - -+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, -+ btrfs_bio_end_io_t end_io, void *private); - struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, -+ struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private); --struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, -- btrfs_bio_end_io_t end_io, void *private); -- - - static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) - { -@@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) - bbio->end_io(bbio); - } - --static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) --{ -- if (bbio->is_metadata) -- return; -- if (bbio->csum != bbio->csum_inline) { -- kfree(bbio->csum); -- bbio->csum = NULL; -- } --} -+/* Bio only refers to one ordered extent. */ -+#define REQ_BTRFS_ONE_ORDERED REQ_DRV - --/* -- * Iterate through a btrfs_bio (@bbio) on a per-sector basis. -- * -- * bvl - struct bio_vec -- * bbio - struct btrfs_bio -- * iters - struct bvec_iter -- * bio_offset - unsigned int -- */ --#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ -- for ((iter) = (bbio)->iter, (bio_offset) = 0; \ -- (iter).bi_size && \ -- (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ -- (bio_offset) += fs_info->sectorsize, \ -- bio_advance_iter_single(&(bbio)->bio, &(iter), \ -- (fs_info)->sectorsize)) -- --void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, -- int mirror_num); -+void btrfs_submit_bio(struct bio *bio, int mirror_num); - int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); -diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c -index 708d843daa72..80c73137e322 100644 ---- a/fs/btrfs/block-group.c -+++ b/fs/btrfs/block-group.c -@@ -1,5 +1,6 @@ - // SPDX-License-Identifier: GPL-2.0 - -+#include - #include - #include "misc.h" - #include "ctree.h" -@@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end - return total_added; - } - -+/* -+ * Get an arbitrary extent item index / max_index through the block group -+ * -+ * @block_group the block group to sample from -+ * @index: the integral step through the block group to grab from -+ * @max_index: the granularity of the sampling -+ * @key: return value parameter for the item we find -+ * -+ * Pre-conditions on indices: -+ * 0 <= index <= max_index -+ * 0 < max_index -+ * -+ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative -+ * error code on error. -+ */ -+static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, -+ struct btrfs_block_group *block_group, -+ int index, int max_index, -+ struct btrfs_key *key) -+{ -+ struct btrfs_fs_info *fs_info = block_group->fs_info; -+ struct btrfs_root *extent_root; -+ int ret = 0; -+ u64 search_offset; -+ u64 search_end = block_group->start + block_group->length; -+ struct btrfs_path *path; -+ -+ ASSERT(index >= 0); -+ ASSERT(index <= max_index); -+ ASSERT(max_index > 0); -+ lockdep_assert_held(&caching_ctl->mutex); -+ lockdep_assert_held_read(&fs_info->commit_root_sem); -+ -+ path = btrfs_alloc_path(); -+ if (!path) -+ return -ENOMEM; -+ -+ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, -+ BTRFS_SUPER_INFO_OFFSET)); -+ -+ path->skip_locking = 1; -+ path->search_commit_root = 1; -+ path->reada = READA_FORWARD; -+ -+ search_offset = index * div_u64(block_group->length, max_index); -+ key->objectid = block_group->start + search_offset; -+ key->type = BTRFS_EXTENT_ITEM_KEY; -+ key->offset = 0; -+ -+ while (1) { -+ ret = btrfs_search_forward(extent_root, key, path, 0); -+ if (ret != 0) -+ goto out; -+ /* Success; sampled an extent item in the block group */ -+ if (key->type == BTRFS_EXTENT_ITEM_KEY && -+ key->objectid >= block_group->start && -+ key->objectid + key->offset <= search_end) -+ goto out; -+ -+ /* We can't possibly find a valid extent item anymore */ -+ if (key->objectid >= search_end) { -+ ret = 1; -+ break; -+ } -+ if (key->type < BTRFS_EXTENT_ITEM_KEY) -+ key->type = BTRFS_EXTENT_ITEM_KEY; -+ else -+ key->objectid++; -+ btrfs_release_path(path); -+ up_read(&fs_info->commit_root_sem); -+ mutex_unlock(&caching_ctl->mutex); -+ cond_resched(); -+ mutex_lock(&caching_ctl->mutex); -+ down_read(&fs_info->commit_root_sem); -+ } -+out: -+ lockdep_assert_held(&caching_ctl->mutex); -+ lockdep_assert_held_read(&fs_info->commit_root_sem); -+ btrfs_free_path(path); -+ return ret; -+} -+ -+/* -+ * Best effort attempt to compute a block group's size class while caching it. -+ * -+ * @block_group: the block group we are caching -+ * -+ * We cannot infer the size class while adding free space extents, because that -+ * logic doesn't care about contiguous file extents (it doesn't differentiate -+ * between a 100M extent and 100 contiguous 1M extents). So we need to read the -+ * file extent items. Reading all of them is quite wasteful, because usually -+ * only a handful are enough to give a good answer. Therefore, we just grab 5 of -+ * them at even steps through the block group and pick the smallest size class -+ * we see. Since size class is best effort, and not guaranteed in general, -+ * inaccuracy is acceptable. -+ * -+ * To be more explicit about why this algorithm makes sense: -+ * -+ * If we are caching in a block group from disk, then there are three major cases -+ * to consider: -+ * 1. the block group is well behaved and all extents in it are the same size -+ * class. -+ * 2. the block group is mostly one size class with rare exceptions for last -+ * ditch allocations -+ * 3. the block group was populated before size classes and can have a totally -+ * arbitrary mix of size classes. -+ * -+ * In case 1, looking at any extent in the block group will yield the correct -+ * result. For the mixed cases, taking the minimum size class seems like a good -+ * approximation, since gaps from frees will be usable to the size class. For -+ * 2., a small handful of file extents is likely to yield the right answer. For -+ * 3, we can either read every file extent, or admit that this is best effort -+ * anyway and try to stay fast. -+ * -+ * Returns: 0 on success, negative error code on error. -+ */ -+static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, -+ struct btrfs_block_group *block_group) -+{ -+ struct btrfs_key key; -+ int i; -+ u64 min_size = block_group->length; -+ enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; -+ int ret; -+ -+ if (!btrfs_block_group_should_use_size_class(block_group)) -+ return 0; -+ -+ for (i = 0; i < 5; ++i) { -+ ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); -+ if (ret < 0) -+ goto out; -+ if (ret > 0) -+ continue; -+ min_size = min_t(u64, min_size, key.offset); -+ size_class = btrfs_calc_block_group_size_class(min_size); -+ } -+ if (size_class != BTRFS_BG_SZ_NONE) { -+ spin_lock(&block_group->lock); -+ block_group->size_class = size_class; -+ spin_unlock(&block_group->lock); -+ } -+ -+out: -+ return ret; -+} -+ - static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) - { - struct btrfs_block_group *block_group = caching_ctl->block_group; -@@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - -+ load_block_group_size_class(caching_ctl, block_group); - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - ret = load_free_space_cache(block_group); - if (ret == 1) { -@@ -1687,7 +1836,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) - - btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", -- bg->start, div_u64(bg->used * 100, bg->length), -+ bg->start, -+ div64_u64(bg->used * 100, bg->length), - div64_u64(zone_unusable * 100, bg->length)); - trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start); -@@ -1816,7 +1966,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) - * - * @fs_info: the filesystem - * @chunk_start: logical address of block group -- * @bdev: physical device to resolve, can be NULL to indicate any device - * @physical: physical address to map to logical addresses - * @logical: return array of logical addresses which map to @physical - * @naddrs: length of @logical -@@ -1827,8 +1976,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) - * block copies. - */ - int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, -- struct block_device *bdev, u64 physical, u64 **logical, -- int *naddrs, int *stripe_len) -+ u64 physical, u64 **logical, int *naddrs, int *stripe_len) - { - struct extent_map *em; - struct map_lookup *map; -@@ -1868,9 +2016,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - data_stripe_length)) - continue; - -- if (bdev && map->stripes[i].dev->bdev != bdev) -- continue; -- - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - -@@ -1927,7 +2072,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); -- ret = btrfs_rmap_block(fs_info, cache->start, NULL, -+ ret = btrfs_rmap_block(fs_info, cache->start, - bytenr, &logical, &nr, &stripe_len); - if (ret) - return ret; -@@ -3330,7 +3475,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, - spin_unlock(&info->delalloc_root_lock); - - while (total) { -- bool reclaim; -+ bool reclaim = false; - - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { -@@ -3379,6 +3524,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, - cache->space_info->disk_used -= num_bytes * factor; - - reclaim = should_reclaim_block_group(cache, num_bytes); -+ - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - -@@ -3433,32 +3579,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, - * reservation and return -EAGAIN, otherwise this function always succeeds. - */ - int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, -- u64 ram_bytes, u64 num_bytes, int delalloc) -+ u64 ram_bytes, u64 num_bytes, int delalloc, -+ bool force_wrong_size_class) - { - struct btrfs_space_info *space_info = cache->space_info; -+ enum btrfs_block_group_size_class size_class; - int ret = 0; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) { - ret = -EAGAIN; -- } else { -- cache->reserved += num_bytes; -- space_info->bytes_reserved += num_bytes; -- trace_btrfs_space_reservation(cache->fs_info, "space_info", -- space_info->flags, num_bytes, 1); -- btrfs_space_info_update_bytes_may_use(cache->fs_info, -- space_info, -ram_bytes); -- if (delalloc) -- cache->delalloc_bytes += num_bytes; -+ goto out; -+ } - -- /* -- * Compression can use less space than we reserved, so wake -- * tickets if that happens -- */ -- if (num_bytes < ram_bytes) -- btrfs_try_granting_tickets(cache->fs_info, space_info); -+ if (btrfs_block_group_should_use_size_class(cache)) { -+ size_class = btrfs_calc_block_group_size_class(num_bytes); -+ ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); -+ if (ret) -+ goto out; - } -+ cache->reserved += num_bytes; -+ space_info->bytes_reserved += num_bytes; -+ trace_btrfs_space_reservation(cache->fs_info, "space_info", -+ space_info->flags, num_bytes, 1); -+ btrfs_space_info_update_bytes_may_use(cache->fs_info, -+ space_info, -ram_bytes); -+ if (delalloc) -+ cache->delalloc_bytes += num_bytes; -+ -+ /* -+ * Compression can use less space than we reserved, so wake tickets if -+ * that happens. -+ */ -+ if (num_bytes < ram_bytes) -+ btrfs_try_granting_tickets(cache->fs_info, space_info); -+out: - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - return ret; -@@ -4218,3 +4374,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount - bg->swap_extents -= amount; - spin_unlock(&bg->lock); - } -+ -+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) -+{ -+ if (size <= SZ_128K) -+ return BTRFS_BG_SZ_SMALL; -+ if (size <= SZ_8M) -+ return BTRFS_BG_SZ_MEDIUM; -+ return BTRFS_BG_SZ_LARGE; -+} -+ -+/* -+ * Handle a block group allocating an extent in a size class -+ * -+ * @bg: The block group we allocated in. -+ * @size_class: The size class of the allocation. -+ * @force_wrong_size_class: Whether we are desperate enough to allow -+ * mismatched size classes. -+ * -+ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the -+ * case of a race that leads to the wrong size class without -+ * force_wrong_size_class set. -+ * -+ * find_free_extent will skip block groups with a mismatched size class until -+ * it really needs to avoid ENOSPC. In that case it will set -+ * force_wrong_size_class. However, if a block group is newly allocated and -+ * doesn't yet have a size class, then it is possible for two allocations of -+ * different sizes to race and both try to use it. The loser is caught here and -+ * has to retry. -+ */ -+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, -+ enum btrfs_block_group_size_class size_class, -+ bool force_wrong_size_class) -+{ -+ ASSERT(size_class != BTRFS_BG_SZ_NONE); -+ -+ /* The new allocation is in the right size class, do nothing */ -+ if (bg->size_class == size_class) -+ return 0; -+ /* -+ * The new allocation is in a mismatched size class. -+ * This means one of two things: -+ * -+ * 1. Two tasks in find_free_extent for different size_classes raced -+ * and hit the same empty block_group. Make the loser try again. -+ * 2. A call to find_free_extent got desperate enough to set -+ * 'force_wrong_slab'. Don't change the size_class, but allow the -+ * allocation. -+ */ -+ if (bg->size_class != BTRFS_BG_SZ_NONE) { -+ if (force_wrong_size_class) -+ return 0; -+ return -EAGAIN; -+ } -+ /* -+ * The happy new block group case: the new allocation is the first -+ * one in the block_group so we set size_class. -+ */ -+ bg->size_class = size_class; -+ -+ return 0; -+} -+ -+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) -+{ -+ if (btrfs_is_zoned(bg->fs_info)) -+ return false; -+ if (!btrfs_is_block_group_data_only(bg)) -+ return false; -+ return true; -+} -diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h -index a02ea76fd6cf..6e4a0b429ac3 100644 ---- a/fs/btrfs/block-group.h -+++ b/fs/btrfs/block-group.h -@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { - BTRFS_DC_SETUP, - }; - -+enum btrfs_block_group_size_class { -+ /* Unset */ -+ BTRFS_BG_SZ_NONE, -+ /* 0 < size <= 128K */ -+ BTRFS_BG_SZ_SMALL, -+ /* 128K < size <= 8M */ -+ BTRFS_BG_SZ_MEDIUM, -+ /* 8M < size < BG_LENGTH */ -+ BTRFS_BG_SZ_LARGE, -+}; -+ - /* - * This describes the state of the block_group for async discard. This is due - * to the two pass nature of it where extent discarding is prioritized over -@@ -233,6 +244,7 @@ struct btrfs_block_group { - struct list_head active_bg_list; - struct work_struct zone_finish_work; - struct extent_buffer *last_eb; -+ enum btrfs_block_group_size_class size_class; - }; - - static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) -@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); - int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool alloc); - int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, -- u64 ram_bytes, u64 num_bytes, int delalloc); -+ u64 ram_bytes, u64 num_bytes, int delalloc, -+ bool force_wrong_size_class); - void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc); - int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, -@@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); - void btrfs_put_block_group_cache(struct btrfs_fs_info *info); - int btrfs_free_block_groups(struct btrfs_fs_info *info); - int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, -- struct block_device *bdev, u64 physical, u64 **logical, -- int *naddrs, int *stripe_len); -+ u64 physical, u64 **logical, int *naddrs, int *stripe_len); - - static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) - { -@@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); - bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); - void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); - -+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); -+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, -+ enum btrfs_block_group_size_class size_class, -+ bool force_wrong_size_class); -+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); -+ - #endif /* BTRFS_BLOCK_GROUP_H */ -diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h -index 195c09e20609..87020aa58121 100644 ---- a/fs/btrfs/btrfs_inode.h -+++ b/fs/btrfs/btrfs_inode.h -@@ -93,12 +93,6 @@ struct btrfs_inode { - /* the io_tree does range state (DIRTY, LOCKED etc) */ - struct extent_io_tree io_tree; - -- /* special utility tree used to record which mirrors have already been -- * tried when checksums fail for a given block -- */ -- struct rb_root io_failure_tree; -- spinlock_t io_failure_lock; -- - /* - * Keep track of where the inode has extent items mapped in order to - * make sure the i_size adjustments are accurate -@@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, - #define CSUM_FMT "0x%*phN" - #define CSUM_FMT_VALUE(size, bytes) size, bytes - --void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); --void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, -- int mirror_num, enum btrfs_compression_type compress_type); --void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); --blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); --blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, -- struct bio *bio, -- u64 dio_file_offset); - int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); --int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, -- u32 bio_offset, struct page *page, u32 pgoff); --unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, -- u32 bio_offset, struct page *page, -- u64 start, u64 end); -+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); -+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, -+ u32 bio_offset, struct bio_vec *bv); - noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); -@@ -532,6 +516,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, - ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); - struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, -+ struct btrfs_ordered_extent **ordered_extent, - size_t done_before); - - extern const struct dentry_operations btrfs_dentry_operations; -diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c -index 5122ca79f7ea..f42f31f22d13 100644 ---- a/fs/btrfs/compression.c -+++ b/fs/btrfs/compression.c -@@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, - - static int btrfs_decompress_bio(struct compressed_bio *cb); - --static void finish_compressed_bio_read(struct compressed_bio *cb) -+static void end_compressed_bio_read(struct btrfs_bio *bbio) - { -+ struct compressed_bio *cb = bbio->private; - unsigned int index; - struct page *page; - -- if (cb->status == BLK_STS_OK) -+ if (bbio->bio.bi_status) -+ cb->status = bbio->bio.bi_status; -+ else - cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); - - /* Release the compressed pages */ -@@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) - /* Finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); --} -- --/* -- * Verify the checksums and kick off repair if needed on the uncompressed data -- * before decompressing it into the original bio and freeing the uncompressed -- * pages. -- */ --static void end_compressed_bio_read(struct btrfs_bio *bbio) --{ -- struct compressed_bio *cb = bbio->private; -- struct inode *inode = cb->inode; -- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- struct btrfs_inode *bi = BTRFS_I(inode); -- bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && -- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); -- blk_status_t status = bbio->bio.bi_status; -- struct bvec_iter iter; -- struct bio_vec bv; -- u32 offset; -- -- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { -- u64 start = bbio->file_offset + offset; -- -- if (!status && -- (!csum || !btrfs_check_data_csum(bi, bbio, offset, -- bv.bv_page, bv.bv_offset))) { -- btrfs_clean_io_failure(bi, start, bv.bv_page, -- bv.bv_offset); -- } else { -- int ret; -- -- refcount_inc(&cb->pending_ios); -- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, -- bv.bv_page, bv.bv_offset, -- true); -- if (ret) { -- refcount_dec(&cb->pending_ios); -- status = errno_to_blk_status(ret); -- } -- } -- } -- -- if (status) -- cb->status = status; -- -- if (refcount_dec_and_test(&cb->pending_ios)) -- finish_compressed_bio_read(cb); -- btrfs_bio_free_csum(bbio); - bio_put(&bbio->bio); - } - -@@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) - static void end_compressed_bio_write(struct btrfs_bio *bbio) - { - struct compressed_bio *cb = bbio->private; -- -- if (bbio->bio.bi_status) -- cb->status = bbio->bio.bi_status; -- -- if (refcount_dec_and_test(&cb->pending_ios)) { -- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); -- -- btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); -- queue_work(fs_info->compressed_write_workers, &cb->write_end_work); -- } -- bio_put(&bbio->bio); --} -- --/* -- * Allocate a compressed_bio, which will be used to read/write on-disk -- * (aka, compressed) * data. -- * -- * @cb: The compressed_bio structure, which records all the needed -- * information to bind the compressed data to the uncompressed -- * page cache. -- * @disk_byten: The logical bytenr where the compressed data will be read -- * from or written to. -- * @endio_func: The endio function to call after the IO for compressed data -- * is finished. -- * @next_stripe_start: Return value of logical bytenr of where next stripe starts. -- * Let the caller know to only fill the bio up to the stripe -- * boundary. -- */ -- -- --static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, -- blk_opf_t opf, -- btrfs_bio_end_io_t endio_func, -- u64 *next_stripe_start) --{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); -- struct btrfs_io_geometry geom; -- struct extent_map *em; -- struct bio *bio; -- int ret; - -- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); -- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; -+ cb->status = bbio->bio.bi_status; -+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - -- em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); -- if (IS_ERR(em)) { -- bio_put(bio); -- return ERR_CAST(em); -- } -- -- if (bio_op(bio) == REQ_OP_ZONE_APPEND) -- bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); -- -- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); -- free_extent_map(em); -- if (ret < 0) { -- bio_put(bio); -- return ERR_PTR(ret); -- } -- *next_stripe_start = disk_bytenr + geom.len; -- refcount_inc(&cb->pending_ios); -- return bio; -+ bio_put(&bbio->bio); - } - - /* -@@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - struct bio *bio = NULL; - struct compressed_bio *cb; - u64 cur_disk_bytenr = disk_start; -- u64 next_stripe_start; - blk_status_t ret = BLK_STS_OK; -- int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; -- const bool use_append = btrfs_use_zone_append(inode, disk_start); -- const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; - - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); - if (!cb) - return BLK_STS_RESOURCE; -- refcount_set(&cb->pending_ios, 1); - cb->status = BLK_STS_OK; - cb->inode = &inode->vfs_inode; - cb->start = start; -@@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); - cb->nr_pages = nr_pages; - -- if (blkcg_css) -+ if (blkcg_css) { - kthread_associate_blkcg(blkcg_css); -+ write_flags |= REQ_CGROUP_PUNT; -+ } -+ -+ write_flags |= REQ_BTRFS_ONE_ORDERED; -+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, -+ BTRFS_I(cb->inode), end_compressed_bio_write, cb); -+ bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; -+ btrfs_bio(bio)->file_offset = start; - - while (cur_disk_bytenr < disk_start + compressed_len) { - u64 offset = cur_disk_bytenr - disk_start; -@@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int real_size; - unsigned int added; - struct page *page = compressed_pages[index]; -- bool submit = false; -- -- /* Allocate new bio if submitted or not yet allocated */ -- if (!bio) { -- bio = alloc_compressed_bio(cb, cur_disk_bytenr, -- bio_op | write_flags, end_compressed_bio_write, -- &next_stripe_start); -- if (IS_ERR(bio)) { -- ret = errno_to_blk_status(PTR_ERR(bio)); -- break; -- } -- if (blkcg_css) -- bio->bi_opf |= REQ_CGROUP_PUNT; -- } -- /* -- * We should never reach next_stripe_start start as we will -- * submit comp_bio when reach the boundary immediately. -- */ -- ASSERT(cur_disk_bytenr != next_stripe_start); - - /* - * We have various limits on the real read size: -- * - stripe boundary - * - page boundary - * - compressed length boundary - */ -- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); -- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); -+ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - -- if (use_append) -- added = bio_add_zone_append_page(bio, page, real_size, -- offset_in_page(offset)); -- else -- added = bio_add_page(bio, page, real_size, -- offset_in_page(offset)); -- /* Reached zoned boundary */ -- if (added == 0) -- submit = true; -- -+ added = bio_add_page(bio, page, real_size, offset_in_page(offset)); -+ /* -+ * Maximum compressed extent is smaller than bio size limit, -+ * thus bio_add_page() should always success. -+ */ -+ ASSERT(added == real_size); - cur_disk_bytenr += added; -- /* Reached stripe boundary */ -- if (cur_disk_bytenr == next_stripe_start) -- submit = true; -- -- /* Finished the range */ -- if (cur_disk_bytenr == disk_start + compressed_len) -- submit = true; -- -- if (submit) { -- if (!skip_sum) { -- ret = btrfs_csum_one_bio(inode, bio, start, true); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(bio), ret); -- break; -- } -- } -- -- ASSERT(bio->bi_iter.bi_size); -- btrfs_submit_bio(fs_info, bio, 0); -- bio = NULL; -- } -- cond_resched(); - } - -+ /* Finished the range. */ -+ ASSERT(bio->bi_iter.bi_size); -+ btrfs_submit_bio(bio, 0); - if (blkcg_css) - kthread_associate_blkcg(NULL); -- -- if (refcount_dec_and_test(&cb->pending_ios)) -- finish_compressed_bio_write(cb); - return ret; - } - -@@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - struct extent_map_tree *em_tree; - struct compressed_bio *cb; - unsigned int compressed_len; -- struct bio *comp_bio = NULL; -+ struct bio *comp_bio; - const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_byte = disk_bytenr; -- u64 next_stripe_start; - u64 file_offset; - u64 em_len; - u64 em_start; -@@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - goto out; - } - -- refcount_set(&cb->pending_ios, 1); - cb->status = BLK_STS_OK; - cb->inode = inode; - -@@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - /* include any pages we added in add_ra-bio_pages */ - cb->len = bio->bi_iter.bi_size; - -+ comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), -+ end_compressed_bio_read, cb); -+ comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); -+ - while (cur_disk_byte < disk_bytenr + compressed_len) { - u64 offset = cur_disk_byte - disk_bytenr; - unsigned int index = offset >> PAGE_SHIFT; - unsigned int real_size; - unsigned int added; - struct page *page = cb->compressed_pages[index]; -- bool submit = false; -- -- /* Allocate new bio if submitted or not yet allocated */ -- if (!comp_bio) { -- comp_bio = alloc_compressed_bio(cb, cur_disk_byte, -- REQ_OP_READ, end_compressed_bio_read, -- &next_stripe_start); -- if (IS_ERR(comp_bio)) { -- cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); -- break; -- } -- } -- /* -- * We should never reach next_stripe_start start as we will -- * submit comp_bio when reach the boundary immediately. -- */ -- ASSERT(cur_disk_byte != next_stripe_start); -+ - /* - * We have various limit on the real read size: -- * - stripe boundary - * - page boundary - * - compressed length boundary - */ -- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); -- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); -+ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - -@@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - */ - ASSERT(added == real_size); - cur_disk_byte += added; -- -- /* Reached stripe boundary, need to submit */ -- if (cur_disk_byte == next_stripe_start) -- submit = true; -- -- /* Has finished the range, need to submit */ -- if (cur_disk_byte == disk_bytenr + compressed_len) -- submit = true; -- -- if (submit) { -- /* Save the original iter for read repair */ -- if (bio_op(comp_bio) == REQ_OP_READ) -- btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; -- -- /* -- * Save the initial offset of this chunk, as there -- * is no direct correlation between compressed pages and -- * the original file offset. The field is only used for -- * priting error messages. -- */ -- btrfs_bio(comp_bio)->file_offset = file_offset; -- -- ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(comp_bio), ret); -- break; -- } -- -- ASSERT(comp_bio->bi_iter.bi_size); -- btrfs_submit_bio(fs_info, comp_bio, mirror_num); -- comp_bio = NULL; -- } - } - - if (memstall) - psi_memstall_leave(&pflags); - -- if (refcount_dec_and_test(&cb->pending_ios)) -- finish_compressed_bio_read(cb); -+ /* -+ * Stash the initial offset of this chunk, as there is no direct -+ * correlation between compressed pages and the original file offset. -+ * The field is only used for printing error messages anyway. -+ */ -+ btrfs_bio(comp_bio)->file_offset = file_offset; -+ -+ ASSERT(comp_bio->bi_iter.bi_size); -+ btrfs_submit_bio(comp_bio, mirror_num); - return; - - fail: -@@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, - index_end = end >> PAGE_SHIFT; - - /* Don't miss unaligned end */ -- if (!IS_ALIGNED(end, PAGE_SIZE)) -+ if (!PAGE_ALIGNED(end)) - index_end++; - - curr_sample_pos = 0; -@@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, - * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics -- * (compressible/uncompressible) to avoid wasting CPU time on uncompressible -+ * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * - * The following types of analysis can be performed: -diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h -index 6209d40a1e08..a5e3377db9ad 100644 ---- a/fs/btrfs/compression.h -+++ b/fs/btrfs/compression.h -@@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); - #define BTRFS_ZLIB_DEFAULT_LEVEL 3 - - struct compressed_bio { -- /* Number of outstanding bios */ -- refcount_t pending_ios; -- - /* Number of compressed pages in the array */ - unsigned int nr_pages; - -diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c -index 4754c9101a4c..a5b6bb54545f 100644 ---- a/fs/btrfs/ctree.c -+++ b/fs/btrfs/ctree.c -@@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, - if (ret) - return ret; - } -- btrfs_clean_tree_block(buf); -+ btrfs_clear_buffer_dirty(trans, buf); - *last_ref = 1; - } - return 0; -@@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, - /* - * Search for a key in the given extent_buffer. - * -- * The lower boundary for the search is specified by the slot number @low. Use a -- * value of 0 to search over the whole extent buffer. -+ * The lower boundary for the search is specified by the slot number @first_slot. -+ * Use a value of 0 to search over the whole extent buffer. - * - * The slot in the extent buffer is returned via @slot. If the key exists in the - * extent buffer, then @slot will point to the slot where the key is, otherwise -@@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, - * Slot may point to the total number of items (i.e. one position beyond the last - * key) if the key is bigger than the last key in the extent buffer. - */ --static noinline int generic_bin_search(struct extent_buffer *eb, int low, -- const struct btrfs_key *key, int *slot) -+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, -+ const struct btrfs_key *key, int *slot) - { - unsigned long p; - int item_size; -- int high = btrfs_header_nritems(eb); -+ /* -+ * Use unsigned types for the low and high slots, so that we get a more -+ * efficient division in the search loop below. -+ */ -+ u32 low = first_slot; -+ u32 high = btrfs_header_nritems(eb); - int ret; - const int key_size = sizeof(struct btrfs_disk_key); - -- if (low > high) { -+ if (unlikely(low > high)) { - btrfs_err(eb->fs_info, -- "%s: low (%d) > high (%d) eb %llu owner %llu level %d", -+ "%s: low (%u) > high (%u) eb %llu owner %llu level %d", - __func__, low, high, eb->start, - btrfs_header_owner(eb), btrfs_header_level(eb)); - return -EINVAL; -@@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, - return 1; - } - --/* -- * Simple binary search on an extent buffer. Works for both leaves and nodes, and -- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). -- */ --int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, -- int *slot) --{ -- return generic_bin_search(eb, 0, key, slot); --} -- - static void root_add_used(struct btrfs_root *root, u32 size) - { - spin_lock(&root->accounting_lock); -@@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, - - path->locks[level] = 0; - path->nodes[level] = NULL; -- btrfs_clean_tree_block(mid); -+ btrfs_clear_buffer_dirty(trans, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); -@@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, - if (wret < 0 && wret != -ENOSPC) - ret = wret; - if (btrfs_header_nritems(right) == 0) { -- btrfs_clean_tree_block(right); -+ btrfs_clear_buffer_dirty(trans, right); - btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); - root_sub_used(root, right->len); -@@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, - BUG_ON(wret == 1); - } - if (btrfs_header_nritems(mid) == 0) { -- btrfs_clean_tree_block(mid); -+ btrfs_clear_buffer_dirty(trans, mid); - btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); - root_sub_used(root, mid->len); -@@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, - return 0; - } - -- return generic_bin_search(eb, search_low_slot, key, slot); -+ return btrfs_generic_bin_search(eb, search_low_slot, key, slot); - } - - static int search_leaf(struct btrfs_trans_handle *trans, -@@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) - * min slot controls the lowest index we're willing to push to the - * right. We'll push up to and including min_slot, but no lower - */ --static noinline int __push_leaf_right(struct btrfs_path *path, -+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, -+ struct btrfs_path *path, - int data_size, int empty, - struct extent_buffer *right, - int free_space, u32 left_nritems, -@@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, - if (left_nritems) - btrfs_mark_buffer_dirty(left); - else -- btrfs_clean_tree_block(left); -+ btrfs_clear_buffer_dirty(trans, left); - - btrfs_mark_buffer_dirty(right); - -@@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, - if (path->slots[0] >= left_nritems) { - path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) -- btrfs_clean_tree_block(path->nodes[0]); -+ btrfs_clear_buffer_dirty(trans, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; -@@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - return 0; - } - -- return __push_leaf_right(path, min_data_size, empty, -- right, free_space, left_nritems, min_slot); -+ return __push_leaf_right(trans, path, min_data_size, empty, right, -+ free_space, left_nritems, min_slot); - out_unlock: - btrfs_tree_unlock(right); - free_extent_buffer(right); -@@ -3259,7 +3255,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the - * items - */ --static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, -+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, -+ struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, - int free_space, u32 right_nritems, - u32 max_slot) -@@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, - if (right_nritems) - btrfs_mark_buffer_dirty(right); - else -- btrfs_clean_tree_block(right); -+ btrfs_clear_buffer_dirty(trans, right); - - btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); -@@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - ret = -EUCLEAN; - goto out; - } -- return __push_leaf_left(path, min_data_size, -- empty, left, free_space, right_nritems, -- max_slot); -+ return __push_leaf_left(trans, path, min_data_size, empty, left, -+ free_space, right_nritems, max_slot); - out: - btrfs_tree_unlock(left); - free_extent_buffer(left); -@@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { -- btrfs_clean_tree_block(leaf); -+ btrfs_clear_buffer_dirty(trans, leaf); - btrfs_del_leaf(trans, root, path, leaf); - } - } else { -diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h -index 6965703a81b6..97897107fab5 100644 ---- a/fs/btrfs/ctree.h -+++ b/fs/btrfs/ctree.h -@@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - /* ctree.c */ - int __init btrfs_ctree_init(void); - void __cold btrfs_ctree_exit(void); -+ -+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, -+ const struct btrfs_key *key, int *slot); -+ -+/* -+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and -+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). -+ */ -+static inline int btrfs_bin_search(struct extent_buffer *eb, -+ const struct btrfs_key *key, -+ int *slot) -+{ -+ return btrfs_generic_bin_search(eb, 0, key, slot); -+} -+ - int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot); - int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); -diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c -index d81b764a7644..8065341d831a 100644 ---- a/fs/btrfs/defrag.c -+++ b/fs/btrfs/defrag.c -@@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i - break; - - unlock_page(page); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* -@@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, - } - - #define CLUSTER_SIZE (SZ_256K) --static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); -+static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); - - /* - * Defrag one contiguous target range. -diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c -index 573ebab886e2..886ffb232eac 100644 ---- a/fs/btrfs/delayed-ref.c -+++ b/fs/btrfs/delayed-ref.c -@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - return 0; - } - --static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, -- struct btrfs_delayed_ref_root *delayed_refs, -+static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref) - { -@@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, - atomic_dec(&delayed_refs->num_entries); - } - --static bool merge_ref(struct btrfs_trans_handle *trans, -- struct btrfs_delayed_ref_root *delayed_refs, -+static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, - u64 seq) -@@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, - mod = -next->ref_mod; - } - -- drop_delayed_ref(trans, delayed_refs, head, next); -+ drop_delayed_ref(delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { -- drop_delayed_ref(trans, delayed_refs, head, ref); -+ drop_delayed_ref(delayed_refs, head, ref); - done = true; - } else { - /* -@@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, - return done; - } - --void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, -+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) - { -- struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; - u64 seq = 0; -@@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); - if (seq && ref->seq >= seq) - continue; -- if (merge_ref(trans, delayed_refs, head, ref, seq)) -+ if (merge_ref(delayed_refs, head, ref, seq)) - goto again; - } - } -@@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - * Return 0 for insert. - * Return >0 for merge. - */ --static int insert_delayed_ref(struct btrfs_trans_handle *trans, -- struct btrfs_delayed_ref_root *root, -+static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) - { -@@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, - - /* remove existing tail if its ref_mod is zero */ - if (exist->ref_mod == 0) -- drop_delayed_ref(trans, root, href, exist); -+ drop_delayed_ref(root, href, exist); - spin_unlock(&href->lock); - return ret; - inserted: -@@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); - -- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); -+ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); - spin_unlock(&delayed_refs->lock); - - /* -@@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); - -- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); -+ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); - spin_unlock(&delayed_refs->lock); - - /* -diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h -index d6304b690ec4..2eb34abf700f 100644 ---- a/fs/btrfs/delayed-ref.h -+++ b/fs/btrfs/delayed-ref.h -@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - struct btrfs_delayed_extent_op *extent_op); --void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, -+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); - -diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c -index fde40112a259..b53f0e30ce2b 100644 ---- a/fs/btrfs/disk-io.c -+++ b/fs/btrfs/disk-io.c -@@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) - crypto_free_shash(fs_info->csum_shash); - } - --/* -- * async submit bios are used to offload expensive checksumming -- * onto the worker threads. They checksum file and metadata bios -- * just before they are sent down the IO stack. -- */ --struct async_submit_bio { -- struct btrfs_inode *inode; -- struct bio *bio; -- enum btrfs_wq_submit_cmd submit_cmd; -- int mirror_num; -- -- /* Optional parameter for used by direct io */ -- u64 dio_file_offset; -- struct btrfs_work work; -- blk_status_t status; --}; -- - /* - * Compute the csum of a btree block and store the result to provided buffer. - */ -@@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec - return csum_one_extent_buffer(eb); - } - -+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) -+{ -+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; -+ struct bvec_iter iter; -+ struct bio_vec bv; -+ int ret = 0; -+ -+ bio_for_each_segment(bv, &bbio->bio, iter) { -+ ret = csum_dirty_buffer(fs_info, &bv); -+ if (ret) -+ break; -+ } -+ -+ return errno_to_blk_status(ret); -+} -+ - static int check_tree_block_fsid(struct extent_buffer *eb) - { - struct btrfs_fs_info *fs_info = eb->fs_info; -@@ -700,172 +699,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - return ret; - } - --static void run_one_async_start(struct btrfs_work *work) --{ -- struct async_submit_bio *async; -- blk_status_t ret; -- -- async = container_of(work, struct async_submit_bio, work); -- switch (async->submit_cmd) { -- case WQ_SUBMIT_METADATA: -- ret = btree_submit_bio_start(async->bio); -- break; -- case WQ_SUBMIT_DATA: -- ret = btrfs_submit_bio_start(async->inode, async->bio); -- break; -- case WQ_SUBMIT_DATA_DIO: -- ret = btrfs_submit_bio_start_direct_io(async->inode, -- async->bio, async->dio_file_offset); -- break; -- } -- if (ret) -- async->status = ret; --} -- --/* -- * In order to insert checksums into the metadata in large chunks, we wait -- * until bio submission time. All the pages in the bio are checksummed and -- * sums are attached onto the ordered extent record. -- * -- * At IO completion time the csums attached on the ordered extent record are -- * inserted into the tree. -- */ --static void run_one_async_done(struct btrfs_work *work) --{ -- struct async_submit_bio *async = -- container_of(work, struct async_submit_bio, work); -- struct btrfs_inode *inode = async->inode; -- struct btrfs_bio *bbio = btrfs_bio(async->bio); -- -- /* If an error occurred we just want to clean up the bio and move on */ -- if (async->status) { -- btrfs_bio_end_io(bbio, async->status); -- return; -- } -- -- /* -- * All of the bios that pass through here are from async helpers. -- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. -- * This changes nothing when cgroups aren't in use. -- */ -- async->bio->bi_opf |= REQ_CGROUP_PUNT; -- btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); --} -- --static void run_one_async_free(struct btrfs_work *work) --{ -- struct async_submit_bio *async; -- -- async = container_of(work, struct async_submit_bio, work); -- kfree(async); --} -- --/* -- * Submit bio to an async queue. -- * -- * Retrun: -- * - true if the work has been succesfuly submitted -- * - false in case of error -- */ --bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, -- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct async_submit_bio *async; -- -- async = kmalloc(sizeof(*async), GFP_NOFS); -- if (!async) -- return false; -- -- async->inode = inode; -- async->bio = bio; -- async->mirror_num = mirror_num; -- async->submit_cmd = cmd; -- -- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, -- run_one_async_free); -- -- async->dio_file_offset = dio_file_offset; -- -- async->status = 0; -- -- if (op_is_sync(bio->bi_opf)) -- btrfs_queue_work(fs_info->hipri_workers, &async->work); -- else -- btrfs_queue_work(fs_info->workers, &async->work); -- return true; --} -- --static blk_status_t btree_csum_one_bio(struct bio *bio) --{ -- struct bio_vec *bvec; -- struct btrfs_root *root; -- int ret = 0; -- struct bvec_iter_all iter_all; -- -- ASSERT(!bio_flagged(bio, BIO_CLONED)); -- bio_for_each_segment_all(bvec, bio, iter_all) { -- root = BTRFS_I(bvec->bv_page->mapping->host)->root; -- ret = csum_dirty_buffer(root->fs_info, bvec); -- if (ret) -- break; -- } -- -- return errno_to_blk_status(ret); --} -- --blk_status_t btree_submit_bio_start(struct bio *bio) --{ -- /* -- * when we're called for a write, we're already in the async -- * submission context. Just jump into btrfs_submit_bio. -- */ -- return btree_csum_one_bio(bio); --} -- --static bool should_async_write(struct btrfs_fs_info *fs_info, -- struct btrfs_inode *bi) --{ -- if (btrfs_is_zoned(fs_info)) -- return false; -- if (atomic_read(&bi->sync_writers)) -- return false; -- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) -- return false; -- return true; --} -- --void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct btrfs_bio *bbio = btrfs_bio(bio); -- blk_status_t ret; -- -- bio->bi_opf |= REQ_META; -- bbio->is_metadata = 1; -- -- if (btrfs_op(bio) != BTRFS_MAP_WRITE) { -- btrfs_submit_bio(fs_info, bio, mirror_num); -- return; -- } -- -- /* -- * Kthread helpers are used to submit writes so that checksumming can -- * happen in parallel across all CPUs. -- */ -- if (should_async_write(fs_info, inode) && -- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) -- return; -- -- ret = btree_csum_one_bio(bio); -- if (ret) { -- btrfs_bio_end_io(bbio, ret); -- return; -- } -- -- btrfs_submit_bio(fs_info, bio, mirror_num); --} -- - #ifdef CONFIG_MIGRATION - static int btree_migrate_folio(struct address_space *mapping, - struct folio *dst, struct folio *src, enum migrate_mode mode) -@@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - - } - --void btrfs_clean_tree_block(struct extent_buffer *buf) --{ -- struct btrfs_fs_info *fs_info = buf->fs_info; -- if (btrfs_header_generation(buf) == -- fs_info->running_transaction->transid) { -- btrfs_assert_tree_write_locked(buf); -- -- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { -- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -- -buf->len, -- fs_info->dirty_metadata_batch); -- clear_extent_buffer_dirty(buf); -- } -- } --} -- - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) - { -@@ -5162,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - start += fs_info->nodesize; - if (!eb) - continue; -+ -+ btrfs_tree_lock(eb); - wait_on_extent_buffer_writeback(eb); -+ btrfs_clear_buffer_dirty(NULL, eb); -+ btrfs_tree_unlock(eb); - -- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, -- &eb->bflags)) -- clear_extent_buffer_dirty(eb); - free_extent_buffer_stale(eb); - } - } -diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h -index f2f295eb6103..4d5772330110 100644 ---- a/fs/btrfs/disk-io.h -+++ b/fs/btrfs/disk-io.h -@@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( - struct btrfs_fs_info *fs_info, - u64 bytenr, u64 owner_root, - int level); --void btrfs_clean_tree_block(struct extent_buffer *buf); -+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, -+ struct extent_buffer *buf); - void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); - int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); - int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, -@@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, - int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror); --void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); - #endif -@@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int btrfs_read_extent_buffer(struct extent_buffer *buf, - struct btrfs_tree_parent_check *check); - --enum btrfs_wq_submit_cmd { -- WQ_SUBMIT_METADATA, -- WQ_SUBMIT_DATA, -- WQ_SUBMIT_DATA_DIO, --}; -- --bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, -- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); --blk_status_t btree_submit_bio_start(struct bio *bio); -+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); - int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, -diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c -index 3c7766dfaa69..29a225836e28 100644 ---- a/fs/btrfs/extent-io-tree.c -+++ b/fs/btrfs/extent-io-tree.c -@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - { - struct extent_state *state; - struct extent_state *prealloc = NULL; -- struct rb_node **p; -- struct rb_node *parent; -+ struct rb_node **p = NULL; -+ struct rb_node *parent = NULL; - int err = 0; - u64 last_start; - u64 last_end; -@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - { - struct extent_state *state; - struct extent_state *prealloc = NULL; -- struct rb_node **p; -- struct rb_node *parent; -+ struct rb_node **p = NULL; -+ struct rb_node *parent = NULL; - int err = 0; - u64 last_start; - u64 last_end; -@@ -1625,7 +1625,7 @@ u64 count_range_bits(struct extent_io_tree *tree, - } - - /* -- * Searche a range in the state tree for a given mask. If 'filled' == 1, this -+ * Search a range in the state tree for a given mask. If 'filled' == 1, this - * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 - * is returned if any bit in the range is found set. - */ -diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h -index e3eeec380844..21766e49ec02 100644 ---- a/fs/btrfs/extent-io-tree.h -+++ b/fs/btrfs/extent-io-tree.h -@@ -6,7 +6,6 @@ - #include "misc.h" - - struct extent_changeset; --struct io_failure_record; - - /* Bits for the extent state */ - enum { -diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c -index 72ba13b027a9..824c657f59e8 100644 ---- a/fs/btrfs/extent-tree.c -+++ b/fs/btrfs/extent-tree.c -@@ -16,7 +16,8 @@ - #include - #include - #include --#include "misc.h" -+#include "ctree.h" -+#include "extent-tree.h" - #include "tree-log.h" - #include "disk-io.h" - #include "print-tree.h" -@@ -31,14 +32,12 @@ - #include "space-info.h" - #include "block-rsv.h" - #include "delalloc-space.h" --#include "block-group.h" - #include "discard.h" - #include "rcu-string.h" - #include "zoned.h" - #include "dev-replace.h" - #include "fs.h" - #include "accessors.h" --#include "extent-tree.h" - #include "root-tree.h" - #include "file-item.h" - #include "orphan.h" -@@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - cond_resched(); - - spin_lock(&locked_ref->lock); -- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); -+ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - } - - return 0; -@@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - * insert_inline_extent_backref()). - */ - spin_lock(&locked_ref->lock); -- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); -+ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); -@@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) - enum btrfs_loop_type { - LOOP_CACHING_NOWAIT, - LOOP_CACHING_WAIT, -+ LOOP_UNSET_SIZE_CLASS, - LOOP_ALLOC_CHUNK, -+ LOOP_WRONG_SIZE_CLASS, - LOOP_NO_EMPTY_SIZE, - }; - -@@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, - btrfs_put_block_group(cache); - } - --enum btrfs_extent_allocation_policy { -- BTRFS_EXTENT_ALLOC_CLUSTERED, -- BTRFS_EXTENT_ALLOC_ZONED, --}; -- --/* -- * Structure used internally for find_free_extent() function. Wraps needed -- * parameters. -- */ --struct find_free_extent_ctl { -- /* Basic allocation info */ -- u64 ram_bytes; -- u64 num_bytes; -- u64 min_alloc_size; -- u64 empty_size; -- u64 flags; -- int delalloc; -- -- /* Where to start the search inside the bg */ -- u64 search_start; -- -- /* For clustered allocation */ -- u64 empty_cluster; -- struct btrfs_free_cluster *last_ptr; -- bool use_cluster; -- -- bool have_caching_bg; -- bool orig_have_caching_bg; -- -- /* Allocation is called for tree-log */ -- bool for_treelog; -- -- /* Allocation is called for data relocation */ -- bool for_data_reloc; -- -- /* RAID index, converted from flags */ -- int index; -- -- /* -- * Current loop number, check find_free_extent_update_loop() for details -- */ -- int loop; -- -- /* -- * Whether we're refilling a cluster, if true we need to re-search -- * current block group but don't try to refill the cluster again. -- */ -- bool retry_clustered; -- -- /* -- * Whether we're updating free space cache, if true we need to re-search -- * current block group but don't try updating free space cache again. -- */ -- bool retry_unclustered; -- -- /* If current block group is cached */ -- int cached; -- -- /* Max contiguous hole found */ -- u64 max_extent_size; -- -- /* Total free space from free space cache, not always contiguous */ -- u64 total_free_space; -- -- /* Found result */ -- u64 found_offset; -- -- /* Hint where to start looking for an empty space */ -- u64 hint_byte; -- -- /* Allocation policy */ -- enum btrfs_extent_allocation_policy policy; --}; -- -- - /* - * Helper function for find_free_extent(). - * -@@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, - if (offset) { - /* We have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); -- trace_btrfs_reserve_extent_cluster(cluster_bg, -- ffe_ctl->search_start, ffe_ctl->num_bytes); -+ trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); - *cluster_bg_ret = cluster_bg; - ffe_ctl->found_offset = offset; - return 0; -@@ -3610,10 +3535,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, - if (offset) { - /* We found one, proceed */ - spin_unlock(&last_ptr->refill_lock); -- trace_btrfs_reserve_extent_cluster(bg, -- ffe_ctl->search_start, -- ffe_ctl->num_bytes); - ffe_ctl->found_offset = offset; -+ trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); - return 0; - } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && -@@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, - } - } - --static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) --{ -- switch (ffe_ctl->policy) { -- case BTRFS_EXTENT_ALLOC_CLUSTERED: -- /* -- * If we can't allocate a new chunk we've already looped through -- * at least once, move on to the NO_EMPTY_SIZE case. -- */ -- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; -- return 0; -- case BTRFS_EXTENT_ALLOC_ZONED: -- /* Give up here */ -- return -ENOSPC; -- default: -- BUG(); -- } --} -- - /* - * Return >0 means caller needs to re-search for free extent - * Return 0 means we have the needed free extent. -@@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching -+ * LOOP_UNSET_SIZE_CLASS, allow unset size class - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { - ffe_ctl->index = 0; -- if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { -- /* -- * We want to skip the LOOP_CACHING_WAIT step if we -- * don't have any uncached bgs and we've already done a -- * full search through. -- */ -- if (ffe_ctl->orig_have_caching_bg || !full_search) -- ffe_ctl->loop = LOOP_CACHING_WAIT; -- else -- ffe_ctl->loop = LOOP_ALLOC_CHUNK; -- } else { -+ /* -+ * We want to skip the LOOP_CACHING_WAIT step if we don't have -+ * any uncached bgs and we've already done a full search -+ * through. -+ */ -+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && -+ (!ffe_ctl->orig_have_caching_bg && full_search)) - ffe_ctl->loop++; -- } -+ ffe_ctl->loop++; - - if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; - -- /*Check if allocation policy allows to create a new chunk */ -+ /* Check if allocation policy allows to create a new chunk */ - ret = can_allocate_chunk(fs_info, ffe_ctl); - if (ret) - return ret; -@@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - CHUNK_ALLOC_FORCE_FOR_EXTENT); - - /* Do not bail out on ENOSPC since we can do more. */ -- if (ret == -ENOSPC) -- ret = chunk_allocation_failed(ffe_ctl); -+ if (ret == -ENOSPC) { -+ ret = 0; -+ ffe_ctl->loop++; -+ } - else if (ret < 0) - btrfs_abort_transaction(trans, ret); - else -@@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - return -ENOSPC; - } - -+static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, -+ struct btrfs_block_group *bg) -+{ -+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) -+ return true; -+ if (!btrfs_block_group_should_use_size_class(bg)) -+ return true; -+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) -+ return true; -+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && -+ bg->size_class == BTRFS_BG_SZ_NONE) -+ return true; -+ return ffe_ctl->size_class == bg->size_class; -+} -+ - static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_space_info *space_info, -@@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - ffe_ctl->total_free_space = 0; - ffe_ctl->found_offset = 0; - ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; -+ ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); - - if (btrfs_is_zoned(fs_info)) - ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; -@@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - ins->objectid = 0; - ins->offset = 0; - -- trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, -- ffe_ctl->flags); -+ trace_find_free_extent(root, ffe_ctl); - - space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); - if (!space_info) { -@@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - block_group->flags); - btrfs_lock_block_group(block_group, - ffe_ctl->delalloc); -+ ffe_ctl->hinted = true; - goto have_block_group; - } - } else if (block_group) { -@@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - } - } - search: -+ trace_find_free_extent_search_loop(root, ffe_ctl); - ffe_ctl->have_caching_bg = false; - if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || - ffe_ctl->index == 0) -@@ -4356,6 +4277,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - &space_info->block_groups[ffe_ctl->index], list) { - struct btrfs_block_group *bg_ret; - -+ ffe_ctl->hinted = false; - /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) { - if (ffe_ctl->for_treelog) -@@ -4397,6 +4319,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - } - - have_block_group: -+ trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); - ffe_ctl->cached = btrfs_block_group_done(block_group); - if (unlikely(!ffe_ctl->cached)) { - ffe_ctl->have_caching_bg = true; -@@ -4421,6 +4344,9 @@ static noinline int find_free_extent(struct btrfs_root *root, - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) - goto loop; - -+ if (!find_free_extent_check_size_class(ffe_ctl, block_group)) -+ goto loop; -+ - bg_ret = NULL; - ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { -@@ -4455,7 +4381,8 @@ static noinline int find_free_extent(struct btrfs_root *root, - - ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, - ffe_ctl->num_bytes, -- ffe_ctl->delalloc); -+ ffe_ctl->delalloc, -+ ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); - if (ret == -EAGAIN) { - btrfs_add_free_space_unused(block_group, - ffe_ctl->found_offset, -@@ -4468,8 +4395,7 @@ static noinline int find_free_extent(struct btrfs_root *root, - ins->objectid = ffe_ctl->search_start; - ins->offset = ffe_ctl->num_bytes; - -- trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, -- ffe_ctl->num_bytes); -+ trace_btrfs_reserve_extent(block_group, ffe_ctl); - btrfs_release_block_group(block_group, ffe_ctl->delalloc); - break; - loop: -@@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - - __btrfs_tree_lock(buf, nest); -- btrfs_clean_tree_block(buf); -+ btrfs_clear_buffer_dirty(trans, buf); - clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); - -@@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, - } - } - } -- /* make block locked assertion in btrfs_clean_tree_block happy */ -- if (!path->locks[level] && -- btrfs_header_generation(eb) == trans->transid) { -+ /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ -+ if (!path->locks[level]) { - btrfs_tree_lock(eb); - path->locks[level] = BTRFS_WRITE_LOCK; - } -- btrfs_clean_tree_block(eb); -+ btrfs_clear_buffer_dirty(trans, eb); - } - - if (eb == root->node) { -diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h -index ae5425253603..0c958fc1b3b8 100644 ---- a/fs/btrfs/extent-tree.h -+++ b/fs/btrfs/extent-tree.h -@@ -3,6 +3,87 @@ - #ifndef BTRFS_EXTENT_TREE_H - #define BTRFS_EXTENT_TREE_H - -+#include "misc.h" -+#include "block-group.h" -+ -+struct btrfs_free_cluster; -+ -+enum btrfs_extent_allocation_policy { -+ BTRFS_EXTENT_ALLOC_CLUSTERED, -+ BTRFS_EXTENT_ALLOC_ZONED, -+}; -+ -+struct find_free_extent_ctl { -+ /* Basic allocation info */ -+ u64 ram_bytes; -+ u64 num_bytes; -+ u64 min_alloc_size; -+ u64 empty_size; -+ u64 flags; -+ int delalloc; -+ -+ /* Where to start the search inside the bg */ -+ u64 search_start; -+ -+ /* For clustered allocation */ -+ u64 empty_cluster; -+ struct btrfs_free_cluster *last_ptr; -+ bool use_cluster; -+ -+ bool have_caching_bg; -+ bool orig_have_caching_bg; -+ -+ /* Allocation is called for tree-log */ -+ bool for_treelog; -+ -+ /* Allocation is called for data relocation */ -+ bool for_data_reloc; -+ -+ /* RAID index, converted from flags */ -+ int index; -+ -+ /* -+ * Current loop number, check find_free_extent_update_loop() for details -+ */ -+ int loop; -+ -+ /* -+ * Whether we're refilling a cluster, if true we need to re-search -+ * current block group but don't try to refill the cluster again. -+ */ -+ bool retry_clustered; -+ -+ /* -+ * Whether we're updating free space cache, if true we need to re-search -+ * current block group but don't try updating free space cache again. -+ */ -+ bool retry_unclustered; -+ -+ /* If current block group is cached */ -+ int cached; -+ -+ /* Max contiguous hole found */ -+ u64 max_extent_size; -+ -+ /* Total free space from free space cache, not always contiguous */ -+ u64 total_free_space; -+ -+ /* Found result */ -+ u64 found_offset; -+ -+ /* Hint where to start looking for an empty space */ -+ u64 hint_byte; -+ -+ /* Allocation policy */ -+ enum btrfs_extent_allocation_policy policy; -+ -+ /* Whether or not the allocator is currently following a hint */ -+ bool hinted; -+ -+ /* Size class of block groups to prefer in early loops */ -+ enum btrfs_block_group_size_class size_class; -+}; -+ - enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID, - BTRFS_REF_TYPE_BLOCK, -diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c -index 3bbf8703db2a..c25fa74d7615 100644 ---- a/fs/btrfs/extent_io.c -+++ b/fs/btrfs/extent_io.c -@@ -36,6 +36,7 @@ - #include "file.h" - #include "dev-replace.h" - #include "super.h" -+#include "transaction.h" - - static struct kmem_cache *extent_buffer_cache; - -@@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { - struct bio *bio; - int mirror_num; - enum btrfs_compression_type compress_type; -- u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; - btrfs_bio_end_io_t end_io_func; - -@@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) - { - struct bio *bio; - struct bio_vec *bv; -- struct btrfs_inode *inode; -+ struct inode *inode; - int mirror_num; - - if (!bio_ctrl->bio) -@@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) - - bio = bio_ctrl->bio; - bv = bio_first_bvec_all(bio); -- inode = BTRFS_I(bv->bv_page->mapping->host); -+ inode = bv->bv_page->mapping->host; - mirror_num = bio_ctrl->mirror_num; - - /* Caller should ensure the bio has at least some range added */ - ASSERT(bio->bi_iter.bi_size); - -- btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -- -- if (!is_data_inode(&inode->vfs_inode)) { -+ if (!is_data_inode(inode)) { - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * For metadata read, we should have the parent_check, -@@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) - bio_ctrl->parent_check, - sizeof(struct btrfs_tree_parent_check)); - } -- btrfs_submit_metadata_bio(inode, bio, mirror_num); -- } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { -- btrfs_submit_data_write_bio(inode, bio, mirror_num); -- } else { -- btrfs_submit_data_read_bio(inode, bio, mirror_num, -- bio_ctrl->compress_type); -+ bio->bi_opf |= REQ_META; - } - -+ if (btrfs_op(bio) == BTRFS_MAP_READ && -+ bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) -+ btrfs_submit_compressed_read(inode, bio, mirror_num); -+ else -+ btrfs_submit_bio(bio, mirror_num); -+ - /* The bio is owned by the end_io handler now */ - bio_ctrl->bio = NULL; - } -@@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - start, end, page_ops, NULL); - } - --static int insert_failrec(struct btrfs_inode *inode, -- struct io_failure_record *failrec) --{ -- struct rb_node *exist; -- -- spin_lock(&inode->io_failure_lock); -- exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, -- &failrec->rb_node); -- spin_unlock(&inode->io_failure_lock); -- -- return (exist == NULL) ? 0 : -EEXIST; --} -- --static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) --{ -- struct rb_node *node; -- struct io_failure_record *failrec = ERR_PTR(-ENOENT); -- -- spin_lock(&inode->io_failure_lock); -- node = rb_simple_search(&inode->io_failure_tree, start); -- if (node) -- failrec = rb_entry(node, struct io_failure_record, rb_node); -- spin_unlock(&inode->io_failure_lock); -- return failrec; --} -- --static void free_io_failure(struct btrfs_inode *inode, -- struct io_failure_record *rec) --{ -- spin_lock(&inode->io_failure_lock); -- rb_erase(&rec->rb_node, &inode->io_failure_tree); -- spin_unlock(&inode->io_failure_lock); -- -- kfree(rec); --} -- --static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) --{ -- if (cur_mirror == failrec->num_copies) -- return cur_mirror + 1 - failrec->num_copies; -- return cur_mirror + 1; --} -- --static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) --{ -- if (cur_mirror == 1) -- return failrec->num_copies; -- return cur_mirror - 1; --} -- --/* -- * each time an IO finishes, we do a fast check in the IO failure tree -- * to see if we need to process or clean up an io_failure_record -- */ --int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, -- struct page *page, unsigned int pg_offset) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct extent_io_tree *io_tree = &inode->io_tree; -- u64 ino = btrfs_ino(inode); -- u64 locked_start, locked_end; -- struct io_failure_record *failrec; -- int mirror; -- int ret; -- -- failrec = get_failrec(inode, start); -- if (IS_ERR(failrec)) -- return 0; -- -- BUG_ON(!failrec->this_mirror); -- -- if (sb_rdonly(fs_info->sb)) -- goto out; -- -- ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, -- &locked_end, EXTENT_LOCKED, NULL); -- if (ret || locked_start > failrec->bytenr || -- locked_end < failrec->bytenr + failrec->len - 1) -- goto out; -- -- mirror = failrec->this_mirror; -- do { -- mirror = prev_mirror(failrec, mirror); -- btrfs_repair_io_failure(fs_info, ino, start, failrec->len, -- failrec->logical, page, pg_offset, mirror); -- } while (mirror != failrec->failed_mirror); -- --out: -- free_io_failure(inode, failrec); -- return 0; --} -- --/* -- * Can be called when -- * - hold extent lock -- * - under ordered extent -- * - the inode is freeing -- */ --void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) --{ -- struct io_failure_record *failrec; -- struct rb_node *node, *next; -- -- if (RB_EMPTY_ROOT(&inode->io_failure_tree)) -- return; -- -- spin_lock(&inode->io_failure_lock); -- node = rb_simple_search_first(&inode->io_failure_tree, start); -- while (node) { -- failrec = rb_entry(node, struct io_failure_record, rb_node); -- if (failrec->bytenr > end) -- break; -- -- next = rb_next(node); -- rb_erase(&failrec->rb_node, &inode->io_failure_tree); -- kfree(failrec); -- -- node = next; -- } -- spin_unlock(&inode->io_failure_lock); --} -- --static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, -- struct btrfs_bio *bbio, -- unsigned int bio_offset) --{ -- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- u64 start = bbio->file_offset + bio_offset; -- struct io_failure_record *failrec; -- const u32 sectorsize = fs_info->sectorsize; -- int ret; -- -- failrec = get_failrec(BTRFS_I(inode), start); -- if (!IS_ERR(failrec)) { -- btrfs_debug(fs_info, -- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", -- failrec->logical, failrec->bytenr, failrec->len); -- /* -- * when data can be on disk more than twice, add to failrec here -- * (e.g. with a list for failed_mirror) to make -- * clean_io_failure() clean all those errors at once. -- */ -- ASSERT(failrec->this_mirror == bbio->mirror_num); -- ASSERT(failrec->len == fs_info->sectorsize); -- return failrec; -- } -- -- failrec = kzalloc(sizeof(*failrec), GFP_NOFS); -- if (!failrec) -- return ERR_PTR(-ENOMEM); -- -- RB_CLEAR_NODE(&failrec->rb_node); -- failrec->bytenr = start; -- failrec->len = sectorsize; -- failrec->failed_mirror = bbio->mirror_num; -- failrec->this_mirror = bbio->mirror_num; -- failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; -- -- btrfs_debug(fs_info, -- "new io failure record logical %llu start %llu", -- failrec->logical, start); -- -- failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); -- if (failrec->num_copies == 1) { -- /* -- * We only have a single copy of the data, so don't bother with -- * all the retry and error correction code that follows. No -- * matter what the error is, it is very likely to persist. -- */ -- btrfs_debug(fs_info, -- "cannot repair logical %llu num_copies %d", -- failrec->logical, failrec->num_copies); -- kfree(failrec); -- return ERR_PTR(-EIO); -- } -- -- /* Set the bits in the private failure tree */ -- ret = insert_failrec(BTRFS_I(inode), failrec); -- if (ret) { -- kfree(failrec); -- return ERR_PTR(ret); -- } -- -- return failrec; --} -- --int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, -- u32 bio_offset, struct page *page, unsigned int pgoff, -- bool submit_buffered) --{ -- u64 start = failed_bbio->file_offset + bio_offset; -- struct io_failure_record *failrec; -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct bio *failed_bio = &failed_bbio->bio; -- const int icsum = bio_offset >> fs_info->sectorsize_bits; -- struct bio *repair_bio; -- struct btrfs_bio *repair_bbio; -- -- btrfs_debug(fs_info, -- "repair read error: read error at %llu", start); -- -- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); -- -- failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); -- if (IS_ERR(failrec)) -- return PTR_ERR(failrec); -- -- /* -- * There are two premises: -- * a) deliver good data to the caller -- * b) correct the bad sectors on disk -- * -- * Since we're only doing repair for one sector, we only need to get -- * a good copy of the failed sector and if we succeed, we have setup -- * everything for btrfs_repair_io_failure to do the rest for us. -- */ -- failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); -- if (failrec->this_mirror == failrec->failed_mirror) { -- btrfs_debug(fs_info, -- "failed to repair num_copies %d this_mirror %d failed_mirror %d", -- failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); -- free_io_failure(inode, failrec); -- return -EIO; -- } -- -- repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, -- failed_bbio->private); -- repair_bbio = btrfs_bio(repair_bio); -- repair_bbio->file_offset = start; -- repair_bio->bi_iter.bi_sector = failrec->logical >> 9; -- -- if (failed_bbio->csum) { -- const u32 csum_size = fs_info->csum_size; -- -- repair_bbio->csum = repair_bbio->csum_inline; -- memcpy(repair_bbio->csum, -- failed_bbio->csum + csum_size * icsum, csum_size); -- } -- -- bio_add_page(repair_bio, page, failrec->len, pgoff); -- repair_bbio->iter = repair_bio->bi_iter; -- -- btrfs_debug(fs_info, -- "repair read error: submitting new read to mirror %d", -- failrec->this_mirror); -- -- /* -- * At this point we have a bio, so any errors from bio submission will -- * be handled by the endio on the repair_bio, so we can't return an -- * error here. -- */ -- if (submit_buffered) -- btrfs_submit_data_read_bio(inode, repair_bio, -- failrec->this_mirror, 0); -- else -- btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); -- -- return BLK_STS_OK; --} -- - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) - { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); -@@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) - btrfs_subpage_end_reader(fs_info, page, start, len); - } - --static void end_sector_io(struct page *page, u64 offset, bool uptodate) --{ -- struct btrfs_inode *inode = BTRFS_I(page->mapping->host); -- const u32 sectorsize = inode->root->fs_info->sectorsize; -- -- end_page_read(page, uptodate, offset, sectorsize); -- unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); --} -- --static void submit_data_read_repair(struct inode *inode, -- struct btrfs_bio *failed_bbio, -- u32 bio_offset, const struct bio_vec *bvec, -- unsigned int error_bitmap) --{ -- const unsigned int pgoff = bvec->bv_offset; -- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- struct page *page = bvec->bv_page; -- const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; -- const u64 end = start + bvec->bv_len - 1; -- const u32 sectorsize = fs_info->sectorsize; -- const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; -- int i; -- -- BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); -- -- /* This repair is only for data */ -- ASSERT(is_data_inode(inode)); -- -- /* We're here because we had some read errors or csum mismatch */ -- ASSERT(error_bitmap); -- -- /* -- * We only get called on buffered IO, thus page must be mapped and bio -- * must not be cloned. -- */ -- ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); -- -- /* Iterate through all the sectors in the range */ -- for (i = 0; i < nr_bits; i++) { -- const unsigned int offset = i * sectorsize; -- bool uptodate = false; -- int ret; -- -- if (!(error_bitmap & (1U << i))) { -- /* -- * This sector has no error, just end the page read -- * and unlock the range. -- */ -- uptodate = true; -- goto next; -- } -- -- ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, -- bio_offset + offset, page, pgoff + offset, -- true); -- if (!ret) { -- /* -- * We have submitted the read repair, the page release -- * will be handled by the endio function of the -- * submitted repair bio. -- * Thus we don't need to do any thing here. -- */ -- continue; -- } -- /* -- * Continue on failed repair, otherwise the remaining sectors -- * will not be properly unlocked. -- */ --next: -- end_sector_io(page, start + offset, uptodate); -- } --} -- - /* lots and lots of room for performance fixes in the end_bio funcs */ - - void end_extent_writepage(struct page *page, int err, u64 start, u64 end) -@@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) - u64 start; - u64 end; - struct bvec_iter_all iter_all; -- bool first_bvec = true; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { -@@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - -- if (first_bvec) { -- btrfs_record_physical_zoned(inode, start, bio); -- first_bvec = false; -- } -- - end_extent_writepage(page, error, start, end); - - btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); -@@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; -- unsigned int error_bitmap = (unsigned int)-1; -- bool repair = false; - u64 start; - u64 end; - u32 len; -@@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) - len = bvec->bv_len; - - mirror = bbio->mirror_num; -- if (likely(uptodate)) { -- if (is_data_inode(inode)) { -- error_bitmap = btrfs_verify_data_csum(bbio, -- bio_offset, page, start, end); -- if (error_bitmap) -- uptodate = false; -- } else { -- if (btrfs_validate_metadata_buffer(bbio, -- page, start, end, mirror)) -- uptodate = false; -- } -- } -+ if (uptodate && !is_data_inode(inode) && -+ btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) -+ uptodate = false; - - if (likely(uptodate)) { - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - -- btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); -- - /* - * Zero out the remaining part if this range straddles - * i_size. -@@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) - zero_user_segment(page, zero_start, - offset_in_page(end) + 1); - } -- } else if (is_data_inode(inode)) { -- /* -- * Only try to repair bios that actually made it to a -- * device. If the bio failed to be submitted mirror -- * is 0 and we need to fail it without retrying. -- * -- * This also includes the high level bios for compressed -- * extents - these never make it to a device and repair -- * is already handled on the lower compressed bio. -- */ -- if (mirror > 0) -- repair = true; -- } else { -+ } else if (!is_data_inode(inode)) { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); -@@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) - atomic_dec(&eb->io_pages); - } - -- if (repair) { -- /* -- * submit_data_read_repair() will handle all the good -- * and bad sectors, we just continue to the next bvec. -- */ -- submit_data_read_repair(inode, bbio, bio_offset, bvec, -- error_bitmap); -- } else { -- /* Update page status and unlock */ -- end_page_read(page, uptodate, start, len); -- endio_readpage_release_extent(&processed, BTRFS_I(inode), -- start, end, PageUptodate(page)); -- } -+ /* Update page status and unlock. */ -+ end_page_read(page, uptodate, start, len); -+ endio_readpage_release_extent(&processed, BTRFS_I(inode), -+ start, end, PageUptodate(page)); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; -@@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) - } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); -- btrfs_bio_free_csum(bbio); - bio_put(bio); - } - -@@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - u32 real_size; - const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig = false; -- int ret; - - ASSERT(bio); - /* The limit should be calculated when bio_ctrl->bio is allocated */ -- ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); -+ ASSERT(bio_ctrl->len_to_oe_boundary); - if (bio_ctrl->compress_type != compress_type) - return 0; - -@@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - if (!contig) - return 0; - -- real_size = min(bio_ctrl->len_to_oe_boundary, -- bio_ctrl->len_to_stripe_boundary) - bio_size; -- real_size = min(real_size, size); -+ real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); - - /* - * If real_size is 0, never call bio_add_*_page(), as even size is 0, -@@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - if (real_size == 0) - return 0; - -- if (bio_op(bio) == REQ_OP_ZONE_APPEND) -- ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); -- else -- ret = bio_add_page(bio, page, real_size, pg_offset); -- -- return ret; -+ return bio_add_page(bio, page, real_size, pg_offset); - } - --static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, -- struct btrfs_inode *inode, u64 file_offset) -+static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, -+ struct btrfs_inode *inode, u64 file_offset) - { -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct btrfs_io_geometry geom; - struct btrfs_ordered_extent *ordered; -- struct extent_map *em; -- u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); -- int ret; - - /* -- * Pages for compressed extent are never submitted to disk directly, -- * thus it has no real boundary, just set them to U32_MAX. -- * -- * The split happens for real compressed bio, which happens in -- * btrfs_submit_compressed_read/write(). -+ * Limit the extent to the ordered boundary for Zone Append. -+ * Compressed bios aren't submitted directly, so it doesn't apply to -+ * them. - */ -- if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { -- bio_ctrl->len_to_oe_boundary = U32_MAX; -- bio_ctrl->len_to_stripe_boundary = U32_MAX; -- return 0; -- } -- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); -- if (IS_ERR(em)) -- return PTR_ERR(em); -- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), -- logical, &geom); -- free_extent_map(em); -- if (ret < 0) { -- return ret; -- } -- if (geom.len > U32_MAX) -- bio_ctrl->len_to_stripe_boundary = U32_MAX; -- else -- bio_ctrl->len_to_stripe_boundary = (u32)geom.len; -- -- if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { -- bio_ctrl->len_to_oe_boundary = U32_MAX; -- return 0; -- } -- -- /* Ordered extent not yet created, so we're good */ -- ordered = btrfs_lookup_ordered_extent(inode, file_offset); -- if (!ordered) { -- bio_ctrl->len_to_oe_boundary = U32_MAX; -- return 0; -+ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && -+ btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { -+ ordered = btrfs_lookup_ordered_extent(inode, file_offset); -+ if (ordered) { -+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, -+ ordered->file_offset + -+ ordered->disk_num_bytes - file_offset); -+ btrfs_put_ordered_extent(ordered); -+ return; -+ } - } - -- bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, -- ordered->disk_bytenr + ordered->disk_num_bytes - logical); -- btrfs_put_ordered_extent(ordered); -- return 0; -+ bio_ctrl->len_to_oe_boundary = U32_MAX; - } - --static int alloc_new_bio(struct btrfs_inode *inode, -- struct btrfs_bio_ctrl *bio_ctrl, -- struct writeback_control *wbc, -- blk_opf_t opf, -- u64 disk_bytenr, u32 offset, u64 file_offset, -- enum btrfs_compression_type compress_type) -+static void alloc_new_bio(struct btrfs_inode *inode, -+ struct btrfs_bio_ctrl *bio_ctrl, -+ struct writeback_control *wbc, blk_opf_t opf, -+ u64 disk_bytenr, u32 offset, u64 file_offset, -+ enum btrfs_compression_type compress_type) - { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *bio; -- int ret; - -- ASSERT(bio_ctrl->end_io_func); -- -- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); -+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, -+ NULL); - /* - * For compressed page range, its disk_bytenr is always @disk_bytenr - * passed in, no matter if we have added any range into previous bio. -@@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - else - bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; -+ btrfs_bio(bio)->file_offset = file_offset; - bio_ctrl->bio = bio; - bio_ctrl->compress_type = compress_type; -- ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); -- if (ret < 0) -- goto error; -+ calc_bio_boundaries(bio_ctrl, inode, file_offset); - - if (wbc) { - /* -- * For Zone append we need the correct block_device that we are -- * going to write to set in the bio to be able to respect the -- * hardware limitation. Look it up here: -+ * Pick the last added device to support cgroup writeback. For -+ * multi-device file systems this means blk-cgroup policies have -+ * to always be set on the last added/replaced device. -+ * This is a bit odd but has been like that for a long time. - */ -- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { -- struct btrfs_device *dev; -- -- dev = btrfs_zoned_get_device(fs_info, disk_bytenr, -- fs_info->sectorsize); -- if (IS_ERR(dev)) { -- ret = PTR_ERR(dev); -- goto error; -- } -- -- bio_set_dev(bio, dev->bdev); -- } else { -- /* -- * Otherwise pick the last added device to support -- * cgroup writeback. For multi-device file systems this -- * means blk-cgroup policies have to always be set on the -- * last added/replaced device. This is a bit odd but has -- * been like that for a long time. -- */ -- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); -- } -+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - wbc_init_bio(wbc, bio); -- } else { -- ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); - } -- return 0; --error: -- bio_ctrl->bio = NULL; -- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); -- return ret; - } - - /* -@@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, - enum btrfs_compression_type compress_type, - bool force_bio_submit) - { -- int ret = 0; - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - unsigned int cur = pg_offset; - -@@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, - - /* Allocate new bio if needed */ - if (!bio_ctrl->bio) { -- ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, -- disk_bytenr, offset, -- page_offset(page) + cur, -- compress_type); -- if (ret < 0) -- return ret; -+ alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, -+ offset, page_offset(page) + cur, -+ compress_type); - } - /* - * We must go through btrfs_bio_add_page() to ensure each -@@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; -- -- if (btrfs_use_zone_append(inode, em->block_start)) -- op = REQ_OP_ZONE_APPEND; -- - free_extent_map(em); - em = NULL; - -@@ -2360,13 +1910,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) - */ - mapping_set_error(page->mapping, -EIO); - -- /* -- * If we error out, we should add back the dirty_metadata_bytes -- * to make it consistent. -- */ -- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -- eb->len, fs_info->dirty_metadata_batch); -- - /* - * If writeback for a btree extent that doesn't belong to a log tree - * failed, increment the counter transaction->eb_write_errors. -@@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) - WARN_ON(atomic_read(&eb->refs) == 0); - } - --void clear_extent_buffer_dirty(const struct extent_buffer *eb) -+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, -+ struct extent_buffer *eb) - { -+ struct btrfs_fs_info *fs_info = eb->fs_info; - int i; - int num_pages; - struct page *page; - -+ btrfs_assert_tree_write_locked(eb); -+ -+ if (trans && btrfs_header_generation(eb) != trans->transid) -+ return; -+ -+ if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) -+ return; -+ -+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, -+ fs_info->dirty_metadata_batch); -+ - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - -diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h -index a2c82448b2e0..4341ad978fb8 100644 ---- a/fs/btrfs/extent_io.h -+++ b/fs/btrfs/extent_io.h -@@ -11,6 +11,8 @@ - #include "ulist.h" - #include "misc.h" - -+struct btrfs_trans_handle; -+ - enum { - EXTENT_BUFFER_UPTODATE, - EXTENT_BUFFER_DIRTY, -@@ -60,11 +62,9 @@ enum { - #define BITMAP_LAST_BYTE_MASK(nbits) \ - (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) - --struct btrfs_bio; - struct btrfs_root; - struct btrfs_inode; - struct btrfs_fs_info; --struct io_failure_record; - struct extent_io_tree; - struct btrfs_tree_parent_check; - -@@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star - void extent_buffer_bitmap_clear(const struct extent_buffer *eb, - unsigned long start, unsigned long pos, - unsigned long len); --void clear_extent_buffer_dirty(const struct extent_buffer *eb); - bool set_extent_buffer_dirty(struct extent_buffer *eb); - void set_extent_buffer_uptodate(struct extent_buffer *eb); - void clear_extent_buffer_uptodate(struct extent_buffer *eb); -@@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - u32 bits_to_clear, unsigned long page_ops); - int extent_invalidate_folio(struct extent_io_tree *tree, - struct folio *folio, size_t offset); -+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, -+ struct extent_buffer *buf); - - int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); - - void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - --/* -- * When IO fails, either with EIO or csum verification fails, we -- * try other mirrors that might have a good copy of the data. This -- * io_failure_record is used to record state as we go through all the -- * mirrors. If another mirror has good data, the sector is set up to date -- * and things continue. If a good mirror can't be found, the original -- * bio end_io callback is called to indicate things have failed. -- */ --struct io_failure_record { -- /* Use rb_simple_node for search/insert */ -- struct { -- struct rb_node rb_node; -- u64 bytenr; -- }; -- struct page *page; -- u64 len; -- u64 logical; -- int this_mirror; -- int failed_mirror; -- int num_copies; --}; -- --int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, -- u32 bio_offset, struct page *page, unsigned int pgoff, -- bool submit_buffered); --void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); --int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, -- struct page *page, unsigned int pg_offset); -- - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, -diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c -index 5de73466b2ca..41c77a100853 100644 ---- a/fs/btrfs/file-item.c -+++ b/fs/btrfs/file-item.c -@@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, - /* - * Lookup the checksum for the read bio in csum tree. - * -- * @inode: inode that the bio is for. -- * @bio: bio to look up. -- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return -- * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If -- * NULL, the checksum buffer is allocated and returned in -- * btrfs_bio(bio)->csum instead. -- * - * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. - */ --blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) -+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) - { -- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -- struct btrfs_bio *bbio = NULL; -+ struct btrfs_inode *inode = bbio->inode; -+ struct btrfs_fs_info *fs_info = inode->root->fs_info; -+ struct extent_io_tree *io_tree = &inode->io_tree; -+ struct bio *bio = &bbio->bio; - struct btrfs_path *path; - const u32 sectorsize = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - u32 orig_len = bio->bi_iter.bi_size; - u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_bytenr; -- u8 *csum; - const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - int count = 0; - blk_status_t ret = BLK_STS_OK; - -- if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || -+ if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return BLK_STS_OK; - -@@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst - if (!path) - return BLK_STS_RESOURCE; - -- if (!dst) { -- bbio = btrfs_bio(bio); -- -- if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { -- bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); -- if (!bbio->csum) { -- btrfs_free_path(path); -- return BLK_STS_RESOURCE; -- } -- } else { -- bbio->csum = bbio->csum_inline; -+ if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { -+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); -+ if (!bbio->csum) { -+ btrfs_free_path(path); -+ return BLK_STS_RESOURCE; - } -- csum = bbio->csum; - } else { -- csum = dst; -+ bbio->csum = bbio->csum_inline; - } - - /* -@@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst - * read from the commit root and sidestep a nasty deadlock - * between reading the free space cache and updating the csum tree. - */ -- if (btrfs_is_free_space_inode(BTRFS_I(inode))) { -+ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; - } -@@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst - ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); - sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> - fs_info->sectorsize_bits; -- csum_dst = csum + sector_offset * csum_size; -+ csum_dst = bbio->csum + sector_offset * csum_size; - - count = search_csum_tree(fs_info, path, cur_disk_bytenr, - search_len, csum_dst); - if (count < 0) { - ret = errno_to_blk_status(count); -- if (bbio) -- btrfs_bio_free_csum(bbio); -+ if (bbio->csum != bbio->csum_inline) -+ kfree(bbio->csum); -+ bbio->csum = NULL; - break; - } - -@@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst - memset(csum_dst, 0, csum_size); - count = 1; - -- if (BTRFS_I(inode)->root->root_key.objectid == -+ if (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - u64 file_offset; - int ret; - -- ret = search_file_offset_in_bio(bio, inode, -+ ret = search_file_offset_in_bio(bio, -+ &inode->vfs_inode, - cur_disk_bytenr, &file_offset); - if (ret) - set_extent_bits(io_tree, file_offset, -@@ -784,23 +772,16 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - - /* - * Calculate checksums of the data contained inside a bio. -- * -- * @inode: Owner of the data inside the bio -- * @bio: Contains the data to be checksummed -- * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the -- * file offsets are determined from the page offsets in the bio. -- * Otherwise, this is the starting file offset of the bio vecs in -- * @bio, which must be contiguous. -- * @one_ordered: If true, @bio only refers to one ordered extent. - */ --blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, -- u64 offset, bool one_ordered) -+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) - { -+ struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); -+ struct bio *bio = &bbio->bio; -+ u64 offset = bbio->file_offset; - struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered = NULL; -- const bool use_page_offsets = (offset == (u64)-1); - char *data; - struct bvec_iter iter; - struct bio_vec bvec; -@@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - shash->tfm = fs_info->csum_shash; - - bio_for_each_segment(bvec, bio, iter) { -- if (use_page_offsets) -- offset = page_offset(bvec.bv_page) + bvec.bv_offset; -- - if (!ordered) { - ordered = btrfs_lookup_ordered_extent(inode, offset); - /* -@@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - - 1); - - for (i = 0; i < blockcount; i++) { -- if (!one_ordered && -+ if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && - !in_range(offset, ordered->file_offset, - ordered->num_bytes)) { - unsigned long bytes_left; -diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h -index 031225668434..cd7f2ae515c0 100644 ---- a/fs/btrfs/file-item.h -+++ b/fs/btrfs/file-item.h -@@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) - - int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); --blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); - int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 pos, - u64 num_bytes); -@@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); --blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, -- u64 offset, bool one_ordered); -+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); -+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, -+ struct list_head *list, int search_commit, -+ bool nowait); - int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); -diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c -index af046d22300e..ec5c5355906b 100644 ---- a/fs/btrfs/file.c -+++ b/fs/btrfs/file.c -@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - unlock_page(pages[i]); - put_page(pages[i]); - } -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - return -EAGAIN; - } -@@ -1465,6 +1465,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) - ssize_t err; - unsigned int ilock_flags = 0; - struct iomap_dio *dio; -+ struct btrfs_ordered_extent *ordered_extent = NULL; - - if (iocb->ki_flags & IOCB_NOWAIT) - ilock_flags |= BTRFS_ILOCK_TRY; -@@ -1526,7 +1527,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) - * got -EFAULT, faulting in the pages before the retry. - */ - from->nofault = true; -- dio = btrfs_dio_write(iocb, from, written); -+ dio = btrfs_dio_write(iocb, from, &ordered_extent, written); - from->nofault = false; - - /* -@@ -1569,6 +1570,14 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) - goto relock; - } - } -+ /* -+ * We can't loop back to btrfs_dio_write, so we can drop the cached -+ * ordered extent. Typically btrfs_dio_iomap_end will run and put the -+ * ordered_extent, but this is needed to clean up in case of an error -+ * path breaking out of iomap_iter before the final iomap_end call. -+ */ -+ if (ordered_extent) -+ btrfs_put_ordered_extent(ordered_extent); - - /* - * If 'err' is -ENOTBLK or we have not written all data, then it means -diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c -index c667e878ef1a..4d155a48ec59 100644 ---- a/fs/btrfs/free-space-tree.c -+++ b/fs/btrfs/free-space-tree.c -@@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) - list_del(&free_space_root->dirty_list); - - btrfs_tree_lock(free_space_root->node); -- btrfs_clean_tree_block(free_space_root->node); -+ btrfs_clear_buffer_dirty(trans, free_space_root->node); - btrfs_tree_unlock(free_space_root->node); - btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), - free_space_root->node, 0, 1); -diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h -index 3d8156fc8523..4c477eae6891 100644 ---- a/fs/btrfs/fs.h -+++ b/fs/btrfs/fs.h -@@ -3,6 +3,7 @@ - #ifndef BTRFS_FS_H - #define BTRFS_FS_H - -+#include - #include - #include - #include -@@ -748,8 +749,10 @@ struct btrfs_fs_info { - */ - u64 zone_size; - -- /* Max size to emit ZONE_APPEND write command */ -+ /* Constraints for ZONE_APPEND commands: */ -+ struct queue_limits limits; - u64 max_zone_append_size; -+ - struct mutex zoned_meta_io_lock; - spinlock_t treelog_bg_lock; - u64 treelog_bg; -diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c -index 98a800b8bd43..6aaa892474be 100644 ---- a/fs/btrfs/inode.c -+++ b/fs/btrfs/inode.c -@@ -81,30 +81,16 @@ struct btrfs_dio_data { - struct extent_changeset *data_reserved; - bool data_space_reserved; - bool nocow_done; -+ struct btrfs_ordered_extent *ordered; - }; - - struct btrfs_dio_private { -- struct btrfs_inode *inode; -- -- /* -- * Since DIO can use anonymous page, we cannot use page_offset() to -- * grab the file offset, thus need a dedicated member for file offset. -- */ -+ /* Range of I/O */ - u64 file_offset; -- /* Used for bio::bi_size */ - u32 bytes; - -- /* -- * References to this structure. There is one reference per in-flight -- * bio plus one while we're still setting up. -- */ -- refcount_t refs; -- -- /* Array of checksums */ -- u8 *csums; -- - /* This must be last */ -- struct bio bio; -+ struct btrfs_bio bbio; - }; - - static struct bio_set btrfs_dio_bioset; -@@ -228,7 +214,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - { - unsigned long index = offset >> PAGE_SHIFT; - unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; -- u64 page_start, page_end; -+ u64 page_start = 0, page_end = 0; - struct page *page; - - if (locked_page) { -@@ -2535,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, - } - } - --/* -- * in order to insert checksums into the metadata in large chunks, -- * we wait until bio submission time. All the pages in the bio are -- * checksummed and sums are attached onto the ordered extent record. -- * -- * At IO completion time the cums attached on the ordered extent record -- * are inserted into the btree -- */ --blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) --{ -- return btrfs_csum_one_bio(inode, bio, (u64)-1, false); --} -- - /* - * Split an extent_map at [start, start + len] - * -@@ -2663,19 +2636,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, - return ret; - } - --static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, -- struct bio *bio, loff_t file_offset) -+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) - { -+ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; -+ u64 len = bbio->bio.bi_iter.bi_size; -+ struct btrfs_inode *inode = bbio->inode; - struct btrfs_ordered_extent *ordered; -- u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 file_len; -- u64 len = bio->bi_iter.bi_size; - u64 end = start + len; - u64 ordered_end; - u64 pre, post; - int ret = 0; - -- ordered = btrfs_lookup_ordered_extent(inode, file_offset); -+ ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - -@@ -2715,7 +2688,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - ret = btrfs_split_ordered_extent(ordered, pre, post); - if (ret) - goto out; -- ret = split_zoned_em(inode, file_offset, file_len, pre, post); -+ ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); - - out: - btrfs_put_ordered_extent(ordered); -@@ -2723,75 +2696,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, - return errno_to_blk_status(ret); - } - --void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- blk_status_t ret; -- -- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { -- ret = extract_ordered_extent(inode, bio, -- page_offset(bio_first_bvec_all(bio)->bv_page)); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(bio), ret); -- return; -- } -- } -- -- /* -- * If we need to checksum, and the I/O is not issued by fsync and -- * friends, that is ->sync_writers != 0, defer the submission to a -- * workqueue to parallelize it. -- * -- * Csum items for reloc roots have already been cloned at this point, -- * so they are handled as part of the no-checksum case. -- */ -- if (!(inode->flags & BTRFS_INODE_NODATASUM) && -- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && -- !btrfs_is_data_reloc_root(inode->root)) { -- if (!atomic_read(&inode->sync_writers) && -- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) -- return; -- -- ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(bio), ret); -- return; -- } -- } -- btrfs_submit_bio(fs_info, bio, mirror_num); --} -- --void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, -- int mirror_num, enum btrfs_compression_type compress_type) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- blk_status_t ret; -- -- if (compress_type != BTRFS_COMPRESS_NONE) { -- /* -- * btrfs_submit_compressed_read will handle completing the bio -- * if there were any errors, so just return here. -- */ -- btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); -- return; -- } -- -- /* Save the original iter for read repair */ -- btrfs_bio(bio)->iter = bio->bi_iter; -- -- /* -- * Lookup bio sums does extra checks around whether we need to csum or -- * not, which is why we ignore skip_sum here. -- */ -- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(bio), ret); -- return; -- } -- -- btrfs_submit_bio(fs_info, bio, mirror_num); --} -- - /* - * given a list of ordered sums record them in the inode. This happens - * at IO completion time based on sums calculated at bio submission time. -@@ -2969,7 +2873,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); - unlock_page(page); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } -@@ -3259,15 +3163,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) - goto out; - } - -- /* A valid bdev implies a write on a sequential zone */ -- if (ordered_extent->bdev) { -+ /* A valid ->physical implies a write on a sequential zone. */ -+ if (ordered_extent->physical != (u64)-1) { - btrfs_rewrite_logical_zoned(ordered_extent); - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); - } - -- btrfs_free_io_failure_record(inode, start, end); -- - if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { - truncated = true; - logical_len = ordered_extent->truncated_len; -@@ -3474,109 +3376,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of - } - - /* -- * check_data_csum - verify checksum of one sector of uncompressed data -- * @inode: inode -- * @bbio: btrfs_bio which contains the csum -+ * Verify the checksum of a single data sector. -+ * -+ * @bbio: btrfs_io_bio which contains the csum -+ * @dev: device the sector is on - * @bio_offset: offset to the beginning of the bio (in bytes) -- * @page: page where is the data to be verified -- * @pgoff: offset inside the page -+ * @bv: bio_vec to check - * -- * The length of such check is always one sector size. -+ * Check if the checksum on a data block is valid. When a checksum mismatch is -+ * detected, report the error and fill the corrupted range with zero. - * -- * When csum mismatch is detected, we will also report the error and fill the -- * corrupted range with zero. (Thus it needs the extra parameters) -+ * Return %true if the sector is ok or had no checksum to start with, else %false. - */ --int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, -- u32 bio_offset, struct page *page, u32 pgoff) -+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, -+ u32 bio_offset, struct bio_vec *bv) - { -+ struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; -- u32 len = fs_info->sectorsize; -+ u64 file_offset = bbio->file_offset + bio_offset; -+ u64 end = file_offset + bv->bv_len - 1; - u8 *csum_expected; - u8 csum[BTRFS_CSUM_SIZE]; - -- ASSERT(pgoff + len <= PAGE_SIZE); -+ ASSERT(bv->bv_len == fs_info->sectorsize); - -- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); -+ if (!bbio->csum) -+ return true; - -- if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) -+ if (btrfs_is_data_reloc_root(inode->root) && -+ test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, -+ 1, NULL)) { -+ /* Skip the range without csum for data reloc inode */ -+ clear_extent_bits(&inode->io_tree, file_offset, end, -+ EXTENT_NODATASUM); -+ return true; -+ } -+ -+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); -+ if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, -+ csum_expected)) - goto zeroit; -- return 0; -+ return true; - - zeroit: -- btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, -- csum, csum_expected, bbio->mirror_num); -- if (bbio->device) -- btrfs_dev_stat_inc_and_print(bbio->device, -- BTRFS_DEV_STAT_CORRUPTION_ERRS); -- memzero_page(page, pgoff, len); -- return -EIO; --} -- --/* -- * When reads are done, we need to check csums to verify the data is correct. -- * if there's a match, we allow the bio to finish. If not, the code in -- * extent_io.c will try to find good copies for us. -- * -- * @bio_offset: offset to the beginning of the bio (in bytes) -- * @start: file offset of the range start -- * @end: file offset of the range end (inclusive) -- * -- * Return a bitmap where bit set means a csum mismatch, and bit not set means -- * csum match. -- */ --unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, -- u32 bio_offset, struct page *page, -- u64 start, u64 end) --{ -- struct btrfs_inode *inode = BTRFS_I(page->mapping->host); -- struct btrfs_root *root = inode->root; -- struct btrfs_fs_info *fs_info = root->fs_info; -- struct extent_io_tree *io_tree = &inode->io_tree; -- const u32 sectorsize = root->fs_info->sectorsize; -- u32 pg_off; -- unsigned int result = 0; -- -- /* -- * This only happens for NODATASUM or compressed read. -- * Normally this should be covered by above check for compressed read -- * or the next check for NODATASUM. Just do a quicker exit here. -- */ -- if (bbio->csum == NULL) -- return 0; -- -- if (inode->flags & BTRFS_INODE_NODATASUM) -- return 0; -- -- if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) -- return 0; -- -- ASSERT(page_offset(page) <= start && -- end <= page_offset(page) + PAGE_SIZE - 1); -- for (pg_off = offset_in_page(start); -- pg_off < offset_in_page(end); -- pg_off += sectorsize, bio_offset += sectorsize) { -- u64 file_offset = pg_off + page_offset(page); -- int ret; -- -- if (btrfs_is_data_reloc_root(root) && -- test_range_bit(io_tree, file_offset, -- file_offset + sectorsize - 1, -- EXTENT_NODATASUM, 1, NULL)) { -- /* Skip the range without csum for data reloc inode */ -- clear_extent_bits(io_tree, file_offset, -- file_offset + sectorsize - 1, -- EXTENT_NODATASUM); -- continue; -- } -- ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); -- if (ret < 0) { -- const int nr_bit = (pg_off - offset_in_page(start)) >> -- root->fs_info->sectorsize_bits; -- -- result |= (1U << nr_bit); -- } -- } -- return result; -+ btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, -+ bbio->mirror_num); -+ if (dev) -+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); -+ memzero_bvec(bv); -+ return false; - } - - /* -@@ -4987,7 +4835,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } -@@ -5466,8 +5314,6 @@ void btrfs_evict_inode(struct inode *inode) - if (is_bad_inode(inode)) - goto no_delete; - -- btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); -- - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - goto no_delete; - -@@ -7131,6 +6977,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - } - - static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, -+ struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, -@@ -7141,7 +6988,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - const int type) - { - struct extent_map *em = NULL; -- int ret; -+ struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, -@@ -7151,18 +6998,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - if (IS_ERR(em)) - goto out; - } -- ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, -- block_len, 0, -- (1 << type) | -- (1 << BTRFS_ORDERED_DIRECT), -- BTRFS_COMPRESS_NONE); -- if (ret) { -+ ordered = btrfs_alloc_ordered_extent(inode, start, len, len, -+ block_start, block_len, 0, -+ (1 << type) | -+ (1 << BTRFS_ORDERED_DIRECT), -+ BTRFS_COMPRESS_NONE); -+ if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } -- em = ERR_PTR(ret); -+ em = ERR_PTR(PTR_ERR(ordered)); -+ } else { -+ ASSERT(!dio_data->ordered); -+ dio_data->ordered = ordered; - } - out: - -@@ -7170,6 +7020,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - } - - static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, -+ struct btrfs_dio_data *dio_data, - u64 start, u64 len) - { - struct btrfs_root *root = inode->root; -@@ -7185,7 +7036,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - if (ret) - return ERR_PTR(ret); - -- em = btrfs_create_dio_extent(inode, start, ins.offset, start, -+ em = btrfs_create_dio_extent(inode, dio_data, -+ start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); -@@ -7392,7 +7244,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); -@@ -7530,7 +7382,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, - } - space_reserved = true; - -- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, -+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); -@@ -7572,7 +7424,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, - goto out; - space_reserved = true; - -- em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); -+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; -@@ -7676,6 +7528,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - } - } - -+ if (dio_data->ordered) { -+ ASSERT(write); -+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, -+ dio_data->ordered->file_offset, -+ dio_data->ordered->bytes_left); -+ if (IS_ERR(em)) { -+ ret = PTR_ERR(em); -+ goto err; -+ } -+ goto map_iomap; -+ } - memset(dio_data, 0, sizeof(*dio_data)); - - /* -@@ -7817,6 +7680,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - else - free_extent_state(cached_state); - -+map_iomap: - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does -@@ -7833,10 +7697,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; -- -- if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) -- iomap->flags |= IOMAP_F_ZONE_APPEND; -- - free_extent_map(em); - - return 0; -@@ -7874,13 +7734,25 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - if (submitted < length) { - pos += submitted; - length -= submitted; -- if (write) -- btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, -- pos, length, false); -- else -+ if (write) { -+ if (submitted == 0) { -+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), -+ NULL, pos, -+ length, false); -+ btrfs_put_ordered_extent(dio_data->ordered); -+ dio_data->ordered = NULL; -+ } -+ } else { - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); -+ } - ret = -ENOTBLK; -+ } else { -+ /* On the last bio, release our cached ordered_extent. */ -+ if (write) { -+ btrfs_put_ordered_extent(dio_data->ordered); -+ dio_data->ordered = NULL; -+ } - } - - if (write) -@@ -7888,267 +7760,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - return ret; - } - --static void btrfs_dio_private_put(struct btrfs_dio_private *dip) --{ -- /* -- * This implies a barrier so that stores to dio_bio->bi_status before -- * this and loads of dio_bio->bi_status after this are fully ordered. -- */ -- if (!refcount_dec_and_test(&dip->refs)) -- return; -- -- if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { -- btrfs_mark_ordered_io_finished(dip->inode, NULL, -- dip->file_offset, dip->bytes, -- !dip->bio.bi_status); -- } else { -- unlock_extent(&dip->inode->io_tree, -- dip->file_offset, -- dip->file_offset + dip->bytes - 1, NULL); -- } -- -- kfree(dip->csums); -- bio_endio(&dip->bio); --} -- --void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -+static void btrfs_dio_end_io(struct btrfs_bio *bbio) - { -- struct btrfs_dio_private *dip = btrfs_bio(bio)->private; -- -- BUG_ON(bio_op(bio) == REQ_OP_WRITE); -- -- refcount_inc(&dip->refs); -- btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); --} -- --static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, -- struct btrfs_bio *bbio, -- const bool uptodate) --{ -- struct inode *inode = &dip->inode->vfs_inode; -- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; -- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); -- blk_status_t err = BLK_STS_OK; -- struct bvec_iter iter; -- struct bio_vec bv; -- u32 offset; -- -- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { -- u64 start = bbio->file_offset + offset; -- -- if (uptodate && -- (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, -- bv.bv_page, bv.bv_offset))) { -- btrfs_clean_io_failure(BTRFS_I(inode), start, -- bv.bv_page, bv.bv_offset); -- } else { -- int ret; -- -- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, -- bv.bv_page, bv.bv_offset, false); -- if (ret) -- err = errno_to_blk_status(ret); -- } -- } -- -- return err; --} -- --blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, -- struct bio *bio, -- u64 dio_file_offset) --{ -- return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); --} -- --static void btrfs_end_dio_bio(struct btrfs_bio *bbio) --{ -- struct btrfs_dio_private *dip = bbio->private; -+ struct btrfs_dio_private *dip = -+ container_of(bbio, struct btrfs_dio_private, bbio); -+ struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; -- blk_status_t err = bio->bi_status; -- -- if (err) -- btrfs_warn(dip->inode->root->fs_info, -- "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", -- btrfs_ino(dip->inode), bio_op(bio), -- bio->bi_opf, bio->bi_iter.bi_sector, -- bio->bi_iter.bi_size, err); -- -- if (bio_op(bio) == REQ_OP_READ) -- err = btrfs_check_read_dio_bio(dip, bbio, !err); - -- if (err) -- dip->bio.bi_status = err; -- -- btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); -- -- bio_put(bio); -- btrfs_dio_private_put(dip); --} -- --static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, -- u64 file_offset, int async_submit) --{ -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- struct btrfs_dio_private *dip = btrfs_bio(bio)->private; -- blk_status_t ret; -- -- /* Save the original iter for read repair */ -- if (btrfs_op(bio) == BTRFS_MAP_READ) -- btrfs_bio(bio)->iter = bio->bi_iter; -- -- if (inode->flags & BTRFS_INODE_NODATASUM) -- goto map; -+ if (bio->bi_status) { -+ btrfs_warn(inode->root->fs_info, -+ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", -+ btrfs_ino(inode), bio->bi_opf, -+ dip->file_offset, dip->bytes, bio->bi_status); -+ } - -- if (btrfs_op(bio) == BTRFS_MAP_WRITE) { -- /* Check btrfs_submit_data_write_bio() for async submit rules */ -- if (async_submit && !atomic_read(&inode->sync_writers) && -- btrfs_wq_submit_bio(inode, bio, 0, file_offset, -- WQ_SUBMIT_DATA_DIO)) -- return; -+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) -+ btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, -+ dip->bytes, !bio->bi_status); -+ else -+ unlock_extent(&inode->io_tree, dip->file_offset, -+ dip->file_offset + dip->bytes - 1, NULL); - -- /* -- * If we aren't doing async submit, calculate the csum of the -- * bio now. -- */ -- ret = btrfs_csum_one_bio(inode, bio, file_offset, false); -- if (ret) { -- btrfs_bio_end_io(btrfs_bio(bio), ret); -- return; -- } -- } else { -- btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, -- file_offset - dip->file_offset); -- } --map: -- btrfs_submit_bio(fs_info, bio, 0); -+ bbio->bio.bi_private = bbio->private; -+ iomap_dio_bio_end_io(bio); - } - --static void btrfs_submit_direct(const struct iomap_iter *iter, -- struct bio *dio_bio, loff_t file_offset) -+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, -+ loff_t file_offset) - { -+ struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = -- container_of(dio_bio, struct btrfs_dio_private, bio); -- struct inode *inode = iter->inode; -- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); -- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -- const bool raid56 = (btrfs_data_alloc_profile(fs_info) & -- BTRFS_BLOCK_GROUP_RAID56_MASK); -- struct bio *bio; -- u64 start_sector; -- int async_submit = 0; -- u64 submit_len; -- u64 clone_offset = 0; -- u64 clone_len; -- u64 logical; -- int ret; -- blk_status_t status; -- struct btrfs_io_geometry geom; -+ container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; -- struct extent_map *em = NULL; -- -- dip->inode = BTRFS_I(inode); -- dip->file_offset = file_offset; -- dip->bytes = dio_bio->bi_iter.bi_size; -- refcount_set(&dip->refs, 1); -- dip->csums = NULL; -- -- if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { -- unsigned int nr_sectors = -- (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); -- -- /* -- * Load the csums up front to reduce csum tree searches and -- * contention when submitting bios. -- */ -- status = BLK_STS_RESOURCE; -- dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); -- if (!dip->csums) -- goto out_err; -- -- status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); -- if (status != BLK_STS_OK) -- goto out_err; -- } -- -- start_sector = dio_bio->bi_iter.bi_sector; -- submit_len = dio_bio->bi_iter.bi_size; -- -- do { -- logical = start_sector << 9; -- em = btrfs_get_chunk_map(fs_info, logical, submit_len); -- if (IS_ERR(em)) { -- status = errno_to_blk_status(PTR_ERR(em)); -- em = NULL; -- goto out_err_em; -- } -- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), -- logical, &geom); -- if (ret) { -- status = errno_to_blk_status(ret); -- goto out_err_em; -- } -- -- clone_len = min(submit_len, geom.len); -- ASSERT(clone_len <= UINT_MAX); -- -- /* -- * This will never fail as it's passing GPF_NOFS and -- * the allocation is backed by btrfs_bioset. -- */ -- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, -- btrfs_end_dio_bio, dip); -- btrfs_bio(bio)->file_offset = file_offset; -- -- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { -- status = extract_ordered_extent(BTRFS_I(inode), bio, -- file_offset); -- if (status) { -- bio_put(bio); -- goto out_err; -- } -- } -- -- ASSERT(submit_len >= clone_len); -- submit_len -= clone_len; -- -- /* -- * Increase the count before we submit the bio so we know -- * the end IO handler won't happen before we increase the -- * count. Otherwise, the dip might get freed before we're -- * done setting it up. -- * -- * We transfer the initial reference to the last bio, so we -- * don't need to increment the reference count for the last one. -- */ -- if (submit_len > 0) { -- refcount_inc(&dip->refs); -- /* -- * If we are submitting more than one bio, submit them -- * all asynchronously. The exception is RAID 5 or 6, as -- * asynchronous checksums make it difficult to collect -- * full stripe writes. -- */ -- if (!raid56) -- async_submit = 1; -- } - -- btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); -+ btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); -+ bbio->file_offset = file_offset; - -- dio_data->submitted += clone_len; -- clone_offset += clone_len; -- start_sector += clone_len >> 9; -- file_offset += clone_len; -- -- free_extent_map(em); -- } while (submit_len > 0); -- return; -+ dip->file_offset = file_offset; -+ dip->bytes = bio->bi_iter.bi_size; - --out_err_em: -- free_extent_map(em); --out_err: -- dio_bio->bi_status = status; -- btrfs_dio_private_put(dip); -+ dio_data->submitted += bio->bi_iter.bi_size; -+ btrfs_submit_bio(bio, 0); - } - - static const struct iomap_ops btrfs_dio_iomap_ops = { -@@ -8157,25 +7809,30 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { - }; - - static const struct iomap_dio_ops btrfs_dio_ops = { -- .submit_io = btrfs_submit_direct, -+ .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, - }; - - ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) - { -- struct btrfs_dio_data data; -+ struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); - } - - struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, -+ struct btrfs_ordered_extent **ordered_extent, - size_t done_before) - { -- struct btrfs_dio_data data; -+ struct btrfs_dio_data dio_data = { .ordered = *ordered_extent }; -+ struct iomap_dio *dio; - -- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, -- IOMAP_DIO_PARTIAL, &data, done_before); -+ dio = __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, -+ IOMAP_DIO_PARTIAL, &dio_data, done_before); -+ if (!IS_ERR_OR_NULL(dio)) -+ *ordered_extent = dio_data.ordered; -+ return dio; - } - - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -@@ -8552,7 +8209,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } -@@ -8850,7 +8507,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) - ei->last_log_commit = 0; - - spin_lock_init(&ei->lock); -- spin_lock_init(&ei->io_failure_lock); - ei->outstanding_extents = 0; - if (sb->s_magic != BTRFS_TEST_MAGIC) - btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, -@@ -8870,7 +8526,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) - ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); -- ei->io_failure_tree = RB_ROOT; - atomic_set(&ei->sync_writers, 0); - mutex_init(&ei->log_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); -@@ -8994,7 +8649,7 @@ int __init btrfs_init_cachep(void) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, -- offsetof(struct btrfs_dio_private, bio), -+ offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; - -@@ -10289,65 +9944,13 @@ struct btrfs_encoded_read_private { - wait_queue_head_t wait; - atomic_t pending; - blk_status_t status; -- bool skip_csum; - }; - --static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, -- struct bio *bio, int mirror_num) --{ -- struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- blk_status_t ret; -- -- if (!priv->skip_csum) { -- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); -- if (ret) -- return ret; -- } -- -- atomic_inc(&priv->pending); -- btrfs_submit_bio(fs_info, bio, mirror_num); -- return BLK_STS_OK; --} -- --static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) --{ -- const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); -- struct btrfs_encoded_read_private *priv = bbio->private; -- struct btrfs_inode *inode = priv->inode; -- struct btrfs_fs_info *fs_info = inode->root->fs_info; -- u32 sectorsize = fs_info->sectorsize; -- struct bio_vec *bvec; -- struct bvec_iter_all iter_all; -- u32 bio_offset = 0; -- -- if (priv->skip_csum || !uptodate) -- return bbio->bio.bi_status; -- -- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { -- unsigned int i, nr_sectors, pgoff; -- -- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); -- pgoff = bvec->bv_offset; -- for (i = 0; i < nr_sectors; i++) { -- ASSERT(pgoff < PAGE_SIZE); -- if (btrfs_check_data_csum(inode, bbio, bio_offset, -- bvec->bv_page, pgoff)) -- return BLK_STS_IOERR; -- bio_offset += sectorsize; -- pgoff += sectorsize; -- } -- } -- return BLK_STS_OK; --} -- - static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) - { - struct btrfs_encoded_read_private *priv = bbio->private; -- blk_status_t status; - -- status = btrfs_encoded_read_verify_csum(bbio); -- if (status) { -+ if (bbio->bio.bi_status) { - /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the -@@ -10356,11 +9959,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) - * write is observed before the load of status in - * btrfs_encoded_read_regular_fill_pages(). - */ -- WRITE_ONCE(priv->status, status); -+ WRITE_ONCE(priv->status, bbio->bio.bi_status); - } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); -- btrfs_bio_free_csum(bbio); - bio_put(&bbio->bio); - } - -@@ -10368,47 +9970,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) - { -- struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .inode = inode, - .file_offset = file_offset, - .pending = ATOMIC_INIT(1), -- .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), - }; - unsigned long i = 0; - u64 cur = 0; -- int ret; - - init_waitqueue_head(&priv.wait); -- /* -- * Submit bios for the extent, splitting due to bio or stripe limits as -- * necessary. -- */ -+ /* Submit bios for the extent, splitting due to bio limits as necessary. */ - while (cur < disk_io_size) { -- struct extent_map *em; -- struct btrfs_io_geometry geom; - struct bio *bio = NULL; -- u64 remaining; -+ u64 remaining = disk_io_size - cur; - -- em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, -- disk_io_size - cur); -- if (IS_ERR(em)) { -- ret = PTR_ERR(em); -- } else { -- ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, -- disk_bytenr + cur, &geom); -- free_extent_map(em); -- } -- if (ret) { -- WRITE_ONCE(priv.status, errno_to_blk_status(ret)); -- break; -- } -- remaining = min(geom.len, disk_io_size - cur); - while (bio || remaining) { - size_t bytes = min_t(u64, remaining, PAGE_SIZE); - - if (!bio) { - bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, -+ inode, - btrfs_encoded_read_endio, - &priv); - bio->bi_iter.bi_sector = -@@ -10417,14 +9998,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - - if (!bytes || - bio_add_page(bio, pages[i], bytes, 0) < bytes) { -- blk_status_t status; -- -- status = submit_encoded_read_bio(inode, bio, 0); -- if (status) { -- WRITE_ONCE(priv.status, status); -- bio_put(bio); -- goto out; -- } -+ atomic_inc(&priv.pending); -+ btrfs_submit_bio(bio, 0); - bio = NULL; - continue; - } -@@ -10435,7 +10010,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - } - } - --out: - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ -@@ -10995,9 +10569,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, - return 0; - - max_pages = sis->max - bsi->nr_pages; -- first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; -- next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, -- PAGE_SIZE) >> PAGE_SHIFT; -+ first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; -+ next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; - - if (first_ppage >= next_ppage) - return 0; -diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c -index 7e348bd2ccde..8ea557e22252 100644 ---- a/fs/btrfs/ioctl.c -+++ b/fs/btrfs/ioctl.c -@@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, - * exists). - */ - btrfs_tree_lock(leaf); -- btrfs_clean_tree_block(leaf); -+ btrfs_clear_buffer_dirty(trans, leaf); - btrfs_tree_unlock(leaf); - btrfs_free_tree_block(trans, objectid, leaf, 0, 1); - free_extent_buffer(leaf); -diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c -new file mode 100644 -index 000000000000..0fe0ae54ac67 ---- /dev/null -+++ b/fs/btrfs/lru_cache.c -@@ -0,0 +1,166 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include "lru_cache.h" -+#include "messages.h" -+ -+/* -+ * Initialize a cache object. -+ * -+ * @cache: The cache. -+ * @max_size: Maximum size (number of entries) for the cache. -+ * Use 0 for unlimited size, it's the user's responsability to -+ * trim the cache in that case. -+ */ -+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) -+{ -+ INIT_LIST_HEAD(&cache->lru_list); -+ mt_init(&cache->entries); -+ cache->size = 0; -+ cache->max_size = max_size; -+} -+ -+static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, -+ u64 gen) -+{ -+ struct btrfs_lru_cache_entry *entry; -+ -+ list_for_each_entry(entry, head, list) { -+ if (entry->key == key && entry->gen == gen) -+ return entry; -+ } -+ -+ return NULL; -+} -+ -+/* -+ * Lookup for an entry in the cache. -+ * -+ * @cache: The cache. -+ * @key: The key of the entry we are looking for. -+ * @gen: Generation associated to the key. -+ * -+ * Returns the entry associated with the key or NULL if none found. -+ */ -+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, -+ u64 key, u64 gen) -+{ -+ struct list_head *head; -+ struct btrfs_lru_cache_entry *entry; -+ -+ head = mtree_load(&cache->entries, key); -+ if (!head) -+ return NULL; -+ -+ entry = match_entry(head, key, gen); -+ if (entry) -+ list_move_tail(&entry->lru_list, &cache->lru_list); -+ -+ return entry; -+} -+ -+/* -+ * Remove an entry from the cache. -+ * -+ * @cache: The cache to remove from. -+ * @entry: The entry to remove from the cache. -+ * -+ * Note: this also frees the memory used by the entry. -+ */ -+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, -+ struct btrfs_lru_cache_entry *entry) -+{ -+ struct list_head *prev = entry->list.prev; -+ -+ ASSERT(cache->size > 0); -+ ASSERT(!mtree_empty(&cache->entries)); -+ -+ list_del(&entry->list); -+ list_del(&entry->lru_list); -+ -+ if (list_empty(prev)) { -+ struct list_head *head; -+ -+ /* -+ * If previous element in the list entry->list is now empty, it -+ * means it's a head entry not pointing to any cached entries, -+ * so remove it from the maple tree and free it. -+ */ -+ head = mtree_erase(&cache->entries, entry->key); -+ ASSERT(head == prev); -+ kfree(head); -+ } -+ -+ kfree(entry); -+ cache->size--; -+} -+ -+/* -+ * Store an entry in the cache. -+ * -+ * @cache: The cache. -+ * @entry: The entry to store. -+ * -+ * Returns 0 on success and < 0 on error. -+ */ -+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, -+ struct btrfs_lru_cache_entry *new_entry, -+ gfp_t gfp) -+{ -+ const u64 key = new_entry->key; -+ struct list_head *head; -+ int ret; -+ -+ head = kmalloc(sizeof(*head), gfp); -+ if (!head) -+ return -ENOMEM; -+ -+ ret = mtree_insert(&cache->entries, key, head, gfp); -+ if (ret == 0) { -+ INIT_LIST_HEAD(head); -+ list_add_tail(&new_entry->list, head); -+ } else if (ret == -EEXIST) { -+ kfree(head); -+ head = mtree_load(&cache->entries, key); -+ ASSERT(head != NULL); -+ if (match_entry(head, key, new_entry->gen) != NULL) -+ return -EEXIST; -+ list_add_tail(&new_entry->list, head); -+ } else if (ret < 0) { -+ kfree(head); -+ return ret; -+ } -+ -+ if (cache->max_size > 0 && cache->size == cache->max_size) { -+ struct btrfs_lru_cache_entry *lru_entry; -+ -+ lru_entry = list_first_entry(&cache->lru_list, -+ struct btrfs_lru_cache_entry, -+ lru_list); -+ btrfs_lru_cache_remove(cache, lru_entry); -+ } -+ -+ list_add_tail(&new_entry->lru_list, &cache->lru_list); -+ cache->size++; -+ -+ return 0; -+} -+ -+/* -+ * Empty a cache. -+ * -+ * @cache: The cache to empty. -+ * -+ * Removes all entries from the cache. -+ */ -+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) -+{ -+ struct btrfs_lru_cache_entry *entry; -+ struct btrfs_lru_cache_entry *tmp; -+ -+ list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) -+ btrfs_lru_cache_remove(cache, entry); -+ -+ ASSERT(cache->size == 0); -+ ASSERT(mtree_empty(&cache->entries)); -+} -diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h -new file mode 100644 -index 000000000000..de3e18bce24a ---- /dev/null -+++ b/fs/btrfs/lru_cache.h -@@ -0,0 +1,80 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef BTRFS_LRU_CACHE_H -+#define BTRFS_LRU_CACHE_H -+ -+#include -+#include -+ -+/* -+ * A cache entry. This is meant to be embedded in a structure of a user of -+ * this module. Similar to how struct list_head and struct rb_node are used. -+ * -+ * Note: it should be embedded as the first element in a struct (offset 0), and -+ * this module assumes it was allocated with kmalloc(), so it calls kfree() when -+ * it needs to free an entry. -+ */ -+struct btrfs_lru_cache_entry { -+ struct list_head lru_list; -+ u64 key; -+ /* -+ * Optional generation associated to a key. Use 0 if not needed/used. -+ * Entries with the same key and different generations are stored in a -+ * linked list, so use this only for cases where there's a small number -+ * of different generations. -+ */ -+ u64 gen; -+ /* -+ * The maple tree uses unsigned long type for the keys, which is 32 bits -+ * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to -+ * use something like inode numbers as keys, which are always a u64, we -+ * have to deal with this in a special way - we store the key in the -+ * entry itself, as a u64, and the values inserted into the maple tree -+ * are linked lists of entries - so in case we are on a 64 bits system, -+ * that list always has a single entry, while on 32 bits systems it -+ * may have more than one, with each entry having the same value for -+ * their lower 32 bits of the u64 key. -+ */ -+ struct list_head list; -+}; -+ -+struct btrfs_lru_cache { -+ struct list_head lru_list; -+ struct maple_tree entries; -+ /* Number of entries stored in the cache. */ -+ unsigned int size; -+ /* Maximum number of entries the cache can have. */ -+ unsigned int max_size; -+}; -+ -+#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ -+ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -+ -+static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -+{ -+ return cache->size; -+} -+ -+static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) -+{ -+ return cache->size >= cache->max_size; -+} -+ -+static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( -+ struct btrfs_lru_cache *cache) -+{ -+ return list_first_entry_or_null(&cache->lru_list, -+ struct btrfs_lru_cache_entry, lru_list); -+} -+ -+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); -+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, -+ u64 key, u64 gen); -+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, -+ struct btrfs_lru_cache_entry *new_entry, -+ gfp_t gfp); -+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, -+ struct btrfs_lru_cache_entry *entry); -+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); -+ -+#endif -diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c -index d5e78cbc8fbc..71f6d8302d50 100644 ---- a/fs/btrfs/lzo.c -+++ b/fs/btrfs/lzo.c -@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - } - - /* Check if we have reached page boundary */ -- if (IS_ALIGNED(cur_in, PAGE_SIZE)) { -+ if (PAGE_ALIGNED(cur_in)) { - put_page(page_in); - page_in = NULL; - } -diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c -index 625bbbbb2608..fde5aaa6e7c9 100644 ---- a/fs/btrfs/messages.c -+++ b/fs/btrfs/messages.c -@@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) - } - #endif - --/* -- * We only mark the transaction aborted and then set the file system read-only. -- * This will prevent new transactions from starting or trying to join this -- * one. -- * -- * This means that error recovery at the call site is limited to freeing -- * any local memory allocations and passing the error code up without -- * further cleanup. The transaction should complete as it normally would -- * in the call path but will return -EIO. -- * -- * We'll complete the cleanup in btrfs_end_transaction and -- * btrfs_commit_transaction. -- */ --__cold --void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, -- const char *function, -- unsigned int line, int errno, bool first_hit) --{ -- struct btrfs_fs_info *fs_info = trans->fs_info; -- -- WRITE_ONCE(trans->aborted, errno); -- WRITE_ONCE(trans->transaction->aborted, errno); -- if (first_hit && errno == -ENOSPC) -- btrfs_dump_space_info_for_trans_abort(fs_info); -- /* Wake up anybody who may be waiting on this transaction */ -- wake_up(&fs_info->transaction_wait); -- wake_up(&fs_info->transaction_blocked_wait); -- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); --} -- - /* - * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an - * alert, and either panics or BUGs, depending on mount options. -diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h -index 190af1f698d9..8c516ee58ff9 100644 ---- a/fs/btrfs/messages.h -+++ b/fs/btrfs/messages.h -@@ -6,7 +6,6 @@ - #include - - struct btrfs_fs_info; --struct btrfs_trans_handle; - - static inline __printf(2, 3) __cold - void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function - - const char * __attribute_const__ btrfs_decode_error(int errno); - --__cold --void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, -- const char *function, -- unsigned int line, int errno, bool first_hit); -- --bool __cold abort_should_print_stack(int errno); -- --/* -- * Call btrfs_abort_transaction as early as possible when an error condition is -- * detected, that way the exact stack trace is reported for some errors. -- */ --#define btrfs_abort_transaction(trans, errno) \ --do { \ -- bool first = false; \ -- /* Report first abort since mount */ \ -- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ -- &((trans)->fs_info->fs_state))) { \ -- first = true; \ -- if (WARN(abort_should_print_stack(errno), \ -- KERN_ERR \ -- "BTRFS: Transaction aborted (error %d)\n", \ -- (errno))) { \ -- /* Stack trace printed. */ \ -- } else { \ -- btrfs_err((trans)->fs_info, \ -- "Transaction aborted (error %d)", \ -- (errno)); \ -- } \ -- } \ -- __btrfs_abort_transaction((trans), __func__, \ -- __LINE__, (errno), first); \ --} while (0) -- - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) -diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c -index 57d8c72737e1..1848d0d1a9c4 100644 ---- a/fs/btrfs/ordered-data.c -+++ b/fs/btrfs/ordered-data.c -@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - * @compress_type: Compression algorithm used for data. - * - * Most of these parameters correspond to &struct btrfs_file_extent_item. The -- * tree is given a single reference on the ordered extent that was inserted. -+ * tree is given a single reference on the ordered extent that was inserted, and -+ * the returned pointer is given a second reference. - * -- * Return: 0 or -ENOMEM. -+ * Return: the new ordered extent or ERR_PTR(-ENOMEM). - */ --int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, -- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, -- u64 disk_num_bytes, u64 offset, unsigned flags, -- int compress_type) -+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( -+ struct btrfs_inode *inode, u64 file_offset, -+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, -+ u64 disk_num_bytes, u64 offset, unsigned long flags, -+ int compress_type) - { - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; -@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); - if (ret < 0) -- return ret; -+ return ERR_PTR(ret); - ret = 0; - } else { - /* -@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); - if (ret < 0) -- return ret; -+ return ERR_PTR(ret); - } - entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) -- return -ENOMEM; -+ return ERR_PTR(-ENOMEM); - - entry->file_offset = file_offset; - entry->num_bytes = num_bytes; -@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - btrfs_mod_outstanding_extents(inode, 1); - spin_unlock(&inode->lock); - -+ /* One ref for the returned entry to match semantics of lookup. */ -+ refcount_inc(&entry->refs); -+ -+ return entry; -+} -+ -+/* -+ * Add a new btrfs_ordered_extent for the range, but drop the reference instead -+ * of returning it to the caller. -+ */ -+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, -+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, -+ u64 disk_num_bytes, u64 offset, unsigned long flags, -+ int compress_type) -+{ -+ struct btrfs_ordered_extent *ordered; -+ -+ ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, -+ ram_bytes, disk_bytenr, -+ disk_num_bytes, offset, flags, -+ compress_type); -+ -+ if (IS_ERR(ordered)) -+ return PTR_ERR(ordered); -+ btrfs_put_ordered_extent(ordered); -+ - return 0; - } - -@@ -616,7 +644,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) - struct btrfs_ordered_extent *ordered; - - ordered = container_of(work, struct btrfs_ordered_extent, flush_work); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - complete(&ordered->completion); - } - -@@ -716,13 +744,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, - } - - /* -- * Used to start IO or wait for a given ordered extent to finish. -+ * Start IO and wait for a given ordered extent to finish. - * -- * If wait is one, this effectively waits on page writeback for all the pages -- * in the extent, and it waits on the io completion code to insert -- * metadata into the btree corresponding to the extent -+ * Wait on page writeback for all the pages in the extent and the IO completion -+ * code to insert metadata into the btree corresponding to the extent. - */ --void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) -+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) - { - u64 start = entry->file_offset; - u64 end = start + entry->num_bytes - 1; -@@ -744,12 +771,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) - */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); -- if (wait) { -- if (!freespace_inode) -- btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); -- wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, -- &entry->flags)); -- } -+ -+ if (!freespace_inode) -+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); -+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); - } - - /* -@@ -800,7 +825,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) - btrfs_put_ordered_extent(ordered); - break; - } -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - end = ordered->file_offset; - /* - * If the ordered extent had an error save the error but don't -@@ -1061,7 +1086,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, - break; - } - unlock_extent(&inode->io_tree, start, end, cachedp); -- btrfs_start_ordered_extent(ordered, 1); -+ btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - } - } -diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h -index 89f82b78f590..18007f9c00ad 100644 ---- a/fs/btrfs/ordered-data.h -+++ b/fs/btrfs/ordered-data.h -@@ -157,7 +157,6 @@ struct btrfs_ordered_extent { - * command in a workqueue context - */ - u64 physical; -- struct block_device *bdev; - }; - - static inline void -@@ -179,15 +178,20 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); -+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( -+ struct btrfs_inode *inode, u64 file_offset, -+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, -+ u64 disk_num_bytes, u64 offset, unsigned long flags, -+ int compress_type); - int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, -- u64 disk_num_bytes, u64 offset, unsigned flags, -+ u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); - void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); - struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, - u64 file_offset); --void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); -+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); - int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); - struct btrfs_ordered_extent * - btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c -index af97413abcf4..52a7d2fa2284 100644 ---- a/fs/btrfs/qgroup.c -+++ b/fs/btrfs/qgroup.c -@@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) - list_del("a_root->dirty_list); - - btrfs_tree_lock(quota_root->node); -- btrfs_clean_tree_block(quota_root->node); -+ btrfs_clear_buffer_dirty(trans, quota_root->node); - btrfs_tree_unlock(quota_root->node); - btrfs_free_tree_block(trans, btrfs_root_id(quota_root), - quota_root->node, 0, 1); -diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c -index ff4b1d583788..642828c1b299 100644 ---- a/fs/btrfs/raid56.c -+++ b/fs/btrfs/raid56.c -@@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) - } - - /* -- * Return the total numer of errors found in the vertical stripe of @sector_nr. -+ * Return the total number of errors found in the vertical stripe of @sector_nr. - * - * @faila and @failb will also be updated to the first and second stripe - * number of the errors. -@@ -1183,7 +1183,15 @@ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, - trace_info->stripe_nr = -1; - } - --/* Generate PQ for one veritical stripe. */ -+static inline void bio_list_put(struct bio_list *bio_list) -+{ -+ struct bio *bio; -+ -+ while ((bio = bio_list_pop(bio_list))) -+ bio_put(bio); -+} -+ -+/* Generate PQ for one vertical stripe. */ - static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) - { - void **pointers = rbio->finish_pointers; -@@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) - static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) - { -- struct bio *bio; - /* The total sector number inside the full stripe. */ - int total_sector_nr; - int sectornr; -@@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, - - return 0; - error: -- while ((bio = bio_list_pop(bio_list))) -- bio_put(bio); -+ bio_list_put(bio_list); - return -EIO; - } - -@@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) - } - - /* -- * For subpage case, we can no longer set page Uptodate directly for -+ * For subpage case, we can no longer set page Up-to-date directly for - * stripe_pages[], thus we need to locate the sector. - */ - static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, -@@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi - int total_sector_nr = get_bio_sector_nr(rbio, bio); - u32 bio_size = 0; - struct bio_vec *bvec; -- struct bvec_iter_all iter_all; - int i; - -- bio_for_each_segment_all(bvec, bio, iter_all) -+ bio_for_each_bvec_all(bvec, bio, i) - bio_size += bvec->bv_len; - - /* -@@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) - wake_up(&rbio->io_wait); - } - --static void submit_read_bios(struct btrfs_raid_bio *rbio, -+static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list) - { - struct bio *bio; -@@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, - } - submit_bio(bio); - } --} -- --static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, -- struct bio_list *bio_list) --{ -- struct bio *bio; -- int total_sector_nr; -- int ret = 0; -- -- ASSERT(bio_list_size(bio_list) == 0); -- -- /* -- * Build a list of bios to read all sectors (including data and P/Q). -- * -- * This behaviro is to compensate the later csum verification and -- * recovery. -- */ -- for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; -- total_sector_nr++) { -- struct sector_ptr *sector; -- int stripe = total_sector_nr / rbio->stripe_nsectors; -- int sectornr = total_sector_nr % rbio->stripe_nsectors; -- -- sector = rbio_stripe_sector(rbio, stripe, sectornr); -- ret = rbio_add_io_sector(rbio, bio_list, sector, -- stripe, sectornr, REQ_OP_READ); -- if (ret) -- goto cleanup; -- } -- return 0; - --cleanup: -- while ((bio = bio_list_pop(bio_list))) -- bio_put(bio); -- return ret; -+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - } - - static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) -@@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) - struct btrfs_raid_bio *rbio; - struct btrfs_plug_cb *plug = NULL; - struct blk_plug_cb *cb; -- int ret = 0; - - rbio = alloc_rbio(fs_info, bioc); - if (IS_ERR(rbio)) { -- ret = PTR_ERR(rbio); -- goto fail; -+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); -+ bio_endio(bio); -+ return; - } - rbio->operation = BTRFS_RBIO_WRITE; - rbio_add_bio(rbio, bio); -@@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) - * Don't plug on full rbios, just get them out the door - * as quickly as we can - */ -- if (rbio_is_full(rbio)) -- goto queue_rbio; -- -- cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); -- if (cb) { -- plug = container_of(cb, struct btrfs_plug_cb, cb); -- if (!plug->info) { -- plug->info = fs_info; -- INIT_LIST_HEAD(&plug->rbio_list); -+ if (!rbio_is_full(rbio)) { -+ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); -+ if (cb) { -+ plug = container_of(cb, struct btrfs_plug_cb, cb); -+ if (!plug->info) { -+ plug->info = fs_info; -+ INIT_LIST_HEAD(&plug->rbio_list); -+ } -+ list_add_tail(&rbio->plug_list, &plug->rbio_list); -+ return; - } -- list_add_tail(&rbio->plug_list, &plug->rbio_list); -- return; - } --queue_rbio: -+ - /* - * Either we don't have any existing plug, or we're doing a full stripe, -- * can queue the rmw work now. -+ * queue the rmw work now. - */ - start_async_work(rbio, rmw_rbio_work); -- -- return; -- --fail: -- bio->bi_status = errno_to_blk_status(ret); -- bio_endio(bio); - } - - static int verify_one_sector(struct btrfs_raid_bio *rbio, -@@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, - &failb); - /* -- * No errors in the veritical stripe, skip it. Can happen for recovery -+ * No errors in the vertical stripe, skip it. Can happen for recovery - * which only part of a stripe failed csum check. - */ - if (!found_errors) -@@ -1949,14 +1914,25 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) - return ret; - } - --static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, -- struct bio_list *bio_list) -+static void recover_rbio(struct btrfs_raid_bio *rbio) - { -- struct bio *bio; -+ struct bio_list bio_list = BIO_EMPTY_LIST; - int total_sector_nr; - int ret = 0; - -- ASSERT(bio_list_size(bio_list) == 0); -+ /* -+ * Either we're doing recover for a read failure or degraded write, -+ * caller should have set error bitmap correctly. -+ */ -+ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); -+ -+ /* For recovery, we need to read all sectors including P/Q. */ -+ ret = alloc_rbio_pages(rbio); -+ if (ret < 0) -+ goto out; -+ -+ index_rbio_pages(rbio); -+ - /* - * Read everything that hasn't failed. However this time we will - * not trust any cached sector. -@@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, - } - - sector = rbio_stripe_sector(rbio, stripe, sectornr); -- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, -+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, REQ_OP_READ); -- if (ret < 0) -- goto error; -+ if (ret < 0) { -+ bio_list_put(&bio_list); -+ goto out; -+ } - } -- return 0; --error: -- while ((bio = bio_list_pop(bio_list))) -- bio_put(bio); -- -- return -EIO; --} -- --static int recover_rbio(struct btrfs_raid_bio *rbio) --{ -- struct bio_list bio_list; -- struct bio *bio; -- int ret; -- -- /* -- * Either we're doing recover for a read failure or degraded write, -- * caller should have set error bitmap correctly. -- */ -- ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); -- bio_list_init(&bio_list); -- -- /* For recovery, we need to read all sectors including P/Q. */ -- ret = alloc_rbio_pages(rbio); -- if (ret < 0) -- goto out; -- -- index_rbio_pages(rbio); -- -- ret = recover_assemble_read_bios(rbio, &bio_list); -- if (ret < 0) -- goto out; -- -- submit_read_bios(rbio, &bio_list); -- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - -+ submit_read_wait_bio_list(rbio, &bio_list); - ret = recover_sectors(rbio); -- - out: -- while ((bio = bio_list_pop(&bio_list))) -- bio_put(bio); -- -- return ret; -+ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } - - static void recover_rbio_work(struct work_struct *work) - { - struct btrfs_raid_bio *rbio; -- int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); -- -- ret = lock_stripe_add(rbio); -- if (ret == 0) { -- ret = recover_rbio(rbio); -- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); -- } -+ if (!lock_stripe_add(rbio)) -+ recover_rbio(rbio); - } - - static void recover_rbio_work_locked(struct work_struct *work) - { -- struct btrfs_raid_bio *rbio; -- int ret; -- -- rbio = container_of(work, struct btrfs_raid_bio, work); -- -- ret = recover_rbio(rbio); -- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); -+ recover_rbio(container_of(work, struct btrfs_raid_bio, work)); - } - - static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) -@@ -2204,11 +2134,9 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) - - static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) - { -- struct bio_list bio_list; -- struct bio *bio; -- int ret; -- -- bio_list_init(&bio_list); -+ struct bio_list bio_list = BIO_EMPTY_LIST; -+ int total_sector_nr; -+ int ret = 0; - - /* - * Fill the data csums we need for data verification. We need to fill -@@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) - */ - fill_data_csums(rbio); - -- ret = rmw_assemble_read_bios(rbio, &bio_list); -- if (ret < 0) -- goto out; -+ /* -+ * Build a list of bios to read all sectors (including data and P/Q). -+ * -+ * This behavior is to compensate the later csum verification and recovery. -+ */ -+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; -+ total_sector_nr++) { -+ struct sector_ptr *sector; -+ int stripe = total_sector_nr / rbio->stripe_nsectors; -+ int sectornr = total_sector_nr % rbio->stripe_nsectors; - -- submit_read_bios(rbio, &bio_list); -- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); -+ sector = rbio_stripe_sector(rbio, stripe, sectornr); -+ ret = rbio_add_io_sector(rbio, &bio_list, sector, -+ stripe, sectornr, REQ_OP_READ); -+ if (ret) { -+ bio_list_put(&bio_list); -+ return ret; -+ } -+ } - - /* - * We may or may not have any corrupted sectors (including missing dev - * and csum mismatch), just let recover_sectors() to handle them all. - */ -- ret = recover_sectors(rbio); -- return ret; --out: -- while ((bio = bio_list_pop(&bio_list))) -- bio_put(bio); -- -- return ret; -+ submit_read_wait_bio_list(rbio, &bio_list); -+ return recover_sectors(rbio); - } - - static void raid_wait_write_end_io(struct bio *bio) -@@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) - return false; - } - --static int rmw_rbio(struct btrfs_raid_bio *rbio) -+static void rmw_rbio(struct btrfs_raid_bio *rbio) - { - struct bio_list bio_list; - int sectornr; -@@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) - */ - ret = alloc_rbio_parity_pages(rbio); - if (ret < 0) -- return ret; -+ goto out; - - /* - * Either full stripe write, or we have every data sector already - * cached, can go to write path immediately. - */ -- if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) -- goto write; -- -- /* -- * Now we're doing sub-stripe write, also need all data stripes to do -- * the full RMW. -- */ -- ret = alloc_rbio_data_pages(rbio); -- if (ret < 0) -- return ret; -+ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { -+ /* -+ * Now we're doing sub-stripe write, also need all data stripes -+ * to do the full RMW. -+ */ -+ ret = alloc_rbio_data_pages(rbio); -+ if (ret < 0) -+ goto out; - -- index_rbio_pages(rbio); -+ index_rbio_pages(rbio); - -- ret = rmw_read_wait_recover(rbio); -- if (ret < 0) -- return ret; -+ ret = rmw_read_wait_recover(rbio); -+ if (ret < 0) -+ goto out; -+ } - --write: - /* - * At this stage we're not allowed to add any new bios to the - * bio list any more, anyone else that wants to change this stripe -@@ -2356,7 +2290,7 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) - bio_list_init(&bio_list); - ret = rmw_assemble_write_bios(rbio, &bio_list); - if (ret < 0) -- return ret; -+ goto out; - - /* We should have at least one bio assembled. */ - ASSERT(bio_list_size(&bio_list)); -@@ -2373,32 +2307,22 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) - break; - } - } -- return ret; -+out: -+ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } - - static void rmw_rbio_work(struct work_struct *work) - { - struct btrfs_raid_bio *rbio; -- int ret; - - rbio = container_of(work, struct btrfs_raid_bio, work); -- -- ret = lock_stripe_add(rbio); -- if (ret == 0) { -- ret = rmw_rbio(rbio); -- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); -- } -+ if (lock_stripe_add(rbio) == 0) -+ rmw_rbio(rbio); - } - - static void rmw_rbio_work_locked(struct work_struct *work) - { -- struct btrfs_raid_bio *rbio; -- int ret; -- -- rbio = container_of(work, struct btrfs_raid_bio, work); -- -- ret = rmw_rbio(rbio); -- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); -+ rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); - } - - /* -@@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) - struct sector_ptr p_sector = { 0 }; - struct sector_ptr q_sector = { 0 }; - struct bio_list bio_list; -- struct bio *bio; - int is_replace = 0; - int ret; - -@@ -2637,8 +2560,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) - return 0; - - cleanup: -- while ((bio = bio_list_pop(&bio_list))) -- bio_put(bio); -+ bio_list_put(&bio_list); - return ret; - } - -@@ -2733,15 +2655,12 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) - return ret; - } - --static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, -- struct bio_list *bio_list) -+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) - { -- struct bio *bio; -+ struct bio_list bio_list = BIO_EMPTY_LIST; - int total_sector_nr; - int ret = 0; - -- ASSERT(bio_list_size(bio_list) == 0); -- - /* Build a list of bios to read all the missing parts. */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { -@@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, - if (sector->uptodate) - continue; - -- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, -+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, REQ_OP_READ); -- if (ret) -- goto error; -+ if (ret) { -+ bio_list_put(&bio_list); -+ return ret; -+ } - } -+ -+ submit_read_wait_bio_list(rbio, &bio_list); - return 0; --error: -- while ((bio = bio_list_pop(bio_list))) -- bio_put(bio); -- return ret; - } - --static int scrub_rbio(struct btrfs_raid_bio *rbio) -+static void scrub_rbio(struct btrfs_raid_bio *rbio) - { - bool need_check = false; -- struct bio_list bio_list; - int sector_nr; - int ret; -- struct bio *bio; -- -- bio_list_init(&bio_list); - - ret = alloc_rbio_essential_pages(rbio); - if (ret) -- goto cleanup; -+ goto out; - - bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - -- ret = scrub_assemble_read_bios(rbio, &bio_list); -+ ret = scrub_assemble_read_bios(rbio); - if (ret < 0) -- goto cleanup; -- -- submit_read_bios(rbio, &bio_list); -- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); -+ goto out; - - /* We may have some failures, recover the failed sectors first. */ - ret = recover_scrub_rbio(rbio); - if (ret < 0) -- goto cleanup; -+ goto out; - - /* - * We have every sector properly prepared. Can finish the scrub -@@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) - break; - } - } -- return ret; -- --cleanup: -- while ((bio = bio_list_pop(&bio_list))) -- bio_put(bio); -- -- return ret; -+out: -+ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } - - static void scrub_rbio_work_locked(struct work_struct *work) - { -- struct btrfs_raid_bio *rbio; -- int ret; -- -- rbio = container_of(work, struct btrfs_raid_bio, work); -- ret = scrub_rbio(rbio); -- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); -+ scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); - } - - void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) -diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h -index 7c73a443939e..df0e0abdeb1f 100644 ---- a/fs/btrfs/raid56.h -+++ b/fs/btrfs/raid56.h -@@ -65,7 +65,7 @@ struct btrfs_raid_bio { - /* Number of data stripes (no p/q) */ - u8 nr_data; - -- /* Numer of all stripes (including P/Q) */ -+ /* Number of all stripes (including P/Q) */ - u8 real_stripes; - - /* How many pages there are for each stripe */ -@@ -132,7 +132,7 @@ struct btrfs_raid_bio { - - /* - * Checksum buffer if the rbio is for data. The buffer should cover -- * all data sectors (exlcuding P/Q sectors). -+ * all data sectors (excluding P/Q sectors). - */ - u8 *csum_buf; - -diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c -index 31ec4a7658ce..ef13a9d4e370 100644 ---- a/fs/btrfs/relocation.c -+++ b/fs/btrfs/relocation.c -@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( - * - * Here we have to manually invalidate the range (i_size, PAGE_END + 1). - */ -- if (!IS_ALIGNED(i_size, PAGE_SIZE)) { -+ if (!PAGE_ALIGNED(i_size)) { - struct address_space *mapping = inode->vfs_inode.i_mapping; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; -diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c -index a5d026041be4..69c93ae333f6 100644 ---- a/fs/btrfs/scrub.c -+++ b/fs/btrfs/scrub.c -@@ -229,7 +229,7 @@ struct full_stripe_lock { - }; - - #ifndef CONFIG_64BIT --/* This structure is for archtectures whose (void *) is smaller than u64 */ -+/* This structure is for architectures whose (void *) is smaller than u64 */ - struct scrub_page_private { - u64 logical; - }; -diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c -index d50182b6deec..e5c963bb873d 100644 ---- a/fs/btrfs/send.c -+++ b/fs/btrfs/send.c -@@ -32,6 +32,7 @@ - #include "file-item.h" - #include "ioctl.h" - #include "verity.h" -+#include "lru_cache.h" - - /* - * Maximum number of references an extent can have in order for us to attempt to -@@ -80,23 +81,23 @@ struct clone_root { - bool found_ref; - }; - --#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 --#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) -+#define SEND_MAX_NAME_CACHE_SIZE 256 - - /* -- * Limit the root_ids array of struct backref_cache_entry to 12 elements. -- * This makes the size of a cache entry to be exactly 128 bytes on x86_64. -+ * Limit the root_ids array of struct backref_cache_entry to 17 elements. -+ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which -+ * can be satisfied from the kmalloc-192 slab, without wasting any space. - * The most common case is to have a single root for cloning, which corresponds -- * to the send root. Having the user specify more than 11 clone roots is not -+ * to the send root. Having the user specify more than 16 clone roots is not - * common, and in such rare cases we simply don't use caching if the number of -- * cloning roots that lead down to a leaf is more than 12. -+ * cloning roots that lead down to a leaf is more than 17. - */ --#define SEND_MAX_BACKREF_CACHE_ROOTS 12 -+#define SEND_MAX_BACKREF_CACHE_ROOTS 17 - - /* - * Max number of entries in the cache. -- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding -- * maple tree's internal nodes, is 16K. -+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding -+ * maple tree's internal nodes, is 24K. - */ - #define SEND_MAX_BACKREF_CACHE_SIZE 128 - -@@ -107,15 +108,31 @@ struct clone_root { - * x86_64). - */ - struct backref_cache_entry { -- /* List to link to the cache's lru list. */ -- struct list_head list; -- /* The key for this entry in the cache. */ -- u64 key; -+ struct btrfs_lru_cache_entry entry; - u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; - /* Number of valid elements in the root_ids array. */ - int num_roots; - }; - -+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ -+static_assert(offsetof(struct backref_cache_entry, entry) == 0); -+ -+/* -+ * Max number of entries in the cache that stores directories that were already -+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses -+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but -+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). -+ */ -+#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 -+ -+/* -+ * Max number of entries in the cache that stores directories that were already -+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses -+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but -+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). -+ */ -+#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 -+ - struct send_ctx { - struct file *send_filp; - loff_t send_off; -@@ -174,9 +191,7 @@ struct send_ctx { - struct list_head new_refs; - struct list_head deleted_refs; - -- struct radix_tree_root name_cache; -- struct list_head name_cache_list; -- int name_cache_size; -+ struct btrfs_lru_cache name_cache; - - /* - * The inode we are currently processing. It's not NULL only when we -@@ -285,13 +300,11 @@ struct send_ctx { - struct rb_root rbtree_new_refs; - struct rb_root rbtree_deleted_refs; - -- struct { -- u64 last_reloc_trans; -- struct list_head lru_list; -- struct maple_tree entries; -- /* Number of entries stored in the cache. */ -- int size; -- } backref_cache; -+ struct btrfs_lru_cache backref_cache; -+ u64 backref_cache_last_reloc_trans; -+ -+ struct btrfs_lru_cache dir_created_cache; -+ struct btrfs_lru_cache dir_utimes_cache; - }; - - struct pending_dir_move { -@@ -321,21 +334,15 @@ struct orphan_dir_info { - u64 ino; - u64 gen; - u64 last_dir_index_offset; -+ u64 dir_high_seq_ino; - }; - - struct name_cache_entry { -- struct list_head list; - /* -- * radix_tree has only 32bit entries but we need to handle 64bit inums. -- * We use the lower 32bit of the 64bit inum to store it in the tree. If -- * more then one inum would fall into the same entry, we use radix_list -- * to store the additional entries. radix_list is also used to store -- * entries where two entries have the same inum but different -- * generations. -+ * The key in the entry is an inode number, and the generation matches -+ * the inode's generation. - */ -- struct list_head radix_list; -- u64 ino; -- u64 gen; -+ struct btrfs_lru_cache_entry entry; - u64 parent_ino; - u64 parent_gen; - int ret; -@@ -344,6 +351,9 @@ struct name_cache_entry { - char name[]; - }; - -+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ -+static_assert(offsetof(struct name_cache_entry, entry) == 0); -+ - #define ADVANCE 1 - #define ADVANCE_ONLY_NEXT -1 - -@@ -956,14 +966,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, - static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) - { - int ret; -- struct btrfs_inode_info info; -+ struct btrfs_inode_info info = { 0 }; - -- if (!gen) -- return -EPERM; -+ ASSERT(gen); - - ret = get_inode_info(root, ino, &info); -- if (!ret) -- *gen = info.gen; -+ *gen = info.gen; - return ret; - } - -@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, - return 0; - } - --static void empty_backref_cache(struct send_ctx *sctx) --{ -- struct backref_cache_entry *entry; -- struct backref_cache_entry *tmp; -- -- list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) -- kfree(entry); -- -- INIT_LIST_HEAD(&sctx->backref_cache.lru_list); -- mtree_destroy(&sctx->backref_cache.entries); -- sctx->backref_cache.size = 0; --} -- - static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, - const u64 **root_ids_ret, int *root_count_ret) - { -@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, - struct send_ctx *sctx = bctx->sctx; - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; - const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; -+ struct btrfs_lru_cache_entry *raw_entry; - struct backref_cache_entry *entry; - -- if (sctx->backref_cache.size == 0) -+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) - return false; - - /* -@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, - * transaction handle or holding fs_info->commit_root_sem, so no need - * to take any lock here. - */ -- if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { -- empty_backref_cache(sctx); -+ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { -+ btrfs_lru_cache_clear(&sctx->backref_cache); - return false; - } - -- entry = mtree_load(&sctx->backref_cache.entries, key); -- if (!entry) -+ raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); -+ if (!raw_entry) - return false; - -+ entry = container_of(raw_entry, struct backref_cache_entry, entry); - *root_ids_ret = entry->root_ids; - *root_count_ret = entry->num_roots; -- list_move_tail(&entry->list, &sctx->backref_cache.lru_list); - - return true; - } -@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, - if (!new_entry) - return; - -- new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; -+ new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; -+ new_entry->entry.gen = 0; - new_entry->num_roots = 0; - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(root_ids, &uiter)) != NULL) { -@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, - * none of the roots is part of the list of roots from which we are - * allowed to clone. Cache the new entry as it's still useful to avoid - * backref walking to determine which roots have a path to the leaf. -+ * -+ * Also use GFP_NOFS because we're called while holding a transaction -+ * handle or while holding fs_info->commit_root_sem. - */ -- -- if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { -- struct backref_cache_entry *lru_entry; -- struct backref_cache_entry *mt_entry; -- -- lru_entry = list_first_entry(&sctx->backref_cache.lru_list, -- struct backref_cache_entry, list); -- mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); -- ASSERT(mt_entry == lru_entry); -- list_del(&mt_entry->list); -- kfree(mt_entry); -- sctx->backref_cache.size--; -- } -- -- ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, -- new_entry, GFP_NOFS); -+ ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, -+ GFP_NOFS); - ASSERT(ret == 0 || ret == -ENOMEM); - if (ret) { - /* Caching is optional, no worries. */ -@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, - return; - } - -- list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); -- - /* - * We are called from iterate_extent_inodes() while either holding a - * transaction handle or holding fs_info->commit_root_sem, so no need - * to take any lock here. - */ -- if (sctx->backref_cache.size == 0) -- sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; -- -- sctx->backref_cache.size++; -+ if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) -+ sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; - } - - static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, -@@ -1886,7 +1868,8 @@ enum inode_state { - inode_state_did_delete, - }; - --static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) -+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, -+ u64 *send_gen, u64 *parent_gen) - { - int ret; - int left_ret; -@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) - goto out; - left_ret = (info.nlink == 0) ? -ENOENT : ret; - left_gen = info.gen; -+ if (send_gen) -+ *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); - - if (!sctx->parent_root) { - right_ret = -ENOENT; -@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) - goto out; - right_ret = (info.nlink == 0) ? -ENOENT : ret; - right_gen = info.gen; -+ if (parent_gen) -+ *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); - } - - if (!left_ret && !right_ret) { -@@ -1953,14 +1940,15 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) - return ret; - } - --static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) -+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, -+ u64 *send_gen, u64 *parent_gen) - { - int ret; - - if (ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - -- ret = get_cur_inode_state(sctx, ino, gen); -+ ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); - if (ret < 0) - goto out; - -@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, - const char *name, int name_len, - u64 *who_ino, u64 *who_gen, u64 *who_mode) - { -- int ret = 0; -- u64 gen; -+ int ret; -+ u64 parent_root_dir_gen; - u64 other_inode = 0; - struct btrfs_inode_info info; - - if (!sctx->parent_root) -- goto out; -+ return 0; - -- ret = is_inode_existent(sctx, dir, dir_gen); -+ ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); - if (ret <= 0) -- goto out; -+ return 0; - - /* - * If we have a parent root we need to verify that the parent dir was - * not deleted and then re-created, if it was then we have no overwrite - * and we can just unlink this entry. -+ * -+ * @parent_root_dir_gen was set to 0 if the inode does not exist in the -+ * parent root. - */ -- if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { -- ret = get_inode_gen(sctx->parent_root, dir, &gen); -- if (ret < 0 && ret != -ENOENT) -- goto out; -- if (ret) { -- ret = 0; -- goto out; -- } -- if (gen != dir_gen) -- goto out; -- } -+ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && -+ parent_root_dir_gen != dir_gen) -+ return 0; - - ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, - &other_inode); -- if (ret < 0 && ret != -ENOENT) -- goto out; -- if (ret) { -- ret = 0; -- goto out; -- } -+ if (ret == -ENOENT) -+ return 0; -+ else if (ret < 0) -+ return ret; - - /* - * Check if the overwritten ref was already processed. If yes, the ref -@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, - is_waiting_for_move(sctx, other_inode)) { - ret = get_inode_info(sctx->parent_root, other_inode, &info); - if (ret < 0) -- goto out; -+ return ret; - -- ret = 1; - *who_ino = other_inode; - *who_gen = info.gen; - *who_mode = info.mode; -- } else { -- ret = 0; -+ return 1; - } - --out: -- return ret; -+ return 0; - } - - /* -@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, - u64 ino, u64 ino_gen, - const char *name, int name_len) - { -- int ret = 0; -- u64 gen; -+ int ret; - u64 ow_inode; -+ u64 ow_gen = 0; -+ u64 send_root_dir_gen; - - if (!sctx->parent_root) -- goto out; -+ return 0; - -- ret = is_inode_existent(sctx, dir, dir_gen); -+ ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); - if (ret <= 0) -- goto out; -+ return ret; - -- if (dir != BTRFS_FIRST_FREE_OBJECTID) { -- ret = get_inode_gen(sctx->send_root, dir, &gen); -- if (ret < 0 && ret != -ENOENT) -- goto out; -- if (ret) { -- ret = 0; -- goto out; -- } -- if (gen != dir_gen) -- goto out; -- } -+ /* -+ * @send_root_dir_gen was set to 0 if the inode does not exist in the -+ * send root. -+ */ -+ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) -+ return 0; - - /* check if the ref was overwritten by another ref */ - ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, - &ow_inode); -- if (ret < 0 && ret != -ENOENT) -- goto out; -- if (ret) { -+ if (ret == -ENOENT) { - /* was never and will never be overwritten */ -- ret = 0; -- goto out; -+ return 0; -+ } else if (ret < 0) { -+ return ret; - } - -- ret = get_inode_gen(sctx->send_root, ow_inode, &gen); -- if (ret < 0) -- goto out; -+ if (ow_inode == ino) { -+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); -+ if (ret < 0) -+ return ret; - -- if (ow_inode == ino && gen == ino_gen) { -- ret = 0; -- goto out; -+ /* It's the same inode, so no overwrite happened. */ -+ if (ow_gen == ino_gen) -+ return 0; - } - - /* -@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, - * inode 'ino' to be orphanized, therefore check if ow_inode matches - * the current inode being processed. - */ -- if ((ow_inode < sctx->send_progress) || -- (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && -- gen == sctx->cur_inode_gen)) -- ret = 1; -- else -- ret = 0; -+ if (ow_inode < sctx->send_progress) -+ return 1; - --out: -- return ret; -+ if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { -+ if (ow_gen == 0) { -+ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); -+ if (ret < 0) -+ return ret; -+ } -+ if (ow_gen == sctx->cur_inode_gen) -+ return 1; -+ } -+ -+ return 0; - } - - /* -@@ -2285,113 +2264,16 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) - return ret; - } - --/* -- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, -- * so we need to do some special handling in case we have clashes. This function -- * takes care of this with the help of name_cache_entry::radix_list. -- * In case of error, nce is kfreed. -- */ --static int name_cache_insert(struct send_ctx *sctx, -- struct name_cache_entry *nce) -+static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, -+ u64 ino, u64 gen) - { -- int ret = 0; -- struct list_head *nce_head; -- -- nce_head = radix_tree_lookup(&sctx->name_cache, -- (unsigned long)nce->ino); -- if (!nce_head) { -- nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); -- if (!nce_head) { -- kfree(nce); -- return -ENOMEM; -- } -- INIT_LIST_HEAD(nce_head); -- -- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); -- if (ret < 0) { -- kfree(nce_head); -- kfree(nce); -- return ret; -- } -- } -- list_add_tail(&nce->radix_list, nce_head); -- list_add_tail(&nce->list, &sctx->name_cache_list); -- sctx->name_cache_size++; -- -- return ret; --} -+ struct btrfs_lru_cache_entry *entry; - --static void name_cache_delete(struct send_ctx *sctx, -- struct name_cache_entry *nce) --{ -- struct list_head *nce_head; -- -- nce_head = radix_tree_lookup(&sctx->name_cache, -- (unsigned long)nce->ino); -- if (!nce_head) { -- btrfs_err(sctx->send_root->fs_info, -- "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", -- nce->ino, sctx->name_cache_size); -- } -- -- list_del(&nce->radix_list); -- list_del(&nce->list); -- sctx->name_cache_size--; -- -- /* -- * We may not get to the final release of nce_head if the lookup fails -- */ -- if (nce_head && list_empty(nce_head)) { -- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); -- kfree(nce_head); -- } --} -- --static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, -- u64 ino, u64 gen) --{ -- struct list_head *nce_head; -- struct name_cache_entry *cur; -- -- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); -- if (!nce_head) -+ entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); -+ if (!entry) - return NULL; - -- list_for_each_entry(cur, nce_head, radix_list) { -- if (cur->ino == ino && cur->gen == gen) -- return cur; -- } -- return NULL; --} -- --/* -- * Remove some entries from the beginning of name_cache_list. -- */ --static void name_cache_clean_unused(struct send_ctx *sctx) --{ -- struct name_cache_entry *nce; -- -- if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) -- return; -- -- while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { -- nce = list_entry(sctx->name_cache_list.next, -- struct name_cache_entry, list); -- name_cache_delete(sctx, nce); -- kfree(nce); -- } --} -- --static void name_cache_free(struct send_ctx *sctx) --{ -- struct name_cache_entry *nce; -- -- while (!list_empty(&sctx->name_cache_list)) { -- nce = list_entry(sctx->name_cache_list.next, -- struct name_cache_entry, list); -- name_cache_delete(sctx, nce); -- kfree(nce); -- } -+ return container_of(entry, struct name_cache_entry, entry); - } - - /* -@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, - { - int ret; - int nce_ret; -- struct name_cache_entry *nce = NULL; -+ struct name_cache_entry *nce; - - /* - * First check if we already did a call to this function with the same -@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, - nce = name_cache_search(sctx, ino, gen); - if (nce) { - if (ino < sctx->send_progress && nce->need_later_update) { -- name_cache_delete(sctx, nce); -- kfree(nce); -+ btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); - nce = NULL; - } else { -- /* -- * Removes the entry from the list and adds it back to -- * the end. This marks the entry as recently used so -- * that name_cache_clean_unused does not remove it. -- */ -- list_move_tail(&nce->list, &sctx->name_cache_list); -- - *parent_ino = nce->parent_ino; - *parent_gen = nce->parent_gen; - ret = fs_path_add(dest, nce->name, nce->name_len); -@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, - * This should only happen for the parent dir that we determine in - * record_new_ref_if_needed(). - */ -- ret = is_inode_existent(sctx, ino, gen); -+ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); - if (ret < 0) - goto out; - -@@ -2497,8 +2371,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, - goto out; - } - -- nce->ino = ino; -- nce->gen = gen; -+ nce->entry.key = ino; -+ nce->entry.gen = gen; - nce->parent_ino = *parent_ino; - nce->parent_gen = *parent_gen; - nce->name_len = fs_path_len(dest); -@@ -2510,10 +2384,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, - else - nce->need_later_update = 1; - -- nce_ret = name_cache_insert(sctx, nce); -- if (nce_ret < 0) -+ nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); -+ if (nce_ret < 0) { -+ kfree(nce); - ret = nce_ret; -- name_cache_clean_unused(sctx); -+ } - - out: - return ret; -@@ -2883,6 +2758,63 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) - return ret; - } - -+/* -+ * If the cache is full, we can't remove entries from it and do a call to -+ * send_utimes() for each respective inode, because we might be finishing -+ * processing an inode that is a directory and it just got renamed, and existing -+ * entries in the cache may refer to inodes that have the directory in their -+ * full path - in which case we would generate outdated paths (pre-rename) -+ * for the inodes that the cache entries point to. Instead of prunning the -+ * cache when inserting, do it after we finish processing each inode at -+ * finish_inode_if_needed(). -+ */ -+static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) -+{ -+ struct btrfs_lru_cache_entry *entry; -+ int ret; -+ -+ entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); -+ if (entry != NULL) -+ return 0; -+ -+ /* Caching is optional, don't fail if we can't allocate memory. */ -+ entry = kmalloc(sizeof(*entry), GFP_KERNEL); -+ if (!entry) -+ return send_utimes(sctx, dir, gen); -+ -+ entry->key = dir; -+ entry->gen = gen; -+ -+ ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); -+ ASSERT(ret != -EEXIST); -+ if (ret) { -+ kfree(entry); -+ return send_utimes(sctx, dir, gen); -+ } -+ -+ return 0; -+} -+ -+static int trim_dir_utimes_cache(struct send_ctx *sctx) -+{ -+ while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > -+ SEND_MAX_DIR_UTIMES_CACHE_SIZE) { -+ struct btrfs_lru_cache_entry *lru; -+ int ret; -+ -+ lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); -+ ASSERT(lru != NULL); -+ -+ ret = send_utimes(sctx, lru->key, lru->gen); -+ if (ret) -+ return ret; -+ -+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); -+ } -+ -+ return 0; -+} -+ - /* - * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have - * a valid path yet because we did not process the refs yet. So, the inode -@@ -2971,6 +2903,23 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) - return ret; - } - -+static void cache_dir_created(struct send_ctx *sctx, u64 dir) -+{ -+ struct btrfs_lru_cache_entry *entry; -+ int ret; -+ -+ /* Caching is optional, ignore any failures. */ -+ entry = kmalloc(sizeof(*entry), GFP_KERNEL); -+ if (!entry) -+ return; -+ -+ entry->key = dir; -+ entry->gen = 0; -+ ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); -+ if (ret < 0) -+ kfree(entry); -+} -+ - /* - * We need some special handling for inodes that get processed before the parent - * directory got created. See process_recorded_refs for details. -@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) - struct btrfs_key di_key; - struct btrfs_dir_item *di; - -+ if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) -+ return 1; -+ - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; -@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) - if (di_key.type != BTRFS_ROOT_ITEM_KEY && - di_key.objectid < sctx->send_progress) { - ret = 1; -+ cache_dir_created(sctx, dir); - break; - } - } -@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) - return 0; - } - -- return send_create_inode(sctx, sctx->cur_ino); -+ ret = send_create_inode(sctx, sctx->cur_ino); -+ -+ if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) -+ cache_dir_created(sctx, sctx->cur_ino); -+ -+ return ret; - } - - struct recorded_ref { -@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, - odi->ino = dir_ino; - odi->gen = dir_gen; - odi->last_dir_index_offset = 0; -+ odi->dir_high_seq_ino = 0; - - rb_link_node(&odi->node, parent, p); - rb_insert_color(&odi->node, &sctx->orphan_dirs); -@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, - * We check this by iterating all dir items and checking if the inode behind - * the dir item was already processed. - */ --static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, -- u64 send_progress) -+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) - { - int ret = 0; - int iter_ret = 0; -@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - struct btrfs_key loc; - struct btrfs_dir_item *di; - struct orphan_dir_info *odi = NULL; -+ u64 dir_high_seq_ino = 0; -+ u64 last_dir_index_offset = 0; - - /* - * Don't try to rmdir the top/root subvolume dir. -@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - if (dir == BTRFS_FIRST_FREE_OBJECTID) - return 0; - -+ odi = get_orphan_dir_info(sctx, dir, dir_gen); -+ if (odi && sctx->cur_ino < odi->dir_high_seq_ino) -+ return 0; -+ - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - -+ if (!odi) { -+ /* -+ * Find the inode number associated with the last dir index -+ * entry. This is very likely the inode with the highest number -+ * of all inodes that have an entry in the directory. We can -+ * then use it to avoid future calls to can_rmdir(), when -+ * processing inodes with a lower number, from having to search -+ * the parent root b+tree for dir index keys. -+ */ -+ key.objectid = dir; -+ key.type = BTRFS_DIR_INDEX_KEY; -+ key.offset = (u64)-1; -+ -+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -+ if (ret < 0) { -+ goto out; -+ } else if (ret > 0) { -+ /* Can't happen, the root is never empty. */ -+ ASSERT(path->slots[0] > 0); -+ if (WARN_ON(path->slots[0] == 0)) { -+ ret = -EUCLEAN; -+ goto out; -+ } -+ path->slots[0]--; -+ } -+ -+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); -+ if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { -+ /* No index keys, dir can be removed. */ -+ ret = 1; -+ goto out; -+ } -+ -+ di = btrfs_item_ptr(path->nodes[0], path->slots[0], -+ struct btrfs_dir_item); -+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); -+ dir_high_seq_ino = loc.objectid; -+ if (sctx->cur_ino < dir_high_seq_ino) { -+ ret = 0; -+ goto out; -+ } -+ -+ btrfs_release_path(path); -+ } -+ - key.objectid = dir; - key.type = BTRFS_DIR_INDEX_KEY; -- key.offset = 0; -- -- odi = get_orphan_dir_info(sctx, dir, dir_gen); -- if (odi) -- key.offset = odi->last_dir_index_offset; -+ key.offset = (odi ? odi->last_dir_index_offset : 0); - - btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { - struct waiting_dir_move *dm; -@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); - -+ dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); -+ last_dir_index_offset = found_key.offset; -+ - dm = get_waiting_dir_move(sctx, loc.objectid); - if (dm) { -- odi = add_orphan_dir_info(sctx, dir, dir_gen); -- if (IS_ERR(odi)) { -- ret = PTR_ERR(odi); -- goto out; -- } -- odi->gen = dir_gen; -- odi->last_dir_index_offset = found_key.offset; - dm->rmdir_ino = dir; - dm->rmdir_gen = dir_gen; - ret = 0; - goto out; - } - -- if (loc.objectid > send_progress) { -- odi = add_orphan_dir_info(sctx, dir, dir_gen); -- if (IS_ERR(odi)) { -- ret = PTR_ERR(odi); -- goto out; -- } -- odi->gen = dir_gen; -- odi->last_dir_index_offset = found_key.offset; -+ if (loc.objectid > sctx->cur_ino) { - ret = 0; - goto out; - } -@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, - - out: - btrfs_free_path(path); -- return ret; -+ -+ if (ret) -+ return ret; -+ -+ if (!odi) { -+ odi = add_orphan_dir_info(sctx, dir, dir_gen); -+ if (IS_ERR(odi)) -+ return PTR_ERR(odi); -+ -+ odi->gen = dir_gen; -+ } -+ -+ odi->last_dir_index_offset = last_dir_index_offset; -+ odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); -+ -+ return 0; - } - - static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) -@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) - } - gen = odi->gen; - -- ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); -+ ret = can_rmdir(sctx, rmdir_ino, gen); - if (ret < 0) - goto out; - if (!ret) -@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) - } - - finish: -- ret = send_utimes(sctx, pm->ino, pm->gen); -+ ret = cache_dir_utimes(sctx, pm->ino, pm->gen); - if (ret < 0) - goto out; - -@@ -3619,7 +3628,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) - if (ret < 0) - goto out; - -- ret = send_utimes(sctx, cur->dir, cur->dir_gen); -+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); - if (ret < 0) - goto out; - } -@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - * "testdir_2". - */ - list_for_each_entry(cur, &sctx->new_refs, list) { -- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); -+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); - if (ret < 0) - goto out; - if (ret == inode_state_will_create) -@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - * the source path when performing its rename - * operation. - */ -- if (is_waiting_for_move(sctx, ow_inode)) { -- wdm = get_waiting_dir_move(sctx, -- ow_inode); -- ASSERT(wdm); -+ wdm = get_waiting_dir_move(sctx, ow_inode); -+ if (wdm) - wdm->orphanized = true; -- } - - /* - * Make sure we clear our orphanized inode's -@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - * and get instead the orphan name. - */ - nce = name_cache_search(sctx, ow_inode, ow_gen); -- if (nce) { -- name_cache_delete(sctx, nce); -- kfree(nce); -- } -+ if (nce) -+ btrfs_lru_cache_remove(&sctx->name_cache, -+ &nce->entry); - - /* - * ow_inode might currently be an ancestor of -@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ -- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); -+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); - if (ret < 0) - goto out; - if (ret == inode_state_will_create) { -@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; -+ cache_dir_created(sctx, cur->dir); - } - } - -@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - * later, we do this check again and rmdir it then if possible. - * See the use of check_dirs for more details. - */ -- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, -- sctx->cur_ino); -+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); - if (ret < 0) - goto out; - if (ret) { -@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) - if (cur->dir > sctx->cur_ino) - continue; - -- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); -+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); - if (ret < 0) - goto out; - - if (ret == inode_state_did_create || - ret == inode_state_no_change) { -- /* TODO delayed utimes */ -- ret = send_utimes(sctx, cur->dir, cur->dir_gen); -+ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); - if (ret < 0) - goto out; - } else if (ret == inode_state_did_delete && - cur->dir != last_dir_ino_rm) { -- ret = can_rmdir(sctx, cur->dir, cur->dir_gen, -- sctx->cur_ino); -+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen); - if (ret < 0) - goto out; - if (ret) { -@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, - * boundary in the send buffer. This means that there may be a gap - * between the beginning of the command and the file data. - */ -- data_offset = ALIGN(sctx->send_size, PAGE_SIZE); -+ data_offset = PAGE_ALIGN(sctx->send_size); - if (data_offset > sctx->send_max_size || - sctx->send_max_size - data_offset < disk_num_bytes) { - ret = -EOVERFLOW; -@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, - sent += size; - } - -- if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { -+ if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { - /* - * Always operate only on ranges that are a multiple of the page - * size. This is not only to prevent zeroing parts of a page in -@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) - * it's moved/renamed, therefore we don't need to do it here. - */ - sctx->send_progress = sctx->cur_ino + 1; -- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); -+ -+ /* -+ * If the current inode is a non-empty directory, delay issuing -+ * the utimes command for it, as it's very likely we have inodes -+ * with an higher number inside it. We want to issue the utimes -+ * command only after adding all dentries to it. -+ */ -+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) -+ ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); -+ else -+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); -+ - if (ret < 0) - goto out; - } - - out: -+ if (!ret) -+ ret = trim_dir_utimes_cache(sctx); -+ - return ret; - } - -@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) - int clone_sources_to_rollback = 0; - size_t alloc_size; - int sort_clone_roots = 0; -+ struct btrfs_lru_cache_entry *entry; -+ struct btrfs_lru_cache_entry *tmp; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; -@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) - - INIT_LIST_HEAD(&sctx->new_refs); - INIT_LIST_HEAD(&sctx->deleted_refs); -- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); -- INIT_LIST_HEAD(&sctx->name_cache_list); - -- INIT_LIST_HEAD(&sctx->backref_cache.lru_list); -- mt_init(&sctx->backref_cache.entries); -+ btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); -+ btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); -+ btrfs_lru_cache_init(&sctx->dir_created_cache, -+ SEND_MAX_DIR_CREATED_CACHE_SIZE); -+ /* -+ * This cache is periodically trimmed to a fixed size elsewhere, see -+ * cache_dir_utimes() and trim_dir_utimes_cache(). -+ */ -+ btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); -+ -+ sctx->pending_dir_moves = RB_ROOT; -+ sctx->waiting_dir_moves = RB_ROOT; -+ sctx->orphan_dirs = RB_ROOT; -+ sctx->rbtree_new_refs = RB_ROOT; -+ sctx->rbtree_deleted_refs = RB_ROOT; - - sctx->flags = arg->flags; - -@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) - goto out; - } - -- sctx->pending_dir_moves = RB_ROOT; -- sctx->waiting_dir_moves = RB_ROOT; -- sctx->orphan_dirs = RB_ROOT; -- sctx->rbtree_new_refs = RB_ROOT; -- sctx->rbtree_deleted_refs = RB_ROOT; -- - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, - GFP_KERNEL); -@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) - if (ret < 0) - goto out; - -+ btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { -+ ret = send_utimes(sctx, entry->key, entry->gen); -+ if (ret < 0) -+ goto out; -+ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); -+ } -+ - if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { - ret = begin_cmd(sctx, BTRFS_SEND_C_END); - if (ret < 0) -@@ -8358,11 +8389,12 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) - kvfree(sctx->send_buf); - kvfree(sctx->verity_descriptor); - -- name_cache_free(sctx); -- - close_current_inode(sctx); - -- empty_backref_cache(sctx); -+ btrfs_lru_cache_clear(&sctx->name_cache); -+ btrfs_lru_cache_clear(&sctx->backref_cache); -+ btrfs_lru_cache_clear(&sctx->dir_created_cache); -+ btrfs_lru_cache_clear(&sctx->dir_utimes_cache); - - kfree(sctx); - } -diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c -index 433ce221dc5c..581845bc206a 100644 ---- a/fs/btrfs/super.c -+++ b/fs/btrfs/super.c -@@ -58,6 +58,7 @@ - #include "scrub.h" - #include "verity.h" - #include "super.h" -+#include "extent-tree.h" - #define CREATE_TRACE_POINTS - #include - -@@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) - } - - /* -- * Metadata in mixed block goup profiles are accounted in data -+ * Metadata in mixed block group profiles are accounted in data - */ - if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { - if (found->flags & BTRFS_BLOCK_GROUP_DATA) -diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c -index 108aa3876186..8c5efa5813b3 100644 ---- a/fs/btrfs/sysfs.c -+++ b/fs/btrfs/sysfs.c -@@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) - kfree(to_raid_kobj(kobj)); - } - --static struct kobj_type btrfs_raid_ktype = { -+static const struct kobj_type btrfs_raid_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .release = release_raid_kobj, - .default_groups = raid_groups, -@@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) - kfree(sinfo); - } - --static struct kobj_type space_info_ktype = { -+static const struct kobj_type space_info_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .release = space_info_release, - .default_groups = space_info_groups, -@@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) - complete(&fs_devs->kobj_unregister); - } - --static struct kobj_type btrfs_ktype = { -+static const struct kobj_type btrfs_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .release = btrfs_release_fsid_kobj, - }; -@@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) - complete(&device->kobj_unregister); - } - --static struct kobj_type devid_ktype = { -+static const struct kobj_type devid_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = devid_groups, - .release = btrfs_release_devid_kobj, -@@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) - kfree(kobj); - } - --static struct kobj_type qgroups_ktype = { -+static const struct kobj_type qgroups_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = qgroups_groups, - .release = qgroups_release, -@@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) - memset(&qgroup->kobj, 0, sizeof(*kobj)); - } - --static struct kobj_type qgroup_ktype = { -+static const struct kobj_type qgroup_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .release = qgroup_release, - .default_groups = qgroup_groups, -diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c -index c5b3a631bf4f..f2f2e11dac4c 100644 ---- a/fs/btrfs/tests/extent-map-tests.c -+++ b/fs/btrfs/tests/extent-map-tests.c -@@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, - goto out_free; - } - -- ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), -+ ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), - &logical, &out_ndaddrs, &out_stripe_len); - if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { - test_err("didn't rmap anything but expected %d", -diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c -index 8f8d0fce6e4a..18329ebcb1cb 100644 ---- a/fs/btrfs/transaction.c -+++ b/fs/btrfs/transaction.c -@@ -2609,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) - return (ret < 0) ? 0 : 1; - } - -+/* -+ * We only mark the transaction aborted and then set the file system read-only. -+ * This will prevent new transactions from starting or trying to join this -+ * one. -+ * -+ * This means that error recovery at the call site is limited to freeing -+ * any local memory allocations and passing the error code up without -+ * further cleanup. The transaction should complete as it normally would -+ * in the call path but will return -EIO. -+ * -+ * We'll complete the cleanup in btrfs_end_transaction and -+ * btrfs_commit_transaction. -+ */ -+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, -+ const char *function, -+ unsigned int line, int errno, bool first_hit) -+{ -+ struct btrfs_fs_info *fs_info = trans->fs_info; -+ -+ WRITE_ONCE(trans->aborted, errno); -+ WRITE_ONCE(trans->transaction->aborted, errno); -+ if (first_hit && errno == -ENOSPC) -+ btrfs_dump_space_info_for_trans_abort(fs_info); -+ /* Wake up anybody who may be waiting on this transaction */ -+ wake_up(&fs_info->transaction_wait); -+ wake_up(&fs_info->transaction_blocked_wait); -+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -+} -+ - int __init btrfs_transaction_init(void) - { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", -diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h -index 97f6c39f59c8..fa728ab80826 100644 ---- a/fs/btrfs/transaction.h -+++ b/fs/btrfs/transaction.h -@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) - delayed_refs->qgroup_to_skip = 0; - } - -+bool __cold abort_should_print_stack(int errno); -+ -+/* -+ * Call btrfs_abort_transaction as early as possible when an error condition is -+ * detected, that way the exact stack trace is reported for some errors. -+ */ -+#define btrfs_abort_transaction(trans, errno) \ -+do { \ -+ bool first = false; \ -+ /* Report first abort since mount */ \ -+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ -+ &((trans)->fs_info->fs_state))) { \ -+ first = true; \ -+ if (WARN(abort_should_print_stack(errno), \ -+ KERN_ERR \ -+ "BTRFS: Transaction aborted (error %d)\n", \ -+ (errno))) { \ -+ /* Stack trace printed. */ \ -+ } else { \ -+ btrfs_debug((trans)->fs_info, \ -+ "Transaction aborted (error %d)", \ -+ (errno)); \ -+ } \ -+ } \ -+ __btrfs_abort_transaction((trans), __func__, \ -+ __LINE__, (errno), first); \ -+} while (0) -+ - int btrfs_end_transaction(struct btrfs_trans_handle *trans); - struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - unsigned int num_items); -@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); - void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); -+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, -+ const char *function, -+ unsigned int line, int errno, bool first_hit); - - int __init btrfs_transaction_init(void); - void __cold btrfs_transaction_exit(void); -diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c -index 58599189bd18..200cea6e49e5 100644 ---- a/fs/btrfs/tree-log.c -+++ b/fs/btrfs/tree-log.c -@@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) - } - } - --static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) --{ -- filemap_fdatawait_range(buf->pages[0]->mapping, -- buf->start, buf->start + buf->len - 1); --} -- - /* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part -@@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - return ret; - } - -+ btrfs_tree_lock(next); -+ btrfs_clear_buffer_dirty(trans, next); -+ wait_on_extent_buffer_writeback(next); -+ btrfs_tree_unlock(next); -+ - if (trans) { -- btrfs_tree_lock(next); -- btrfs_clean_tree_block(next); -- btrfs_wait_tree_block_writeback(next); -- btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - bytenr, blocksize); - if (ret) { -@@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - btrfs_redirty_list_add( - trans->transaction, next); - } else { -- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) -- clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, bytenr); - } - } -@@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - - next = path->nodes[*level]; - -+ btrfs_tree_lock(next); -+ btrfs_clear_buffer_dirty(trans, next); -+ wait_on_extent_buffer_writeback(next); -+ btrfs_tree_unlock(next); -+ - if (trans) { -- btrfs_tree_lock(next); -- btrfs_clean_tree_block(next); -- btrfs_wait_tree_block_writeback(next); -- btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - path->nodes[*level]->start, - path->nodes[*level]->len); -@@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - btrfs_redirty_list_add(trans->transaction, - next); - } else { -- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) -- clear_extent_buffer_dirty(next); -- - unaccount_log_buffer(fs_info, - path->nodes[*level]->start); - } -@@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, - - next = path->nodes[orig_level]; - -+ btrfs_tree_lock(next); -+ btrfs_clear_buffer_dirty(trans, next); -+ wait_on_extent_buffer_writeback(next); -+ btrfs_tree_unlock(next); -+ - if (trans) { -- btrfs_tree_lock(next); -- btrfs_clean_tree_block(next); -- btrfs_wait_tree_block_writeback(next); -- btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - next->start, next->len); - if (ret) - goto out; - btrfs_redirty_list_add(trans->transaction, next); - } else { -- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) -- clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, next->start); - } - } -@@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - - /* - * If for some unexpected reason the last item's index is not greater -- * than the last index we logged, warn and return an error to fallback -- * to a transaction commit. -+ * than the last index we logged, warn and force a transaction commit. - */ - if (WARN_ON(last_index <= inode->last_dir_index_offset)) -- ret = -EUCLEAN; -+ ret = BTRFS_LOG_FORCE_COMMIT; - else - inode->last_dir_index_offset = last_index; - out: -@@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_key min_key; - struct btrfs_root *root = inode->root; - struct btrfs_root *log = root->log_root; -- int err = 0; - int ret; - u64 last_old_dentry_offset = min_offset - 1; - u64 last_offset = (u64)-1; -@@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - path->slots[0]); - if (tmp.type == BTRFS_DIR_INDEX_KEY) - last_old_dentry_offset = tmp.offset; -- } else if (ret < 0) { -- err = ret; -+ } else if (ret > 0) { -+ ret = 0; - } - - goto done; -@@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - if (tmp.type == BTRFS_DIR_INDEX_KEY) - last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { -- err = ret; - goto done; - } - -@@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - */ - search: - ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); -- if (ret > 0) -+ if (ret > 0) { - ret = btrfs_next_item(root, path); -+ if (ret > 0) { -+ /* There are no more keys in the inode's root. */ -+ ret = 0; -+ goto done; -+ } -+ } - if (ret < 0) -- err = ret; -- /* If ret is 1, there are no more keys in the inode's root. */ -- if (ret != 0) - goto done; - - /* -@@ -3897,8 +3887,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, - &last_old_dentry_offset); - if (ret != 0) { -- if (ret < 0) -- err = ret; -+ if (ret > 0) -+ ret = 0; - goto done; - } - path->slots[0] = btrfs_header_nritems(path->nodes[0]); -@@ -3909,10 +3899,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - */ - ret = btrfs_next_leaf(root, path); - if (ret) { -- if (ret == 1) -+ if (ret == 1) { - last_offset = (u64)-1; -- else -- err = ret; -+ ret = 0; -+ } - goto done; - } - btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); -@@ -3943,7 +3933,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - btrfs_release_path(path); - btrfs_release_path(dst_path); - -- if (err == 0) { -+ if (ret == 0) { - *last_offset_ret = last_offset; - /* - * In case the leaf was changed in the current transaction but -@@ -3954,15 +3944,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - * a range, last_old_dentry_offset is == to last_offset. - */ - ASSERT(last_old_dentry_offset <= last_offset); -- if (last_old_dentry_offset < last_offset) { -+ if (last_old_dentry_offset < last_offset) - ret = insert_dir_log_key(trans, log, path, ino, - last_old_dentry_offset + 1, - last_offset); -- if (ret) -- err = ret; -- } - } -- return err; -+ -+ return ret; - } - - /* -@@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, - * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction - * commits. - */ -- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { -- btrfs_set_log_full_commit(trans); -+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) - return BTRFS_LOG_FORCE_COMMIT; -- } - - inode = btrfs_iget(root->fs_info->sb, ino, root); - /* -@@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - * result in losing the file after a log replay. - */ - if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { -- btrfs_set_log_full_commit(trans); - ret = BTRFS_LOG_FORCE_COMMIT; - goto out_unlock; - } -diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h -index 85cd24cb0540..bdeb5216718f 100644 ---- a/fs/btrfs/tree-log.h -+++ b/fs/btrfs/tree-log.h -@@ -13,8 +13,13 @@ - /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ - #define BTRFS_NO_LOG_SYNC 256 - --/* We can't use the tree log for whatever reason, force a transaction commit */ --#define BTRFS_LOG_FORCE_COMMIT (1) -+/* -+ * We can't use the tree log for whatever reason, force a transaction commit. -+ * We use a negative value because there are functions through the logging code -+ * that need to return an error (< 0 value), false (0) or true (1). Any negative -+ * value will do, as it will cause the log to be marked for a full sync. -+ */ -+#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) - - struct btrfs_log_ctx { - int log_ret; -diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c -index df43093b7a46..7823168c08a6 100644 ---- a/fs/btrfs/volumes.c -+++ b/fs/btrfs/volumes.c -@@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( - /* - * Handle the case where the scanned device is part of an fs whose last - * metadata UUID change reverted it to the original FSID. At the same -- * time * fs_devices was first created by another constitutent device -+ * time fs_devices was first created by another constituent device - * which didn't fully observe the operation. This results in an - * btrfs_fs_devices created with metadata/fsid different AND - * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the -@@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) - return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); - } - --/* -- * Calculate the geometry of a particular (address, len) tuple. This -- * information is used to calculate how big a particular bio can get before it -- * straddles a stripe. -- * -- * @fs_info: the filesystem -- * @em: mapping containing the logical extent -- * @op: type of operation - write or read -- * @logical: address that we want to figure out the geometry of -- * @io_geom: pointer used to return values -- * -- * Returns < 0 in case a chunk for the given logical address cannot be found, -- * usually shouldn't happen unless @logical is corrupted, 0 otherwise. -- */ --int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, -- enum btrfs_map_op op, u64 logical, -- struct btrfs_io_geometry *io_geom) -+static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, -+ u64 offset, u64 *stripe_nr, u64 *stripe_offset, -+ u64 *full_stripe_start) - { -- struct map_lookup *map; -- u64 len; -- u64 offset; -- u64 stripe_offset; -- u64 stripe_nr; -- u32 stripe_len; -- u64 raid56_full_stripe_start = (u64)-1; -- int data_stripes; -+ u32 stripe_len = map->stripe_len; - - ASSERT(op != BTRFS_MAP_DISCARD); - -- map = em->map_lookup; -- /* Offset of this logical address in the chunk */ -- offset = logical - em->start; -- /* Len of a stripe in a chunk */ -- stripe_len = map->stripe_len; - /* -- * Stripe_nr is where this block falls in -- * stripe_offset is the offset of this block in its stripe. -+ * Stripe_nr is the stripe where this block falls. stripe_offset is -+ * the offset of this block in its stripe. - */ -- stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); -- ASSERT(stripe_offset < U32_MAX); -+ *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); -+ ASSERT(*stripe_offset < U32_MAX); - -- data_stripes = nr_data_stripes(map); -+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { -+ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - -- /* Only stripe based profiles needs to check against stripe length. */ -- if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { -- u64 max_len = stripe_len - stripe_offset; -+ *full_stripe_start = -+ div64_u64(offset, full_stripe_len) * full_stripe_len; - - /* -- * In case of raid56, we need to know the stripe aligned start -+ * For writes to RAID56, allow to write a full stripe set, but -+ * no straddling of stripe sets. - */ -- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { -- unsigned long full_stripe_len = stripe_len * data_stripes; -- raid56_full_stripe_start = offset; -- -- /* -- * Allow a write of a full stripe, but make sure we -- * don't allow straddling of stripes -- */ -- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, -- full_stripe_len); -- raid56_full_stripe_start *= full_stripe_len; -- -- /* -- * For writes to RAID[56], allow a full stripeset across -- * all disks. For other RAID types and for RAID[56] -- * reads, just allow a single stripe (on a single disk). -- */ -- if (op == BTRFS_MAP_WRITE) { -- max_len = stripe_len * data_stripes - -- (offset - raid56_full_stripe_start); -- } -- } -- len = min_t(u64, em->len - offset, max_len); -- } else { -- len = em->len - offset; -+ if (op == BTRFS_MAP_WRITE) -+ return full_stripe_len - (offset - *full_stripe_start); - } - -- io_geom->len = len; -- io_geom->offset = offset; -- io_geom->stripe_len = stripe_len; -- io_geom->stripe_nr = stripe_nr; -- io_geom->stripe_offset = stripe_offset; -- io_geom->raid56_stripe_offset = raid56_full_stripe_start; -- -- return 0; -+ /* -+ * For other RAID types and for RAID56 reads, allow a single stripe (on -+ * a single disk). -+ */ -+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) -+ return stripe_len - *stripe_offset; -+ return U64_MAX; - } - - static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, -@@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - { - struct extent_map *em; - struct map_lookup *map; -+ u64 map_offset; - u64 stripe_offset; - u64 stripe_nr; - u64 stripe_len; -@@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - int patch_the_first_stripe_for_dev_replace = 0; - u64 physical_to_patch_in_first_stripe = 0; - u64 raid56_full_stripe_start = (u64)-1; -- struct btrfs_io_geometry geom; -+ u64 max_len; - - ASSERT(bioc_ret); - ASSERT(op != BTRFS_MAP_DISCARD); -@@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); - -- ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); -- if (ret < 0) -- return ret; -- - map = em->map_lookup; -- -- *length = geom.len; -- stripe_len = geom.stripe_len; -- stripe_nr = geom.stripe_nr; -- stripe_offset = geom.stripe_offset; -- raid56_full_stripe_start = geom.raid56_stripe_offset; - data_stripes = nr_data_stripes(map); -+ stripe_len = map->stripe_len; -+ -+ map_offset = logical - em->start; -+ max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, -+ &stripe_offset, &raid56_full_stripe_start); -+ *length = min_t(u64, em->len - map_offset, max_len); - - down_read(&dev_replace->rwsem); - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); -diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h -index 6b7a05f6cf82..7e51f2238f72 100644 ---- a/fs/btrfs/volumes.h -+++ b/fs/btrfs/volumes.h -@@ -53,21 +53,6 @@ enum btrfs_raid_types { - BTRFS_NR_RAID_TYPES - }; - --struct btrfs_io_geometry { -- /* remaining bytes before crossing a stripe */ -- u64 len; -- /* offset of logical address in chunk */ -- u64 offset; -- /* length of single IO stripe */ -- u32 stripe_len; -- /* offset of address in stripe */ -- u32 stripe_offset; -- /* number of stripe where address falls */ -- u64 stripe_nr; -- /* offset of raid56 stripe into the chunk */ -- u64 raid56_stripe_offset; --}; -- - /* - * Use sequence counter to get consistent device stat data on - * 32-bit processors. -@@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - u32 *num_stripes); --int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, -- enum btrfs_map_op op, u64 logical, -- struct btrfs_io_geometry *io_geom); - int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); - int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); - struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, -diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c -index 1f503e8e42d4..f95b2c94d619 100644 ---- a/fs/btrfs/zoned.c -+++ b/fs/btrfs/zoned.c -@@ -17,6 +17,7 @@ - #include "space-info.h" - #include "fs.h" - #include "accessors.h" -+#include "bio.h" - - /* Maximum number of zones to report per blkdev_report_zones() call */ - #define BTRFS_REPORT_NR_ZONES 4096 -@@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, - */ - static inline u32 sb_zone_number(int shift, int mirror) - { -- u64 zone; -+ u64 zone = U64_MAX; - - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); - switch (mirror) { -@@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, - struct blk_zone *zones, unsigned int *nr_zones) - { - struct btrfs_zoned_device_info *zinfo = device->zone_info; -- u32 zno; - int ret; - - if (!*nr_zones) -@@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, - /* Check cache */ - if (zinfo->zone_cache) { - unsigned int i; -+ u32 zno; - - ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); - zno = pos >> zinfo->zone_size_shift; -@@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, - return -EIO; - - /* Populate cache */ -- if (zinfo->zone_cache) -+ if (zinfo->zone_cache) { -+ u32 zno = pos >> zinfo->zone_size_shift; -+ - memcpy(zinfo->zone_cache + zno, zones, - sizeof(*zinfo->zone_cache) * *nr_zones); -+ } - - return 0; - } -@@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) - nr_sectors = bdev_nr_sectors(bdev); - zone_info->zone_size_shift = ilog2(zone_info->zone_size); - zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); -- /* -- * We limit max_zone_append_size also by max_segments * -- * PAGE_SIZE. Technically, we can have multiple pages per segment. But, -- * since btrfs adds the pages one by one to a bio, and btrfs cannot -- * increase the metadata reservation even if it increases the number of -- * extents, it is safe to stick with the limit. -- * -- * With the zoned emulation, we can have non-zoned device on the zoned -- * mode. In this case, we don't have a valid max zone append size. So, -- * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. -- */ -- if (bdev_is_zoned(bdev)) { -- zone_info->max_zone_append_size = min_t(u64, -- (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, -- (u64)bdev_max_segments(bdev) << PAGE_SHIFT); -- } else { -- zone_info->max_zone_append_size = -- (u64)bdev_max_segments(bdev) << PAGE_SHIFT; -- } - if (!IS_ALIGNED(nr_sectors, zone_sectors)) - zone_info->nr_zones++; - -@@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) - - int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) - { -+ struct queue_limits *lim = &fs_info->limits; - struct btrfs_device *device; - u64 zone_size = 0; -- u64 max_zone_append_size = 0; - int ret; - - /* -@@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) - if (!btrfs_fs_incompat(fs_info, ZONED)) - return btrfs_check_for_zoned_device(fs_info); - -+ blk_set_stacking_limits(lim); -+ - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { - struct btrfs_zoned_device_info *zone_info = device->zone_info; - -@@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) - zone_info->zone_size, zone_size); - return -EINVAL; - } -- if (!max_zone_append_size || -- (zone_info->max_zone_append_size && -- zone_info->max_zone_append_size < max_zone_append_size)) -- max_zone_append_size = zone_info->max_zone_append_size; -+ -+ /* -+ * With the zoned emulation, we can have non-zoned device on the -+ * zoned mode. In this case, we don't have a valid max zone -+ * append size. -+ */ -+ if (bdev_is_zoned(device->bdev)) { -+ blk_stack_limits(lim, -+ &bdev_get_queue(device->bdev)->limits, -+ 0); -+ } - } - - /* -@@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) - } - - fs_info->zone_size = zone_size; -- fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, -- fs_info->sectorsize); -+ /* -+ * Also limit max_zone_append_size by max_segments * PAGE_SIZE. -+ * Technically, we can have multiple pages per segment. But, since -+ * we add the pages one by one to a bio, and cannot increase the -+ * metadata reservation even if it increases the number of extents, it -+ * is safe to stick with the limit. -+ */ -+ fs_info->max_zone_append_size = ALIGN_DOWN( -+ min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, -+ (u64)lim->max_sectors << SECTOR_SHIFT, -+ (u64)lim->max_segments << PAGE_SHIFT), -+ fs_info->sectorsize); - fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; -@@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) - spin_unlock(&trans->releasing_ebs_lock); - } - --bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) -+bool btrfs_use_zone_append(struct btrfs_bio *bbio) - { -+ u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); -+ struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_group *cache; - bool ret = false; -@@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) - if (!is_data_inode(&inode->vfs_inode)) - return false; - -+ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) -+ return false; -+ - /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the - * extent layout the relocation code has. -@@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) - return ret; - } - --void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, -- struct bio *bio) -+void btrfs_record_physical_zoned(struct btrfs_bio *bbio) - { -+ const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - struct btrfs_ordered_extent *ordered; -- const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - -- if (bio_op(bio) != REQ_OP_ZONE_APPEND) -- return; -- -- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); -+ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON(!ordered)) - return; - - ordered->physical = physical; -- ordered->bdev = bio->bi_bdev; -- - btrfs_put_ordered_extent(ordered); - } - -@@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) - struct extent_map *em; - struct btrfs_ordered_sum *sum; - u64 orig_logical = ordered->disk_bytenr; -- u64 *logical = NULL; -- int nr, stripe_len; -+ struct map_lookup *map; -+ u64 physical = ordered->physical; -+ u64 chunk_start_phys; -+ u64 logical; - -- /* Zoned devices should not have partitions. So, we can assume it is 0 */ -- ASSERT(!bdev_is_partition(ordered->bdev)); -- if (WARN_ON(!ordered->bdev)) -+ em = btrfs_get_chunk_map(fs_info, orig_logical, 1); -+ if (IS_ERR(em)) - return; -+ map = em->map_lookup; -+ chunk_start_phys = map->stripes[0].physical; - -- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, -- ordered->physical, &logical, &nr, -- &stripe_len))) -- goto out; -- -- WARN_ON(nr != 1); -+ if (WARN_ON_ONCE(map->num_stripes > 1) || -+ WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || -+ WARN_ON_ONCE(physical < chunk_start_phys) || -+ WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { -+ free_extent_map(em); -+ return; -+ } -+ logical = em->start + (physical - map->stripes[0].physical); -+ free_extent_map(em); - -- if (orig_logical == *logical) -- goto out; -+ if (orig_logical == logical) -+ return; - -- ordered->disk_bytenr = *logical; -+ ordered->disk_bytenr = logical; - - em_tree = &inode->extent_tree; - write_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, ordered->file_offset, - ordered->num_bytes); -- em->block_start = *logical; -+ em->block_start = logical; - free_extent_map(em); - write_unlock(&em_tree->lock); - - list_for_each_entry(sum, &ordered->list, list) { -- if (*logical < orig_logical) -- sum->bytenr -= orig_logical - *logical; -+ if (logical < orig_logical) -+ sum->bytenr -= orig_logical - logical; - else -- sum->bytenr += *logical - orig_logical; -+ sum->bytenr += logical - orig_logical; - } -- --out: -- kfree(logical); - } - - bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, -@@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, - return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); - } - --struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, -- u64 logical, u64 length) --{ -- struct btrfs_device *device; -- struct extent_map *em; -- struct map_lookup *map; -- -- em = btrfs_get_chunk_map(fs_info, logical, length); -- if (IS_ERR(em)) -- return ERR_CAST(em); -- -- map = em->map_lookup; -- /* We only support single profile for now */ -- device = map->stripes[0].dev; -- -- free_extent_map(em); -- -- return device; --} -- - /* - * Activate block group and underlying device zones - * -diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h -index f43990985d80..c0570d35fea2 100644 ---- a/fs/btrfs/zoned.h -+++ b/fs/btrfs/zoned.h -@@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { - */ - u64 zone_size; - u8 zone_size_shift; -- u64 max_zone_append_size; - u32 nr_zones; - unsigned int max_active_zones; - atomic_t active_zones_left; -@@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); - void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb); - void btrfs_free_redirty_list(struct btrfs_transaction *trans); --bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); --void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, -- struct bio *bio); -+bool btrfs_use_zone_append(struct btrfs_bio *bbio); -+void btrfs_record_physical_zoned(struct btrfs_bio *bbio); - void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); - bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, -@@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); - int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, - u64 physical_start, u64 physical_pos); --struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, -- u64 logical, u64 length); - bool btrfs_zone_activate(struct btrfs_block_group *block_group); - int btrfs_zone_finish(struct btrfs_block_group *block_group); - bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); -@@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) { } - static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } - --static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) -+static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) - { - return false; - } - --static inline void btrfs_record_physical_zoned(struct inode *inode, -- u64 file_offset, struct bio *bio) -+static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) - { - } - -@@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, - return -EOPNOTSUPP; - } - --static inline struct btrfs_device *btrfs_zoned_get_device( -- struct btrfs_fs_info *fs_info, -- u64 logical, u64 length) --{ -- return ERR_PTR(-EOPNOTSUPP); --} -- - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) - { - return true; -diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c -index 9de1c9d1a13d..3559ea6b0781 100644 ---- a/fs/ext4/extents.c -+++ b/fs/ext4/extents.c -@@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle, - ext4_ext_mark_unwritten(ex2); - - err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); -- if (err != -ENOSPC && err != -EDQUOT) -+ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) - goto out; - - if (EXT4_EXT_MAY_ZEROOUT & split_flag) { -diff --git a/fs/ext4/file.c b/fs/ext4/file.c -index 7ac0a81bd371..6e9f198ecacf 100644 ---- a/fs/ext4/file.c -+++ b/fs/ext4/file.c -@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) - return false; - } - --/* Is IO overwriting allocated and initialized blocks? */ --static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) -+/* Is IO overwriting allocated or initialized blocks? */ -+static bool ext4_overwrite_io(struct inode *inode, -+ loff_t pos, loff_t len, bool *unwritten) - { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; -@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) - blklen = map.m_len; - - err = ext4_map_blocks(NULL, inode, &map, 0); -+ if (err != blklen) -+ return false; - /* - * 'err==len' means that all of the blocks have been preallocated, -- * regardless of whether they have been initialized or not. To exclude -- * unwritten extents, we need to check m_flags. -+ * regardless of whether they have been initialized or not. We need to -+ * check m_flags to distinguish the unwritten extents. - */ -- return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); -+ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); -+ return true; - } - - static ssize_t ext4_generic_write_checks(struct kiocb *iocb, -@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { - * - For extending writes case we don't take the shared lock, since it requires - * updating inode i_disksize and/or orphan handling with exclusive lock. - * -- * - shared locking will only be true mostly with overwrites. Otherwise we will -- * switch to exclusive i_rwsem lock. -+ * - shared locking will only be true mostly with overwrites, including -+ * initialized blocks and unwritten blocks. For overwrite unwritten blocks -+ * we protect splitting extents by i_data_sem in ext4_inode_info, so we can -+ * also release exclusive i_rwsem lock. -+ * -+ * - Otherwise we will switch to exclusive i_rwsem lock. - */ - static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, -- bool *ilock_shared, bool *extend) -+ bool *ilock_shared, bool *extend, -+ bool *unwritten) - { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); -@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, - * in file_modified(). - */ - if (*ilock_shared && (!IS_NOSEC(inode) || *extend || -- !ext4_overwrite_io(inode, offset, count))) { -+ !ext4_overwrite_io(inode, offset, count, unwritten))) { - if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; -@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(from); - const struct iomap_ops *iomap_ops = &ext4_iomap_ops; -- bool extend = false, unaligned_io = false; -+ bool extend = false, unaligned_io = false, unwritten = false; - bool ilock_shared = true; - - /* -@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) - return ext4_buffered_write_iter(iocb, from); - } - -- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); -+ ret = ext4_dio_write_checks(iocb, from, -+ &ilock_shared, &extend, &unwritten); - if (ret <= 0) - return ret; - -@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) - ext4_journal_stop(handle); - } - -- if (ilock_shared) -+ if (ilock_shared && !unwritten) - iomap_ops = &ext4_iomap_overwrite_ops; - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, -diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c -index 9d9f414f99fe..24128f6cd1b0 100644 ---- a/fs/ext4/inode.c -+++ b/fs/ext4/inode.c -@@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, - new_size); - } - --static int __ext4_journalled_writepage(struct page *page, unsigned int len); - static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - -@@ -1005,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, - return ret; - } - --/* -- * To preserve ordering, it is essential that the hole instantiation and -- * the data write be encapsulated in a single transaction. We cannot -- * close off a transaction and start a new one between the ext4_get_block() -- * and the commit_write(). So doing the jbd2_journal_start at the start of -- * prepare_write() is the right place. -- * -- * Also, this function can nest inside ext4_writepage(). In that case, we -- * *know* that ext4_writepage() has generated enough buffer credits to do the -- * whole page. So we won't block on the journal in that case, which is good, -- * because the caller may be PF_MEMALLOC. -- * -- * By accident, ext4 can be reentered when a transaction is open via -- * quota file writes. If we were to commit the transaction while thus -- * reentered, there can be a deadlock - we would be holding a quota -- * lock, and the commit would never complete if another thread had a -- * transaction open and was blocking on the quota lock - a ranking -- * violation. -- * -- * So what we do is to rely on the fact that jbd2_journal_stop/journal_start -- * will _not_ run commit under these circumstances because handle->h_ref -- * is elevated. We'll still have enough credits for the tiny quotafile -- * write. -- */ - int do_journal_get_write_access(handle_t *handle, struct inode *inode, - struct buffer_head *bh) - { -@@ -1149,6 +1124,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, - } - #endif - -+/* -+ * To preserve ordering, it is essential that the hole instantiation and -+ * the data write be encapsulated in a single transaction. We cannot -+ * close off a transaction and start a new one between the ext4_get_block() -+ * and the ext4_write_end(). So doing the jbd2_journal_start at the start of -+ * ext4_write_begin() is the right place. -+ */ - static int ext4_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) -@@ -1649,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode) - return; - } - --static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, -- struct buffer_head *bh) --{ -- return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); --} -- - /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block -@@ -1887,216 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - return 0; - } - --static int __ext4_journalled_writepage(struct page *page, -- unsigned int len) -+static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) - { -- struct address_space *mapping = page->mapping; -- struct inode *inode = mapping->host; -- handle_t *handle = NULL; -- int ret = 0, err = 0; -- int inline_data = ext4_has_inline_data(inode); -- struct buffer_head *inode_bh = NULL; -- loff_t size; -- -- ClearPageChecked(page); -- -- if (inline_data) { -- BUG_ON(page->index != 0); -- BUG_ON(len > ext4_get_max_inline_size(inode)); -- inode_bh = ext4_journalled_write_inline_data(inode, len, page); -- if (inode_bh == NULL) -- goto out; -- } -- /* -- * We need to release the page lock before we start the -- * journal, so grab a reference so the page won't disappear -- * out from under us. -- */ -- get_page(page); -- unlock_page(page); -- -- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, -- ext4_writepage_trans_blocks(inode)); -- if (IS_ERR(handle)) { -- ret = PTR_ERR(handle); -- put_page(page); -- goto out_no_pagelock; -- } -- BUG_ON(!ext4_handle_valid(handle)); -- -- lock_page(page); -- put_page(page); -- size = i_size_read(inode); -- if (page->mapping != mapping || page_offset(page) > size) { -- /* The page got truncated from under us */ -- ext4_journal_stop(handle); -- ret = 0; -- goto out; -- } -- -- if (inline_data) { -- ret = ext4_mark_inode_dirty(handle, inode); -- } else { -- struct buffer_head *page_bufs = page_buffers(page); -- -- if (page->index == size >> PAGE_SHIFT) -- len = size & ~PAGE_MASK; -- else -- len = PAGE_SIZE; -- -- ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -- NULL, do_journal_get_write_access); -- -- err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -- NULL, write_end_fn); -- } -- if (ret == 0) -- ret = err; -- err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); -- if (ret == 0) -- ret = err; -- EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; -- err = ext4_journal_stop(handle); -- if (!ret) -- ret = err; -- -- ext4_set_inode_state(inode, EXT4_STATE_JDATA); --out: -+ mpd->first_page++; - unlock_page(page); --out_no_pagelock: -- brelse(inode_bh); -- return ret; --} -- --/* -- * Note that we don't need to start a transaction unless we're journaling data -- * because we should have holes filled from ext4_page_mkwrite(). We even don't -- * need to file the inode to the transaction's list in ordered mode because if -- * we are writing back data added by write(), the inode is already there and if -- * we are writing back data modified via mmap(), no one guarantees in which -- * transaction the data will hit the disk. In case we are journaling data, we -- * cannot start transaction directly because transaction start ranks above page -- * lock so we have to do some magic. -- * -- * This function can get called via... -- * - ext4_writepages after taking page lock (have journal handle) -- * - journal_submit_inode_data_buffers (no journal handle) -- * - shrink_page_list via the kswapd/direct reclaim (no journal handle) -- * - grab_page_cache when doing write_begin (have journal handle) -- * -- * We don't do any block allocation in this function. If we have page with -- * multiple blocks we need to write those buffer_heads that are mapped. This -- * is important for mmaped based write. So if we do with blocksize 1K -- * truncate(f, 1024); -- * a = mmap(f, 0, 4096); -- * a[0] = 'a'; -- * truncate(f, 4096); -- * we have in the page first buffer_head mapped via page_mkwrite call back -- * but other buffer_heads would be unmapped but dirty (dirty done via the -- * do_wp_page). So writepage should write the first block. If we modify -- * the mmap area beyond 1024 we will again get a page_fault and the -- * page_mkwrite callback will do the block allocation and mark the -- * buffer_heads mapped. -- * -- * We redirty the page if we have any buffer_heads that is either delay or -- * unwritten in the page. -- * -- * We can get recursively called as show below. -- * -- * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> -- * ext4_writepage() -- * -- * But since we don't do any block allocation we should not deadlock. -- * Page also have the dirty flag cleared so we don't get recurive page_lock. -- */ --static int ext4_writepage(struct page *page, -- struct writeback_control *wbc) --{ -- struct folio *folio = page_folio(page); -- int ret = 0; -- loff_t size; -- unsigned int len; -- struct buffer_head *page_bufs = NULL; -- struct inode *inode = page->mapping->host; -- struct ext4_io_submit io_submit; -- -- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { -- folio_invalidate(folio, 0, folio_size(folio)); -- folio_unlock(folio); -- return -EIO; -- } -- -- trace_ext4_writepage(page); -- size = i_size_read(inode); -- if (page->index == size >> PAGE_SHIFT && -- !ext4_verity_in_progress(inode)) -- len = size & ~PAGE_MASK; -- else -- len = PAGE_SIZE; -- -- /* Should never happen but for bugs in other kernel subsystems */ -- if (!page_has_buffers(page)) { -- ext4_warning_inode(inode, -- "page %lu does not have buffers attached", page->index); -- ClearPageDirty(page); -- unlock_page(page); -- return 0; -- } -- -- page_bufs = page_buffers(page); -- /* -- * We cannot do block allocation or other extent handling in this -- * function. If there are buffers needing that, we have to redirty -- * the page. But we may reach here when we do a journal commit via -- * journal_submit_inode_data_buffers() and in that case we must write -- * allocated buffers to achieve data=ordered mode guarantees. -- * -- * Also, if there is only one buffer per page (the fs block -- * size == the page size), if one buffer needs block -- * allocation or needs to modify the extent tree to clear the -- * unwritten flag, we know that the page can't be written at -- * all, so we might as well refuse the write immediately. -- * Unfortunately if the block size != page size, we can't as -- * easily detect this case using ext4_walk_page_buffers(), but -- * for the extremely common case, this is an optimization that -- * skips a useless round trip through ext4_bio_write_page(). -- */ -- if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, -- ext4_bh_delay_or_unwritten)) { -- redirty_page_for_writepage(wbc, page); -- if ((current->flags & PF_MEMALLOC) || -- (inode->i_sb->s_blocksize == PAGE_SIZE)) { -- /* -- * For memory cleaning there's no point in writing only -- * some buffers. So just bail out. Warn if we came here -- * from direct reclaim. -- */ -- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) -- == PF_MEMALLOC); -- unlock_page(page); -- return 0; -- } -- } -- -- if (PageChecked(page) && ext4_should_journal_data(inode)) -- /* -- * It's mmapped pagecache. Add buffers and journal it. There -- * doesn't seem much point in redirtying the page here. -- */ -- return __ext4_journalled_writepage(page, len); -- -- ext4_io_submit_init(&io_submit, wbc); -- io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); -- if (!io_submit.io_end) { -- redirty_page_for_writepage(wbc, page); -- unlock_page(page); -- return -ENOMEM; -- } -- ret = ext4_bio_write_page(&io_submit, page, len); -- ext4_io_submit(&io_submit); -- /* Drop io_end reference we got from init */ -- ext4_put_io_end_defer(io_submit.io_end); -- return ret; - } - - static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -@@ -2129,7 +1899,6 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) - err = ext4_bio_write_page(&mpd->io_submit, page, len); - if (!err) - mpd->wbc->nr_to_write--; -- mpd->first_page++; - - return err; - } -@@ -2243,6 +2012,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, - err = mpage_submit_page(mpd, head->b_page); - if (err < 0) - return err; -+ mpage_page_done(mpd, head->b_page); - } - if (lblk >= blocks) { - mpd->scanned_until_end = 1; -@@ -2374,6 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) - err = mpage_submit_page(mpd, page); - if (err < 0) - goto out; -+ mpage_page_done(mpd, page); - } - folio_batch_release(&fbatch); - } -@@ -2572,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page) - return false; - } - -+static int ext4_journal_page_buffers(handle_t *handle, struct page *page, -+ int len) -+{ -+ struct buffer_head *page_bufs = page_buffers(page); -+ struct inode *inode = page->mapping->host; -+ int ret, err; -+ -+ ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -+ NULL, do_journal_get_write_access); -+ err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -+ NULL, write_end_fn); -+ if (ret == 0) -+ ret = err; -+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); -+ if (ret == 0) -+ ret = err; -+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; -+ -+ ext4_set_inode_state(inode, EXT4_STATE_JDATA); -+ -+ return ret; -+} -+ -+static int mpage_journal_page_buffers(handle_t *handle, -+ struct mpage_da_data *mpd, -+ struct page *page) -+{ -+ struct inode *inode = mpd->inode; -+ loff_t size = i_size_read(inode); -+ int len; -+ -+ ClearPageChecked(page); -+ clear_page_dirty_for_io(page); -+ mpd->wbc->nr_to_write--; -+ -+ if (page->index == size >> PAGE_SHIFT && -+ !ext4_verity_in_progress(inode)) -+ len = size & ~PAGE_MASK; -+ else -+ len = PAGE_SIZE; -+ -+ return ext4_journal_page_buffers(handle, page, len); -+} -+ - /* - * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages - * needing mapping, submit mapped pages -@@ -2597,7 +2412,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) - struct address_space *mapping = mpd->inode->i_mapping; - struct pagevec pvec; - unsigned int nr_pages; -- long left = mpd->wbc->nr_to_write; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; - xa_mark_t tag; -@@ -2605,12 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) - int blkbits = mpd->inode->i_blkbits; - ext4_lblk_t lblk; - struct buffer_head *head; -+ handle_t *handle = NULL; -+ int bpp = ext4_journal_blocks_per_page(mpd->inode); - - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; - -+ if (ext4_should_journal_data(mpd->inode)) { -+ handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, -+ bpp); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ } - pagevec_init(&pvec); - mpd->map.m_len = 0; - mpd->next_page = index; -@@ -2631,13 +2453,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) - * newly appeared dirty pages, but have not synced all - * of the old dirty pages. - */ -- if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) -+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && -+ mpd->wbc->nr_to_write <= -+ mpd->map.m_len >> (PAGE_SHIFT - blkbits)) - goto out; - - /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != page->index) - goto out; - -+ if (handle) { -+ err = ext4_journal_ensure_credits(handle, bpp, -+ 0); -+ if (err < 0) -+ goto out; -+ } -+ - lock_page(page); - /* - * If the page is no longer dirty, or its mapping no -@@ -2677,18 +2508,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; - /* -- * Writeout for transaction commit where we cannot -- * modify metadata is simple. Just submit the page. -+ * Writeout when we cannot modify metadata is simple. -+ * Just submit the page. For data=journal mode we -+ * first handle writeout of the page for checkpoint and -+ * only after that handle delayed page dirtying. This -+ * is crutial so that forcing a transaction commit and -+ * then calling filemap_write_and_wait() guarantees -+ * current state of data is in its final location. Such -+ * sequence is used for example by insert/collapse -+ * range operations before discarding the page cache. - */ - if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(page)) { - err = mpage_submit_page(mpd, page); - if (err < 0) - goto out; -- } else { -- unlock_page(page); -- mpd->first_page++; - } -+ /* Pending dirtying of journalled data? */ -+ if (PageChecked(page)) { -+ err = mpage_journal_page_buffers(handle, -+ mpd, page); -+ if (err < 0) -+ goto out; -+ } -+ mpage_page_done(mpd, page); - } else { - /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << -@@ -2700,24 +2543,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) - goto out; - err = 0; - } -- left--; - } - pagevec_release(&pvec); - cond_resched(); - } - mpd->scanned_until_end = 1; -+ if (handle) -+ ext4_journal_stop(handle); - return 0; - out: - pagevec_release(&pvec); -+ if (handle) -+ ext4_journal_stop(handle); - return err; - } - --static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, -- void *data) --{ -- return ext4_writepage(page, wbc); --} -- - static int ext4_do_writepages(struct mpage_da_data *mpd) - { - struct writeback_control *wbc = mpd->wbc; -@@ -2743,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) - if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - goto out_writepages; - -- if (ext4_should_journal_data(inode)) { -- blk_start_plug(&plug); -- ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); -- blk_finish_plug(&plug); -- goto out_writepages; -- } -- - /* - * If the filesystem has aborted, it is read-only, so return - * right away instead of dumping stack traces later on that -@@ -2784,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) - ext4_journal_stop(handle); - } - -+ /* -+ * data=journal mode does not do delalloc so we just need to writeout / -+ * journal already mapped buffers -+ */ -+ if (ext4_should_journal_data(inode)) -+ mpd->can_map = 0; -+ - if (ext4_should_dioread_nolock(inode)) { - /* - * We may need to convert up to one extent per block in -@@ -3160,9 +3000,8 @@ static int ext4_da_write_end(struct file *file, - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() -- * check), we need to update i_disksize here as neither -- * ext4_writepage() nor certain ext4_writepages() paths not -- * allocating blocks update i_disksize. -+ * check), we need to update i_disksize here as certain -+ * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). -@@ -3687,24 +3526,26 @@ const struct iomap_ops ext4_iomap_report_ops = { - }; - - /* -- * Whenever the folio is being dirtied, corresponding buffers should already -- * be attached to the transaction (we take care of this in ext4_page_mkwrite() -- * and ext4_write_begin()). However we cannot move buffers to dirty transaction -- * lists here because ->dirty_folio is called under VFS locks and the folio -- * is not necessarily locked. -- * -- * We cannot just dirty the folio and leave attached buffers clean, because the -- * buffers' dirty state is "definitive". We cannot just set the buffers dirty -- * or jbddirty because all the journalling code will explode. -- * -- * So what we do is to mark the folio "pending dirty" and next time writepage -- * is called, propagate that into the buffers appropriately. -+ * For data=journal mode, folio should be marked dirty only when it was -+ * writeably mapped. When that happens, it was already attached to the -+ * transaction and marked as jbddirty (we take care of this in -+ * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings -+ * so we should have nothing to do here, except for the case when someone -+ * had the page pinned and dirtied the page through this pin (e.g. by doing -+ * direct IO to it). In that case we'd need to attach buffers here to the -+ * transaction but we cannot due to lock ordering. We cannot just dirty the -+ * folio and leave attached buffers clean, because the buffers' dirty state is -+ * "definitive". We cannot just set the buffers dirty or jbddirty because all -+ * the journalling code will explode. So what we do is to mark the folio -+ * "pending dirty" and next time ext4_writepages() is called, attach buffers -+ * to the transaction appropriately. - */ - static bool ext4_journalled_dirty_folio(struct address_space *mapping, - struct folio *folio) - { - WARN_ON_ONCE(!folio_buffers(folio)); -- folio_set_checked(folio); -+ if (folio_maybe_dma_pinned(folio)) -+ folio_set_checked(folio); - return filemap_dirty_folio(mapping, folio); - } - -@@ -4872,13 +4713,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, - goto bad_inode; - raw_inode = ext4_raw_inode(&iloc); - -- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { -- ext4_error_inode(inode, function, line, 0, -- "iget: root inode unallocated"); -- ret = -EFSCORRUPTED; -- goto bad_inode; -- } -- - if ((flags & EXT4_IGET_HANDLE) && - (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { - ret = -ESTALE; -@@ -4951,11 +4785,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, - * NeilBrown 1999oct15 - */ - if (inode->i_nlink == 0) { -- if ((inode->i_mode == 0 || -+ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && - ino != EXT4_BOOT_LOADER_INO) { -- /* this inode is deleted */ -- ret = -ESTALE; -+ /* this inode is deleted or unallocated */ -+ if (flags & EXT4_IGET_SPECIAL) { -+ ext4_error_inode(inode, function, line, 0, -+ "iget: special inode unallocated"); -+ ret = -EFSCORRUPTED; -+ } else -+ ret = -ESTALE; - goto bad_inode; - } - /* The only unlinked inodes we let through here have -@@ -5382,7 +5221,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) - * If the folio is fully truncated, we don't need to wait for any commit - * (and we even should not as __ext4_journalled_invalidate_folio() may - * strip all buffers from the folio but keep the folio dirty which can then -- * confuse e.g. concurrent ext4_writepage() seeing dirty folio without -+ * confuse e.g. concurrent ext4_writepages() seeing dirty folio without - * buffers). Also we don't need to wait for any commit if all buffers in - * the folio remain valid. This is most beneficial for the common case of - * blocksize == PAGESIZE. -@@ -5788,7 +5627,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); - int gdpblocks; - int idxblocks; -- int ret = 0; -+ int ret; - - /* - * How many index blocks need to touch to map @lblocks logical blocks -@@ -6320,18 +6159,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) - err = __block_write_begin(page, 0, len, ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; -- if (ext4_walk_page_buffers(handle, inode, -- page_buffers(page), 0, len, NULL, -- do_journal_get_write_access)) -- goto out_error; -- if (ext4_walk_page_buffers(handle, inode, -- page_buffers(page), 0, len, NULL, -- write_end_fn)) -- goto out_error; -- if (ext4_jbd2_inode_add_write(handle, inode, -- page_offset(page), len)) -+ if (ext4_journal_page_buffers(handle, page, len)) - goto out_error; -- ext4_set_inode_state(inode, EXT4_STATE_JDATA); - } else { - unlock_page(page); - } -diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c -index 8067ccda34e4..2e8c34036313 100644 ---- a/fs/ext4/ioctl.c -+++ b/fs/ext4/ioctl.c -@@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb, - set_buffer_uptodate(bh); - unlock_buffer(bh); - -- if (err) -- goto out_bh; -- - if (handle) { - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (err) -diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c -index dd28453d6ea3..270fbcba75b6 100644 ---- a/fs/ext4/namei.c -+++ b/fs/ext4/namei.c -@@ -3872,9 +3872,16 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) - goto end_rename; - } -+ /* -+ * We need to protect against old.inode directory getting -+ * converted from inline directory format into a normal one. -+ */ -+ inode_lock_nested(old.inode, I_MUTEX_NONDIR2); - retval = ext4_rename_dir_prepare(handle, &old); -- if (retval) -+ if (retval) { -+ inode_unlock(old.inode); - goto end_rename; -+ } - } - /* - * If we're renaming a file within an inline_data dir and adding or -@@ -4006,6 +4013,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - } else { - ext4_journal_stop(handle); - } -+ if (old.dir_bh) -+ inode_unlock(old.inode); - release_bh: - brelse(old.dir_bh); - brelse(old.bh); -diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c -index beaec6d81074..3bc7c7c5b99d 100644 ---- a/fs/ext4/page-io.c -+++ b/fs/ext4/page-io.c -@@ -500,7 +500,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, - - /* Nothing to submit? Just unlock the page... */ - if (!nr_to_submit) -- goto unlock; -+ return 0; - - bh = head = page_buffers(page); - -@@ -548,7 +548,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, - } - bh = bh->b_this_page; - } while (bh != head); -- goto unlock; -+ -+ return ret; - } - } - -@@ -564,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, - io_submit_add_bh(io, inode, - bounce_page ? bounce_page : page, bh); - } while ((bh = bh->b_this_page) != head); --unlock: -- unlock_page(page); -- return ret; -+ -+ return 0; - } -diff --git a/fs/ext4/super.c b/fs/ext4/super.c -index c81fa0fa9901..2192b4111442 100644 ---- a/fs/ext4/super.c -+++ b/fs/ext4/super.c -@@ -4751,7 +4751,6 @@ static int ext4_group_desc_init(struct super_block *sb, - struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned int db_count; - ext4_fsblk_t block; -- int ret; - int i; - - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / -@@ -4791,8 +4790,7 @@ static int ext4_group_desc_init(struct super_block *sb, - ext4_msg(sb, KERN_ERR, - "can't read group descriptor %d", i); - sbi->s_gdb_count = i; -- ret = PTR_ERR(bh); -- goto out; -+ return PTR_ERR(bh); - } - rcu_read_lock(); - rcu_dereference(sbi->s_group_desc)[i] = bh; -@@ -4801,13 +4799,10 @@ static int ext4_group_desc_init(struct super_block *sb, - sbi->s_gdb_count = db_count; - if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { - ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); -- ret = -EFSCORRUPTED; -- goto out; -+ return -EFSCORRUPTED; - } -+ - return 0; --out: -- ext4_group_desc_free(sbi); -- return ret; - } - - static int ext4_load_and_init_journal(struct super_block *sb, -@@ -5234,14 +5229,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) - if (ext4_geometry_check(sb, es)) - goto failed_mount; - -- err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); -- if (err) -- goto failed_mount; -- - timer_setup(&sbi->s_err_report, print_daily_error_info, 0); - spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); - -+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); -+ if (err) -+ goto failed_mount3; -+ - /* Register extent status tree shrinker */ - if (ext4_es_register_shrinker(sbi)) - goto failed_mount3; -@@ -5967,8 +5962,11 @@ static int ext4_load_journal(struct super_block *sb, - if (!really_read_only && journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - es->s_journal_dev = cpu_to_le32(journal_devnum); -- -- /* Make sure we flush the recovery flag to disk. */ -+ ext4_commit_super(sb); -+ } -+ if (!really_read_only && journal_inum && -+ journal_inum != le32_to_cpu(es->s_journal_inum)) { -+ es->s_journal_inum = cpu_to_le32(journal_inum); - ext4_commit_super(sb); - } - -diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c -index 0c6b011a91b3..62f2ec599218 100644 ---- a/fs/ext4/xattr.c -+++ b/fs/ext4/xattr.c -@@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index) - } - - static int --ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, -- void *value_start) -+check_xattrs(struct inode *inode, struct buffer_head *bh, -+ struct ext4_xattr_entry *entry, void *end, void *value_start, -+ const char *function, unsigned int line) - { - struct ext4_xattr_entry *e = entry; -+ int err = -EFSCORRUPTED; -+ char *err_str; -+ -+ if (bh) { -+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || -+ BHDR(bh)->h_blocks != cpu_to_le32(1)) { -+ err_str = "invalid header"; -+ goto errout; -+ } -+ if (buffer_verified(bh)) -+ return 0; -+ if (!ext4_xattr_block_csum_verify(inode, bh)) { -+ err = -EFSBADCRC; -+ err_str = "invalid checksum"; -+ goto errout; -+ } -+ } else { -+ struct ext4_xattr_ibody_header *header = value_start; -+ -+ header -= 1; -+ if (end - (void *)header < sizeof(*header) + sizeof(u32)) { -+ err_str = "in-inode xattr block too small"; -+ goto errout; -+ } -+ if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { -+ err_str = "bad magic number in in-inode xattr"; -+ goto errout; -+ } -+ } - - /* Find the end of the names list */ - while (!IS_LAST_ENTRY(e)) { - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); -- if ((void *)next >= end) -- return -EFSCORRUPTED; -- if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) -- return -EFSCORRUPTED; -+ if ((void *)next >= end) { -+ err_str = "e_name out of bounds"; -+ goto errout; -+ } -+ if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { -+ err_str = "bad e_name length"; -+ goto errout; -+ } - e = next; - } - - /* Check the values */ - while (!IS_LAST_ENTRY(entry)) { - u32 size = le32_to_cpu(entry->e_value_size); -+ unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); - -- if (size > EXT4_XATTR_SIZE_MAX) -- return -EFSCORRUPTED; -+ if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { -+ err_str = "ea_inode specified without ea_inode feature enabled"; -+ goto errout; -+ } -+ if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || -+ !ext4_valid_inum(inode->i_sb, ea_ino))) { -+ err_str = "invalid ea_ino"; -+ goto errout; -+ } -+ if (size > EXT4_XATTR_SIZE_MAX) { -+ err_str = "e_value size too large"; -+ goto errout; -+ } - - if (size != 0 && entry->e_value_inum == 0) { - u16 offs = le16_to_cpu(entry->e_value_offs); -@@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, - * the padded and unpadded sizes, since the size may - * overflow to 0 when adding padding. - */ -- if (offs > end - value_start) -- return -EFSCORRUPTED; -+ if (offs > end - value_start) { -+ err_str = "e_value out of bounds"; -+ goto errout; -+ } - value = value_start + offs; - if (value < (void *)e + sizeof(u32) || - size > end - value || -- EXT4_XATTR_SIZE(size) > end - value) -- return -EFSCORRUPTED; -+ EXT4_XATTR_SIZE(size) > end - value) { -+ err_str = "overlapping e_value "; -+ goto errout; -+ } - } - entry = EXT4_XATTR_NEXT(entry); - } -- -+ if (bh) -+ set_buffer_verified(bh); - return 0; -+ -+errout: -+ if (bh) -+ __ext4_error_inode(inode, function, line, 0, -err, -+ "corrupted xattr block %llu: %s", -+ (unsigned long long) bh->b_blocknr, -+ err_str); -+ else -+ __ext4_error_inode(inode, function, line, 0, -err, -+ "corrupted in-inode xattr: %s", err_str); -+ return err; - } - - static inline int - __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, - const char *function, unsigned int line) - { -- int error = -EFSCORRUPTED; -- -- if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || -- BHDR(bh)->h_blocks != cpu_to_le32(1)) -- goto errout; -- if (buffer_verified(bh)) -- return 0; -- -- error = -EFSBADCRC; -- if (!ext4_xattr_block_csum_verify(inode, bh)) -- goto errout; -- error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, -- bh->b_data); --errout: -- if (error) -- __ext4_error_inode(inode, function, line, 0, -error, -- "corrupted xattr block %llu", -- (unsigned long long) bh->b_blocknr); -- else -- set_buffer_verified(bh); -- return error; -+ return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, -+ bh->b_data, function, line); - } - - #define ext4_xattr_check_block(inode, bh) \ - __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) - - --static int -+static inline int - __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, - void *end, const char *function, unsigned int line) - { -- int error = -EFSCORRUPTED; -- -- if (end - (void *)header < sizeof(*header) + sizeof(u32) || -- (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) -- goto errout; -- error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); --errout: -- if (error) -- __ext4_error_inode(inode, function, line, 0, -error, -- "corrupted in-inode xattr"); -- return error; -+ return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), -+ function, line); - } - - #define xattr_check_inode(inode, header, end) \ -@@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, - struct inode *inode; - int err; - -+ /* -+ * We have to check for this corruption early as otherwise -+ * iget_locked() could wait indefinitely for the state of our -+ * parent inode. -+ */ -+ if (parent->i_ino == ea_ino) { -+ ext4_error(parent->i_sb, -+ "Parent and EA inode have the same ino %lu", ea_ino); -+ return -EFSCORRUPTED; -+ } -+ - inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); -diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c -index e7537fd305dd..e191ecfb1fde 100644 ---- a/fs/gfs2/bmap.c -+++ b/fs/gfs2/bmap.c -@@ -956,26 +956,40 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, - goto out; - } - --static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, -- unsigned len) -+static struct folio * -+gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) - { -+ struct inode *inode = iter->inode; - unsigned int blockmask = i_blocksize(inode) - 1; - struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned int blocks; -+ struct folio *folio; -+ int status; - - blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; -- return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); -+ status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); -+ if (status) -+ return ERR_PTR(status); -+ -+ folio = iomap_get_folio(iter, pos); -+ if (IS_ERR(folio)) -+ gfs2_trans_end(sdp); -+ return folio; - } - --static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, -- unsigned copied, struct page *page) -+static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, -+ unsigned copied, struct folio *folio) - { - struct gfs2_trans *tr = current->journal_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - -- if (page && !gfs2_is_stuffed(ip)) -- gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); -+ if (!gfs2_is_stuffed(ip)) -+ gfs2_page_add_databufs(ip, &folio->page, offset_in_page(pos), -+ copied); -+ -+ folio_unlock(folio); -+ folio_put(folio); - - if (tr->tr_num_buf_new) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); -@@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - gfs2_trans_end(sdp); - } - --static const struct iomap_page_ops gfs2_iomap_page_ops = { -- .page_prepare = gfs2_iomap_page_prepare, -- .page_done = gfs2_iomap_page_done, -+static const struct iomap_folio_ops gfs2_iomap_folio_ops = { -+ .get_folio = gfs2_iomap_get_folio, -+ .put_folio = gfs2_iomap_put_folio, - }; - - static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, -@@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, - } - - if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) -- iomap->page_ops = &gfs2_iomap_page_ops; -+ iomap->folio_ops = &gfs2_iomap_folio_ops; - return 0; - - out_trans_end: -@@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, - /* - * NOTE: Never call gfs2_block_zero_range with an open transaction because it - * uses iomap write to perform its actions, which begin their own transactions -- * (iomap_begin, page_prepare, etc.) -+ * (iomap_begin, get_folio, etc.) - */ - static int gfs2_block_zero_range(struct inode *inode, loff_t from, - unsigned int length) -diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c -index 356193e44cf0..d3c300563eb8 100644 ---- a/fs/iomap/buffered-io.c -+++ b/fs/iomap/buffered-io.c -@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) - } - EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); - -+/** -+ * iomap_get_folio - get a folio reference for writing -+ * @iter: iteration structure -+ * @pos: start offset of write -+ * -+ * Returns a locked reference to the folio at @pos, or an error pointer if the -+ * folio could not be obtained. -+ */ -+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) -+{ -+ unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; -+ struct folio *folio; -+ -+ if (iter->flags & IOMAP_NOWAIT) -+ fgp |= FGP_NOWAIT; -+ -+ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, -+ fgp, mapping_gfp_mask(iter->inode->i_mapping)); -+ if (folio) -+ return folio; -+ -+ if (iter->flags & IOMAP_NOWAIT) -+ return ERR_PTR(-EAGAIN); -+ return ERR_PTR(-ENOMEM); -+} -+EXPORT_SYMBOL_GPL(iomap_get_folio); -+ - bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) - { - trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), -@@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - return 0; - } - -+static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, -+ size_t len) -+{ -+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; -+ -+ if (folio_ops && folio_ops->get_folio) -+ return folio_ops->get_folio(iter, pos, len); -+ else -+ return iomap_get_folio(iter, pos); -+} -+ -+static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, -+ struct folio *folio) -+{ -+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; -+ -+ if (folio_ops && folio_ops->put_folio) { -+ folio_ops->put_folio(iter->inode, pos, ret, folio); -+ } else { -+ folio_unlock(folio); -+ folio_put(folio); -+ } -+} -+ - static int iomap_write_begin_inline(const struct iomap_iter *iter, - struct folio *folio) - { -@@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, - static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, - size_t len, struct folio **foliop) - { -- const struct iomap_page_ops *page_ops = iter->iomap.page_ops; -+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; - const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct folio *folio; -- unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; - int status = 0; - -- if (iter->flags & IOMAP_NOWAIT) -- fgp |= FGP_NOWAIT; -- - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); - if (srcmap != &iter->iomap) - BUG_ON(pos + len > srcmap->offset + srcmap->length); -@@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, - if (!mapping_large_folio_support(iter->inode->i_mapping)) - len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - -- if (page_ops && page_ops->page_prepare) { -- status = page_ops->page_prepare(iter->inode, pos, len); -- if (status) -- return status; -- } -- -- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, -- fgp, mapping_gfp_mask(iter->inode->i_mapping)); -- if (!folio) { -- status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; -- goto out_no_page; -- } -+ folio = __iomap_get_folio(iter, pos, len); -+ if (IS_ERR(folio)) -+ return PTR_ERR(folio); - - /* - * Now we have a locked folio, before we do anything with it we need to -@@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, - * could do the wrong thing here (zero a page range incorrectly or fail - * to zero) and corrupt data. - */ -- if (page_ops && page_ops->iomap_valid) { -- bool iomap_valid = page_ops->iomap_valid(iter->inode, -- &iter->iomap); -+ if (folio_ops && folio_ops->iomap_valid) { -+ bool iomap_valid = folio_ops->iomap_valid(iter->inode, -+ &iter->iomap); - if (!iomap_valid) { - iter->iomap.flags |= IOMAP_F_STALE; - status = 0; -@@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, - return 0; - - out_unlock: -- folio_unlock(folio); -- folio_put(folio); -+ __iomap_put_folio(iter, pos, 0, folio); - iomap_write_failed(iter->inode, pos, len); - --out_no_page: -- if (page_ops && page_ops->page_done) -- page_ops->page_done(iter->inode, pos, 0, NULL); - return status; - } - -@@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, - static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, - size_t copied, struct folio *folio) - { -- const struct iomap_page_ops *page_ops = iter->iomap.page_ops; - const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t old_size = iter->inode->i_size; - size_t ret; -@@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, - i_size_write(iter->inode, pos + ret); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; - } -- folio_unlock(folio); -+ __iomap_put_folio(iter, pos, ret, folio); - - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); -- if (page_ops && page_ops->page_done) -- page_ops->page_done(iter->inode, pos, ret, &folio->page); -- folio_put(folio); -- - if (ret < len) - iomap_write_failed(iter->inode, pos + ret, len - ret); - return ret; -diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c -index 9804714b1751..f771001574d0 100644 ---- a/fs/iomap/direct-io.c -+++ b/fs/iomap/direct-io.c -@@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - { - blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - -- if (!(dio->flags & IOMAP_DIO_WRITE)) { -- WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); -+ if (!(dio->flags & IOMAP_DIO_WRITE)) - return REQ_OP_READ; -- } -- -- if (iomap->flags & IOMAP_F_ZONE_APPEND) -- opflags |= REQ_OP_ZONE_APPEND; -- else -- opflags |= REQ_OP_WRITE; - -+ opflags |= REQ_OP_WRITE; - if (use_fua) - opflags |= REQ_FUA; - else -diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c -index 989cf341779b..f8ff81c3de76 100644 ---- a/fs/xfs/libxfs/xfs_alloc.c -+++ b/fs/xfs/libxfs/xfs_alloc.c -@@ -2472,20 +2472,20 @@ xfs_defer_agfl_block( - struct xfs_owner_info *oinfo) - { - struct xfs_mount *mp = tp->t_mountp; -- struct xfs_extent_free_item *new; /* new element */ -+ struct xfs_extent_free_item *xefi; - - ASSERT(xfs_extfree_item_cache != NULL); - ASSERT(oinfo != NULL); - -- new = kmem_cache_zalloc(xfs_extfree_item_cache, -+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache, - GFP_KERNEL | __GFP_NOFAIL); -- new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); -- new->xefi_blockcount = 1; -- new->xefi_owner = oinfo->oi_owner; -+ xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); -+ xefi->xefi_blockcount = 1; -+ xefi->xefi_owner = oinfo->oi_owner; - - trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - -- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); -+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); - } - - /* -@@ -2500,7 +2500,7 @@ __xfs_free_extent_later( - const struct xfs_owner_info *oinfo, - bool skip_discard) - { -- struct xfs_extent_free_item *new; /* new element */ -+ struct xfs_extent_free_item *xefi; - #ifdef DEBUG - struct xfs_mount *mp = tp->t_mountp; - xfs_agnumber_t agno; -@@ -2519,27 +2519,27 @@ __xfs_free_extent_later( - #endif - ASSERT(xfs_extfree_item_cache != NULL); - -- new = kmem_cache_zalloc(xfs_extfree_item_cache, -+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache, - GFP_KERNEL | __GFP_NOFAIL); -- new->xefi_startblock = bno; -- new->xefi_blockcount = (xfs_extlen_t)len; -+ xefi->xefi_startblock = bno; -+ xefi->xefi_blockcount = (xfs_extlen_t)len; - if (skip_discard) -- new->xefi_flags |= XFS_EFI_SKIP_DISCARD; -+ xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; - if (oinfo) { - ASSERT(oinfo->oi_offset == 0); - - if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) -- new->xefi_flags |= XFS_EFI_ATTR_FORK; -+ xefi->xefi_flags |= XFS_EFI_ATTR_FORK; - if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) -- new->xefi_flags |= XFS_EFI_BMBT_BLOCK; -- new->xefi_owner = oinfo->oi_owner; -+ xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; -+ xefi->xefi_owner = oinfo->oi_owner; - } else { -- new->xefi_owner = XFS_RMAP_OWN_NULL; -+ xefi->xefi_owner = XFS_RMAP_OWN_NULL; - } - trace_xfs_bmap_free_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); -- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); -+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); - } - - #ifdef DEBUG -diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c -index 0d56a8d862e8..c8c65387136c 100644 ---- a/fs/xfs/libxfs/xfs_bmap.c -+++ b/fs/xfs/libxfs/xfs_bmap.c -@@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent( - int - xfs_bmap_finish_one( - struct xfs_trans *tp, -- struct xfs_inode *ip, -- enum xfs_bmap_intent_type type, -- int whichfork, -- xfs_fileoff_t startoff, -- xfs_fsblock_t startblock, -- xfs_filblks_t *blockcount, -- xfs_exntst_t state) -+ struct xfs_bmap_intent *bi) - { -+ struct xfs_bmbt_irec *bmap = &bi->bi_bmap; - int error = 0; - - ASSERT(tp->t_firstblock == NULLFSBLOCK); - - trace_xfs_bmap_deferred(tp->t_mountp, -- XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, -- XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), -- ip->i_ino, whichfork, startoff, *blockcount, state); -+ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), -+ bi->bi_type, -+ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), -+ bi->bi_owner->i_ino, bi->bi_whichfork, -+ bmap->br_startoff, bmap->br_blockcount, -+ bmap->br_state); - -- if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) -+ if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) - return -EFSCORRUPTED; - - if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE)) - return -EIO; - -- switch (type) { -+ switch (bi->bi_type) { - case XFS_BMAP_MAP: -- error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, -- startblock, 0); -- *blockcount = 0; -+ error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, -+ bmap->br_blockcount, bmap->br_startblock, 0); -+ bmap->br_blockcount = 0; - break; - case XFS_BMAP_UNMAP: -- error = __xfs_bunmapi(tp, ip, startoff, blockcount, -- XFS_BMAPI_REMAP, 1); -+ error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, -+ &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); - break; - default: - ASSERT(0); -diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h -index 16db95b11589..01c2df35c3e3 100644 ---- a/fs/xfs/libxfs/xfs_bmap.h -+++ b/fs/xfs/libxfs/xfs_bmap.h -@@ -234,10 +234,7 @@ struct xfs_bmap_intent { - struct xfs_bmbt_irec bi_bmap; - }; - --int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, -- enum xfs_bmap_intent_type type, int whichfork, -- xfs_fileoff_t startoff, xfs_fsblock_t startblock, -- xfs_filblks_t *blockcount, xfs_exntst_t state); -+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); - void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); - void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, -diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c -index 35f574421670..da8c769887fd 100644 ---- a/fs/xfs/libxfs/xfs_btree.c -+++ b/fs/xfs/libxfs/xfs_btree.c -@@ -2913,9 +2913,22 @@ xfs_btree_split_worker( - } - - /* -- * BMBT split requests often come in with little stack to work on. Push -+ * BMBT split requests often come in with little stack to work on so we push - * them off to a worker thread so there is lots of stack to use. For the other - * btree types, just call directly to avoid the context switch overhead here. -+ * -+ * Care must be taken here - the work queue rescuer thread introduces potential -+ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new -+ * AGFs to allocate blocks. A task being run by the rescuer could attempt to -+ * lock an AGF that is already locked by a task queued to run by the rescuer, -+ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to -+ * release it until the current thread it is running gains the lock. -+ * -+ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF -+ * already locked to allocate from. The only place that doesn't hold an AGF -+ * locked is unwritten extent conversion at IO completion, but that has already -+ * been offloaded to a worker thread and hence has no stack consumption issues -+ * we have to worry about. - */ - STATIC int /* error */ - xfs_btree_split( -@@ -2929,7 +2942,8 @@ xfs_btree_split( - struct xfs_btree_split_args args; - DECLARE_COMPLETION_ONSTACK(done); - -- if (cur->bc_btnum != XFS_BTNUM_BMAP) -+ if (cur->bc_btnum != XFS_BTNUM_BMAP || -+ cur->bc_tp->t_firstblock == NULLFSBLOCK) - return __xfs_btree_split(cur, level, ptrp, key, curp, stat); - - args.cur = cur; -diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c -index 6f7ed9288fe4..bcf46aa0d08b 100644 ---- a/fs/xfs/libxfs/xfs_refcount.c -+++ b/fs/xfs/libxfs/xfs_refcount.c -@@ -1213,37 +1213,33 @@ xfs_refcount_adjust_extents( - STATIC int - xfs_refcount_adjust( - struct xfs_btree_cur *cur, -- xfs_agblock_t agbno, -- xfs_extlen_t aglen, -- xfs_agblock_t *new_agbno, -- xfs_extlen_t *new_aglen, -+ xfs_agblock_t *agbno, -+ xfs_extlen_t *aglen, - enum xfs_refc_adjust_op adj) - { - bool shape_changed; - int shape_changes = 0; - int error; - -- *new_agbno = agbno; -- *new_aglen = aglen; - if (adj == XFS_REFCOUNT_ADJUST_INCREASE) -- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, -- agbno, aglen); -+ trace_xfs_refcount_increase(cur->bc_mp, -+ cur->bc_ag.pag->pag_agno, *agbno, *aglen); - else -- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, -- agbno, aglen); -+ trace_xfs_refcount_decrease(cur->bc_mp, -+ cur->bc_ag.pag->pag_agno, *agbno, *aglen); - - /* - * Ensure that no rcextents cross the boundary of the adjustment range. - */ - error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, -- agbno, &shape_changed); -+ *agbno, &shape_changed); - if (error) - goto out_error; - if (shape_changed) - shape_changes++; - - error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, -- agbno + aglen, &shape_changed); -+ *agbno + *aglen, &shape_changed); - if (error) - goto out_error; - if (shape_changed) -@@ -1253,7 +1249,7 @@ xfs_refcount_adjust( - * Try to merge with the left or right extents of the range. - */ - error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, -- new_agbno, new_aglen, adj, &shape_changed); -+ agbno, aglen, adj, &shape_changed); - if (error) - goto out_error; - if (shape_changed) -@@ -1262,7 +1258,7 @@ xfs_refcount_adjust( - cur->bc_ag.refc.shape_changes++; - - /* Now that we've taken care of the ends, adjust the middle extents */ -- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); -+ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); - if (error) - goto out_error; - -@@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup( - static inline int - xfs_refcount_continue_op( - struct xfs_btree_cur *cur, -- xfs_fsblock_t startblock, -- xfs_agblock_t new_agbno, -- xfs_extlen_t new_len, -- xfs_fsblock_t *new_fsbno) -+ struct xfs_refcount_intent *ri, -+ xfs_agblock_t new_agbno) - { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; - -- if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) -+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, -+ ri->ri_blockcount))) - return -EFSCORRUPTED; - -- *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); -+ ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); - -- ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); -- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); -+ ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); -+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); - - return 0; - } -@@ -1327,11 +1322,7 @@ xfs_refcount_continue_op( - int - xfs_refcount_finish_one( - struct xfs_trans *tp, -- enum xfs_refcount_intent_type type, -- xfs_fsblock_t startblock, -- xfs_extlen_t blockcount, -- xfs_fsblock_t *new_fsb, -- xfs_extlen_t *new_len, -+ struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) - { - struct xfs_mount *mp = tp->t_mountp; -@@ -1339,17 +1330,16 @@ xfs_refcount_finish_one( - struct xfs_buf *agbp = NULL; - int error = 0; - xfs_agblock_t bno; -- xfs_agblock_t new_agbno; - unsigned long nr_ops = 0; - int shape_changes = 0; - struct xfs_perag *pag; - -- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); -- bno = XFS_FSB_TO_AGBNO(mp, startblock); -+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); -+ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - -- trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), -- type, XFS_FSB_TO_AGBNO(mp, startblock), -- blockcount); -+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), -+ ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), -+ ri->ri_blockcount); - - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { - error = -EIO; -@@ -1380,42 +1370,42 @@ xfs_refcount_finish_one( - } - *pcur = rcur; - -- switch (type) { -+ switch (ri->ri_type) { - case XFS_REFCOUNT_INCREASE: -- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, -- new_len, XFS_REFCOUNT_ADJUST_INCREASE); -+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, -+ XFS_REFCOUNT_ADJUST_INCREASE); - if (error) - goto out_drop; -- if (*new_len > 0) -- error = xfs_refcount_continue_op(rcur, startblock, -- new_agbno, *new_len, new_fsb); -+ if (ri->ri_blockcount > 0) -+ error = xfs_refcount_continue_op(rcur, ri, bno); - break; - case XFS_REFCOUNT_DECREASE: -- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, -- new_len, XFS_REFCOUNT_ADJUST_DECREASE); -+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, -+ XFS_REFCOUNT_ADJUST_DECREASE); - if (error) - goto out_drop; -- if (*new_len > 0) -- error = xfs_refcount_continue_op(rcur, startblock, -- new_agbno, *new_len, new_fsb); -+ if (ri->ri_blockcount > 0) -+ error = xfs_refcount_continue_op(rcur, ri, bno); - break; - case XFS_REFCOUNT_ALLOC_COW: -- *new_fsb = startblock + blockcount; -- *new_len = 0; -- error = __xfs_refcount_cow_alloc(rcur, bno, blockcount); -+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); -+ if (error) -+ goto out_drop; -+ ri->ri_blockcount = 0; - break; - case XFS_REFCOUNT_FREE_COW: -- *new_fsb = startblock + blockcount; -- *new_len = 0; -- error = __xfs_refcount_cow_free(rcur, bno, blockcount); -+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); -+ if (error) -+ goto out_drop; -+ ri->ri_blockcount = 0; - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } -- if (!error && *new_len > 0) -- trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, -- bno, blockcount, new_agbno, *new_len); -+ if (!error && ri->ri_blockcount > 0) -+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, -+ ri->ri_type, bno, ri->ri_blockcount); - out_drop: - xfs_perag_put(pag); - return error; -diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h -index 452f30556f5a..c633477ce3ce 100644 ---- a/fs/xfs/libxfs/xfs_refcount.h -+++ b/fs/xfs/libxfs/xfs_refcount.h -@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp, - extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); - extern int xfs_refcount_finish_one(struct xfs_trans *tp, -- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, -- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, -- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); -+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); - - extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, - xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, -diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c -index b56aca1e7c66..df720041cd3d 100644 ---- a/fs/xfs/libxfs/xfs_rmap.c -+++ b/fs/xfs/libxfs/xfs_rmap.c -@@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup( - int - xfs_rmap_finish_one( - struct xfs_trans *tp, -- enum xfs_rmap_intent_type type, -- uint64_t owner, -- int whichfork, -- xfs_fileoff_t startoff, -- xfs_fsblock_t startblock, -- xfs_filblks_t blockcount, -- xfs_exntst_t state, -+ struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) - { - struct xfs_mount *mp = tp->t_mountp; -@@ -2408,11 +2402,13 @@ xfs_rmap_finish_one( - xfs_agblock_t bno; - bool unwritten; - -- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); -- bno = XFS_FSB_TO_AGBNO(mp, startblock); -+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); -+ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - -- trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, -- startoff, blockcount, state); -+ trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, -+ ri->ri_owner, ri->ri_whichfork, -+ ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, -+ ri->ri_bmap.br_state); - - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { - error = -EIO; -@@ -2448,35 +2444,37 @@ xfs_rmap_finish_one( - } - *pcur = rcur; - -- xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); -- unwritten = state == XFS_EXT_UNWRITTEN; -- bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); -+ xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, -+ ri->ri_bmap.br_startoff); -+ unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; -+ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - -- switch (type) { -+ switch (ri->ri_type) { - case XFS_RMAP_ALLOC: - case XFS_RMAP_MAP: -- error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); -+ error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, -+ unwritten, &oinfo); - break; - case XFS_RMAP_MAP_SHARED: -- error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, -- &oinfo); -+ error = xfs_rmap_map_shared(rcur, bno, -+ ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_FREE: - case XFS_RMAP_UNMAP: -- error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, -- &oinfo); -+ error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, -+ unwritten, &oinfo); - break; - case XFS_RMAP_UNMAP_SHARED: -- error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, -- &oinfo); -+ error = xfs_rmap_unmap_shared(rcur, bno, -+ ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT: -- error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, -- &oinfo); -+ error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, -+ !unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT_SHARED: -- error = xfs_rmap_convert_shared(rcur, bno, blockcount, -- !unwritten, &oinfo); -+ error = xfs_rmap_convert_shared(rcur, bno, -+ ri->ri_bmap.br_blockcount, !unwritten, &oinfo); - break; - default: - ASSERT(0); -diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h -index 54741a591a17..2dac88cea28d 100644 ---- a/fs/xfs/libxfs/xfs_rmap.h -+++ b/fs/xfs/libxfs/xfs_rmap.h -@@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, - - void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, - struct xfs_btree_cur *rcur, int error); --int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, -- uint64_t owner, int whichfork, xfs_fileoff_t startoff, -- xfs_fsblock_t startblock, xfs_filblks_t blockcount, -- xfs_exntst_t state, struct xfs_btree_cur **pcur); -+int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, -+ struct xfs_btree_cur **pcur); - - int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, -diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c -index 41323da523d1..6e2f0013380a 100644 ---- a/fs/xfs/xfs_bmap_item.c -+++ b/fs/xfs/xfs_bmap_item.c -@@ -246,18 +246,11 @@ static int - xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, -- enum xfs_bmap_intent_type type, -- struct xfs_inode *ip, -- int whichfork, -- xfs_fileoff_t startoff, -- xfs_fsblock_t startblock, -- xfs_filblks_t *blockcount, -- xfs_exntst_t state) -+ struct xfs_bmap_intent *bi) - { - int error; - -- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, -- startblock, blockcount, state); -+ error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the -@@ -290,24 +283,24 @@ xfs_bmap_update_diff_items( - /* Set the map extent flags for this mapping. */ - static void - xfs_trans_set_bmap_flags( -- struct xfs_map_extent *bmap, -+ struct xfs_map_extent *map, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_exntst_t state) - { -- bmap->me_flags = 0; -+ map->me_flags = 0; - switch (type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: -- bmap->me_flags = type; -+ map->me_flags = type; - break; - default: - ASSERT(0); - } - if (state == XFS_EXT_UNWRITTEN) -- bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; -+ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) -- bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; -+ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; - } - - /* Log bmap updates in the intent item. */ -@@ -315,7 +308,7 @@ STATIC void - xfs_bmap_update_log_item( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip, -- struct xfs_bmap_intent *bmap) -+ struct xfs_bmap_intent *bi) - { - uint next_extent; - struct xfs_map_extent *map; -@@ -331,12 +324,12 @@ xfs_bmap_update_log_item( - next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; - ASSERT(next_extent < buip->bui_format.bui_nextents); - map = &buip->bui_format.bui_extents[next_extent]; -- map->me_owner = bmap->bi_owner->i_ino; -- map->me_startblock = bmap->bi_bmap.br_startblock; -- map->me_startoff = bmap->bi_bmap.br_startoff; -- map->me_len = bmap->bi_bmap.br_blockcount; -- xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, -- bmap->bi_bmap.br_state); -+ map->me_owner = bi->bi_owner->i_ino; -+ map->me_startblock = bi->bi_bmap.br_startblock; -+ map->me_startoff = bi->bi_bmap.br_startoff; -+ map->me_len = bi->bi_bmap.br_blockcount; -+ xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, -+ bi->bi_bmap.br_state); - } - - static struct xfs_log_item * -@@ -348,15 +341,15 @@ xfs_bmap_update_create_intent( - { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_bui_log_item *buip = xfs_bui_init(mp); -- struct xfs_bmap_intent *bmap; -+ struct xfs_bmap_intent *bi; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - - xfs_trans_add_item(tp, &buip->bui_item); - if (sort) - list_sort(mp, items, xfs_bmap_update_diff_items); -- list_for_each_entry(bmap, items, bi_list) -- xfs_bmap_update_log_item(tp, buip, bmap); -+ list_for_each_entry(bi, items, bi_list) -+ xfs_bmap_update_log_item(tp, buip, bi); - return &buip->bui_item; - } - -@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item( - struct list_head *item, - struct xfs_btree_cur **state) - { -- struct xfs_bmap_intent *bmap; -- xfs_filblks_t count; -+ struct xfs_bmap_intent *bi; - int error; - -- bmap = container_of(item, struct xfs_bmap_intent, bi_list); -- count = bmap->bi_bmap.br_blockcount; -- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), -- bmap->bi_type, -- bmap->bi_owner, bmap->bi_whichfork, -- bmap->bi_bmap.br_startoff, -- bmap->bi_bmap.br_startblock, -- &count, -- bmap->bi_bmap.br_state); -- if (!error && count > 0) { -- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); -- bmap->bi_bmap.br_blockcount = count; -+ bi = container_of(item, struct xfs_bmap_intent, bi_list); -+ -+ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); -+ if (!error && bi->bi_bmap.br_blockcount > 0) { -+ ASSERT(bi->bi_type == XFS_BMAP_UNMAP); - return -EAGAIN; - } -- kmem_cache_free(xfs_bmap_intent_cache, bmap); -+ kmem_cache_free(xfs_bmap_intent_cache, bi); - return error; - } - -@@ -413,10 +398,10 @@ STATIC void - xfs_bmap_update_cancel_item( - struct list_head *item) - { -- struct xfs_bmap_intent *bmap; -+ struct xfs_bmap_intent *bi; - -- bmap = container_of(item, struct xfs_bmap_intent, bi_list); -- kmem_cache_free(xfs_bmap_intent_cache, bmap); -+ bi = container_of(item, struct xfs_bmap_intent, bi_list); -+ kmem_cache_free(xfs_bmap_intent_cache, bi); - } - - const struct xfs_defer_op_type xfs_bmap_update_defer_type = { -@@ -434,18 +419,18 @@ xfs_bui_validate( - struct xfs_mount *mp, - struct xfs_bui_log_item *buip) - { -- struct xfs_map_extent *bmap; -+ struct xfs_map_extent *map; - - /* Only one mapping operation per BUI... */ - if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) - return false; - -- bmap = &buip->bui_format.bui_extents[0]; -+ map = &buip->bui_format.bui_extents[0]; - -- if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS) -+ if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS) - return false; - -- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { -+ switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - break; -@@ -453,13 +438,13 @@ xfs_bui_validate( - return false; - } - -- if (!xfs_verify_ino(mp, bmap->me_owner)) -+ if (!xfs_verify_ino(mp, map->me_owner)) - return false; - -- if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len)) -+ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) - return false; - -- return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len); -+ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); - } - - /* -@@ -471,17 +456,13 @@ xfs_bui_item_recover( - struct xfs_log_item *lip, - struct list_head *capture_list) - { -- struct xfs_bmbt_irec irec; -+ struct xfs_bmap_intent fake = { }; - struct xfs_bui_log_item *buip = BUI_ITEM(lip); - struct xfs_trans *tp; - struct xfs_inode *ip = NULL; - struct xfs_mount *mp = lip->li_log->l_mp; -- struct xfs_map_extent *bmap; -+ struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; -- xfs_filblks_t count; -- xfs_exntst_t state; -- unsigned int bui_type; -- int whichfork; - int iext_delta; - int error = 0; - -@@ -491,14 +472,12 @@ xfs_bui_item_recover( - return -EFSCORRUPTED; - } - -- bmap = &buip->bui_format.bui_extents[0]; -- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? -- XFS_EXT_UNWRITTEN : XFS_EXT_NORM; -- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? -+ map = &buip->bui_format.bui_extents[0]; -+ fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; -- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; -+ fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - -- error = xlog_recover_iget(mp, bmap->me_owner, &ip); -+ error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; - -@@ -512,34 +491,34 @@ xfs_bui_item_recover( - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - -- if (bui_type == XFS_BMAP_MAP) -+ if (fake.bi_type == XFS_BMAP_MAP) - iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; - else - iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - -- error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); -+ error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); - if (error == -EFBIG) - error = xfs_iext_count_upgrade(tp, ip, iext_delta); - if (error) - goto err_cancel; - -- count = bmap->me_len; -- error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, -- whichfork, bmap->me_startoff, bmap->me_startblock, -- &count, state); -+ fake.bi_owner = ip; -+ fake.bi_bmap.br_startblock = map->me_startblock; -+ fake.bi_bmap.br_startoff = map->me_startoff; -+ fake.bi_bmap.br_blockcount = map->me_len; -+ fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? -+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM; -+ -+ error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); - if (error == -EFSCORRUPTED) -- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, -- sizeof(*bmap)); -+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, -+ sizeof(*map)); - if (error) - goto err_cancel; - -- if (count > 0) { -- ASSERT(bui_type == XFS_BMAP_UNMAP); -- irec.br_startblock = bmap->me_startblock; -- irec.br_blockcount = count; -- irec.br_startoff = bmap->me_startoff; -- irec.br_state = state; -- xfs_bmap_unmap_extent(tp, ip, &irec); -+ if (fake.bi_bmap.br_blockcount > 0) { -+ ASSERT(fake.bi_type == XFS_BMAP_UNMAP); -+ xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); - } - - /* -@@ -579,18 +558,18 @@ xfs_bui_item_relog( - { - struct xfs_bud_log_item *budp; - struct xfs_bui_log_item *buip; -- struct xfs_map_extent *extp; -+ struct xfs_map_extent *map; - unsigned int count; - - count = BUI_ITEM(intent)->bui_format.bui_nextents; -- extp = BUI_ITEM(intent)->bui_format.bui_extents; -+ map = BUI_ITEM(intent)->bui_format.bui_extents; - - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - buip = xfs_bui_init(tp->t_mountp); -- memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); -+ memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); - atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); -diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c -index ae082808cfed..b2cbbba3e15a 100644 ---- a/fs/xfs/xfs_error.c -+++ b/fs/xfs/xfs_error.c -@@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_errortag); - --static struct kobj_type xfs_errortag_ktype = { -+static const struct kobj_type xfs_errortag_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_errortag_sysfs_ops, - .default_groups = xfs_errortag_groups, -diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h -index dbe6c37dc697..0b9c5ba8a598 100644 ---- a/fs/xfs/xfs_error.h -+++ b/fs/xfs/xfs_error.h -@@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); - - /* - * XFS panic tags -- allow a call to xfs_alert_tag() be turned into -- * a panic by setting xfs_panic_mask in a sysctl. -+ * a panic by setting fs.xfs.panic_mask in a sysctl. - */ - #define XFS_NO_PTAG 0u - #define XFS_PTAG_IFLUSH (1u << 0) -@@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); - #define XFS_PTAG_FSBLOCK_ZERO (1u << 7) - #define XFS_PTAG_VERIFIER_ERROR (1u << 8) - -+#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ -+ XFS_PTAG_LOGRES | \ -+ XFS_PTAG_AILDELETE | \ -+ XFS_PTAG_ERROR_REPORT | \ -+ XFS_PTAG_SHUTDOWN_CORRUPT | \ -+ XFS_PTAG_SHUTDOWN_IOERROR | \ -+ XFS_PTAG_SHUTDOWN_LOGERROR | \ -+ XFS_PTAG_FSBLOCK_ZERO | \ -+ XFS_PTAG_VERIFIER_ERROR) -+ - #define XFS_PTAG_STRINGS \ - { XFS_NO_PTAG, "none" }, \ - { XFS_PTAG_IFLUSH, "iflush" }, \ -diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c -index d5130d1fcfae..011b50469301 100644 ---- a/fs/xfs/xfs_extfree_item.c -+++ b/fs/xfs/xfs_extfree_item.c -@@ -345,23 +345,30 @@ static int - xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, -- xfs_fsblock_t start_block, -- xfs_extlen_t ext_len, -- const struct xfs_owner_info *oinfo, -- bool skip_discard) -+ struct xfs_extent_free_item *xefi) - { -+ struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; -- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); -+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, -+ xefi->xefi_startblock); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, -- start_block); -+ xefi->xefi_startblock); - int error; - -- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); -+ oinfo.oi_owner = xefi->xefi_owner; -+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) -+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; -+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) -+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; -+ -+ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, -+ xefi->xefi_blockcount); - -- error = __xfs_free_extent(tp, start_block, ext_len, -- oinfo, XFS_AG_RESV_NONE, skip_discard); -+ error = __xfs_free_extent(tp, xefi->xefi_startblock, -+ xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, -+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: -@@ -375,8 +382,8 @@ xfs_trans_free_extent( - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); -- extp->ext_start = start_block; -- extp->ext_len = ext_len; -+ extp->ext_start = xefi->xefi_startblock; -+ extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -@@ -404,7 +411,7 @@ STATIC void - xfs_extent_free_log_item( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, -- struct xfs_extent_free_item *free) -+ struct xfs_extent_free_item *xefi) - { - uint next_extent; - struct xfs_extent *extp; -@@ -420,8 +427,8 @@ xfs_extent_free_log_item( - next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; - ASSERT(next_extent < efip->efi_format.efi_nextents); - extp = &efip->efi_format.efi_extents[next_extent]; -- extp->ext_start = free->xefi_startblock; -- extp->ext_len = free->xefi_blockcount; -+ extp->ext_start = xefi->xefi_startblock; -+ extp->ext_len = xefi->xefi_blockcount; - } - - static struct xfs_log_item * -@@ -433,15 +440,15 @@ xfs_extent_free_create_intent( - { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); -- struct xfs_extent_free_item *free; -+ struct xfs_extent_free_item *xefi; - - ASSERT(count > 0); - - xfs_trans_add_item(tp, &efip->efi_item); - if (sort) - list_sort(mp, items, xfs_extent_free_diff_items); -- list_for_each_entry(free, items, xefi_list) -- xfs_extent_free_log_item(tp, efip, free); -+ list_for_each_entry(xefi, items, xefi_list) -+ xfs_extent_free_log_item(tp, efip, xefi); - return &efip->efi_item; - } - -@@ -463,21 +470,13 @@ xfs_extent_free_finish_item( - struct list_head *item, - struct xfs_btree_cur **state) - { -- struct xfs_owner_info oinfo = { }; -- struct xfs_extent_free_item *free; -+ struct xfs_extent_free_item *xefi; - int error; - -- free = container_of(item, struct xfs_extent_free_item, xefi_list); -- oinfo.oi_owner = free->xefi_owner; -- if (free->xefi_flags & XFS_EFI_ATTR_FORK) -- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; -- if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) -- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; -- error = xfs_trans_free_extent(tp, EFD_ITEM(done), -- free->xefi_startblock, -- free->xefi_blockcount, -- &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); -- kmem_cache_free(xfs_extfree_item_cache, free); -+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); -+ -+ error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); -+ kmem_cache_free(xfs_extfree_item_cache, xefi); - return error; - } - -@@ -494,10 +493,10 @@ STATIC void - xfs_extent_free_cancel_item( - struct list_head *item) - { -- struct xfs_extent_free_item *free; -+ struct xfs_extent_free_item *xefi; - -- free = container_of(item, struct xfs_extent_free_item, xefi_list); -- kmem_cache_free(xfs_extfree_item_cache, free); -+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); -+ kmem_cache_free(xfs_extfree_item_cache, xefi); - } - - const struct xfs_defer_op_type xfs_extent_free_defer_type = { -@@ -523,7 +522,7 @@ xfs_agfl_free_finish_item( - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_efd_log_item *efdp = EFD_ITEM(done); -- struct xfs_extent_free_item *free; -+ struct xfs_extent_free_item *xefi; - struct xfs_extent *extp; - struct xfs_buf *agbp; - int error; -@@ -532,13 +531,13 @@ xfs_agfl_free_finish_item( - uint next_extent; - struct xfs_perag *pag; - -- free = container_of(item, struct xfs_extent_free_item, xefi_list); -- ASSERT(free->xefi_blockcount == 1); -- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); -- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); -- oinfo.oi_owner = free->xefi_owner; -+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); -+ ASSERT(xefi->xefi_blockcount == 1); -+ agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); -+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); -+ oinfo.oi_owner = xefi->xefi_owner; - -- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); -+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); - - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); -@@ -559,11 +558,11 @@ xfs_agfl_free_finish_item( - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); -- extp->ext_start = free->xefi_startblock; -- extp->ext_len = free->xefi_blockcount; -+ extp->ext_start = xefi->xefi_startblock; -+ extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - -- kmem_cache_free(xfs_extfree_item_cache, free); -+ kmem_cache_free(xfs_extfree_item_cache, xefi); - return error; - } - -@@ -599,7 +598,6 @@ xfs_efi_item_recover( - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; - struct xfs_trans *tp; -- struct xfs_extent *extp; - int i; - int error = 0; - -@@ -624,10 +622,17 @@ xfs_efi_item_recover( - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - - for (i = 0; i < efip->efi_format.efi_nextents; i++) { -+ struct xfs_extent_free_item fake = { -+ .xefi_owner = XFS_RMAP_OWN_UNKNOWN, -+ }; -+ struct xfs_extent *extp; -+ - extp = &efip->efi_format.efi_extents[i]; -- error = xfs_trans_free_extent(tp, efdp, extp->ext_start, -- extp->ext_len, -- &XFS_RMAP_OINFO_ANY_OWNER, false); -+ -+ fake.xefi_startblock = extp->ext_start; -+ fake.xefi_blockcount = extp->ext_len; -+ -+ error = xfs_trans_free_extent(tp, efdp, &fake); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); -diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c -index 88a88506ffff..92ca2017eded 100644 ---- a/fs/xfs/xfs_fsmap.c -+++ b/fs/xfs/xfs_fsmap.c -@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( - { - struct xfs_alloc_rec_incore akeys[2]; - -+ memset(akeys, 0, sizeof(akeys)); - info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return __xfs_getfsmap_datadev(tp, keys, info, - xfs_getfsmap_datadev_bnobt_query, &akeys[0]); -diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c -index 4d0a98f920ca..9edc1f2bc939 100644 ---- a/fs/xfs/xfs_globals.c -+++ b/fs/xfs/xfs_globals.c -@@ -4,6 +4,7 @@ - * All Rights Reserved. - */ - #include "xfs.h" -+#include "xfs_error.h" - - /* - * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, -@@ -15,7 +16,7 @@ xfs_param_t xfs_params = { - /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, -- .panic_mask = { 0, 0, 256 }, -+ .panic_mask = { 0, 0, XFS_PTAG_MASK}, - .error_level = { 0, 3, 11 }, - .syncd_timer = { 1*100, 30*100, 7200*100}, - .stats_clear = { 0, 0, 1 }, -diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c -index fc1946f80a4a..69dbe7814128 100644 ---- a/fs/xfs/xfs_iomap.c -+++ b/fs/xfs/xfs_iomap.c -@@ -83,7 +83,7 @@ xfs_iomap_valid( - return true; - } - --static const struct iomap_page_ops xfs_iomap_page_ops = { -+static const struct iomap_folio_ops xfs_iomap_folio_ops = { - .iomap_valid = xfs_iomap_valid, - }; - -@@ -133,7 +133,7 @@ xfs_bmbt_to_iomap( - iomap->flags |= IOMAP_F_DIRTY; - - iomap->validity_cookie = sequence_cookie; -- iomap->page_ops = &xfs_iomap_page_ops; -+ iomap->folio_ops = &xfs_iomap_folio_ops; - return 0; - } - -diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c -index 858e3e9eb4a8..48d771a76add 100644 ---- a/fs/xfs/xfs_refcount_item.c -+++ b/fs/xfs/xfs_refcount_item.c -@@ -252,17 +252,12 @@ static int - xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, -- enum xfs_refcount_intent_type type, -- xfs_fsblock_t startblock, -- xfs_extlen_t blockcount, -- xfs_fsblock_t *new_fsb, -- xfs_extlen_t *new_len, -+ struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) - { - int error; - -- error = xfs_refcount_finish_one(tp, type, startblock, -- blockcount, new_fsb, new_len, pcur); -+ error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the -@@ -297,16 +292,16 @@ xfs_refcount_update_diff_items( - /* Set the phys extent flags for this reverse mapping. */ - static void - xfs_trans_set_refcount_flags( -- struct xfs_phys_extent *refc, -+ struct xfs_phys_extent *pmap, - enum xfs_refcount_intent_type type) - { -- refc->pe_flags = 0; -+ pmap->pe_flags = 0; - switch (type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: -- refc->pe_flags |= type; -+ pmap->pe_flags |= type; - break; - default: - ASSERT(0); -@@ -318,10 +313,10 @@ STATIC void - xfs_refcount_update_log_item( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip, -- struct xfs_refcount_intent *refc) -+ struct xfs_refcount_intent *ri) - { - uint next_extent; -- struct xfs_phys_extent *ext; -+ struct xfs_phys_extent *pmap; - - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); -@@ -333,10 +328,10 @@ xfs_refcount_update_log_item( - */ - next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; - ASSERT(next_extent < cuip->cui_format.cui_nextents); -- ext = &cuip->cui_format.cui_extents[next_extent]; -- ext->pe_startblock = refc->ri_startblock; -- ext->pe_len = refc->ri_blockcount; -- xfs_trans_set_refcount_flags(ext, refc->ri_type); -+ pmap = &cuip->cui_format.cui_extents[next_extent]; -+ pmap->pe_startblock = ri->ri_startblock; -+ pmap->pe_len = ri->ri_blockcount; -+ xfs_trans_set_refcount_flags(pmap, ri->ri_type); - } - - static struct xfs_log_item * -@@ -348,15 +343,15 @@ xfs_refcount_update_create_intent( - { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); -- struct xfs_refcount_intent *refc; -+ struct xfs_refcount_intent *ri; - - ASSERT(count > 0); - - xfs_trans_add_item(tp, &cuip->cui_item); - if (sort) - list_sort(mp, items, xfs_refcount_update_diff_items); -- list_for_each_entry(refc, items, ri_list) -- xfs_refcount_update_log_item(tp, cuip, refc); -+ list_for_each_entry(ri, items, ri_list) -+ xfs_refcount_update_log_item(tp, cuip, ri); - return &cuip->cui_item; - } - -@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item( - struct list_head *item, - struct xfs_btree_cur **state) - { -- struct xfs_refcount_intent *refc; -- xfs_fsblock_t new_fsb; -- xfs_extlen_t new_aglen; -+ struct xfs_refcount_intent *ri; - int error; - -- refc = container_of(item, struct xfs_refcount_intent, ri_list); -- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), -- refc->ri_type, refc->ri_startblock, refc->ri_blockcount, -- &new_fsb, &new_aglen, state); -+ ri = container_of(item, struct xfs_refcount_intent, ri_list); -+ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, -+ state); - - /* Did we run out of reservation? Requeue what we didn't finish. */ -- if (!error && new_aglen > 0) { -- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || -- refc->ri_type == XFS_REFCOUNT_DECREASE); -- refc->ri_startblock = new_fsb; -- refc->ri_blockcount = new_aglen; -+ if (!error && ri->ri_blockcount > 0) { -+ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || -+ ri->ri_type == XFS_REFCOUNT_DECREASE); - return -EAGAIN; - } -- kmem_cache_free(xfs_refcount_intent_cache, refc); -+ kmem_cache_free(xfs_refcount_intent_cache, ri); - return error; - } - -@@ -413,10 +403,10 @@ STATIC void - xfs_refcount_update_cancel_item( - struct list_head *item) - { -- struct xfs_refcount_intent *refc; -+ struct xfs_refcount_intent *ri; - -- refc = container_of(item, struct xfs_refcount_intent, ri_list); -- kmem_cache_free(xfs_refcount_intent_cache, refc); -+ ri = container_of(item, struct xfs_refcount_intent, ri_list); -+ kmem_cache_free(xfs_refcount_intent_cache, ri); - } - - const struct xfs_defer_op_type xfs_refcount_update_defer_type = { -@@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - static inline bool - xfs_cui_validate_phys( - struct xfs_mount *mp, -- struct xfs_phys_extent *refc) -+ struct xfs_phys_extent *pmap) - { - if (!xfs_has_reflink(mp)) - return false; - -- if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) -+ if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) - return false; - -- switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { -+ switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: -@@ -451,7 +441,7 @@ xfs_cui_validate_phys( - return false; - } - -- return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len); -+ return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); - } - - /* -@@ -463,18 +453,13 @@ xfs_cui_item_recover( - struct xfs_log_item *lip, - struct list_head *capture_list) - { -- struct xfs_bmbt_irec irec; - struct xfs_cui_log_item *cuip = CUI_ITEM(lip); -- struct xfs_phys_extent *refc; - struct xfs_cud_log_item *cudp; - struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_log->l_mp; -- xfs_fsblock_t new_fsb; -- xfs_extlen_t new_len; - unsigned int refc_type; - bool requeue_only = false; -- enum xfs_refcount_intent_type type; - int i; - int error = 0; - -@@ -513,14 +498,17 @@ xfs_cui_item_recover( - cudp = xfs_trans_get_cud(tp, cuip); - - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { -- refc = &cuip->cui_format.cui_extents[i]; -- refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; -+ struct xfs_refcount_intent fake = { }; -+ struct xfs_phys_extent *pmap; -+ -+ pmap = &cuip->cui_format.cui_extents[i]; -+ refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: -- type = refc_type; -+ fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, -@@ -529,13 +517,12 @@ xfs_cui_item_recover( - error = -EFSCORRUPTED; - goto abort_error; - } -- if (requeue_only) { -- new_fsb = refc->pe_startblock; -- new_len = refc->pe_len; -- } else -+ -+ fake.ri_startblock = pmap->pe_startblock; -+ fake.ri_blockcount = pmap->pe_len; -+ if (!requeue_only) - error = xfs_trans_log_finish_refcount_update(tp, cudp, -- type, refc->pe_startblock, refc->pe_len, -- &new_fsb, &new_len, &rcur); -+ &fake, &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, -@@ -544,10 +531,13 @@ xfs_cui_item_recover( - goto abort_error; - - /* Requeue what we didn't finish. */ -- if (new_len > 0) { -- irec.br_startblock = new_fsb; -- irec.br_blockcount = new_len; -- switch (type) { -+ if (fake.ri_blockcount > 0) { -+ struct xfs_bmbt_irec irec = { -+ .br_startblock = fake.ri_startblock, -+ .br_blockcount = fake.ri_blockcount, -+ }; -+ -+ switch (fake.ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; -@@ -596,18 +586,18 @@ xfs_cui_item_relog( - { - struct xfs_cud_log_item *cudp; - struct xfs_cui_log_item *cuip; -- struct xfs_phys_extent *extp; -+ struct xfs_phys_extent *pmap; - unsigned int count; - - count = CUI_ITEM(intent)->cui_format.cui_nextents; -- extp = CUI_ITEM(intent)->cui_format.cui_extents; -+ pmap = CUI_ITEM(intent)->cui_format.cui_extents; - - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - cuip = xfs_cui_init(tp->t_mountp, count); -- memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); -+ memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); - atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); -diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c -index 534504ede1a3..a1619d67015f 100644 ---- a/fs/xfs/xfs_rmap_item.c -+++ b/fs/xfs/xfs_rmap_item.c -@@ -244,40 +244,40 @@ xfs_trans_get_rud( - /* Set the map extent flags for this reverse mapping. */ - static void - xfs_trans_set_rmap_flags( -- struct xfs_map_extent *rmap, -+ struct xfs_map_extent *map, - enum xfs_rmap_intent_type type, - int whichfork, - xfs_exntst_t state) - { -- rmap->me_flags = 0; -+ map->me_flags = 0; - if (state == XFS_EXT_UNWRITTEN) -- rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; -+ map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) -- rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; -+ map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; - switch (type) { - case XFS_RMAP_MAP: -- rmap->me_flags |= XFS_RMAP_EXTENT_MAP; -+ map->me_flags |= XFS_RMAP_EXTENT_MAP; - break; - case XFS_RMAP_MAP_SHARED: -- rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; -+ map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; - break; - case XFS_RMAP_UNMAP: -- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; -+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP; - break; - case XFS_RMAP_UNMAP_SHARED: -- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; -+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; - break; - case XFS_RMAP_CONVERT: -- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; -+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT; - break; - case XFS_RMAP_CONVERT_SHARED: -- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; -+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; - break; - case XFS_RMAP_ALLOC: -- rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; -+ map->me_flags |= XFS_RMAP_EXTENT_ALLOC; - break; - case XFS_RMAP_FREE: -- rmap->me_flags |= XFS_RMAP_EXTENT_FREE; -+ map->me_flags |= XFS_RMAP_EXTENT_FREE; - break; - default: - ASSERT(0); -@@ -293,19 +293,12 @@ static int - xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, -- enum xfs_rmap_intent_type type, -- uint64_t owner, -- int whichfork, -- xfs_fileoff_t startoff, -- xfs_fsblock_t startblock, -- xfs_filblks_t blockcount, -- xfs_exntst_t state, -+ struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) - { - int error; - -- error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, -- startblock, blockcount, state, pcur); -+ error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the -@@ -342,7 +335,7 @@ STATIC void - xfs_rmap_update_log_item( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip, -- struct xfs_rmap_intent *rmap) -+ struct xfs_rmap_intent *ri) - { - uint next_extent; - struct xfs_map_extent *map; -@@ -358,12 +351,12 @@ xfs_rmap_update_log_item( - next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; - ASSERT(next_extent < ruip->rui_format.rui_nextents); - map = &ruip->rui_format.rui_extents[next_extent]; -- map->me_owner = rmap->ri_owner; -- map->me_startblock = rmap->ri_bmap.br_startblock; -- map->me_startoff = rmap->ri_bmap.br_startoff; -- map->me_len = rmap->ri_bmap.br_blockcount; -- xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, -- rmap->ri_bmap.br_state); -+ map->me_owner = ri->ri_owner; -+ map->me_startblock = ri->ri_bmap.br_startblock; -+ map->me_startoff = ri->ri_bmap.br_startoff; -+ map->me_len = ri->ri_bmap.br_blockcount; -+ xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, -+ ri->ri_bmap.br_state); - } - - static struct xfs_log_item * -@@ -375,15 +368,15 @@ xfs_rmap_update_create_intent( - { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); -- struct xfs_rmap_intent *rmap; -+ struct xfs_rmap_intent *ri; - - ASSERT(count > 0); - - xfs_trans_add_item(tp, &ruip->rui_item); - if (sort) - list_sort(mp, items, xfs_rmap_update_diff_items); -- list_for_each_entry(rmap, items, ri_list) -- xfs_rmap_update_log_item(tp, ruip, rmap); -+ list_for_each_entry(ri, items, ri_list) -+ xfs_rmap_update_log_item(tp, ruip, ri); - return &ruip->rui_item; - } - -@@ -405,16 +398,14 @@ xfs_rmap_update_finish_item( - struct list_head *item, - struct xfs_btree_cur **state) - { -- struct xfs_rmap_intent *rmap; -+ struct xfs_rmap_intent *ri; - int error; - -- rmap = container_of(item, struct xfs_rmap_intent, ri_list); -- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), -- rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, -- rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, -- rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, -+ ri = container_of(item, struct xfs_rmap_intent, ri_list); -+ -+ error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); -- kmem_cache_free(xfs_rmap_intent_cache, rmap); -+ kmem_cache_free(xfs_rmap_intent_cache, ri); - return error; - } - -@@ -431,10 +422,10 @@ STATIC void - xfs_rmap_update_cancel_item( - struct list_head *item) - { -- struct xfs_rmap_intent *rmap; -+ struct xfs_rmap_intent *ri; - -- rmap = container_of(item, struct xfs_rmap_intent, ri_list); -- kmem_cache_free(xfs_rmap_intent_cache, rmap); -+ ri = container_of(item, struct xfs_rmap_intent, ri_list); -+ kmem_cache_free(xfs_rmap_intent_cache, ri); - } - - const struct xfs_defer_op_type xfs_rmap_update_defer_type = { -@@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - static inline bool - xfs_rui_validate_map( - struct xfs_mount *mp, -- struct xfs_map_extent *rmap) -+ struct xfs_map_extent *map) - { - if (!xfs_has_rmapbt(mp)) - return false; - -- if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) -+ if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) - return false; - -- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { -+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - case XFS_RMAP_EXTENT_MAP_SHARED: - case XFS_RMAP_EXTENT_UNMAP: -@@ -473,14 +464,14 @@ xfs_rui_validate_map( - return false; - } - -- if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) && -- !xfs_verify_ino(mp, rmap->me_owner)) -+ if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && -+ !xfs_verify_ino(mp, map->me_owner)) - return false; - -- if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len)) -+ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) - return false; - -- return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len); -+ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); - } - - /* -@@ -493,15 +484,11 @@ xfs_rui_item_recover( - struct list_head *capture_list) - { - struct xfs_rui_log_item *ruip = RUI_ITEM(lip); -- struct xfs_map_extent *rmap; - struct xfs_rud_log_item *rudp; - struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_log->l_mp; -- enum xfs_rmap_intent_type type; -- xfs_exntst_t state; - int i; -- int whichfork; - int error = 0; - - /* -@@ -526,35 +513,34 @@ xfs_rui_item_recover( - rudp = xfs_trans_get_rud(tp, ruip); - - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { -- rmap = &ruip->rui_format.rui_extents[i]; -- state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? -- XFS_EXT_UNWRITTEN : XFS_EXT_NORM; -- whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? -- XFS_ATTR_FORK : XFS_DATA_FORK; -- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { -+ struct xfs_rmap_intent fake = { }; -+ struct xfs_map_extent *map; -+ -+ map = &ruip->rui_format.rui_extents[i]; -+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: -- type = XFS_RMAP_MAP; -+ fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: -- type = XFS_RMAP_MAP_SHARED; -+ fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: -- type = XFS_RMAP_UNMAP; -+ fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: -- type = XFS_RMAP_UNMAP_SHARED; -+ fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: -- type = XFS_RMAP_CONVERT; -+ fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: -- type = XFS_RMAP_CONVERT_SHARED; -+ fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: -- type = XFS_RMAP_ALLOC; -+ fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: -- type = XFS_RMAP_FREE; -+ fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, -@@ -563,13 +549,21 @@ xfs_rui_item_recover( - error = -EFSCORRUPTED; - goto abort_error; - } -- error = xfs_trans_log_finish_rmap_update(tp, rudp, type, -- rmap->me_owner, whichfork, -- rmap->me_startoff, rmap->me_startblock, -- rmap->me_len, state, &rcur); -+ -+ fake.ri_owner = map->me_owner; -+ fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? -+ XFS_ATTR_FORK : XFS_DATA_FORK; -+ fake.ri_bmap.br_startblock = map->me_startblock; -+ fake.ri_bmap.br_startoff = map->me_startoff; -+ fake.ri_bmap.br_blockcount = map->me_len; -+ fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? -+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM; -+ -+ error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, -+ &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, -- rmap, sizeof(*rmap)); -+ map, sizeof(*map)); - if (error) - goto abort_error; - -@@ -600,18 +594,18 @@ xfs_rui_item_relog( - { - struct xfs_rud_log_item *rudp; - struct xfs_rui_log_item *ruip; -- struct xfs_map_extent *extp; -+ struct xfs_map_extent *map; - unsigned int count; - - count = RUI_ITEM(intent)->rui_format.rui_nextents; -- extp = RUI_ITEM(intent)->rui_format.rui_extents; -+ map = RUI_ITEM(intent)->rui_format.rui_extents; - - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - ruip = xfs_rui_init(tp->t_mountp, count); -- memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); -+ memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); - atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); -diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c -index f7faf6e70d7f..a3c6b1548723 100644 ---- a/fs/xfs/xfs_sysfs.c -+++ b/fs/xfs/xfs_sysfs.c -@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_mp); - --struct kobj_type xfs_mp_ktype = { -+const struct kobj_type xfs_mp_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_groups = xfs_mp_groups, -@@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_dbg); - --struct kobj_type xfs_dbg_ktype = { -+const struct kobj_type xfs_dbg_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_groups = xfs_dbg_groups, -@@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_stats); - --struct kobj_type xfs_stats_ktype = { -+const struct kobj_type xfs_stats_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_groups = xfs_stats_groups, -@@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_log); - --struct kobj_type xfs_log_ktype = { -+const struct kobj_type xfs_log_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_groups = xfs_log_groups, -@@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = { - }; - ATTRIBUTE_GROUPS(xfs_error); - --static struct kobj_type xfs_error_cfg_ktype = { -+static const struct kobj_type xfs_error_cfg_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - .default_groups = xfs_error_groups, - }; - --static struct kobj_type xfs_error_ktype = { -+static const struct kobj_type xfs_error_ktype = { - .release = xfs_sysfs_release, - .sysfs_ops = &xfs_sysfs_ops, - }; -diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h -index 513095e353a5..148893ebfdef 100644 ---- a/fs/xfs/xfs_sysfs.h -+++ b/fs/xfs/xfs_sysfs.h -@@ -7,10 +7,10 @@ - #ifndef __XFS_SYSFS_H__ - #define __XFS_SYSFS_H__ - --extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ --extern struct kobj_type xfs_dbg_ktype; /* debug */ --extern struct kobj_type xfs_log_ktype; /* xlog */ --extern struct kobj_type xfs_stats_ktype; /* stats */ -+extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ -+extern const struct kobj_type xfs_dbg_ktype; /* debug */ -+extern const struct kobj_type xfs_log_ktype; /* xlog */ -+extern const struct kobj_type xfs_stats_ktype; /* stats */ - - static inline struct xfs_kobj * - to_kobj(struct kobject *kobject) -@@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject) - static inline int - xfs_sysfs_init( - struct xfs_kobj *kobj, -- struct kobj_type *ktype, -+ const struct kobj_type *ktype, - struct xfs_kobj *parent_kobj, - const char *name) - { -diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h -index 421d1e504ac4..6b0e9ae7c513 100644 ---- a/fs/xfs/xfs_trace.h -+++ b/fs/xfs/xfs_trace.h -@@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); - - TRACE_EVENT(xfs_refcount_finish_one_leftover, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, -- int type, xfs_agblock_t agbno, xfs_extlen_t len, -- xfs_agblock_t new_agbno, xfs_extlen_t new_len), -- TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), -+ int type, xfs_agblock_t agbno, xfs_extlen_t len), -+ TP_ARGS(mp, agno, type, agbno, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(int, type) - __field(xfs_agblock_t, agbno) - __field(xfs_extlen_t, len) -- __field(xfs_agblock_t, new_agbno) -- __field(xfs_extlen_t, new_len) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; -@@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, - __entry->type = type; - __entry->agbno = agbno; - __entry->len = len; -- __entry->new_agbno = new_agbno; -- __entry->new_len = new_len; - ), -- TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", -+ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->agno, - __entry->agbno, -- __entry->len, -- __entry->new_agbno, -- __entry->new_len) -+ __entry->len) - ); - - /* simple inode-based error/%ip tracepoint class */ -diff --git a/include/linux/bio.h b/include/linux/bio.h -index c1da63f6c808..d766be7152e1 100644 ---- a/include/linux/bio.h -+++ b/include/linux/bio.h -@@ -12,6 +12,8 @@ - - #define BIO_MAX_VECS 256U - -+struct queue_limits; -+ - static inline unsigned int bio_max_segs(unsigned int nr_segs) - { - return min(nr_segs, BIO_MAX_VECS); -@@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, - void bio_trim(struct bio *bio, sector_t offset, sector_t size); - extern struct bio *bio_split(struct bio *bio, int sectors, - gfp_t gfp, struct bio_set *bs); -+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, -+ unsigned *segs, struct bio_set *bs, unsigned max_bytes); - - /** - * bio_next_split - get next @sectors from a bio, splitting if necessary -diff --git a/include/linux/iomap.h b/include/linux/iomap.h -index 0983dfc9a203..0f8123504e5e 100644 ---- a/include/linux/iomap.h -+++ b/include/linux/iomap.h -@@ -13,6 +13,7 @@ - struct address_space; - struct fiemap_extent_info; - struct inode; -+struct iomap_iter; - struct iomap_dio; - struct iomap_writepage_ctx; - struct iov_iter; -@@ -58,8 +59,7 @@ struct vm_fault; - #define IOMAP_F_SHARED (1U << 2) - #define IOMAP_F_MERGED (1U << 3) - #define IOMAP_F_BUFFER_HEAD (1U << 4) --#define IOMAP_F_ZONE_APPEND (1U << 5) --#define IOMAP_F_XATTR (1U << 6) -+#define IOMAP_F_XATTR (1U << 5) - - /* - * Flags set by the core iomap code during operations: -@@ -85,7 +85,7 @@ struct vm_fault; - */ - #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ - --struct iomap_page_ops; -+struct iomap_folio_ops; - - struct iomap { - u64 addr; /* disk offset of mapping, bytes */ -@@ -97,7 +97,7 @@ struct iomap { - struct dax_device *dax_dev; /* dax_dev for dax operations */ - void *inline_data; - void *private; /* filesystem private */ -- const struct iomap_page_ops *page_ops; -+ const struct iomap_folio_ops *folio_ops; - u64 validity_cookie; /* used with .iomap_valid() */ - }; - -@@ -125,19 +125,20 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap) - } - - /* -- * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare -- * and page_done will be called for each page written to. This only applies to -- * buffered writes as unbuffered writes will not typically have pages -+ * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio -+ * and put_folio will be called for each folio written to. This only applies -+ * to buffered writes as unbuffered writes will not typically have folios - * associated with them. - * -- * When page_prepare succeeds, page_done will always be called to do any -- * cleanup work necessary. In that page_done call, @page will be NULL if the -- * associated page could not be obtained. -+ * When get_folio succeeds, put_folio will always be called to do any -+ * cleanup work necessary. put_folio is responsible for unlocking and putting -+ * @folio. - */ --struct iomap_page_ops { -- int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); -- void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, -- struct page *page); -+struct iomap_folio_ops { -+ struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, -+ unsigned len); -+ void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, -+ struct folio *folio); - - /* - * Check that the cached iomap still maps correctly to the filesystem's -@@ -260,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, - int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); - void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); - bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); - bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); - void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); - int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, -diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h -index 6548b5b5aa60..75d7d22c3a27 100644 ---- a/include/trace/events/btrfs.h -+++ b/include/trace/events/btrfs.h -@@ -32,6 +32,7 @@ struct prelim_ref; - struct btrfs_space_info; - struct btrfs_raid_bio; - struct raid56_bio_trace_info; -+struct find_free_extent_ctl; - - #define show_ref_type(type) \ - __print_symbolic(type, \ -@@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, - - TRACE_EVENT(find_free_extent, - -- TP_PROTO(const struct btrfs_root *root, u64 num_bytes, -- u64 empty_size, u64 data), -+ TP_PROTO(const struct btrfs_root *root, -+ const struct find_free_extent_ctl *ffe_ctl), - -- TP_ARGS(root, num_bytes, empty_size, data), -+ TP_ARGS(root, ffe_ctl), - - TP_STRUCT__entry_btrfs( - __field( u64, root_objectid ) - __field( u64, num_bytes ) - __field( u64, empty_size ) -- __field( u64, data ) -+ __field( u64, flags ) - ), - - TP_fast_assign_btrfs(root->fs_info, - __entry->root_objectid = root->root_key.objectid; -- __entry->num_bytes = num_bytes; -- __entry->empty_size = empty_size; -- __entry->data = data; -+ __entry->num_bytes = ffe_ctl->num_bytes; -+ __entry->empty_size = ffe_ctl->empty_size; -+ __entry->flags = ffe_ctl->flags; - ), - - TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", - show_root_type(__entry->root_objectid), -- __entry->num_bytes, __entry->empty_size, __entry->data, -- __print_flags((unsigned long)__entry->data, "|", -+ __entry->num_bytes, __entry->empty_size, __entry->flags, -+ __print_flags((unsigned long)__entry->flags, "|", -+ BTRFS_GROUP_FLAGS)) -+); -+ -+TRACE_EVENT(find_free_extent_search_loop, -+ -+ TP_PROTO(const struct btrfs_root *root, -+ const struct find_free_extent_ctl *ffe_ctl), -+ -+ TP_ARGS(root, ffe_ctl), -+ -+ TP_STRUCT__entry_btrfs( -+ __field( u64, root_objectid ) -+ __field( u64, num_bytes ) -+ __field( u64, empty_size ) -+ __field( u64, flags ) -+ __field( u64, loop ) -+ ), -+ -+ TP_fast_assign_btrfs(root->fs_info, -+ __entry->root_objectid = root->root_key.objectid; -+ __entry->num_bytes = ffe_ctl->num_bytes; -+ __entry->empty_size = ffe_ctl->empty_size; -+ __entry->flags = ffe_ctl->flags; -+ __entry->loop = ffe_ctl->loop; -+ ), -+ -+ TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", -+ show_root_type(__entry->root_objectid), -+ __entry->num_bytes, __entry->empty_size, __entry->flags, -+ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), -+ __entry->loop) -+); -+ -+TRACE_EVENT(find_free_extent_have_block_group, -+ -+ TP_PROTO(const struct btrfs_root *root, -+ const struct find_free_extent_ctl *ffe_ctl, -+ const struct btrfs_block_group *block_group), -+ -+ TP_ARGS(root, ffe_ctl, block_group), -+ -+ TP_STRUCT__entry_btrfs( -+ __field( u64, root_objectid ) -+ __field( u64, num_bytes ) -+ __field( u64, empty_size ) -+ __field( u64, flags ) -+ __field( u64, loop ) -+ __field( bool, hinted ) -+ __field( u64, bg_start ) -+ __field( u64, bg_flags ) -+ ), -+ -+ TP_fast_assign_btrfs(root->fs_info, -+ __entry->root_objectid = root->root_key.objectid; -+ __entry->num_bytes = ffe_ctl->num_bytes; -+ __entry->empty_size = ffe_ctl->empty_size; -+ __entry->flags = ffe_ctl->flags; -+ __entry->loop = ffe_ctl->loop; -+ __entry->hinted = ffe_ctl->hinted; -+ __entry->bg_start = block_group->start; -+ __entry->bg_flags = block_group->flags; -+ ), -+ -+ TP_printk_btrfs( -+"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", -+ show_root_type(__entry->root_objectid), -+ __entry->num_bytes, __entry->empty_size, __entry->flags, -+ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), -+ __entry->loop, __entry->hinted, -+ __entry->bg_start, __entry->bg_flags, -+ __print_flags((unsigned long)__entry->bg_flags, "|", - BTRFS_GROUP_FLAGS)) - ); - - DECLARE_EVENT_CLASS(btrfs__reserve_extent, - -- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, -- u64 len), -+ TP_PROTO(const struct btrfs_block_group *block_group, -+ const struct find_free_extent_ctl *ffe_ctl), - -- TP_ARGS(block_group, start, len), -+ TP_ARGS(block_group, ffe_ctl), - - TP_STRUCT__entry_btrfs( - __field( u64, bg_objectid ) - __field( u64, flags ) -+ __field( int, bg_size_class ) - __field( u64, start ) - __field( u64, len ) -+ __field( u64, loop ) -+ __field( bool, hinted ) -+ __field( int, size_class ) - ), - - TP_fast_assign_btrfs(block_group->fs_info, - __entry->bg_objectid = block_group->start; - __entry->flags = block_group->flags; -- __entry->start = start; -- __entry->len = len; -+ __entry->bg_size_class = block_group->size_class; -+ __entry->start = ffe_ctl->search_start; -+ __entry->len = ffe_ctl->num_bytes; -+ __entry->loop = ffe_ctl->loop; -+ __entry->hinted = ffe_ctl->hinted; -+ __entry->size_class = ffe_ctl->size_class; - ), - -- TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " -- "start=%llu len=%llu", -+ TP_printk_btrfs( -+"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", - show_root_type(BTRFS_EXTENT_TREE_OBJECTID), - __entry->bg_objectid, - __entry->flags, __print_flags((unsigned long)__entry->flags, - "|", BTRFS_GROUP_FLAGS), -- __entry->start, __entry->len) -+ __entry->bg_size_class, __entry->start, __entry->len, -+ __entry->loop, __entry->hinted, __entry->size_class) - ); - - DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - -- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, -- u64 len), -+ TP_PROTO(const struct btrfs_block_group *block_group, -+ const struct find_free_extent_ctl *ffe_ctl), - -- TP_ARGS(block_group, start, len) -+ TP_ARGS(block_group, ffe_ctl) - ); - - DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - -- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, -- u64 len), -+ TP_PROTO(const struct btrfs_block_group *block_group, -+ const struct find_free_extent_ctl *ffe_ctl), - -- TP_ARGS(block_group, start, len) -+ TP_ARGS(block_group, ffe_ctl) - ); - - TRACE_EVENT(btrfs_find_cluster, -diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h -index 77b426ae0064..ebccf6a6aa1b 100644 ---- a/include/trace/events/ext4.h -+++ b/include/trace/events/ext4.h -@@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op, - (unsigned long) __entry->index) - ); - --DEFINE_EVENT(ext4__page_op, ext4_writepage, -- -- TP_PROTO(struct page *page), -- -- TP_ARGS(page) --); -- - DEFINE_EVENT(ext4__page_op, ext4_readpage, - - TP_PROTO(struct page *page), --- -2.40.0.rc2 - -From 31bc464783789781c2a6885b36f63fcb3751a5bb Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Fri, 10 Mar 2023 18:05:48 +0100 -Subject: [PATCH 08/16] Implement amd-pstate-epp and amd-pstate-guided driver - -Signed-off-by: Peter Jung ---- - .../admin-guide/kernel-parameters.txt | 33 +- - Documentation/admin-guide/pm/amd-pstate.rst | 95 ++- - drivers/acpi/cppc_acpi.c | 188 ++++- - drivers/cpufreq/amd-pstate.c | 794 +++++++++++++++++- - drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 +- - drivers/cpufreq/cpufreq.c | 8 +- - drivers/cpufreq/mediatek-cpufreq-hw.c | 4 +- - drivers/cpufreq/omap-cpufreq.c | 4 +- - drivers/cpufreq/qcom-cpufreq-hw.c | 4 +- - include/acpi/cppc_acpi.h | 23 + - include/linux/amd-pstate.h | 34 + - include/linux/cpufreq.h | 2 +- - 12 files changed, 1136 insertions(+), 58 deletions(-) + .../admin-guide/kernel-parameters.txt | 40 ++-- + Documentation/admin-guide/pm/amd-pstate.rst | 31 ++- + drivers/acpi/cppc_acpi.c | 121 +++++++++++- + drivers/cpufreq/amd-pstate.c | 177 +++++++++++++----- + include/acpi/cppc_acpi.h | 11 ++ + include/linux/amd-pstate.h | 2 + + 6 files changed, 302 insertions(+), 80 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 9595abf34974..f39b8f05392c 100644 +index 4f6761a93715..bf2a402af231 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -339,6 +339,29 @@ @@ -26717,7 +9065,7 @@ index 9595abf34974..f39b8f05392c 100644 amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , -@@ -7019,13 +7042,3 @@ +@@ -7068,20 +7091,3 @@ xmon commands. off xmon is disabled. @@ -26731,57 +9079,28 @@ index 9595abf34974..f39b8f05392c 100644 - management firmware translates the requests into actual - hardware states (core frequency, data fabric and memory - clocks etc.) +- active +- Use amd_pstate_epp driver instance as the scaling driver, +- driver provides a hint to the hardware if software wants +- to bias toward performance (0x0) or energy efficiency (0xff) +- to the CPPC firmware. then CPPC power algorithm will +- calculate the runtime workload and adjust the realtime cores +- frequency. diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index 5376d53faaa8..f24a90007e98 100644 +index 6e5298b521b1..1cf40f69278c 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond - to the request from AMD P-States. +@@ -303,13 +303,18 @@ efficiency frequency management method on AMD processors. + AMD Pstate Driver Operation Modes + ================================= - --User Space Interface in ``sysfs`` --================================== -+User Space Interface in ``sysfs`` - Per-policy control -+====================================================== - - ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to - control its functionality at the system level. They are located in the -@@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability - `_.) - This attribute is read-only. - -+``energy_performance_available_preferences`` -+ -+A list of all the supported EPP preferences that could be used for -+``energy_performance_preference`` on this system. -+These profiles represent different hints that are provided -+to the low-level firmware about the user's desired energy vs efficiency -+tradeoff. ``default`` represents the epp value is set by platform -+firmware. This attribute is read-only. -+ -+``energy_performance_preference`` -+ -+The current energy performance preference can be read from this attribute. -+and user can change current preference according to energy or performance needs -+Please get all support profiles list from -+``energy_performance_available_preferences`` attribute, all the profiles are -+integer values defined between 0 to 255 when EPP feature is enabled by platform -+firmware, if EPP feature is disabled, driver will ignore the written value -+This attribute is read-write. -+ - Other performance and frequency values can be read back from - ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. - -@@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD - platforms. The AMD P-States mechanism is the more performance and energy - efficiency frequency management method on AMD processors. - --Kernel Module Options for ``amd-pstate`` --========================================= -+ -+AMD Pstate Driver Operation Modes -+================================= -+ +-``amd_pstate`` CPPC has two operation modes: CPPC Autonomous(active) mode and +-CPPC non-autonomous(passive) mode. +-active mode and passive mode can be chosen by different kernel parameters. +-When in Autonomous mode, CPPC ignores requests done in the Desired Performance +-Target register and takes into account only the values set to the Minimum requested +-performance, Maximum requested performance, and Energy Performance Preference +-registers. When Autonomous is disabled, it only considers the Desired Performance Target. +``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, +non-autonomous (passive) mode and guided autonomous (guided) mode. +Active/passive/guided mode can be chosen by different kernel parameters. @@ -26794,23 +9113,10 @@ index 5376d53faaa8..f24a90007e98 100644 +- In guided-autonomous mode, platform sets operating performance level + autonomously according to the current workload and within the limits set by + OS through min and max performance registers. -+ -+Active Mode -+------------ -+ -+``amd_pstate=active`` -+ -+This is the low-level firmware control mode which is implemented by ``amd_pstate_epp`` -+driver with ``amd_pstate=active`` passed to the kernel in the command line. -+In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software -+wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware. -+then CPPC power algorithm will calculate the runtime workload and adjust the realtime -+cores frequency according to the power supply and thermal, core voltage and some other -+hardware conditions. - Passive Mode + Active Mode ------------ -@@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l +@@ -338,6 +343,15 @@ to the Performance Reduction Tolerance register. Above the nominal performance l processor must provide at least nominal performance requested and go higher if current operating conditions allow. @@ -26823,123 +9129,27 @@ index 5376d53faaa8..f24a90007e98 100644 +is activated. In this mode, driver requests minimum and maximum performance +level and the platform autonomously selects a performance level in this range +and appropriate to the current workload. -+ -+User Space Interface in ``sysfs`` - General -+=========================================== -+ -+Global Attributes -+----------------- -+ -+``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to -+control its functionality at the system level. They are located in the -+``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs. -+ -+``status`` -+ Operation mode of the driver: "active", "passive" or "disable". -+ -+ "active" -+ The driver is functional and in the ``active mode`` -+ -+ "passive" -+ The driver is functional and in the ``passive mode`` -+ + + User Space Interface in ``sysfs`` - General + =========================================== +@@ -358,6 +372,9 @@ control its functionality at the system level. They are located in the + "passive" + The driver is functional and in the ``passive mode`` + + "guided" + The driver is functional and in the ``guided mode`` + -+ "disable" -+ The driver is unregistered and not functional now. -+ -+ This attribute can be written to in order to change the driver's -+ operation mode or to unregister it. The string written to it must be -+ one of the possible values of it and, if successful, writing one of -+ these values to the sysfs file will cause the driver to switch over -+ to the operation mode represented by that string - or to be -+ unregistered in the "disable" case. + "disable" + The driver is unregistered and not functional now. - ``cpupower`` tool support for ``amd-pstate`` - =============================================== diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c -index 0f17b1c32718..0efdbeed6ada 100644 +index c51d3ccb4cca..02a4bfb54967 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c -@@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); +@@ -1433,6 +1433,103 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) } + EXPORT_SYMBOL_GPL(cppc_set_epp_perf); -+/** -+ * cppc_get_epp_perf - Get the epp register value. -+ * @cpunum: CPU from which to get epp preference value. -+ * @epp_perf: Return address. -+ * -+ * Return: 0 for success, -EIO otherwise. -+ */ -+int cppc_get_epp_perf(int cpunum, u64 *epp_perf) -+{ -+ return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); -+} -+EXPORT_SYMBOL_GPL(cppc_get_epp_perf); -+ - /** - * cppc_get_perf_caps - Get a CPU's performance capabilities. - * @cpunum: CPU from which to get capabilities info. -@@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) - } - EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); - -+/* -+ * Set Energy Performance Preference Register value through -+ * Performance Controls Interface -+ */ -+int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) -+{ -+ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); -+ struct cpc_register_resource *epp_set_reg; -+ struct cpc_register_resource *auto_sel_reg; -+ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); -+ struct cppc_pcc_data *pcc_ss_data = NULL; -+ int ret; -+ -+ if (!cpc_desc) { -+ pr_debug("No CPC descriptor for CPU:%d\n", cpu); -+ return -ENODEV; -+ } -+ -+ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; -+ epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; -+ -+ if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { -+ if (pcc_ss_id < 0) { -+ pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); -+ return -ENODEV; -+ } -+ -+ if (CPC_SUPPORTED(auto_sel_reg)) { -+ ret = cpc_write(cpu, auto_sel_reg, enable); -+ if (ret) -+ return ret; -+ } -+ -+ if (CPC_SUPPORTED(epp_set_reg)) { -+ ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); -+ if (ret) -+ return ret; -+ } -+ -+ pcc_ss_data = pcc_data[pcc_ss_id]; -+ -+ down_write(&pcc_ss_data->pcc_lock); -+ /* after writing CPC, transfer the ownership of PCC to platform */ -+ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); -+ up_write(&pcc_ss_data->pcc_lock); -+ } else { -+ ret = -ENOTSUPP; -+ pr_debug("_CPC in PCC is not supported\n"); -+ } -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(cppc_set_epp_perf); -+ +/* + * cppc_get_auto_sel_caps - Read autonomous selection register. + * @cpunum : CPU from which to read register. @@ -27040,7 +9250,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 /** * cppc_set_enable - Set to enable CPPC on the processor by writing the * Continuous Performance Control package EnableRegister field. -@@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable); +@@ -1488,7 +1585,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable); int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); @@ -27049,7 +9259,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cppc_pcc_data *pcc_ss_data = NULL; int ret = 0; -@@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +@@ -1499,6 +1596,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) } desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; @@ -27058,7 +9268,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 /* * This is Phase-I where we want to write to CPC registers -@@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +@@ -1507,7 +1606,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * Since read_lock can be acquired by multiple CPUs simultaneously we * achieve that goal here */ @@ -27067,7 +9277,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id\n"); return -ENODEV; -@@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +@@ -1530,13 +1629,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) cpc_desc->write_cmd_status = 0; } @@ -27092,7 +9302,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 up_read(&pcc_ss_data->pcc_lock); /* END Phase-I */ /* * This is Phase-II where we transfer the ownership of PCC to Platform -@@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +@@ -1584,7 +1689,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) * case during a CMD_READ and if there are pending writes it delivers * the write command before servicing the read command */ @@ -27102,207 +9312,19 @@ index 0f17b1c32718..0efdbeed6ada 100644 /* Update only if there are pending write commands */ if (pcc_ss_data->pending_pcc_write_cmd) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index c17bd845f5fc..f4f96baae500 100644 +index 73c7643b2697..7955cfc91c31 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c -@@ -59,8 +59,173 @@ - * we disable it by default to go acpi-cpufreq on these processors and add a - * module parameter to be able to enable it manually for debugging. - */ -+static struct cpufreq_driver *current_pstate_driver; - static struct cpufreq_driver amd_pstate_driver; --static int cppc_load __initdata; -+static struct cpufreq_driver amd_pstate_epp_driver; -+static int cppc_state = AMD_PSTATE_DISABLE; -+struct kobject *amd_pstate_kobj; -+ -+/* -+ * AMD Energy Preference Performance (EPP) -+ * The EPP is used in the CCLK DPM controller to drive -+ * the frequency that a core is going to operate during -+ * short periods of activity. EPP values will be utilized for -+ * different OS profiles (balanced, performance, power savings) -+ * display strings corresponding to EPP index in the -+ * energy_perf_strings[] -+ * index String -+ *------------------------------------- -+ * 0 default -+ * 1 performance -+ * 2 balance_performance -+ * 3 balance_power -+ * 4 power -+ */ -+enum energy_perf_value_index { -+ EPP_INDEX_DEFAULT = 0, -+ EPP_INDEX_PERFORMANCE, -+ EPP_INDEX_BALANCE_PERFORMANCE, -+ EPP_INDEX_BALANCE_POWERSAVE, -+ EPP_INDEX_POWERSAVE, -+}; -+ -+static const char * const energy_perf_strings[] = { -+ [EPP_INDEX_DEFAULT] = "default", -+ [EPP_INDEX_PERFORMANCE] = "performance", -+ [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", -+ [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", -+ [EPP_INDEX_POWERSAVE] = "power", -+ NULL -+}; -+ -+static unsigned int epp_values[] = { -+ [EPP_INDEX_DEFAULT] = 0, -+ [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE, -+ [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, -+ [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, -+ [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, -+ }; -+ +@@ -106,6 +106,8 @@ static unsigned int epp_values[] = { + [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, + }; + +typedef int (*cppc_mode_transition_fn)(int); + -+static inline int get_mode_idx_from_str(const char *str, size_t size) -+{ -+ int i; -+ -+ for (i=0; i < AMD_PSTATE_MAX; i++) { -+ if (!strncmp(str, amd_pstate_mode_string[i], size)) -+ return i; -+ } -+ return -EINVAL; -+} -+ -+static DEFINE_MUTEX(amd_pstate_limits_lock); -+static DEFINE_MUTEX(amd_pstate_driver_lock); -+ -+static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) -+{ -+ u64 epp; -+ int ret; -+ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ if (!cppc_req_cached) { -+ epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, -+ &cppc_req_cached); -+ if (epp) -+ return epp; -+ } -+ epp = (cppc_req_cached >> 24) & 0xFF; -+ } else { -+ ret = cppc_get_epp_perf(cpudata->cpu, &epp); -+ if (ret < 0) { -+ pr_debug("Could not retrieve energy perf value (%d)\n", ret); -+ return -EIO; -+ } -+ } -+ -+ return (s16)(epp & 0xff); -+} -+ -+static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) -+{ -+ s16 epp; -+ int index = -EINVAL; -+ -+ epp = amd_pstate_get_epp(cpudata, 0); -+ if (epp < 0) -+ return epp; -+ -+ switch (epp) { -+ case AMD_CPPC_EPP_PERFORMANCE: -+ index = EPP_INDEX_PERFORMANCE; -+ break; -+ case AMD_CPPC_EPP_BALANCE_PERFORMANCE: -+ index = EPP_INDEX_BALANCE_PERFORMANCE; -+ break; -+ case AMD_CPPC_EPP_BALANCE_POWERSAVE: -+ index = EPP_INDEX_BALANCE_POWERSAVE; -+ break; -+ case AMD_CPPC_EPP_POWERSAVE: -+ index = EPP_INDEX_POWERSAVE; -+ break; -+ default: -+ break; -+ } -+ -+ return index; -+} -+ -+static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) -+{ -+ int ret; -+ struct cppc_perf_ctrls perf_ctrls; -+ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ u64 value = READ_ONCE(cpudata->cppc_req_cached); -+ -+ value &= ~GENMASK_ULL(31, 24); -+ value |= (u64)epp << 24; -+ WRITE_ONCE(cpudata->cppc_req_cached, value); -+ -+ ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); -+ if (!ret) -+ cpudata->epp_cached = epp; -+ } else { -+ perf_ctrls.energy_perf = epp; -+ ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); -+ if (ret) { -+ pr_debug("failed to set energy perf value (%d)\n", ret); -+ return ret; -+ } -+ cpudata->epp_cached = epp; -+ } -+ -+ return ret; -+} -+ -+static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, -+ int pref_index) -+{ -+ int epp = -EINVAL; -+ int ret; -+ -+ if (!pref_index) { -+ pr_debug("EPP pref_index is invalid\n"); -+ return -EINVAL; -+ } -+ -+ if (epp == -EINVAL) -+ epp = epp_values[pref_index]; -+ -+ if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { -+ pr_debug("EPP cannot be set under performance policy\n"); -+ return -EBUSY; -+ } -+ -+ ret = amd_pstate_set_epp(cpudata, epp); -+ -+ return ret; -+} - - static inline int pstate_enable(bool enable) + static inline int get_mode_idx_from_str(const char *str, size_t size) { -@@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable) - static int cppc_enable(bool enable) - { - int cpu, ret = 0; -+ struct cppc_perf_ctrls perf_ctrls; - - for_each_present_cpu(cpu) { - ret = cppc_set_enable(cpu, enable); - if (ret) - return ret; -+ -+ /* Enable autonomous mode for EPP */ -+ if (cppc_state == AMD_PSTATE_ACTIVE) { -+ /* Set desired perf as zero to allow EPP firmware control */ -+ perf_ctrls.desired_perf = 0; -+ ret = cppc_set_perf(cpu, &perf_ctrls); -+ if (ret) -+ return ret; -+ } - } - - return ret; -@@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + int i; +@@ -308,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); @@ -27326,7 +9348,7 @@ index c17bd845f5fc..f4f96baae500 100644 } DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); -@@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) +@@ -385,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) } static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, @@ -27346,7 +9368,7 @@ index c17bd845f5fc..f4f96baae500 100644 value &= ~AMD_CPPC_MIN_PERF(~0L); value |= AMD_CPPC_MIN_PERF(min_perf); -@@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy, +@@ -445,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy, cpufreq_freq_transition_begin(policy, &freqs); amd_pstate_update(cpudata, min_perf, des_perf, @@ -27355,7 +9377,7 @@ index c17bd845f5fc..f4f96baae500 100644 cpufreq_freq_transition_end(policy, &freqs, false); return 0; -@@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -479,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; @@ -27365,99 +9387,10 @@ index c17bd845f5fc..f4f96baae500 100644 cpufreq_cpu_put(policy); } -@@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata) - return; - - cpudata->boost_supported = true; -- amd_pstate_driver.boost_enabled = true; -+ current_pstate_driver->boost_enabled = true; +@@ -816,6 +840,98 @@ static ssize_t show_energy_performance_preference( + return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); } - static void amd_perf_ctl_reset(unsigned int cpu) -@@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - policy->driver_data = cpudata; - - amd_pstate_boost_init(cpudata); -+ if (!current_pstate_driver->adjust_perf) -+ current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; - - return 0; - -@@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, - if (max_freq < 0) - return max_freq; - -- return sprintf(&buf[0], "%u\n", max_freq); -+ return sysfs_emit(buf, "%u\n", max_freq); - } - - static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, -@@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli - if (freq < 0) - return freq; - -- return sprintf(&buf[0], "%u\n", freq); -+ return sysfs_emit(buf, "%u\n", freq); - } - - /* -@@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, - - perf = READ_ONCE(cpudata->highest_perf); - -- return sprintf(&buf[0], "%u\n", perf); -+ return sysfs_emit(buf, "%u\n", perf); -+} -+ -+static ssize_t show_energy_performance_available_preferences( -+ struct cpufreq_policy *policy, char *buf) -+{ -+ int i = 0; -+ int offset = 0; -+ -+ while (energy_perf_strings[i] != NULL) -+ offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); -+ -+ sysfs_emit_at(buf, offset, "\n"); -+ -+ return offset; -+} -+ -+static ssize_t store_energy_performance_preference( -+ struct cpufreq_policy *policy, const char *buf, size_t count) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ char str_preference[21]; -+ ssize_t ret; -+ -+ ret = sscanf(buf, "%20s", str_preference); -+ if (ret != 1) -+ return -EINVAL; -+ -+ ret = match_string(energy_perf_strings, -1, str_preference); -+ if (ret < 0) -+ return -EINVAL; -+ -+ mutex_lock(&amd_pstate_limits_lock); -+ ret = amd_pstate_set_energy_pref_index(cpudata, ret); -+ mutex_unlock(&amd_pstate_limits_lock); -+ -+ return ret ?: count; -+} -+ -+static ssize_t show_energy_performance_preference( -+ struct cpufreq_policy *policy, char *buf) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ int preference; -+ -+ preference = amd_pstate_get_energy_pref_index(cpudata); -+ if (preference < 0) -+ return preference; -+ -+ return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); -+} -+ +static void amd_pstate_driver_cleanup(void) +{ + amd_pstate_enable(false); @@ -27550,633 +9483,117 @@ index c17bd845f5fc..f4f96baae500 100644 + }, +}; + -+static ssize_t amd_pstate_show_status(char *buf) -+{ -+ if (!current_pstate_driver) -+ return sysfs_emit(buf, "disable\n"); -+ -+ return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); -+} -+ -+static int amd_pstate_update_status(const char *buf, size_t size) -+{ -+ int mode_idx; -+ -+ if (size > strlen("passive") || size < strlen("active")) -+ return -EINVAL; -+ -+ mode_idx = get_mode_idx_from_str(buf, size); -+ -+ if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) -+ return -EINVAL; -+ -+ if (mode_state_machine[cppc_state][mode_idx]) -+ return mode_state_machine[cppc_state][mode_idx](mode_idx); -+ -+ return 0; -+} -+ -+static ssize_t show_status(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ ssize_t ret; -+ -+ mutex_lock(&amd_pstate_driver_lock); -+ ret = amd_pstate_show_status(buf); -+ mutex_unlock(&amd_pstate_driver_lock); -+ -+ return ret; -+} -+ -+static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, -+ const char *buf, size_t count) -+{ -+ char *p = memchr(buf, '\n', count); -+ int ret; -+ -+ mutex_lock(&amd_pstate_driver_lock); -+ ret = amd_pstate_update_status(buf, p ? p - buf : count); -+ mutex_unlock(&amd_pstate_driver_lock); -+ -+ return ret < 0 ? ret : count; + static ssize_t amd_pstate_show_status(char *buf) + { + if (!current_pstate_driver) +@@ -824,57 +940,22 @@ static ssize_t amd_pstate_show_status(char *buf) + return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); } - cpufreq_freq_attr_ro(amd_pstate_max_freq); - cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); - - cpufreq_freq_attr_ro(amd_pstate_highest_perf); -+cpufreq_freq_attr_rw(energy_performance_preference); -+cpufreq_freq_attr_ro(energy_performance_available_preferences); -+define_one_global_rw(status); - - static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, -@@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = { - NULL, - }; - -+static struct freq_attr *amd_pstate_epp_attr[] = { -+ &amd_pstate_max_freq, -+ &amd_pstate_lowest_nonlinear_freq, -+ &amd_pstate_highest_perf, -+ &energy_performance_preference, -+ &energy_performance_available_preferences, -+ NULL, -+}; -+ -+static struct attribute *pstate_global_attributes[] = { -+ &status.attr, -+ NULL -+}; -+ -+static const struct attribute_group amd_pstate_global_attr_group = { -+ .attrs = pstate_global_attributes, -+}; -+ -+static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) -+{ -+ int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; -+ struct amd_cpudata *cpudata; -+ struct device *dev; -+ u64 value; -+ -+ /* -+ * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, -+ * which is ideal for initialization process. -+ */ -+ amd_perf_ctl_reset(policy->cpu); -+ dev = get_cpu_device(policy->cpu); -+ if (!dev) -+ return -ENODEV; -+ -+ cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); -+ if (!cpudata) -+ return -ENOMEM; -+ -+ cpudata->cpu = policy->cpu; -+ cpudata->epp_policy = 0; -+ -+ ret = amd_pstate_init_perf(cpudata); -+ if (ret) -+ goto free_cpudata1; -+ -+ min_freq = amd_get_min_freq(cpudata); -+ max_freq = amd_get_max_freq(cpudata); -+ nominal_freq = amd_get_nominal_freq(cpudata); -+ lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); -+ if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { -+ dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", -+ min_freq, max_freq); -+ ret = -EINVAL; -+ goto free_cpudata1; -+ } -+ -+ policy->cpuinfo.min_freq = min_freq; -+ policy->cpuinfo.max_freq = max_freq; -+ /* It will be updated by governor */ -+ policy->cur = policy->cpuinfo.min_freq; -+ -+ /* Initial processor data capability frequencies */ -+ cpudata->max_freq = max_freq; -+ cpudata->min_freq = min_freq; -+ cpudata->nominal_freq = nominal_freq; -+ cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; -+ -+ policy->driver_data = cpudata; -+ -+ cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); -+ -+ policy->min = policy->cpuinfo.min_freq; -+ policy->max = policy->cpuinfo.max_freq; -+ -+ /* -+ * Set the policy to powersave to provide a valid fallback value in case -+ * the default cpufreq governor is neither powersave nor performance. -+ */ -+ policy->policy = CPUFREQ_POLICY_POWERSAVE; -+ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ policy->fast_switch_possible = true; -+ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); -+ if (ret) -+ return ret; -+ WRITE_ONCE(cpudata->cppc_req_cached, value); -+ -+ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); -+ if (ret) -+ return ret; -+ WRITE_ONCE(cpudata->cppc_cap1_cached, value); -+ } -+ amd_pstate_boost_init(cpudata); -+ -+ return 0; -+ -+free_cpudata1: -+ kfree(cpudata); -+ return ret; -+} -+ -+static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) -+{ -+ pr_debug("CPU %d exiting\n", policy->cpu); -+ policy->fast_switch_possible = false; -+ return 0; -+} -+ -+static void amd_pstate_epp_init(unsigned int cpu) -+{ -+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -+ struct amd_cpudata *cpudata = policy->driver_data; -+ u32 max_perf, min_perf; -+ u64 value; -+ s16 epp; -+ -+ max_perf = READ_ONCE(cpudata->highest_perf); -+ min_perf = READ_ONCE(cpudata->lowest_perf); -+ -+ value = READ_ONCE(cpudata->cppc_req_cached); -+ -+ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) -+ min_perf = max_perf; -+ -+ /* Initial min/max values for CPPC Performance Controls Register */ -+ value &= ~AMD_CPPC_MIN_PERF(~0L); -+ value |= AMD_CPPC_MIN_PERF(min_perf); -+ -+ value &= ~AMD_CPPC_MAX_PERF(~0L); -+ value |= AMD_CPPC_MAX_PERF(max_perf); -+ -+ /* CPPC EPP feature require to set zero to the desire perf bit */ -+ value &= ~AMD_CPPC_DES_PERF(~0L); -+ value |= AMD_CPPC_DES_PERF(0); -+ -+ if (cpudata->epp_policy == cpudata->policy) -+ goto skip_epp; -+ -+ cpudata->epp_policy = cpudata->policy; -+ -+ /* Get BIOS pre-defined epp value */ -+ epp = amd_pstate_get_epp(cpudata, value); -+ if (epp < 0) { -+ /** -+ * This return value can only be negative for shared_memory -+ * systems where EPP register read/write not supported. -+ */ -+ goto skip_epp; -+ } -+ -+ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) -+ epp = 0; -+ -+ /* Set initial EPP value */ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ value &= ~GENMASK_ULL(31, 24); -+ value |= (u64)epp << 24; -+ } -+ -+ WRITE_ONCE(cpudata->cppc_req_cached, value); -+ amd_pstate_set_epp(cpudata, epp); -+skip_epp: -+ cpufreq_cpu_put(policy); -+} -+ -+static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ if (!policy->cpuinfo.max_freq) -+ return -ENODEV; -+ -+ pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", -+ policy->cpuinfo.max_freq, policy->max); -+ -+ cpudata->policy = policy->policy; -+ -+ amd_pstate_epp_init(policy->cpu); -+ -+ return 0; -+} -+ -+static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) -+{ -+ struct cppc_perf_ctrls perf_ctrls; -+ u64 value, max_perf; -+ int ret; -+ -+ ret = amd_pstate_enable(true); -+ if (ret) -+ pr_err("failed to enable amd pstate during resume, return %d\n", ret); -+ -+ value = READ_ONCE(cpudata->cppc_req_cached); -+ max_perf = READ_ONCE(cpudata->highest_perf); -+ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); -+ } else { -+ perf_ctrls.max_perf = max_perf; -+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); -+ cppc_set_perf(cpudata->cpu, &perf_ctrls); -+ } -+} -+ -+static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); -+ -+ if (cppc_state == AMD_PSTATE_ACTIVE) { -+ amd_pstate_epp_reenable(cpudata); -+ cpudata->suspended = false; -+ } -+ -+ return 0; -+} -+ -+static void amd_pstate_epp_offline(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ struct cppc_perf_ctrls perf_ctrls; -+ int min_perf; -+ u64 value; -+ -+ min_perf = READ_ONCE(cpudata->lowest_perf); -+ value = READ_ONCE(cpudata->cppc_req_cached); -+ -+ mutex_lock(&amd_pstate_limits_lock); -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; -+ -+ /* Set max perf same as min perf */ -+ value &= ~AMD_CPPC_MAX_PERF(~0L); -+ value |= AMD_CPPC_MAX_PERF(min_perf); -+ value &= ~AMD_CPPC_MIN_PERF(~0L); -+ value |= AMD_CPPC_MIN_PERF(min_perf); -+ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); -+ } else { -+ perf_ctrls.desired_perf = 0; -+ perf_ctrls.max_perf = min_perf; -+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); -+ cppc_set_perf(cpudata->cpu, &perf_ctrls); -+ } -+ mutex_unlock(&amd_pstate_limits_lock); -+} -+ -+static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); -+ -+ if (cpudata->suspended) -+ return 0; -+ -+ if (cppc_state == AMD_PSTATE_ACTIVE) -+ amd_pstate_epp_offline(policy); -+ -+ return 0; -+} -+ -+static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) -+{ -+ cpufreq_verify_within_cpu_limits(policy); -+ pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); -+ return 0; -+} -+ -+static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ int ret; -+ -+ /* avoid suspending when EPP is not enabled */ -+ if (cppc_state != AMD_PSTATE_ACTIVE) -+ return 0; -+ -+ /* set this flag to avoid setting core offline*/ -+ cpudata->suspended = true; -+ -+ /* disable CPPC in lowlevel firmware */ -+ ret = amd_pstate_enable(false); -+ if (ret) -+ pr_err("failed to suspend, return %d\n", ret); -+ -+ return 0; -+} -+ -+static int amd_pstate_epp_resume(struct cpufreq_policy *policy) -+{ -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ if (cpudata->suspended) { -+ mutex_lock(&amd_pstate_limits_lock); -+ -+ /* enable amd pstate from suspend state*/ -+ amd_pstate_epp_reenable(cpudata); -+ -+ mutex_unlock(&amd_pstate_limits_lock); -+ -+ cpudata->suspended = false; -+ } -+ -+ return 0; -+} -+ - static struct cpufreq_driver amd_pstate_driver = { - .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, - .verify = amd_pstate_verify, -@@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = { - .attr = amd_pstate_attr, - }; - -+static struct cpufreq_driver amd_pstate_epp_driver = { -+ .flags = CPUFREQ_CONST_LOOPS, -+ .verify = amd_pstate_epp_verify_policy, -+ .setpolicy = amd_pstate_epp_set_policy, -+ .init = amd_pstate_epp_cpu_init, -+ .exit = amd_pstate_epp_cpu_exit, -+ .offline = amd_pstate_epp_cpu_offline, -+ .online = amd_pstate_epp_cpu_online, -+ .suspend = amd_pstate_epp_suspend, -+ .resume = amd_pstate_epp_resume, -+ .name = "amd_pstate_epp", -+ .attr = amd_pstate_epp_attr, -+}; -+ - static int __init amd_pstate_init(void) +-static void amd_pstate_driver_cleanup(void) +-{ +- current_pstate_driver = NULL; +-} +- + static int amd_pstate_update_status(const char *buf, size_t size) { - int ret; -@@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void) - /* - * by default the pstate driver is disabled to load - * enable the amd_pstate passive mode driver explicitly -- * with amd_pstate=passive in kernel command line -+ * with amd_pstate=passive or other modes in kernel command line - */ -- if (!cppc_load) { -- pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); -+ if (cppc_state == AMD_PSTATE_DISABLE) { -+ pr_info("driver load is disabled, boot with specific mode to enable this\n"); - return -ENODEV; - } +- int ret = 0; + int mode_idx; -@@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void) +- if (size > 7 || size < 6) ++ if (size > strlen("passive") || size < strlen("active")) + return -EINVAL; +- mode_idx = get_mode_idx_from_str(buf, size); + +- switch(mode_idx) { +- case AMD_PSTATE_DISABLE: +- if (!current_pstate_driver) +- return -EINVAL; +- if (cppc_state == AMD_PSTATE_ACTIVE) +- return -EBUSY; +- cpufreq_unregister_driver(current_pstate_driver); +- amd_pstate_driver_cleanup(); +- break; +- case AMD_PSTATE_PASSIVE: +- if (current_pstate_driver) { +- if (current_pstate_driver == &amd_pstate_driver) +- return 0; +- cpufreq_unregister_driver(current_pstate_driver); +- cppc_state = AMD_PSTATE_PASSIVE; +- current_pstate_driver = &amd_pstate_driver; +- } ++ mode_idx = get_mode_idx_from_str(buf, size); + +- ret = cpufreq_register_driver(current_pstate_driver); +- break; +- case AMD_PSTATE_ACTIVE: +- if (current_pstate_driver) { +- if (current_pstate_driver == &amd_pstate_epp_driver) +- return 0; +- cpufreq_unregister_driver(current_pstate_driver); +- current_pstate_driver = &amd_pstate_epp_driver; +- cppc_state = AMD_PSTATE_ACTIVE; +- } ++ if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) ++ return -EINVAL; + +- ret = cpufreq_register_driver(current_pstate_driver); +- break; +- default: +- ret = -EINVAL; +- break; +- } ++ if (mode_state_machine[cppc_state][mode_idx]) ++ return mode_state_machine[cppc_state][mode_idx](mode_idx); + +- return ret; ++ return 0; + } + + static ssize_t show_status(struct kobject *kobj, +@@ -1279,7 +1360,7 @@ static int __init amd_pstate_init(void) /* capability check */ if (boot_cpu_has(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); -- amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; +- if (cppc_state == AMD_PSTATE_PASSIVE) + if (cppc_state != AMD_PSTATE_ACTIVE) -+ current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; } else { pr_debug("AMD CPPC shared memory based functionality is supported\n"); - static_call_update(amd_pstate_enable, cppc_enable); -@@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void) - /* enable amd pstate feature */ - ret = amd_pstate_enable(true); - if (ret) { -- pr_err("failed to enable amd-pstate with return %d\n", ret); -+ pr_err("failed to enable with return %d\n", ret); - return ret; - } +@@ -1341,7 +1422,7 @@ static int __init amd_pstate_param(char *str) + if (cppc_state == AMD_PSTATE_ACTIVE) + current_pstate_driver = &amd_pstate_epp_driver; -- ret = cpufreq_register_driver(&amd_pstate_driver); -+ ret = cpufreq_register_driver(current_pstate_driver); - if (ret) -- pr_err("failed to register amd_pstate_driver with return %d\n", -- ret); -+ pr_err("failed to register with return %d\n", ret); -+ -+ amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj); -+ if (!amd_pstate_kobj) { -+ ret = -EINVAL; -+ pr_err("global sysfs registration failed.\n"); -+ goto kobject_free; -+ } - -+ ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group); -+ if (ret) { -+ pr_err("sysfs attribute export failed with error %d.\n", ret); -+ goto global_attr_free; -+ } -+ -+ return ret; -+ -+global_attr_free: -+ kobject_put(amd_pstate_kobj); -+kobject_free: -+ cpufreq_unregister_driver(current_pstate_driver); - return ret; - } - device_initcall(amd_pstate_init); - - static int __init amd_pstate_param(char *str) - { -+ size_t size; -+ int mode_idx; -+ - if (!str) - return -EINVAL; - -- if (!strcmp(str, "disable")) { -- cppc_load = 0; -- pr_info("driver is explicitly disabled\n"); -- } else if (!strcmp(str, "passive")) -- cppc_load = 1; -+ size = strlen(str); -+ mode_idx = get_mode_idx_from_str(str, size); - -- return 0; -+ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { -+ cppc_state = mode_idx; -+ if (cppc_state == AMD_PSTATE_DISABLE) -+ pr_info("driver is explicitly disabled\n"); -+ -+ if (cppc_state == AMD_PSTATE_ACTIVE) -+ current_pstate_driver = &amd_pstate_epp_driver; -+ +- if (cppc_state == AMD_PSTATE_PASSIVE) + if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) -+ current_pstate_driver = &amd_pstate_driver; -+ -+ return 0; -+ } -+ -+ return -EINVAL; - } - early_param("amd_pstate", amd_pstate_param); + current_pstate_driver = &amd_pstate_driver; -diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c -index 4153150e20db..ffea6402189d 100644 ---- a/drivers/cpufreq/brcmstb-avs-cpufreq.c -+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c -@@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) - - static int brcm_avs_cpufreq_remove(struct platform_device *pdev) - { -- int ret; -- -- ret = cpufreq_unregister_driver(&brcm_avs_driver); -- WARN_ON(ret); -+ cpufreq_unregister_driver(&brcm_avs_driver); - - brcm_avs_prepare_uninit(pdev); - -diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 7e56a42750ea..85a0bea2dbf1 100644 ---- a/drivers/cpufreq/cpufreq.c -+++ b/drivers/cpufreq/cpufreq.c -@@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver); - * Returns zero if successful, and -EINVAL if the cpufreq_driver is - * currently not initialised. - */ --int cpufreq_unregister_driver(struct cpufreq_driver *driver) -+void cpufreq_unregister_driver(struct cpufreq_driver *driver) - { - unsigned long flags; - -- if (!cpufreq_driver || (driver != cpufreq_driver)) -- return -EINVAL; -+ if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) -+ return; - - pr_debug("unregistering driver %s\n", driver->name); - -@@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) - - write_unlock_irqrestore(&cpufreq_driver_lock, flags); - cpus_read_unlock(); -- -- return 0; - } - EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); - -diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c -index f80339779084..f21a9e3df53d 100644 ---- a/drivers/cpufreq/mediatek-cpufreq-hw.c -+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c -@@ -317,7 +317,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) - - static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev) - { -- return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); -+ cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); -+ -+ return 0; - } - - static const struct of_device_id mtk_cpufreq_hw_match[] = { -diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c -index 1b50df06c6bc..81649a1969b6 100644 ---- a/drivers/cpufreq/omap-cpufreq.c -+++ b/drivers/cpufreq/omap-cpufreq.c -@@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev) - - static int omap_cpufreq_remove(struct platform_device *pdev) - { -- return cpufreq_unregister_driver(&omap_driver); -+ cpufreq_unregister_driver(&omap_driver); -+ -+ return 0; - } - - static struct platform_driver omap_cpufreq_platdrv = { -diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c -index d3f55ca06ed3..2f581d2d617d 100644 ---- a/drivers/cpufreq/qcom-cpufreq-hw.c -+++ b/drivers/cpufreq/qcom-cpufreq-hw.c -@@ -770,7 +770,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) - - static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) - { -- return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); -+ cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); -+ -+ return 0; - } - - static struct platform_driver qcom_cpufreq_hw_driver = { + return 0; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index c5614444031f..6126c977ece0 100644 +index 6b487a5bd638..6126c977ece0 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h -@@ -108,12 +108,15 @@ struct cppc_perf_caps { - u32 lowest_nonlinear_perf; +@@ -109,6 +109,7 @@ struct cppc_perf_caps { u32 lowest_freq; u32 nominal_freq; -+ u32 energy_perf; + u32 energy_perf; + bool auto_sel; }; struct cppc_perf_ctrls { - u32 max_perf; - u32 min_perf; - u32 desired_perf; -+ u32 energy_perf; - }; - - struct cppc_perf_fb_ctrs { -@@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void); - extern bool cpc_supported_by_cpu(void); - extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); +@@ -153,6 +154,8 @@ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); -+extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); -+extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); + extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); + extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); +extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); +extern int cppc_set_auto_sel(int cpu, bool enable); #else /* !CONFIG_ACPI_CPPC_LIB */ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) { -@@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +@@ -214,6 +217,14 @@ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) { return -ENOTSUPP; } -+static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) -+{ -+ return -ENOTSUPP; -+} -+static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) -+{ -+ return -ENOTSUPP; -+} +static inline int cppc_set_auto_sel(int cpu, bool enable) +{ + return -ENOTSUPP; @@ -28189,84 +9606,32 @@ index c5614444031f..6126c977ece0 100644 #endif /* _CPPC_ACPI_H*/ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 1c4b8659f171..c10ebf8c42e6 100644 +index f5f22418e64b..c10ebf8c42e6 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h -@@ -12,6 +12,11 @@ - - #include - -+#define AMD_CPPC_EPP_PERFORMANCE 0x00 -+#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 -+#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF -+#define AMD_CPPC_EPP_POWERSAVE 0xFF -+ - /********************************************************************* - * AMD P-state INTERFACE * - *********************************************************************/ -@@ -47,6 +52,10 @@ struct amd_aperf_mperf { - * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value - * @boost_supported: check whether the Processor or SBIOS supports boost mode -+ * @epp_policy: Last saved policy used to set energy-performance preference -+ * @epp_cached: Cached CPPC energy-performance preference value -+ * @policy: Cpufreq policy value -+ * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value - * - * The amd_cpudata is key private data for each CPU thread in AMD P-State, and - * represents all the attributes and goals that AMD P-State requests at runtime. -@@ -72,6 +81,31 @@ struct amd_cpudata { - - u64 freq; - bool boost_supported; -+ -+ /* EPP feature related attributes*/ -+ s16 epp_policy; -+ s16 epp_cached; -+ u32 policy; -+ u64 cppc_cap1_cached; -+ bool suspended; +@@ -97,6 +97,7 @@ enum amd_pstate_mode { + AMD_PSTATE_DISABLE = 0, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, ++ AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, }; -+/* -+ * enum amd_pstate_mode - driver working mode of amd pstate -+ */ -+enum amd_pstate_mode { -+ AMD_PSTATE_DISABLE = 0, -+ AMD_PSTATE_PASSIVE, -+ AMD_PSTATE_ACTIVE, -+ AMD_PSTATE_GUIDED, -+ AMD_PSTATE_MAX, -+}; -+ -+static const char * const amd_pstate_mode_string[] = { -+ [AMD_PSTATE_DISABLE] = "disable", -+ [AMD_PSTATE_PASSIVE] = "passive", -+ [AMD_PSTATE_ACTIVE] = "active", +@@ -104,6 +105,7 @@ static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", -+ NULL, -+}; + NULL, + }; #endif /* _LINUX_AMD_PSTATE_H */ -diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index 6a94a6eaad27..65623233ab2f 100644 ---- a/include/linux/cpufreq.h -+++ b/include/linux/cpufreq.h -@@ -448,7 +448,7 @@ struct cpufreq_driver { - #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) - - int cpufreq_register_driver(struct cpufreq_driver *driver_data); --int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); -+void cpufreq_unregister_driver(struct cpufreq_driver *driver_data); - - bool cpufreq_driver_test_flags(u16 flags); - const char *cpufreq_get_current_driver(void); -- -2.40.0.rc2 +2.40.0 -From 501028b1bc1da95eeb61b26a0ee82ef93873d5d7 Mon Sep 17 00:00:00 2001 +From 3c01171fc23cece3cf05ed3380e25fa10cd3393d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 22 Jan 2023 13:41:50 +0100 -Subject: [PATCH 09/16] ksm +Date: Sun, 9 Apr 2023 21:22:26 +0200 +Subject: [PATCH 06/10] ksm Signed-off-by: Peter Jung --- @@ -28292,9 +9657,9 @@ Signed-off-by: Peter Jung include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 5 +- kernel/sys_ni.c | 1 + - mm/ksm.c | 88 +++++++++------ - mm/madvise.c | 113 ++++++++++++++++++++ - 24 files changed, 198 insertions(+), 34 deletions(-) + mm/ksm.c | 82 +++++++++----- + mm/madvise.c | 117 ++++++++++++++++++++ + 24 files changed, 199 insertions(+), 31 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 8ebacf37a8cf..c9d25f85d86d 100644 @@ -28470,7 +9835,7 @@ index 52c94ab5c205..1518e261d882 100644 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common pmadv_ksm sys_pmadv_ksm diff --git a/include/linux/ksm.h b/include/linux/ksm.h -index 7e232ba59b86..632a1a792ebb 100644 +index 7e232ba59b86..57ed92987717 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -16,6 +16,10 @@ @@ -28478,9 +9843,9 @@ index 7e232ba59b86..632a1a792ebb 100644 #ifdef CONFIG_KSM +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, -+ unsigned long *vm_flags); ++ const vm_flags_t *vm_flags); +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, unsigned long *vm_flags); ++ unsigned long end, const vm_flags_t *vm_flags); int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); int __ksm_enter(struct mm_struct *mm); @@ -28526,17 +9891,17 @@ index 860b2dcf3ac4..810e1fcaff94 100644 COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/ksm.c b/mm/ksm.c -index ee60890cf9b1..bc920121bce9 100644 +index 82029f1d454b..0c206bd8007d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c -@@ -2582,54 +2582,78 @@ static int ksm_scan_thread(void *nothing) +@@ -2576,52 +2576,76 @@ static int ksm_scan_thread(void *nothing) return 0; } -int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, -+ unsigned long *vm_flags) ++ const vm_flags_t *vm_flags) { - struct mm_struct *mm = vma->vm_mm; int err; @@ -28584,24 +9949,12 @@ index ee60890cf9b1..bc920121bce9 100644 + if (err) + return err; + } - -- *vm_flags |= VM_MERGEABLE; -- break; -+ *vm_flags |= VM_MERGEABLE; - -- case MADV_UNMERGEABLE: -- if (!(*vm_flags & VM_MERGEABLE)) -- return 0; /* just ignore the advice */ ++ + return 0; +} - -- if (vma->anon_vma) { -- err = unmerge_ksm_pages(vma, start, end); -- if (err) -- return err; -- } ++ +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, unsigned long *vm_flags) ++ unsigned long end, const vm_flags_t *vm_flags) +{ + int err; + @@ -28613,9 +9966,6 @@ index ee60890cf9b1..bc920121bce9 100644 + if (err) + return err; + } - -- *vm_flags &= ~VM_MERGEABLE; -+ *vm_flags &= ~VM_MERGEABLE; + + return 0; +} @@ -28631,20 +9981,30 @@ index ee60890cf9b1..bc920121bce9 100644 + err = ksm_madvise_merge(mm, vma, vm_flags); + if (err) + return err; -+ break; -+ -+ case MADV_UNMERGEABLE: + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: +- if (!(*vm_flags & VM_MERGEABLE)) +- return 0; /* just ignore the advice */ +- +- if (vma->anon_vma) { +- err = unmerge_ksm_pages(vma, start, end); +- if (err) +- return err; +- } + err = ksm_madvise_unmerge(vma, start, end, vm_flags); + if (err) + return err; - break; - } + *vm_flags &= ~VM_MERGEABLE; + break; diff --git a/mm/madvise.c b/mm/madvise.c -index b6ea204d4e23..0064dcafb812 100644 +index 340125d08c03..36e756355f04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c -@@ -1527,3 +1527,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, +@@ -1522,3 +1522,120 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, out: return ret; } @@ -28710,9 +10070,13 @@ index b6ea204d4e23..0064dcafb812 100644 + switch (behaviour) { + case MADV_MERGEABLE: + ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags); ++ if (!ret) ++ vm_flags_set(vma, VM_MERGEABLE); + break; + case MADV_UNMERGEABLE: + ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags); ++ if (!ret) ++ vm_flags_clear(vma, VM_MERGEABLE); + break; + default: + /* look, ma, no brain */ @@ -28762,54 +10126,27 @@ index b6ea204d4e23..0064dcafb812 100644 +subsys_initcall(pmadv_sysfs_init); +#endif /* CONFIG_KSM */ -- -2.40.0.rc2 +2.40.0 -From abf71738a315ea5ad029cd3976ec7b2d9456c432 Mon Sep 17 00:00:00 2001 +From d349cbde64499039351b1bb146999948a1319b71 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 10 Mar 2023 18:06:12 +0100 -Subject: [PATCH 10/16] maple-lru +Date: Sun, 9 Apr 2023 21:24:33 +0200 +Subject: [PATCH 07/10] maple-lru Signed-off-by: Peter Jung --- - Documentation/mm/multigen_lru.rst | 128 +++- - include/linux/fs.h | 2 + - include/linux/maple_tree.h | 6 - - include/linux/memcontrol.h | 10 + - include/linux/mm_inline.h | 19 +- - include/linux/mmzone.h | 124 +++- - lib/maple_tree.c | 149 ++-- - mm/fadvise.c | 5 +- - mm/memcontrol.c | 12 + - mm/memory.c | 7 +- - mm/page_alloc.c | 1 + - mm/rmap.c | 42 +- - mm/vmscan.c | 1083 ++++++++++++++++++----------- - mm/workingset.c | 4 +- - tools/testing/radix-tree/maple.c | 18 +- - 15 files changed, 1066 insertions(+), 544 deletions(-) + Documentation/mm/multigen_lru.rst | 44 +++++++++++++++++++++++--- + include/linux/mmzone.h | 2 +- + lib/maple_tree.c | 51 ++++++------------------------- + mm/vmscan.c | 24 ++++++--------- + tools/testing/radix-tree/maple.c | 24 +++++++++++++++ + 5 files changed, 84 insertions(+), 61 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst -index d7062c6a8946..52ed5092022f 100644 +index 5f1f6ecbb79b..52ed5092022f 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst -@@ -89,21 +89,22 @@ variables are monotonically increasing. - - Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` - bits in order to fit into the gen counter in ``folio->flags``. Each --truncated generation number is an index to ``lrugen->lists[]``. The -+truncated generation number is an index to ``lrugen->folios[]``. The - sliding window technique is used to track at least ``MIN_NR_GENS`` and - at most ``MAX_NR_GENS`` generations. The gen counter stores a value - within ``[1, MAX_NR_GENS]`` while a page is on one of --``lrugen->lists[]``; otherwise it stores zero. -+``lrugen->folios[]``; otherwise it stores zero. - - Each generation is divided into multiple tiers. A page accessed ``N`` - times through file descriptors is in tier ``order_base_2(N)``. Unlike --generations, tiers do not have dedicated ``lrugen->lists[]``. In -+generations, tiers do not have dedicated ``lrugen->folios[]``. In - contrast to moving across generations, which requires the LRU lock, - moving across tiers only involves atomic operations on +@@ -103,7 +103,8 @@ moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop modeled after the PID controller monitors refaults over all the tiers from anon and file types and decides which tiers from which types to @@ -28819,34 +10156,10 @@ index d7062c6a8946..52ed5092022f 100644 There are two conceptually independent procedures: the aging and the eviction. They form a closed-loop system, i.e., the page reclaim. -@@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. - Eviction - -------- - The eviction consumes old generations. Given an ``lruvec``, it --increments ``min_seq`` when ``lrugen->lists[]`` indexed by -+increments ``min_seq`` when ``lrugen->folios[]`` indexed by - ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to - evict from, it first compares ``min_seq[]`` to select the older type. - If both types are equally old, it selects the one whose first tier has -@@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To - this end, the feedback loop uses the first tier as the baseline, for - the reason stated earlier. +@@ -156,6 +157,27 @@ This time-based approach has the following advantages: + and memory sizes. + 2. It is more reliable because it is directly wired to the OOM killer. -+Working set protection -+---------------------- -+Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is -+set, an ``lruvec`` is protected from the eviction when its oldest -+generation was born within ``lru_gen_min_ttl`` milliseconds. In other -+words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds -+from getting evicted. The OOM killer is triggered if this working set -+cannot be kept in memory. -+ -+This time-based approach has the following advantages: -+ -+1. It is easier to configure because it is agnostic to applications -+ and memory sizes. -+2. It is more reliable because it is directly wired to the OOM killer. -+ +``mm_struct`` list +------------------ +An ``mm_struct`` list is maintained for each memcg, and an @@ -28868,36 +10181,22 @@ index d7062c6a8946..52ed5092022f 100644 +context switches so that page table walkers can skip processes that +have been sleeping since the last iteration. + -+Rmap/PT walk feedback -+--------------------- -+Searching the rmap for PTEs mapping each page on an LRU list (to test -+and clear the accessed bit) can be expensive because pages from -+different VMAs (PA space) are not cache friendly to the rmap (VA -+space). For workloads mostly using mapped pages, searching the rmap -+can incur the highest CPU cost in the reclaim path. -+ -+``lru_gen_look_around()`` exploits spatial locality to reduce the -+trips into the rmap. It scans the adjacent PTEs of a young PTE and -+promotes hot pages. If the scan was done cacheline efficiently, it -+adds the PMD entry pointing to the PTE table to the Bloom filter. This -+forms a feedback loop between the eviction and the aging. -+ + Rmap/PT walk feedback + --------------------- + Searching the rmap for PTEs mapping each page on an LRU list (to test +@@ -170,7 +192,7 @@ promotes hot pages. If the scan was done cacheline efficiently, it + adds the PMD entry pointing to the PTE table to the Bloom filter. This + forms a feedback loop between the eviction and the aging. + +-Bloom Filters +Bloom filters -+------------- -+Bloom filters are a space and memory efficient data structure for set -+membership test, i.e., test if an element is not in the set or may be -+in the set. -+ -+In the eviction path, specifically, in ``lru_gen_look_around()``, if a -+PMD has a sufficient number of hot pages, its address is placed in the -+filter. In the aging path, set membership means that the PTE range -+will be scanned for young pages. -+ -+Note that Bloom filters are probabilistic on set membership. If a test -+is false positive, the cost is an additional scan of a range of PTEs, -+which may yield hot pages anyway. Parameters of the filter itself can -+control the false positive rate in the limit. -+ + ------------- + Bloom filters are a space and memory efficient data structure for set + membership test, i.e., test if an element is not in the set or may be +@@ -186,6 +208,18 @@ is false positive, the cost is an additional scan of a range of PTEs, + which may yield hot pages anyway. Parameters of the filter itself can + control the false positive rate in the limit. + +PID controller +-------------- +A feedback loop modeled after the Proportional-Integral-Derivative @@ -28910,41 +10209,10 @@ index d7062c6a8946..52ed5092022f 100644 +varying memory pressure. It calculates a moving average for each new +generation to avoid being permanently locked in a suboptimal state. + -+Memcg LRU -+--------- -+An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, -+since each node and memcg combination has an LRU of folios (see -+``mem_cgroup_lruvec()``). Its goal is to improve the scalability of -+global reclaim, which is critical to system-wide memory overcommit in -+data centers. Note that memcg LRU only applies to global reclaim. -+ -+The basic structure of an memcg LRU can be understood by an analogy to -+the active/inactive LRU (of folios): -+ -+1. It has the young and the old (generations), i.e., the counterparts -+ to the active and the inactive; -+2. The increment of ``max_seq`` triggers promotion, i.e., the -+ counterpart to activation; -+3. Other events trigger similar operations, e.g., offlining an memcg -+ triggers demotion, i.e., the counterpart to deactivation. -+ -+In terms of global reclaim, it has two distinct features: -+ -+1. Sharding, which allows each thread to start at a random memcg (in -+ the old generation) and improves parallelism; -+2. Eventual fairness, which allows direct reclaim to bail out at will -+ and reduces latency without affecting fairness over some time. -+ -+In terms of traversing memcgs during global reclaim, it improves the -+best-case complexity from O(n) to O(1) and does not affect the -+worst-case complexity O(n). Therefore, on average, it has a sublinear -+complexity. -+ - Summary - ------- --The multi-gen LRU can be disassembled into the following parts: -+The multi-gen LRU (of folios) can be disassembled into the following -+parts: + Memcg LRU + --------- + An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +@@ -223,9 +257,9 @@ parts: * Generations * Rmap walks @@ -28957,696 +10225,105 @@ index d7062c6a8946..52ed5092022f 100644 The aging and the eviction form a producer-consumer model; specifically, the latter drives the former by the sliding window over -diff --git a/include/linux/fs.h b/include/linux/fs.h -index c1769a2c5d70..d353c262d669 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, - /* File supports DIRECT IO */ - #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) - -+#define FMODE_NOREUSE ((__force fmode_t)0x800000) -+ - /* File was opened by fanotify and shouldn't generate fanotify events */ - #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) - -diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h -index e594db58a0f1..815a27661517 100644 ---- a/include/linux/maple_tree.h -+++ b/include/linux/maple_tree.h -@@ -12,7 +12,6 @@ - #include - #include - /* #define CONFIG_MAPLE_RCU_DISABLED */ --/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ - - /* - * Allocated nodes are mutable until they have been inserted into the tree, -@@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) - return mas->node == MAS_PAUSE; - } - --void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); --void mas_dup_store(struct ma_state *mas, void *entry); -- - /* - * This finds an empty area from the highest address to the lowest. - * AKA "Topdown" version, -@@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas) - * entry. - * - * Note: may return the zero entry. -- * - */ - #define mas_for_each(__mas, __entry, __max) \ - while (((__entry) = mas_find((__mas), (__max))) != NULL) -@@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) - } - - static inline unsigned int mt_height(const struct maple_tree *mt) -- - { - return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; - } -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 85dc9b88ea37..8e0be0680005 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) - percpu_ref_put(&objcg->refcnt); - } - -+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) -+{ -+ return !memcg || css_tryget(&memcg->css); -+} -+ - static inline void mem_cgroup_put(struct mem_cgroup *memcg) - { - if (memcg) -@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) - { - } - -+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) -+{ -+ return true; -+} -+ - static inline void mem_cgroup_put(struct mem_cgroup *memcg) - { - } -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index ff3f3f23f649..de1e622dd366 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli - int zone = folio_zonenum(folio); - int delta = folio_nr_pages(folio); - enum lru_list lru = type * LRU_INACTIVE_FILE; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); - VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); -@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, - int gen = folio_lru_gen(folio); - int type = folio_is_file_lru(folio); - int zone = folio_zonenum(folio); -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); - -@@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, - lru_gen_update_size(lruvec, folio, -1, gen); - /* for folio_rotate_reclaimable() */ - if (reclaiming) -- list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); -+ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); - else -- list_add(&folio->lru, &lrugen->lists[gen][type][zone]); -+ list_add(&folio->lru, &lrugen->folios[gen][type][zone]); - - return true; - } -@@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, - #endif - } - -+static inline bool vma_has_recency(struct vm_area_struct *vma) -+{ -+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) -+ return false; -+ -+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) -+ return false; -+ -+ return true; -+} -+ - #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index cd28a100d9e4..70bd7f55bdd2 100644 +index 9fb1b03b83b2..bf8786d45b31 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h -@@ -7,6 +7,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -312,7 +313,7 @@ enum lruvec_flags { - * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An - * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the - * corresponding generation. The gen counter in folio->flags stores gen+1 while -- * a page is on one of lrugen->lists[]. Otherwise it stores 0. -+ * a page is on one of lrugen->folios[]. Otherwise it stores 0. - * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the -@@ -324,8 +325,8 @@ enum lruvec_flags { - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). - * -- * PG_active is always cleared while a page is on one of lrugen->lists[] so that -- * the aging needs not to worry about it. And it's set again when a page -+ * PG_active is always cleared while a page is on one of lrugen->folios[] so -+ * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). - * -@@ -404,7 +405,7 @@ enum { - * The number of pages in each generation is eventually consistent and therefore - * can be transiently negative when reset_batch_size() is pending. - */ --struct lru_gen_struct { -+struct lru_gen_folio { - /* the aging increments the youngest generation number */ - unsigned long max_seq; - /* the eviction increments the oldest generation numbers */ -@@ -412,7 +413,7 @@ struct lru_gen_struct { - /* the birth time of each generation in jiffies */ - unsigned long timestamps[MAX_NR_GENS]; - /* the multi-gen LRU lists, lazily sorted on eviction */ -- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the multi-gen LRU sizes, eventually consistent */ - long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the exponential moving average of refaulted */ -@@ -426,6 +427,14 @@ struct lru_gen_struct { - atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - /* whether the multi-gen LRU is enabled */ - bool enabled; -+#ifdef CONFIG_MEMCG -+ /* the memcg generation this lru_gen_folio belongs to */ -+ u8 gen; -+ /* the list segment this lru_gen_folio belongs to */ -+ u8 seg; -+ /* per-node lru_gen_folio list for global reclaim */ -+ struct hlist_nulls_node list; -+#endif - }; - - enum { -@@ -461,7 +470,7 @@ struct lru_gen_mm_state { - struct lru_gen_mm_walk { - /* the lruvec under reclaim */ - struct lruvec *lruvec; -- /* unstable max_seq from lru_gen_struct */ -+ /* unstable max_seq from lru_gen_folio */ - unsigned long max_seq; - /* the next address within an mm to scan */ - unsigned long next_addr; -@@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - - #ifdef CONFIG_MEMCG -+ -+/* -+ * For each node, memcgs are divided into two generations: the old and the -+ * young. For each generation, memcgs are randomly sharded into multiple bins -+ * to improve scalability. For each bin, the hlist_nulls is virtually divided -+ * into three segments: the head, the tail and the default. -+ * -+ * An onlining memcg is added to the tail of a random bin in the old generation. -+ * The eviction starts at the head of a random bin in the old generation. The -+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes -+ * the old generation, is incremented when all its bins become empty. -+ * -+ * There are four operations: -+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its -+ * current generation (old or young) and updates its "seg" to "head"; -+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its -+ * current generation (old or young) and updates its "seg" to "tail"; -+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old -+ * generation, updates its "gen" to "old" and resets its "seg" to "default"; -+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the -+ * young generation, updates its "gen" to "young" and resets its "seg" to -+ * "default". -+ * -+ * The events that trigger the above operations are: -+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; -+ * 2. The first attempt to reclaim an memcg below low, which triggers -+ * MEMCG_LRU_TAIL; -+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold, -+ * which triggers MEMCG_LRU_TAIL; -+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold, -+ * which triggers MEMCG_LRU_YOUNG; -+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; -+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; -+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. -+ * -+ * Note that memcg LRU only applies to global reclaim, and the round-robin -+ * incrementing of their max_seq counters ensures the eventual fairness to all -+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). -+ */ -+#define MEMCG_NR_GENS 2 -+#define MEMCG_NR_BINS 8 -+ -+struct lru_gen_memcg { -+ /* the per-node memcg generation counter */ -+ unsigned long seq; -+ /* each memcg has one lru_gen_folio per node */ -+ unsigned long nr_memcgs[MEMCG_NR_GENS]; -+ /* per-node lru_gen_folio list for global reclaim */ -+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; -+ /* protects the above */ -+ spinlock_t lock; -+}; -+ -+void lru_gen_init_pgdat(struct pglist_data *pgdat); -+ - void lru_gen_init_memcg(struct mem_cgroup *memcg); - void lru_gen_exit_memcg(struct mem_cgroup *memcg); --#endif -+void lru_gen_online_memcg(struct mem_cgroup *memcg); -+void lru_gen_offline_memcg(struct mem_cgroup *memcg); -+void lru_gen_release_memcg(struct mem_cgroup *memcg); -+void lru_gen_soft_reclaim(struct lruvec *lruvec); -+ -+#else /* !CONFIG_MEMCG */ -+ -+#define MEMCG_NR_GENS 1 -+ -+struct lru_gen_memcg { -+}; -+ -+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) -+{ -+} -+ -+#endif /* CONFIG_MEMCG */ - - #else /* !CONFIG_LRU_GEN */ - -+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) -+{ -+} -+ - static inline void lru_gen_init_lruvec(struct lruvec *lruvec) - { - } -@@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - } - - #ifdef CONFIG_MEMCG -+ - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { - } -@@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) - { - } --#endif -+ -+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) -+{ -+} -+ -+#endif /* CONFIG_MEMCG */ - - #endif /* CONFIG_LRU_GEN */ - -@@ -524,7 +626,7 @@ struct lruvec { - unsigned long flags; - #ifdef CONFIG_LRU_GEN - /* evictable pages divided into generations */ -- struct lru_gen_struct lrugen; -+ struct lru_gen_folio lrugen; - /* to concurrently iterate lru_gen_mm_list */ - struct lru_gen_mm_state mm_state; - #endif -@@ -1242,7 +1344,9 @@ typedef struct pglist_data { +@@ -1369,7 +1369,7 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ - struct lru_gen_mm_walk mm_walk; + struct lru_gen_mm_walk mm_walk; -+ /* lru_gen_folio list */ -+ struct lru_gen_memcg memcg_lru; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif - - CACHELINE_PADDING(_pad2_); diff --git a/lib/maple_tree.c b/lib/maple_tree.c -index 5a976393c9ae..a73f83d0eb0e 100644 +index db60edb55f2f..4df6a0ce1c1b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c -@@ -146,16 +146,22 @@ struct maple_subtree_state { - struct maple_big_node *bn; - }; - -+#ifdef CONFIG_KASAN_STACK -+/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ -+#define noinline_for_kasan noinline_for_stack -+#else -+#define noinline_for_kasan inline -+#endif -+ - /* Functions */ - static inline struct maple_node *mt_alloc_one(gfp_t gfp) - { -- return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); -+ return kmem_cache_alloc(maple_node_cache, gfp); - } - - static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) - { -- return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, -- nodes); -+ return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); - } - - static inline void mt_free_bulk(size_t size, void __rcu **nodes) -@@ -183,7 +189,6 @@ static void ma_free_rcu(struct maple_node *node) - call_rcu(&node->rcu, mt_free_rcu); - } - -- - static void mas_set_height(struct ma_state *mas) - { - unsigned int new_flags = mas->tree->ma_flags; -@@ -468,7 +473,7 @@ static inline - void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, - unsigned char slot) - { -- unsigned long val = (unsigned long) parent; -+ unsigned long val = (unsigned long)parent; - unsigned long shift; - unsigned long type; - enum maple_type p_type = mte_node_type(parent); -@@ -502,10 +507,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, - */ - static inline unsigned int mte_parent_slot(const struct maple_enode *enode) - { -- unsigned long val = (unsigned long) mte_to_node(enode)->parent; -+ unsigned long val = (unsigned long)mte_to_node(enode)->parent; - -- /* Root. */ -- if (val & 1) -+ if (val & MA_ROOT_PARENT) - return 0; - - /* -@@ -1128,9 +1132,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) - { - struct maple_alloc *ret, *node = mas->alloc; - unsigned long total = mas_allocated(mas); -+ unsigned int req = mas_alloc_req(mas); - - /* nothing or a request pending. */ -- if (unlikely(!total)) -+ if (WARN_ON(!total)) - return NULL; - - if (total == 1) { -@@ -1140,27 +1145,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) - goto single_node; - } - -- if (!node->node_count) { -+ if (node->node_count == 1) { - /* Single allocation in this node. */ - mas->alloc = node->slot[0]; -- node->slot[0] = NULL; - mas->alloc->total = node->total - 1; - ret = node; - goto new_head; - } -- - node->total--; -- ret = node->slot[node->node_count]; -- node->slot[node->node_count--] = NULL; -+ ret = node->slot[--node->node_count]; -+ node->slot[node->node_count] = NULL; - - single_node: - new_head: -- ret->total = 0; -- ret->node_count = 0; -- if (ret->request_count) { -- mas_set_alloc_req(mas, ret->request_count + 1); -- ret->request_count = 0; -+ if (req) { -+ req++; -+ mas_set_alloc_req(mas, req); - } -+ -+ memset(ret, 0, sizeof(*ret)); - return (struct maple_node *)ret; - } - -@@ -1179,21 +1182,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) - unsigned long count; - unsigned int requested = mas_alloc_req(mas); - -- memset(reuse, 0, sizeof(*reuse)); - count = mas_allocated(mas); - -- if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { -- if (head->slot[0]) -- head->node_count++; -- head->slot[head->node_count] = reuse; -+ reuse->request_count = 0; -+ reuse->node_count = 0; -+ if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { -+ head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - - reuse->total = 1; - if ((head) && !((unsigned long)head & 0x1)) { -- head->request_count = 0; - reuse->slot[0] = head; -+ reuse->node_count = 1; - reuse->total += head->total; - } - -@@ -1212,7 +1214,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) - { - struct maple_alloc *node; - unsigned long allocated = mas_allocated(mas); -- unsigned long success = allocated; - unsigned int requested = mas_alloc_req(mas); - unsigned int count; - void **slots = NULL; -@@ -1228,24 +1229,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) - WARN_ON(!allocated); - } - -- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { -+ if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { - node = (struct maple_alloc *)mt_alloc_one(gfp); - if (!node) - goto nomem_one; - -- if (allocated) -+ if (allocated) { - node->slot[0] = mas->alloc; -+ node->node_count = 1; -+ } else { -+ node->node_count = 0; -+ } - -- success++; - mas->alloc = node; -+ node->total = ++allocated; - requested--; - } - +@@ -1303,26 +1303,18 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; -+ node->request_count = 0; + node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; -- if (node->slot[0]) { -- unsigned int offset = node->node_count + 1; -+ if (node->node_count) { -+ unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; -@@ -1259,15 +1265,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +- max_req = MAPLE_ALLOC_SLOTS; +- if (node->node_count) { +- unsigned int offset = node->node_count; +- +- slots = (void **)&node->slot[offset]; +- max_req -= offset; +- } else { +- slots = (void **)&node->slot; +- } +- ++ max_req = MAPLE_ALLOC_SLOTS - node->node_count; ++ slots = (void **)&node->slot[node->node_count]; + max_req = min(requested, max_req); + count = mt_alloc_bulk(gfp, max_req, slots); + if (!count) goto nomem_bulk; ++ if (node->node_count == 0) ++ node->slot[0]->node_count = 0; node->node_count += count; -- /* zero indexed. */ -- if (slots == (void **)&node->slot) -- node->node_count--; -- -- success += count; -+ allocated += count; + allocated += count; node = node->slot[0]; -+ node->node_count = 0; -+ node->request_count = 0; +- node->node_count = 0; +- node->request_count = 0; requested -= count; } -- mas->alloc->total = success; -+ mas->alloc->total = allocated; - return; + mas->alloc->total = allocated; +@@ -2317,9 +2309,7 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) + static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) + { + struct ma_state *mas = wr_mas->mas; +- unsigned char count; +- unsigned char offset; +- unsigned long index, min, max; ++ unsigned char count, offset; - nomem_bulk: -@@ -1276,10 +1280,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) - nomem_one: - mas_set_alloc_req(mas, requested); - if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) -- mas->alloc->total = success; -+ mas->alloc->total = allocated; - mas_set_err(mas, -ENOMEM); -- return; + if (unlikely(ma_is_dense(wr_mas->type))) { + wr_mas->r_max = wr_mas->r_min = mas->index; +@@ -2332,34 +2322,12 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) + count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, + wr_mas->pivots, mas->max); + offset = mas->offset; +- min = mas_safe_min(mas, wr_mas->pivots, offset); +- if (unlikely(offset == count)) +- goto max; - +- max = wr_mas->pivots[offset]; +- index = mas->index; +- if (unlikely(index <= max)) +- goto done; +- +- if (unlikely(!max && offset)) +- goto max; + +- min = max + 1; +- while (++offset < count) { +- max = wr_mas->pivots[offset]; +- if (index <= max) +- goto done; +- else if (unlikely(!max)) +- break; ++ while (offset < count && mas->index > wr_mas->pivots[offset]) ++ offset++; + +- min = max + 1; +- } +- +-max: +- max = mas->max; +-done: +- wr_mas->r_max = max; +- wr_mas->r_min = min; ++ wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; ++ wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); + wr_mas->offset_end = mas->offset = offset; } - /* -@@ -1887,10 +1889,9 @@ static inline int mab_calc_split(struct ma_state *mas, - - /* Avoid ending a node on a NULL entry */ - split = mab_no_null_split(bn, split, slot_count); -- if (!(*mid_split)) -- return split; - -- *mid_split = mab_no_null_split(bn, *mid_split, slot_count); -+ if (unlikely(*mid_split)) -+ *mid_split = mab_no_null_split(bn, *mid_split, slot_count); - - return split; - } -@@ -2113,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, - * - * Return: The actual end of the data stored in @b_node - */ --static inline void mas_store_b_node(struct ma_wr_state *wr_mas, -+static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node, unsigned char offset_end) - { - unsigned char slot; -@@ -2947,7 +2948,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) - mas->min = prev_min; - mas->max = prev_max; - mas->node = last; -- return (void *) next; -+ return (void *)next; - - dead_node: - mas_reset(mas); -@@ -3467,7 +3468,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, - */ - static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) - { -- - struct maple_subtree_state mast; - int height = 0; - unsigned char mid_split, split = 0; -@@ -3586,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, - * @b_node: The maple big node - * @end: The end of the data. - */ --static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, -+static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node, unsigned char end) - { - struct maple_node *node; -@@ -3893,7 +3893,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) - goto dead_node; - } while (!ma_is_leaf(type)); - -- return (void *) next; -+ return (void *)next; - - dead_node: - mas_reset(mas); -@@ -4711,15 +4711,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, - - static inline void mas_rewalk(struct ma_state *mas, unsigned long index) - { -- - retry: - mas_set(mas, index); - mas_state_walk(mas); - if (mas_is_start(mas)) - goto retry; -- -- return; -- - } - - /* -@@ -5093,35 +5089,21 @@ static inline bool mas_rewind_node(struct ma_state *mas) - */ - static inline bool mas_skip_node(struct ma_state *mas) - { -- unsigned char slot, slot_count; -- unsigned long *pivots; -- enum maple_type mt; -+ if (mas_is_err(mas)) -+ return false; - -- mt = mte_node_type(mas->node); -- slot_count = mt_slots[mt] - 1; - do { - if (mte_is_root(mas->node)) { -- slot = mas->offset; -- if (slot > slot_count) { -+ if (mas->offset >= mas_data_end(mas)) { - mas_set_err(mas, -EBUSY); - return false; - } - } else { - mas_ascend(mas); -- slot = mas->offset; -- mt = mte_node_type(mas->node); -- slot_count = mt_slots[mt] - 1; - } -- } while (slot > slot_count); -- -- mas->offset = ++slot; -- pivots = ma_pivots(mas_mn(mas), mt); -- if (slot > 0) -- mas->min = pivots[slot - 1] + 1; -- -- if (slot <= slot_count) -- mas->max = pivots[slot]; -+ } while (mas->offset >= mas_data_end(mas)); - -+ mas->offset++; - return true; - } - -@@ -5590,8 +5572,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, - - /* - * mte_destroy_walk() - Free a tree or sub-tree. -- * @enode - the encoded maple node (maple_enode) to start -- * @mn - the tree to free - needed for node types. -+ * @enode: the encoded maple node (maple_enode) to start -+ * @mt: the tree to free - needed for node types. - * - * Must hold the write lock. - */ -@@ -5620,7 +5602,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) - mas_reset(wr_mas->mas); - } - } -- - } - - /* Interface */ -@@ -5733,6 +5714,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +@@ -5819,6 +5787,7 @@ int mas_preallocate(struct ma_state *mas, gfp_t gfp) mas_reset(mas); return ret; } @@ -29654,519 +10331,11 @@ index 5a976393c9ae..a73f83d0eb0e 100644 /* * mas_destroy() - destroy a maple state. -@@ -5745,6 +5727,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) - void mas_destroy(struct ma_state *mas) - { - struct maple_alloc *node; -+ unsigned long total; - - /* - * When using mas_for_each() to insert an expected number of elements, -@@ -5767,14 +5750,20 @@ void mas_destroy(struct ma_state *mas) - } - mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); - -- while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { -+ total = mas_allocated(mas); -+ while (total) { - node = mas->alloc; - mas->alloc = node->slot[0]; -- if (node->node_count > 0) -- mt_free_bulk(node->node_count, -- (void __rcu **)&node->slot[1]); -+ if (node->node_count > 1) { -+ size_t count = node->node_count - 1; -+ -+ mt_free_bulk(count, (void __rcu **)&node->slot[1]); -+ total -= count; -+ } - kmem_cache_free(maple_node_cache, node); -+ total--; - } -+ - mas->alloc = NULL; - } - EXPORT_SYMBOL_GPL(mas_destroy); -@@ -6734,7 +6723,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, - - if (i < (MAPLE_RANGE64_SLOTS - 1)) - last = node->pivot[i]; -- else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) -+ else if (!node->slot[i] && max != mt_node_max(entry)) - break; - if (last == 0 && i > 0) - break; -@@ -6841,7 +6830,7 @@ void mt_dump(const struct maple_tree *mt) - if (!xa_is_node(entry)) - mt_dump_entry(entry, 0, 0, 0); - else if (entry) -- mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); -+ mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); - } - EXPORT_SYMBOL_GPL(mt_dump); - -diff --git a/mm/fadvise.c b/mm/fadvise.c -index bf04fec87f35..fb7c5f43fd2a 100644 ---- a/mm/fadvise.c -+++ b/mm/fadvise.c -@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) - case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&file->f_lock); -- file->f_mode &= ~FMODE_RANDOM; -+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); - spin_unlock(&file->f_lock); - break; - case POSIX_FADV_RANDOM: -@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) - force_page_cache_readahead(mapping, file, start_index, nrpages); - break; - case POSIX_FADV_NOREUSE: -+ spin_lock(&file->f_lock); -+ file->f_mode |= FMODE_NOREUSE; -+ spin_unlock(&file->f_lock); - break; - case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 2eee092f8f11..802d3868d097 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) - struct mem_cgroup_per_node *mz; - struct mem_cgroup_tree_per_node *mctz; - -+ if (lru_gen_enabled()) { -+ if (soft_limit_excess(memcg)) -+ lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); -+ return; -+ } -+ - mctz = soft_limit_tree.rb_tree_per_node[nid]; - if (!mctz) - return; -@@ -3526,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - struct mem_cgroup_tree_per_node *mctz; - unsigned long excess; - -+ if (lru_gen_enabled()) -+ return 0; -+ - if (order > 0) - return 0; - -@@ -5386,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) - if (unlikely(mem_cgroup_is_root(memcg))) - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, - 2UL*HZ); -+ lru_gen_online_memcg(memcg); - return 0; - offline_kmem: - memcg_offline_kmem(memcg); -@@ -5417,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) - memcg_offline_kmem(memcg); - reparent_shrinker_deferred(memcg); - wb_memcg_offline(memcg); -+ lru_gen_offline_memcg(memcg); - - drain_all_stock(memcg); - -@@ -5428,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - - invalidate_reclaim_iterators(memcg); -+ lru_gen_release_memcg(memcg); - } - - static void mem_cgroup_css_free(struct cgroup_subsys_state *css) -diff --git a/mm/memory.c b/mm/memory.c -index f526b9152bef..4ad62eba3cb7 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -1392,8 +1392,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, - force_flush = 1; - } - } -- if (pte_young(ptent) && -- likely(!(vma->vm_flags & VM_SEQ_READ))) -+ if (pte_young(ptent) && likely(vma_has_recency(vma))) - mark_page_accessed(page); - } - rss[mm_counter(page)]--; -@@ -5140,8 +5139,8 @@ static inline void mm_account_fault(struct pt_regs *regs, - #ifdef CONFIG_LRU_GEN - static void lru_gen_enter_fault(struct vm_area_struct *vma) - { -- /* the LRU algorithm doesn't apply to sequential or random reads */ -- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); -+ /* the LRU algorithm only applies to accesses with recency */ -+ current->in_lru_fault = vma_has_recency(vma); - } - - static void lru_gen_exit_fault(void) -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 3aec9a6a9cb7..6658cbf43f5d 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid) - pgdat_set_deferred_range(pgdat); - - free_area_init_core(pgdat); -+ lru_gen_init_pgdat(pgdat); - } - - static void __init free_area_init_memoryless_node(int nid) -diff --git a/mm/rmap.c b/mm/rmap.c -index 3b45d049069e..c8701608bb0d 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, - } - - if (pvmw.pte) { -- if (lru_gen_enabled() && pte_young(*pvmw.pte) && -- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { -+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) { - lru_gen_look_around(&pvmw); - referenced++; - } - - if (ptep_clear_flush_young_notify(vma, address, -- pvmw.pte)) { -- /* -- * Don't treat a reference through -- * a sequentially read mapping as such. -- * If the folio has been used in another mapping, -- * we will catch it; if this other mapping is -- * already gone, the unmap path will have set -- * the referenced flag or activated the folio. -- */ -- if (likely(!(vma->vm_flags & VM_SEQ_READ))) -- referenced++; -- } -+ pvmw.pte)) -+ referenced++; - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (pmdp_clear_flush_young_notify(vma, address, - pvmw.pmd)) -@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) - struct folio_referenced_arg *pra = arg; - struct mem_cgroup *memcg = pra->memcg; - -- if (!mm_match_cgroup(vma->vm_mm, memcg)) -+ /* -+ * Ignore references from this mapping if it has no recency. If the -+ * folio has been used in another mapping, we will catch it; if this -+ * other mapping is already gone, the unmap path will have set the -+ * referenced flag or activated the folio in zap_pte_range(). -+ */ -+ if (!vma_has_recency(vma)) -+ return true; -+ -+ /* -+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf -+ * of references from different cgroups. -+ */ -+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - return true; - - return false; -@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked, - .arg = (void *)&pra, - .anon_lock = folio_lock_anon_vma_read, - .try_lock = true, -+ .invalid_vma = invalid_folio_referenced_vma, - }; - - *vm_flags = 0; -@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked, - return 1; - } - -- /* -- * If we are reclaiming on behalf of a cgroup, skip -- * counting on behalf of references from different -- * cgroups -- */ -- if (memcg) { -- rwc.invalid_vma = invalid_folio_referenced_vma; -- } -- - rmap_walk(folio, &rwc); - *vm_flags = pra.vm_flags; - diff --git a/mm/vmscan.c b/mm/vmscan.c -index 160acbbdf111..1a8f3b1c0bad 100644 +index 71a7f4517e5a..8dadd1772661 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -55,6 +55,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -135,12 +137,6 @@ struct scan_control { - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - --#ifdef CONFIG_LRU_GEN -- /* help kswapd make better choices among multiple memcgs */ -- unsigned int memcgs_need_aging:1; -- unsigned long last_reclaimed; --#endif -- - /* Allocation order */ - s8 order; - -@@ -453,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc) - return sc->target_mem_cgroup; - } - -+static bool global_reclaim(struct scan_control *sc) -+{ -+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); -+} -+ - /** - * writeback_throttling_sane - is the usual dirty throttling mechanism available? - * @sc: scan_control in question -@@ -503,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc) - return false; - } - -+static bool global_reclaim(struct scan_control *sc) -+{ -+ return true; -+} -+ - static bool writeback_throttling_sane(struct scan_control *sc) - { - return true; -@@ -3184,6 +3190,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); - for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ - for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) - -+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) -+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) -+ - static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) - { - struct pglist_data *pgdat = NODE_DATA(nid); -@@ -3209,6 +3218,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - -+ if (!sc->may_swap) -+ return 0; -+ - if (!can_demote(pgdat->node_id, sc) && - mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) - return 0; -@@ -3223,12 +3235,104 @@ static int get_nr_gens(struct lruvec *lruvec, int type) - - static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) - { -- /* see the comment on lru_gen_struct */ -+ /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; - } - -+/****************************************************************************** -+ * Bloom filters -+ ******************************************************************************/ -+ -+/* -+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when -+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of -+ * bits in a bitmap, k is the number of hash functions and n is the number of -+ * inserted items. -+ * -+ * Page table walkers use one of the two filters to reduce their search space. -+ * To get rid of non-leaf entries that no longer have enough leaf entries, the -+ * aging uses the double-buffering technique to flip to the other filter each -+ * time it produces a new generation. For non-leaf entries that have enough -+ * leaf entries, the aging carries them over to the next generation in -+ * walk_pmd_range(); the eviction also report them when walking the rmap -+ * in lru_gen_look_around(). -+ * -+ * For future optimizations: -+ * 1. It's not necessary to keep both filters all the time. The spare one can be -+ * freed after the RCU grace period and reallocated if needed again. -+ * 2. And when reallocating, it's worth scaling its size according to the number -+ * of inserted entries in the other filter, to reduce the memory overhead on -+ * small systems and false positives on large systems. -+ * 3. Jenkins' hash function is an alternative to Knuth's. -+ */ -+#define BLOOM_FILTER_SHIFT 15 -+ -+static inline int filter_gen_from_seq(unsigned long seq) -+{ -+ return seq % NR_BLOOM_FILTERS; -+} -+ -+static void get_item_key(void *item, int *key) -+{ -+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -+ -+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -+ -+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -+ key[1] = hash >> BLOOM_FILTER_SHIFT; -+} -+ -+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_state.filters[gen]); -+ if (!filter) -+ return true; -+ -+ get_item_key(item, key); -+ -+ return test_bit(key[0], filter) && test_bit(key[1], filter); -+} -+ -+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_state.filters[gen]); -+ if (!filter) -+ return; -+ -+ get_item_key(item, key); -+ -+ if (!test_bit(key[0], filter)) -+ set_bit(key[0], filter); -+ if (!test_bit(key[1], filter)) -+ set_bit(key[1], filter); -+} -+ -+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -+{ -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = lruvec->mm_state.filters[gen]; -+ if (filter) { -+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -+ return; -+ } -+ -+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), -+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -+} -+ - /****************************************************************************** - * mm_struct list - ******************************************************************************/ -@@ -3348,94 +3452,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) - } - #endif - --/* -- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when -- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of -- * bits in a bitmap, k is the number of hash functions and n is the number of -- * inserted items. -- * -- * Page table walkers use one of the two filters to reduce their search space. -- * To get rid of non-leaf entries that no longer have enough leaf entries, the -- * aging uses the double-buffering technique to flip to the other filter each -- * time it produces a new generation. For non-leaf entries that have enough -- * leaf entries, the aging carries them over to the next generation in -- * walk_pmd_range(); the eviction also report them when walking the rmap -- * in lru_gen_look_around(). -- * -- * For future optimizations: -- * 1. It's not necessary to keep both filters all the time. The spare one can be -- * freed after the RCU grace period and reallocated if needed again. -- * 2. And when reallocating, it's worth scaling its size according to the number -- * of inserted entries in the other filter, to reduce the memory overhead on -- * small systems and false positives on large systems. -- * 3. Jenkins' hash function is an alternative to Knuth's. -- */ --#define BLOOM_FILTER_SHIFT 15 -- --static inline int filter_gen_from_seq(unsigned long seq) --{ -- return seq % NR_BLOOM_FILTERS; --} -- --static void get_item_key(void *item, int *key) --{ -- u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -- -- BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -- -- key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -- key[1] = hash >> BLOOM_FILTER_SHIFT; --} -- --static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) --{ -- unsigned long *filter; -- int gen = filter_gen_from_seq(seq); -- -- filter = lruvec->mm_state.filters[gen]; -- if (filter) { -- bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -- return; -- } -- -- filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), -- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -- WRITE_ONCE(lruvec->mm_state.filters[gen], filter); --} -- --static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) --{ -- int key[2]; -- unsigned long *filter; -- int gen = filter_gen_from_seq(seq); -- -- filter = READ_ONCE(lruvec->mm_state.filters[gen]); -- if (!filter) -- return; -- -- get_item_key(item, key); -- -- if (!test_bit(key[0], filter)) -- set_bit(key[0], filter); -- if (!test_bit(key[1], filter)) -- set_bit(key[1], filter); --} -- --static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) --{ -- int key[2]; -- unsigned long *filter; -- int gen = filter_gen_from_seq(seq); -- -- filter = READ_ONCE(lruvec->mm_state.filters[gen]); -- if (!filter) -- return true; -- -- get_item_key(item, key); -- -- return test_bit(key[0], filter) && test_bit(key[1], filter); --} -- - static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) - { - int i; -@@ -3592,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +@@ -3608,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) } /****************************************************************************** @@ -30175,1280 +10344,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644 ******************************************************************************/ /* -@@ -3623,7 +3639,7 @@ struct ctrl_pos { - static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, - struct ctrl_pos *pos) - { -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - - pos->refaulted = lrugen->avg_refaulted[type][tier] + -@@ -3638,7 +3654,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, - static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) - { - int hist, tier; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; - unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; - -@@ -3715,7 +3731,7 @@ static int folio_update_gen(struct folio *folio, int gen) - static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) - { - int type = folio_is_file_lru(folio); -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); - -@@ -3760,7 +3776,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, - static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) - { - int gen, type, zone; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - walk->batched = 0; - -@@ -3793,7 +3809,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal - if (is_vm_hugetlb_page(vma)) - return true; - -- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) -+ if (!vma_has_recency(vma)) -+ return true; -+ -+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) - return true; - - if (vma == get_gate_vma(vma->vm_mm)) -@@ -3988,8 +4007,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, - } - - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) --static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -- struct mm_walk *args, unsigned long *bitmap, unsigned long *start) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, -+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first) - { - int i; - pmd_t *pmd; -@@ -4002,18 +4021,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area - VM_WARN_ON_ONCE(pud_leaf(*pud)); - - /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ -- if (*start == -1) { -- *start = next; -+ if (*first == -1) { -+ *first = addr; -+ bitmap_zero(bitmap, MIN_LRU_BATCH); - return; - } - -- i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); -+ i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); - if (i && i <= MIN_LRU_BATCH) { - __set_bit(i - 1, bitmap); - return; - } - -- pmd = pmd_offset(pud, *start); -+ pmd = pmd_offset(pud, *first); - - ptl = pmd_lockptr(args->mm, pmd); - if (!spin_trylock(ptl)) -@@ -4024,15 +4044,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area - do { - unsigned long pfn; - struct folio *folio; -- unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; -+ -+ /* don't round down the first address */ -+ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) - goto next; - - if (!pmd_trans_huge(pmd[i])) { -- if (arch_has_hw_nonleaf_pmd_young() && -- get_cap(LRU_GEN_NONLEAF_YOUNG)) -+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) - pmdp_test_and_clear_young(vma, addr, pmd + i); - goto next; - } -@@ -4061,12 +4082,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area - arch_leave_lazy_mmu_mode(); - spin_unlock(ptl); - done: -- *start = -1; -- bitmap_zero(bitmap, MIN_LRU_BATCH); -+ *first = -1; - } - #else --static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -- struct mm_walk *args, unsigned long *bitmap, unsigned long *start) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, -+ struct mm_walk *args, unsigned long *bitmap, unsigned long *first) - { - } - #endif -@@ -4079,9 +4099,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, - unsigned long next; - unsigned long addr; - struct vm_area_struct *vma; -- unsigned long pos = -1; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; -+ unsigned long first = -1; - struct lru_gen_mm_walk *walk = args->private; -- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; - - VM_WARN_ON_ONCE(pud_leaf(*pud)); - -@@ -4120,18 +4140,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - -- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); - continue; - } - #endif - walk->mm_stats[MM_NONLEAF_TOTAL]++; - -- if (arch_has_hw_nonleaf_pmd_young() && -- get_cap(LRU_GEN_NONLEAF_YOUNG)) { -+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { - if (!pmd_young(val)) - continue; - -- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); - } - - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) -@@ -4148,7 +4167,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, - update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); - } - -- walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); -+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); - - if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) - goto restart; -@@ -4238,7 +4257,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ - } while (err == -EAGAIN); - } - --static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) -+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) - { - struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; - -@@ -4246,7 +4265,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) - VM_WARN_ON_ONCE(walk); - - walk = &pgdat->mm_walk; -- } else if (!pgdat && !walk) { -+ } else if (!walk && force_alloc) { - VM_WARN_ON_ONCE(current_is_kswapd()); - - walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -@@ -4274,7 +4293,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) - { - int zone; - int remaining = MAX_LRU_BATCH; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - - if (type == LRU_GEN_ANON && !can_swap) -@@ -4282,7 +4301,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) - - /* prevent cold/hot inversion if force_scan is true */ - for (zone = 0; zone < MAX_NR_ZONES; zone++) { -- struct list_head *head = &lrugen->lists[old_gen][type][zone]; -+ struct list_head *head = &lrugen->folios[old_gen][type][zone]; - - while (!list_empty(head)) { - struct folio *folio = lru_to_folio(head); -@@ -4293,7 +4312,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) - VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); - - new_gen = folio_inc_gen(lruvec, folio, false); -- list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); -+ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); - - if (!--remaining) - return false; -@@ -4310,7 +4329,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) - { - int gen, type, zone; - bool success = false; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - DEFINE_MIN_SEQ(lruvec); - - VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); -@@ -4321,7 +4340,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) - gen = lru_gen_from_seq(min_seq[type]); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) { -- if (!list_empty(&lrugen->lists[gen][type][zone])) -+ if (!list_empty(&lrugen->folios[gen][type][zone])) - goto next; - } - -@@ -4331,7 +4350,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) - ; - } - -- /* see the comment on lru_gen_struct */ -+ /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); -@@ -4353,7 +4372,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) - { - int prev, next; - int type, zone; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - spin_lock_irq(&lruvec->lru_lock); - -@@ -4411,7 +4430,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - bool success; - struct lru_gen_mm_walk *walk; - struct mm_struct *mm = NULL; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); - -@@ -4427,12 +4446,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - * handful of PTEs. Spreading the work out over a period of time usually - * is less efficient, but it avoids bursty page faults. - */ -- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { -+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { - success = iterate_mm_list_nowalk(lruvec, max_seq); - goto done; - } - -- walk = set_mm_walk(NULL); -+ walk = set_mm_walk(NULL, true); - if (!walk) { - success = iterate_mm_list_nowalk(lruvec, max_seq); - goto done; -@@ -4455,8 +4474,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - if (sc->priority <= DEF_PRIORITY - 2) - wait_event_killable(lruvec->mm_state.wait, - max_seq < READ_ONCE(lrugen->max_seq)); -- -- return max_seq < READ_ONCE(lrugen->max_seq); -+ return false; - } - - VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); -@@ -4469,97 +4487,56 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - return true; - } - --static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, -- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) -+/****************************************************************************** -+ * working set protection -+ ******************************************************************************/ -+ -+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) - { - int gen, type, zone; -- unsigned long old = 0; -- unsigned long young = 0; - unsigned long total = 0; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ bool can_swap = get_swappiness(lruvec, sc); -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); - - for (type = !can_swap; type < ANON_AND_FILE; type++) { - unsigned long seq; - - for (seq = min_seq[type]; seq <= max_seq; seq++) { -- unsigned long size = 0; -- - gen = lru_gen_from_seq(seq); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) -- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); -- -- total += size; -- if (seq == max_seq) -- young += size; -- else if (seq + MIN_NR_GENS == max_seq) -- old += size; -+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - } - } - -- /* try to scrape all its memory if this memcg was deleted */ -- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; -- -- /* -- * The aging tries to be lazy to reduce the overhead, while the eviction -- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the -- * ideal number of generations is MIN_NR_GENS+1. -- */ -- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) -- return true; -- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) -- return false; -- -- /* -- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) -- * of the total number of pages for each generation. A reasonable range -- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The -- * aging cares about the upper bound of hot pages, while the eviction -- * cares about the lower bound of cold pages. -- */ -- if (young * MIN_NR_GENS > total) -- return true; -- if (old * (MIN_NR_GENS + 2) < total) -- return true; -- -- return false; -+ /* whether the size is big enough to be helpful */ -+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - } - --static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) -+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, -+ unsigned long min_ttl) - { -- bool need_aging; -- unsigned long nr_to_scan; -- int swappiness = get_swappiness(lruvec, sc); -+ int gen; -+ unsigned long birth; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); -- DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - -- VM_WARN_ON_ONCE(sc->memcg_low_reclaim); -- -- mem_cgroup_calculate_protection(NULL, memcg); -+ /* see the comment on lru_gen_folio */ -+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); -+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - -- if (mem_cgroup_below_min(NULL, memcg)) -+ if (time_is_after_jiffies(birth + min_ttl)) - return false; - -- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); -- -- if (min_ttl) { -- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); -- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); -- -- if (time_is_after_jiffies(birth + min_ttl)) -- return false; -- -- /* the size is likely too small to be helpful */ -- if (!nr_to_scan && sc->priority != DEF_PRIORITY) -- return false; -- } -+ if (!lruvec_is_sizable(lruvec, sc)) -+ return false; - -- if (need_aging) -- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); -+ mem_cgroup_calculate_protection(NULL, memcg); - -- return true; -+ return !mem_cgroup_below_min(NULL, memcg); - } - - /* to protect the working set of the last N jiffies */ -@@ -4572,46 +4549,30 @@ static unsigned long lru_gen_min_ttl __read_mostly; - static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - { - struct mem_cgroup *memcg; -- bool success = false; - unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); - - VM_WARN_ON_ONCE(!current_is_kswapd()); - -- sc->last_reclaimed = sc->nr_reclaimed; -- -- /* -- * To reduce the chance of going into the aging path, which can be -- * costly, optimistically skip it if the flag below was cleared in the -- * eviction path. This improves the overall performance when multiple -- * memcgs are available. -- */ -- if (!sc->memcgs_need_aging) { -- sc->memcgs_need_aging = true; -+ /* check the order to exclude compaction-induced reclaim */ -+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) - return; -- } -- -- set_mm_walk(pgdat); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - -- if (age_lruvec(lruvec, sc, min_ttl)) -- success = true; -+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { -+ mem_cgroup_iter_break(NULL, memcg); -+ return; -+ } - - cond_resched(); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - -- clear_mm_walk(); -- -- /* check the order to exclude compaction-induced reclaim */ -- if (success || !min_ttl || sc->order) -- return; -- - /* - * The main goal is to OOM kill if every generation from all memcgs is - * younger than min_ttl. However, another possibility is all memcgs are -- * either below min or empty. -+ * either too small or below min. - */ - if (mutex_trylock(&oom_lock)) { - struct oom_control oc = { -@@ -4624,6 +4585,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - } - } - -+/****************************************************************************** -+ * rmap/PT walk feedback -+ ******************************************************************************/ -+ - /* - * This function exploits spatial locality when shrink_folio_list() walks the - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If -@@ -4634,13 +4599,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - { - int i; -- pte_t *pte; - unsigned long start; - unsigned long end; -- unsigned long addr; - struct lru_gen_mm_walk *walk; - int young = 0; -- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; -+ pte_t *pte = pvmw->pte; -+ unsigned long addr = pvmw->address; - struct folio *folio = pfn_folio(pvmw->pfn); - struct mem_cgroup *memcg = folio_memcg(folio); - struct pglist_data *pgdat = folio_pgdat(folio); -@@ -4657,25 +4621,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - /* avoid taking the LRU lock under the PTL when possible */ - walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - -- start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -- end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; -+ start = max(addr & PMD_MASK, pvmw->vma->vm_start); -+ end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; - - if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { -- if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) -+ if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) - end = start + MIN_LRU_BATCH * PAGE_SIZE; -- else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) -+ else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) - start = end - MIN_LRU_BATCH * PAGE_SIZE; - else { -- start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; -- end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; -+ start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; -+ end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; - } - } - -- pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ /* folio_update_gen() requires stable folio_memcg() */ -+ if (!mem_cgroup_trylock_pages(memcg)) -+ return; - -- rcu_read_lock(); - arch_enter_lazy_mmu_mode(); - -+ pte -= (addr - start) / PAGE_SIZE; -+ - for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { - unsigned long pfn; - -@@ -4700,58 +4667,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - -+ if (walk) { -+ old_gen = folio_update_gen(folio, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(walk, folio, old_gen, new_gen); -+ -+ continue; -+ } -+ - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) -- __set_bit(i, bitmap); -+ folio_activate(folio); - } - - arch_leave_lazy_mmu_mode(); -- rcu_read_unlock(); -+ mem_cgroup_unlock_pages(); - - /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); -+} - -- if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { -- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -- folio = pfn_folio(pte_pfn(pte[i])); -- folio_activate(folio); -- } -- return; -+/****************************************************************************** -+ * memcg LRU -+ ******************************************************************************/ -+ -+/* see the comment on MEMCG_NR_GENS */ -+enum { -+ MEMCG_LRU_NOP, -+ MEMCG_LRU_HEAD, -+ MEMCG_LRU_TAIL, -+ MEMCG_LRU_OLD, -+ MEMCG_LRU_YOUNG, -+}; -+ -+#ifdef CONFIG_MEMCG -+ -+static int lru_gen_memcg_seg(struct lruvec *lruvec) -+{ -+ return READ_ONCE(lruvec->lrugen.seg); -+} -+ -+static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) -+{ -+ int seg; -+ int old, new; -+ int bin = get_random_u32_below(MEMCG_NR_BINS); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock(&pgdat->memcg_lru.lock); -+ -+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); -+ -+ seg = 0; -+ new = old = lruvec->lrugen.gen; -+ -+ /* see the comment on MEMCG_NR_GENS */ -+ if (op == MEMCG_LRU_HEAD) -+ seg = MEMCG_LRU_HEAD; -+ else if (op == MEMCG_LRU_TAIL) -+ seg = MEMCG_LRU_TAIL; -+ else if (op == MEMCG_LRU_OLD) -+ new = get_memcg_gen(pgdat->memcg_lru.seq); -+ else if (op == MEMCG_LRU_YOUNG) -+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1); -+ else -+ VM_WARN_ON_ONCE(true); -+ -+ hlist_nulls_del_rcu(&lruvec->lrugen.list); -+ -+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) -+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); -+ else -+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); -+ -+ pgdat->memcg_lru.nr_memcgs[old]--; -+ pgdat->memcg_lru.nr_memcgs[new]++; -+ -+ lruvec->lrugen.gen = new; -+ WRITE_ONCE(lruvec->lrugen.seg, seg); -+ -+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) -+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); -+ -+ spin_unlock(&pgdat->memcg_lru.lock); -+} -+ -+void lru_gen_online_memcg(struct mem_cgroup *memcg) -+{ -+ int gen; -+ int nid; -+ int bin = get_random_u32_below(MEMCG_NR_BINS); -+ -+ for_each_node(nid) { -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ spin_lock(&pgdat->memcg_lru.lock); -+ -+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); -+ -+ gen = get_memcg_gen(pgdat->memcg_lru.seq); -+ -+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); -+ pgdat->memcg_lru.nr_memcgs[gen]++; -+ -+ lruvec->lrugen.gen = gen; -+ -+ spin_unlock(&pgdat->memcg_lru.lock); - } -+} - -- /* folio_update_gen() requires stable folio_memcg() */ -- if (!mem_cgroup_trylock_pages(memcg)) -- return; -+void lru_gen_offline_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; - -- if (!walk) { -- spin_lock_irq(&lruvec->lru_lock); -- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); - } -+} - -- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -- folio = pfn_folio(pte_pfn(pte[i])); -- if (folio_memcg_rcu(folio) != memcg) -- continue; -+void lru_gen_release_memcg(struct mem_cgroup *memcg) -+{ -+ int gen; -+ int nid; - -- old_gen = folio_update_gen(folio, new_gen); -- if (old_gen < 0 || old_gen == new_gen) -- continue; -+ for_each_node(nid) { -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ struct lruvec *lruvec = get_lruvec(memcg, nid); - -- if (walk) -- update_batch_size(walk, folio, old_gen, new_gen); -- else -- lru_gen_update_size(lruvec, folio, old_gen, new_gen); -+ spin_lock(&pgdat->memcg_lru.lock); -+ -+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); -+ -+ gen = lruvec->lrugen.gen; -+ -+ hlist_nulls_del_rcu(&lruvec->lrugen.list); -+ pgdat->memcg_lru.nr_memcgs[gen]--; -+ -+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) -+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); -+ -+ spin_unlock(&pgdat->memcg_lru.lock); - } -+} -+ -+void lru_gen_soft_reclaim(struct lruvec *lruvec) -+{ -+ /* see the comment on MEMCG_NR_GENS */ -+ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) -+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); -+} - -- if (!walk) -- spin_unlock_irq(&lruvec->lru_lock); -+#else /* !CONFIG_MEMCG */ - -- mem_cgroup_unlock_pages(); -+static int lru_gen_memcg_seg(struct lruvec *lruvec) -+{ -+ return 0; - } - -+#endif -+ - /****************************************************************************** - * the eviction - ******************************************************************************/ -@@ -4765,7 +4845,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) - int delta = folio_nr_pages(folio); - int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); - -@@ -4790,7 +4870,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) - - /* promoted */ - if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -- list_move(&folio->lru, &lrugen->lists[gen][type][zone]); -+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - return true; - } - -@@ -4799,7 +4879,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - - gen = folio_inc_gen(lruvec, folio, false); -- list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); -+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); - - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); -@@ -4811,7 +4891,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) - if (folio_test_locked(folio) || folio_test_writeback(folio) || - (type == LRU_GEN_FILE && folio_test_dirty(folio))) { - gen = folio_inc_gen(lruvec, folio, true); -- list_move(&folio->lru, &lrugen->lists[gen][type][zone]); -+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - return true; - } - -@@ -4822,12 +4902,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca - { - bool success; - -- /* unmapping inhibited */ -- if (!sc->may_unmap && folio_mapped(folio)) -- return false; -- - /* swapping inhibited */ -- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ if (!(sc->gfp_mask & __GFP_IO) && - (folio_test_dirty(folio) || - (folio_test_anon(folio) && !folio_test_swapcache(folio)))) - return false; -@@ -4865,7 +4941,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - int scanned = 0; - int isolated = 0; - int remaining = MAX_LRU_BATCH; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - - VM_WARN_ON_ONCE(!list_empty(list)); -@@ -4878,7 +4954,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - for (zone = sc->reclaim_idx; zone >= 0; zone--) { - LIST_HEAD(moved); - int skipped = 0; -- struct list_head *head = &lrugen->lists[gen][type][zone]; -+ struct list_head *head = &lrugen->folios[gen][type][zone]; - - while (!list_empty(head)) { - struct folio *folio = lru_to_folio(head); -@@ -4924,9 +5000,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - __count_vm_events(PGSCAN_ANON + type, isolated); - - /* -- * There might not be eligible pages due to reclaim_idx, may_unmap and -- * may_writepage. Check the remaining to prevent livelock if it's not -- * making progress. -+ * There might not be eligible folios due to reclaim_idx. Check the -+ * remaining to prevent livelock if it's not making progress. - */ - return isolated || !remaining ? scanned : 0; - } -@@ -5021,8 +5096,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - return scanned; - } - --static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -- bool *need_swapping) -+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) - { - int type; - int scanned; -@@ -5111,153 +5185,348 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap - goto retry; - } - -- if (need_swapping && type == LRU_GEN_ANON) -- *need_swapping = true; -- - return scanned; - } - -+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, -+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) -+{ -+ int gen, type, zone; -+ unsigned long old = 0; -+ unsigned long young = 0; -+ unsigned long total = 0; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ /* whether this lruvec is completely out of cold folios */ -+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { -+ *nr_to_scan = 0; -+ return true; -+ } -+ -+ for (type = !can_swap; type < ANON_AND_FILE; type++) { -+ unsigned long seq; -+ -+ for (seq = min_seq[type]; seq <= max_seq; seq++) { -+ unsigned long size = 0; -+ -+ gen = lru_gen_from_seq(seq); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); -+ -+ total += size; -+ if (seq == max_seq) -+ young += size; -+ else if (seq + MIN_NR_GENS == max_seq) -+ old += size; -+ } -+ } -+ -+ /* try to scrape all its memory if this memcg was deleted */ -+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; -+ -+ /* -+ * The aging tries to be lazy to reduce the overhead, while the eviction -+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the -+ * ideal number of generations is MIN_NR_GENS+1. -+ */ -+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) -+ return false; -+ -+ /* -+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) -+ * of the total number of pages for each generation. A reasonable range -+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The -+ * aging cares about the upper bound of hot pages, while the eviction -+ * cares about the lower bound of cold pages. -+ */ -+ if (young * MIN_NR_GENS > total) -+ return true; -+ if (old * (MIN_NR_GENS + 2) < total) -+ return true; -+ -+ return false; -+} -+ - /* - * For future optimizations: - * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg - * reclaim. - */ --static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, -- bool can_swap, bool *need_aging) -+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) - { - unsigned long nr_to_scan; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); -- DEFINE_MIN_SEQ(lruvec); - -- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || -- (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && -- !sc->memcg_low_reclaim)) -+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return 0; - -- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); -- if (!*need_aging) -+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) - return nr_to_scan; - - /* skip the aging path at the default priority */ - if (sc->priority == DEF_PRIORITY) -- goto done; -+ return nr_to_scan; - -- /* leave the work to lru_gen_age_node() */ -- if (current_is_kswapd()) -- return 0; -+ /* skip this lruvec as it's low on cold folios */ -+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; -+} - -- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) -- return nr_to_scan; --done: -- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; -+static unsigned long get_nr_to_reclaim(struct scan_control *sc) -+{ -+ /* don't abort memcg reclaim to ensure fairness */ -+ if (!global_reclaim(sc)) -+ return -1; -+ -+ return max(sc->nr_to_reclaim, compact_gap(sc->order)); - } - --static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, -- struct scan_control *sc, bool need_swapping) -+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { -- int i; -- DEFINE_MAX_SEQ(lruvec); -+ long nr_to_scan; -+ unsigned long scanned = 0; -+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); -+ int swappiness = get_swappiness(lruvec, sc); - -- if (!current_is_kswapd()) { -- /* age each memcg at most once to ensure fairness */ -- if (max_seq - seq > 1) -- return true; -+ /* clean file folios are more likely to exist */ -+ if (swappiness && !(sc->gfp_mask & __GFP_IO)) -+ swappiness = 1; - -- /* over-swapping can increase allocation latency */ -- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) -- return true; -+ while (true) { -+ int delta; - -- /* give this thread a chance to exit and free its memory */ -- if (fatal_signal_pending(current)) { -- sc->nr_reclaimed += MIN_LRU_BATCH; -- return true; -- } -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); -+ if (nr_to_scan <= 0) -+ break; - -- if (cgroup_reclaim(sc)) -- return false; -- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) -- return false; -+ delta = evict_folios(lruvec, sc, swappiness); -+ if (!delta) -+ break; - -- /* keep scanning at low priorities to ensure fairness */ -- if (sc->priority > DEF_PRIORITY - 2) -- return false; -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; - -- /* -- * A minimum amount of work was done under global memory pressure. For -- * kswapd, it may be overshooting. For direct reclaim, the allocation -- * may succeed if all suitable zones are somewhat safe. In either case, -- * it's better to stop now, and restart later if necessary. -- */ -- for (i = 0; i <= sc->reclaim_idx; i++) { -- unsigned long wmark; -- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; -+ if (sc->nr_reclaimed >= nr_to_reclaim) -+ break; - -- if (!managed_zone(zone)) -+ cond_resched(); -+ } -+ -+ /* whether try_to_inc_max_seq() was successful */ -+ return nr_to_scan < 0; -+} -+ -+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool success; -+ unsigned long scanned = sc->nr_scanned; -+ unsigned long reclaimed = sc->nr_reclaimed; -+ int seg = lru_gen_memcg_seg(lruvec); -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ /* see the comment on MEMCG_NR_GENS */ -+ if (!lruvec_is_sizable(lruvec, sc)) -+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; -+ -+ mem_cgroup_calculate_protection(NULL, memcg); -+ -+ if (mem_cgroup_below_min(NULL, memcg)) -+ return MEMCG_LRU_YOUNG; -+ -+ if (mem_cgroup_below_low(NULL, memcg)) { -+ /* see the comment on MEMCG_NR_GENS */ -+ if (seg != MEMCG_LRU_TAIL) -+ return MEMCG_LRU_TAIL; -+ -+ memcg_memory_event(memcg, MEMCG_LOW); -+ } -+ -+ success = try_to_shrink_lruvec(lruvec, sc); -+ -+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); -+ -+ if (!sc->proactive) -+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, -+ sc->nr_reclaimed - reclaimed); -+ -+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; -+ current->reclaim_state->reclaimed_slab = 0; -+ -+ return success ? MEMCG_LRU_YOUNG : 0; -+} -+ -+#ifdef CONFIG_MEMCG -+ -+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ int op; -+ int gen; -+ int bin; -+ int first_bin; -+ struct lruvec *lruvec; -+ struct lru_gen_folio *lrugen; -+ struct mem_cgroup *memcg; -+ const struct hlist_nulls_node *pos; -+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); -+ -+ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); -+restart: -+ op = 0; -+ memcg = NULL; -+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); -+ -+ rcu_read_lock(); -+ -+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { -+ if (op) -+ lru_gen_rotate_memcg(lruvec, op); -+ -+ mem_cgroup_put(memcg); -+ -+ lruvec = container_of(lrugen, struct lruvec, lrugen); -+ memcg = lruvec_memcg(lruvec); -+ -+ if (!mem_cgroup_tryget(memcg)) { -+ op = 0; -+ memcg = NULL; - continue; -+ } - -- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); -- if (wmark > zone_page_state(zone, NR_FREE_PAGES)) -- return false; -+ rcu_read_unlock(); -+ -+ op = shrink_one(lruvec, sc); -+ -+ rcu_read_lock(); -+ -+ if (sc->nr_reclaimed >= nr_to_reclaim) -+ break; - } - -- sc->nr_reclaimed += MIN_LRU_BATCH; -+ rcu_read_unlock(); - -- return true; -+ if (op) -+ lru_gen_rotate_memcg(lruvec, op); -+ -+ mem_cgroup_put(memcg); -+ -+ if (sc->nr_reclaimed >= nr_to_reclaim) -+ return; -+ -+ /* restart if raced with lru_gen_rotate_memcg() */ -+ if (gen != get_nulls_value(pos)) -+ goto restart; -+ -+ /* try the rest of the bins of the current generation */ -+ bin = get_memcg_bin(bin + 1); -+ if (bin != first_bin) -+ goto restart; - } - - static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - struct blk_plug plug; -- bool need_aging = false; -- bool need_swapping = false; -- unsigned long scanned = 0; -- unsigned long reclaimed = sc->nr_reclaimed; -- DEFINE_MAX_SEQ(lruvec); -+ -+ VM_WARN_ON_ONCE(global_reclaim(sc)); -+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); - - lru_add_drain(); - - blk_start_plug(&plug); - -- set_mm_walk(lruvec_pgdat(lruvec)); -+ set_mm_walk(NULL, sc->proactive); - -- while (true) { -- int delta; -- int swappiness; -- unsigned long nr_to_scan; -+ if (try_to_shrink_lruvec(lruvec, sc)) -+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); - -- if (sc->may_swap) -- swappiness = get_swappiness(lruvec, sc); -- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) -- swappiness = 1; -- else -- swappiness = 0; -+ clear_mm_walk(); - -- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); -- if (!nr_to_scan) -- goto done; -+ blk_finish_plug(&plug); -+} - -- delta = evict_folios(lruvec, sc, swappiness, &need_swapping); -- if (!delta) -- goto done; -+#else /* !CONFIG_MEMCG */ - -- scanned += delta; -- if (scanned >= nr_to_scan) -- break; -+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ BUILD_BUG(); -+} - -- if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) -- break; -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ BUILD_BUG(); -+} - -- cond_resched(); -- } -+#endif -+ -+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ int priority; -+ unsigned long reclaimable; -+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); -+ -+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) -+ return; -+ /* -+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> -+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the -+ * estimated reclaimed_to_scanned_ratio = inactive / total. -+ */ -+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); -+ if (get_swappiness(lruvec, sc)) -+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); -+ -+ reclaimable /= MEMCG_NR_GENS; -+ -+ /* round down reclaimable and round up sc->nr_to_reclaim */ -+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); -+ -+ sc->priority = clamp(priority, 0, DEF_PRIORITY); -+} -+ -+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ unsigned long reclaimed = sc->nr_reclaimed; -+ -+ VM_WARN_ON_ONCE(!global_reclaim(sc)); -+ -+ /* -+ * Unmapped clean folios are already prioritized. Scanning for more of -+ * them is likely futile and can cause high reclaim latency when there -+ * is a large number of memcgs. -+ */ -+ if (!sc->may_writepage || !sc->may_unmap) -+ goto done; -+ -+ lru_add_drain(); -+ -+ blk_start_plug(&plug); -+ -+ set_mm_walk(pgdat, sc->proactive); -+ -+ set_initial_priority(pgdat, sc); -+ -+ if (current_is_kswapd()) -+ sc->nr_reclaimed = 0; -+ -+ if (mem_cgroup_disabled()) -+ shrink_one(&pgdat->__lruvec, sc); -+ else -+ shrink_many(pgdat, sc); -+ -+ if (current_is_kswapd()) -+ sc->nr_reclaimed += reclaimed; - -- /* see the comment in lru_gen_age_node() */ -- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) -- sc->memcgs_need_aging = false; --done: - clear_mm_walk(); - - blk_finish_plug(&plug); -+done: -+ /* kswapd should never fail */ -+ pgdat->kswapd_failures = 0; - } - - /****************************************************************************** -@@ -5266,7 +5535,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - - static bool __maybe_unused state_is_valid(struct lruvec *lruvec) - { -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - if (lrugen->enabled) { - enum lru_list lru; -@@ -5279,7 +5548,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) - int gen, type, zone; - - for_each_gen_type_zone(gen, type, zone) { -- if (!list_empty(&lrugen->lists[gen][type][zone])) -+ if (!list_empty(&lrugen->folios[gen][type][zone])) - return false; - } - } -@@ -5324,7 +5593,7 @@ static bool drain_evictable(struct lruvec *lruvec) - int remaining = MAX_LRU_BATCH; - - for_each_gen_type_zone(gen, type, zone) { -- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; -+ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; - - while (!list_empty(head)) { - bool success; -@@ -5402,14 +5671,14 @@ static void lru_gen_change_state(bool enabled) +@@ -5671,14 +5671,14 @@ static void lru_gen_change_state(bool enabled) * sysfs interface ******************************************************************************/ @@ -31467,7 +10363,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644 { unsigned int msecs; -@@ -5421,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, +@@ -5690,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, return len; } @@ -31481,7 +10377,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644 { unsigned int caps = 0; -@@ -5442,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c +@@ -5711,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ @@ -31490,7 +10386,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644 const char *buf, size_t len) { int i; -@@ -5469,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, +@@ -5738,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, return len; } @@ -31501,7 +10397,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644 static struct attribute *lru_gen_attrs[] = { &lru_gen_min_ttl_attr.attr, -@@ -5479,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = { +@@ -5748,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = { NULL }; @@ -31510,1959 +10406,2106 @@ index 160acbbdf111..1a8f3b1c0bad 100644 .name = "lru_gen", .attrs = lru_gen_attrs, }; -@@ -5545,7 +5810,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, - int i; - int type, tier; - int hist = lru_hist_from_seq(seq); -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - for (tier = 0; tier < MAX_NR_TIERS; tier++) { - seq_printf(m, " %10d", tier); -@@ -5595,7 +5860,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) - unsigned long seq; - bool full = !debugfs_real_fops(m->file)->write; - struct lruvec *lruvec = v; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - int nid = lruvec_pgdat(lruvec)->node_id; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); -@@ -5692,7 +5957,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co - if (sc->nr_reclaimed >= nr_to_reclaim) - return 0; - -- if (!evict_folios(lruvec, sc, swappiness, NULL)) -+ if (!evict_folios(lruvec, sc, swappiness)) - return 0; - - cond_resched(); -@@ -5713,11 +5978,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, - - if (!mem_cgroup_disabled()) { - rcu_read_lock(); -+ - memcg = mem_cgroup_from_id(memcg_id); --#ifdef CONFIG_MEMCG -- if (memcg && !css_tryget(&memcg->css)) -+ if (!mem_cgroup_tryget(memcg)) - memcg = NULL; --#endif -+ - rcu_read_unlock(); - - if (!memcg) -@@ -5777,7 +6042,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, - set_task_reclaim_state(current, &sc.reclaim_state); - flags = memalloc_noreclaim_save(); - blk_start_plug(&plug); -- if (!set_mm_walk(NULL)) { -+ if (!set_mm_walk(NULL, true)) { - err = -ENOMEM; - goto done; - } -@@ -5849,7 +6114,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) - { - int i; - int gen, type, zone; -- struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct lru_gen_folio *lrugen = &lruvec->lrugen; - - lrugen->max_seq = MIN_NR_GENS + 1; - lrugen->enabled = lru_gen_enabled(); -@@ -5858,13 +6123,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) - lrugen->timestamps[i] = jiffies; - - for_each_gen_type_zone(gen, type, zone) -- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - - lruvec->mm_state.seq = MIN_NR_GENS; - init_waitqueue_head(&lruvec->mm_state.wait); - } - - #ifdef CONFIG_MEMCG -+ -+void lru_gen_init_pgdat(struct pglist_data *pgdat) -+{ -+ int i, j; -+ -+ spin_lock_init(&pgdat->memcg_lru.lock); -+ -+ for (i = 0; i < MEMCG_NR_GENS; i++) { -+ for (j = 0; j < MEMCG_NR_BINS; j++) -+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); -+ } -+} -+ - void lru_gen_init_memcg(struct mem_cgroup *memcg) - { - INIT_LIST_HEAD(&memcg->mm_list.fifo); -@@ -5876,19 +6154,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) - int i; - int nid; - -+ VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); -+ - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(memcg, nid); - -+ VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); - VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, - sizeof(lruvec->lrugen.nr_pages))); - -+ lruvec->lrugen.list.next = LIST_POISON1; -+ - for (i = 0; i < NR_BLOOM_FILTERS; i++) { - bitmap_free(lruvec->mm_state.filters[i]); - lruvec->mm_state.filters[i] = NULL; - } - } - } --#endif -+ -+#endif /* CONFIG_MEMCG */ - - static int __init init_lru_gen(void) - { -@@ -5915,6 +6199,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - { - } - -+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -5928,7 +6216,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - bool proportional_reclaim; - struct blk_plug plug; - -- if (lru_gen_enabled()) { -+ if (lru_gen_enabled() && !global_reclaim(sc)) { - lru_gen_shrink_lruvec(lruvec, sc); - return; - } -@@ -6171,6 +6459,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - struct lruvec *target_lruvec; - bool reclaimable = false; - -+ if (lru_gen_enabled() && global_reclaim(sc)) { -+ lru_gen_shrink_node(pgdat, sc); -+ return; -+ } -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - again: -diff --git a/mm/workingset.c b/mm/workingset.c -index 1a86645b7b3c..fd666584515c 100644 ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio) - unsigned long token; - unsigned long min_seq; - struct lruvec *lruvec; -- struct lru_gen_struct *lrugen; -+ struct lru_gen_folio *lrugen; - int type = folio_is_file_lru(folio); - int delta = folio_nr_pages(folio); - int refs = folio_lru_refs(folio); -@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) - unsigned long token; - unsigned long min_seq; - struct lruvec *lruvec; -- struct lru_gen_struct *lrugen; -+ struct lru_gen_folio *lrugen; - struct mem_cgroup *memcg; - struct pglist_data *pgdat; - int type = folio_is_file_lru(folio); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c -index 81fa7ec2e66a..1f36bc1c5d36 100644 +index 4c89ff333f6f..9286d3baa12d 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c -@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt) - - if (!MAPLE_32BIT) { - if (i >= 35) -- e = i - 35; -+ e = i - 34; - else if (i >= 5) -- e = i - 5; -+ e = i - 4; - else if (i >= 2) -- e = i - 2; -+ e = i - 1; - } else { - if (i >= 4) - e = i - 4; -@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt) - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); -- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); -+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); -- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); -+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); - - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); -- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); -+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - /* Check the limit of pop/push/pop */ - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ -@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt) - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_alloc_req(&mas)); -- MT_BUG_ON(mt, mas.alloc->node_count); -+ MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); -- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); -+ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - mas_push_node(&mas, mn); -- MT_BUG_ON(mt, mas.alloc->node_count); -+ MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); --- -2.40.0.rc2 - -From d9e434e1093f450c71f9a327b2201f7bdcc75743 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Fri, 17 Feb 2023 13:41:20 +0100 -Subject: [PATCH 11/16] mm/kvm: lockless accessed bit harvest - -TLDR -==== -This patchset RCU-protects KVM page tables and compare-and-exchanges -KVM PTEs with the accessed bit set by hardware. It significantly -improves the performance of guests when the host is under heavy -memory pressure. - -ChromeOS has been using a similar approach [1] since mid 2021 and it -was proven successful on tens of millions devices. - -[1] https://crrev.com/c/2987928 - -Overview -======== -The goal of this patchset is to optimize the performance of guests -when the host memory is overcommitted. It focuses on the vast -majority of VMs that are not nested and run on hardware that sets the -accessed bit in KVM page tables. - -Note that nested VMs and hardware that does not support the accessed -bit are both out of scope. - -This patchset relies on two techniques, RCU and cmpxchg, to safely -test and clear the accessed bit without taking kvm->mmu_lock. The -former protects KVM page tables from being freed while the latter -clears the accessed bit atomically against both hardware and other -software page table walkers. - -A new MMU notifier API, mmu_notifier_test_clear_young(), is -introduced. It follows two design patterns: fallback and batching. -For any unsupported cases, it can optionally fall back to -mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can test -or test and clear their accessed bits according to a bitmap provided -by the caller. - -This patchset only applies mmu_notifier_test_clear_young() to MGLRU. -A follow-up patchset will apply it to /proc/PID/pagemap and -/prod/PID/clear_refs. - -Evaluation -========== -An existing selftest can quickly demonstrate the effectiveness of -this patchset. On a generic workstation equipped with 64 CPUs and -256GB DRAM: - - $ sudo max_guest_memory_test -c 64 -m 256 -s 256 - - MGLRU run2 - --------------- - Before ~600s - After ~50s - Off ~250s - - kswapd (MGLRU before) - 100.00% balance_pgdat - 100.00% shrink_node - 100.00% shrink_one - 99.97% try_to_shrink_lruvec - 99.06% evict_folios - 97.41% shrink_folio_list - 31.33% folio_referenced - 31.06% rmap_walk_file - 30.89% folio_referenced_one - 20.83% __mmu_notifier_clear_flush_young - 20.54% kvm_mmu_notifier_clear_flush_young - => 19.34% _raw_write_lock - - kswapd (MGLRU after) - 100.00% balance_pgdat - 100.00% shrink_node - 100.00% shrink_one - 99.97% try_to_shrink_lruvec - 99.51% evict_folios - 71.70% shrink_folio_list - 7.08% folio_referenced - 6.78% rmap_walk_file - 6.72% folio_referenced_one - 5.60% lru_gen_look_around - => 1.53% __mmu_notifier_test_clear_young - - kswapd (MGLRU off) - 100.00% balance_pgdat - 100.00% shrink_node - 99.92% shrink_lruvec - 69.95% shrink_folio_list - 19.35% folio_referenced - 18.37% rmap_walk_file - 17.88% folio_referenced_one - 13.20% __mmu_notifier_clear_flush_young - 11.64% kvm_mmu_notifier_clear_flush_young - => 9.93% _raw_write_lock - 26.23% shrink_active_list - 25.50% folio_referenced - 25.35% rmap_walk_file - 25.28% folio_referenced_one - 23.87% __mmu_notifier_clear_flush_young - 23.69% kvm_mmu_notifier_clear_flush_young - => 18.98% _raw_write_lock - -Comprehensive benchmarks are coming soon. - -Yu Zhao (5): - mm/kvm: add mmu_notifier_test_clear_young() - kvm/x86: add kvm_arch_test_clear_young() - kvm/arm64: add kvm_arch_test_clear_young() - kvm/powerpc: add kvm_arch_test_clear_young() - mm: multi-gen LRU: use mmu_notifier_test_clear_young() - -Signed-off-by: Peter Jung ---- - arch/arm64/include/asm/kvm_host.h | 7 ++ - arch/arm64/include/asm/kvm_pgtable.h | 8 ++ - arch/arm64/include/asm/stage2_pgtable.h | 43 ++++++++ - arch/arm64/kvm/arm.c | 1 + - arch/arm64/kvm/hyp/pgtable.c | 51 ++-------- - arch/arm64/kvm/mmu.c | 77 +++++++++++++- - arch/powerpc/include/asm/kvm_host.h | 18 ++++ - arch/powerpc/include/asm/kvm_ppc.h | 14 +-- - arch/powerpc/kvm/book3s.c | 7 ++ - arch/powerpc/kvm/book3s.h | 2 + - arch/powerpc/kvm/book3s_64_mmu_radix.c | 78 ++++++++++++++- - arch/powerpc/kvm/book3s_hv.c | 10 +- - arch/x86/include/asm/kvm_host.h | 27 +++++ - arch/x86/kvm/mmu/spte.h | 12 --- - arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++ - include/linux/kvm_host.h | 29 ++++++ - include/linux/mmu_notifier.h | 40 ++++++++ - include/linux/mmzone.h | 6 +- - mm/mmu_notifier.c | 26 +++++ - mm/rmap.c | 8 +- - mm/vmscan.c | 127 +++++++++++++++++++++--- - virt/kvm/kvm_main.c | 58 +++++++++++ - 22 files changed, 593 insertions(+), 97 deletions(-) - -diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h -index 35a159d131b5..572bcd321586 100644 ---- a/arch/arm64/include/asm/kvm_host.h -+++ b/arch/arm64/include/asm/kvm_host.h -@@ -1031,4 +1031,11 @@ static inline void kvm_hyp_reserve(void) { } - void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); - bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); - -+/* see the comments on the generic kvm_arch_has_test_clear_young() */ -+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young -+static inline bool kvm_arch_has_test_clear_young(void) -+{ -+ return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled(); -+} -+ - #endif /* __ARM64_KVM_HOST_H__ */ -diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h -index 63f81b27a4e3..8c9a04388c88 100644 ---- a/arch/arm64/include/asm/kvm_pgtable.h -+++ b/arch/arm64/include/asm/kvm_pgtable.h -@@ -105,6 +105,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level) - * @put_page: Decrement the refcount on a page. When the - * refcount reaches 0 the page is automatically - * freed. -+ * @put_page_rcu: RCU variant of put_page(). - * @page_count: Return the refcount of a page. - * @phys_to_virt: Convert a physical address into a virtual - * address mapped in the current context. -@@ -122,6 +123,7 @@ struct kvm_pgtable_mm_ops { - void (*free_removed_table)(void *addr, u32 level); - void (*get_page)(void *addr); - void (*put_page)(void *addr); -+ void (*put_page_rcu)(void *addr); - int (*page_count)(void *addr); - void* (*phys_to_virt)(phys_addr_t phys); - phys_addr_t (*virt_to_phys)(void *addr); -@@ -188,6 +190,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, - * children. - * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared - * with other software walkers. -+ * -+ * kvm_arch_test_clear_young() is a special case. It relies on two -+ * techniques, RCU and cmpxchg, to safely test and clear the accessed -+ * bit without taking the MMU lock. The former protects KVM page tables -+ * from being freed while the latter clears the accessed bit atomically -+ * against both the hardware and other software page table walkers. - */ - enum kvm_pgtable_walk_flags { - KVM_PGTABLE_WALK_LEAF = BIT(0), -diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h -index c8dca8ae359c..350437661d4b 100644 ---- a/arch/arm64/include/asm/stage2_pgtable.h -+++ b/arch/arm64/include/asm/stage2_pgtable.h -@@ -30,4 +30,47 @@ - */ - #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) - -+#define KVM_PTE_TYPE BIT(1) -+#define KVM_PTE_TYPE_BLOCK 0 -+#define KVM_PTE_TYPE_PAGE 1 -+#define KVM_PTE_TYPE_TABLE 1 -+ -+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) -+ -+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 -+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) -+ -+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) -+ -+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) -+ -+#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) -+ -+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -+ -+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) -+ -+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ -+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ -+ KVM_PTE_LEAF_ATTR_HI_S2_XN) -+ -+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -+#define KVM_MAX_OWNER_ID 1 -+ -+/* -+ * Used to indicate a pte for which a 'break-before-make' sequence is in -+ * progress. -+ */ -+#define KVM_INVALID_PTE_LOCKED BIT(10) -+ - #endif /* __ARM64_S2_PGTABLE_H_ */ -diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c -index 9c5573bc4614..6770bc47f5c9 100644 ---- a/arch/arm64/kvm/arm.c -+++ b/arch/arm64/kvm/arm.c -@@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) - */ - void kvm_arch_destroy_vm(struct kvm *kvm) - { -+ kvm_free_stage2_pgd(&kvm->arch.mmu); - bitmap_free(kvm->arch.pmu_filter); - free_cpumask_var(kvm->arch.supported_cpus); - -diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c -index b11cf2c618a6..8d65ee4767f1 100644 ---- a/arch/arm64/kvm/hyp/pgtable.c -+++ b/arch/arm64/kvm/hyp/pgtable.c -@@ -12,49 +12,6 @@ - #include - - --#define KVM_PTE_TYPE BIT(1) --#define KVM_PTE_TYPE_BLOCK 0 --#define KVM_PTE_TYPE_PAGE 1 --#define KVM_PTE_TYPE_TABLE 1 -- --#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) -- --#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) --#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) --#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 --#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 --#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) --#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 --#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) -- --#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) --#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) --#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) --#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) --#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 --#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) -- --#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) -- --#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) -- --#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -- --#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) -- --#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ -- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ -- KVM_PTE_LEAF_ATTR_HI_S2_XN) -- --#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) --#define KVM_MAX_OWNER_ID 1 -- --/* -- * Used to indicate a pte for which a 'break-before-make' sequence is in -- * progress. -- */ --#define KVM_INVALID_PTE_LOCKED BIT(10) -- - struct kvm_pgtable_walk_data { - struct kvm_pgtable_walker *walker; - -@@ -994,8 +951,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), - kvm_granule_size(ctx->level)); - -- if (childp) -- mm_ops->put_page(childp); -+ if (childp) { -+ if (mm_ops->put_page_rcu) -+ mm_ops->put_page_rcu(childp); -+ else -+ mm_ops->put_page(childp); -+ } - - return 0; - } -diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c -index a3ee3b605c9b..761fffc788f5 100644 ---- a/arch/arm64/kvm/mmu.c -+++ b/arch/arm64/kvm/mmu.c -@@ -171,6 +171,21 @@ static int kvm_host_page_count(void *addr) - return page_count(virt_to_page(addr)); - } - -+static void kvm_s2_rcu_put_page(struct rcu_head *head) -+{ -+ put_page(container_of(head, struct page, rcu_head)); -+} -+ -+static void kvm_s2_put_page_rcu(void *addr) -+{ -+ struct page *page = virt_to_page(addr); -+ -+ if (kvm_host_page_count(addr) == 1) -+ kvm_account_pgtable_pages(addr, -1); -+ -+ call_rcu(&page->rcu_head, kvm_s2_rcu_put_page); -+} -+ - static phys_addr_t kvm_host_pa(void *addr) - { - return __pa(addr); -@@ -684,6 +699,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { - .free_removed_table = stage2_free_removed_table, - .get_page = kvm_host_get_page, - .put_page = kvm_s2_put_page, -+ .put_page_rcu = kvm_s2_put_page_rcu, - .page_count = kvm_host_page_count, - .phys_to_virt = kvm_host_va, - .virt_to_phys = kvm_host_pa, -@@ -1624,6 +1640,66 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) - return pte_valid(pte) && pte_young(pte); - } - -+struct test_clear_young_arg { -+ struct kvm_gfn_range *range; -+ gfn_t lsb_gfn; -+ unsigned long *bitmap; -+}; -+ -+static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx, -+ enum kvm_pgtable_walk_flags flags) -+{ -+ struct test_clear_young_arg *arg = ctx->arg; -+ gfn_t gfn = ctx->addr / PAGE_SIZE; -+ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; -+ -+ VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep))); -+ VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end); -+ -+ if (!kvm_pte_valid(new)) -+ return 0; -+ -+ if (new == ctx->old) -+ return 0; -+ -+ /* see the comments on the generic kvm_arch_has_test_clear_young() */ -+ if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap)) -+ cmpxchg64(ctx->ptep, ctx->old, new); -+ -+ return 0; -+} -+ -+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap) -+{ -+ u64 start = range->start * PAGE_SIZE; -+ u64 end = range->end * PAGE_SIZE; -+ struct test_clear_young_arg arg = { -+ .range = range, -+ .lsb_gfn = lsb_gfn, -+ .bitmap = bitmap, -+ }; -+ struct kvm_pgtable_walker walker = { -+ .cb = stage2_test_clear_young, -+ .arg = &arg, -+ .flags = KVM_PGTABLE_WALK_LEAF, -+ }; -+ -+ BUILD_BUG_ON(is_hyp_code()); -+ -+ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) -+ return false; -+ -+ /* see the comments on kvm_pgtable_walk_flags */ -+ rcu_read_lock(); -+ -+ kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker); -+ -+ rcu_read_unlock(); -+ -+ return true; -+} -+ - bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) - { - if (!kvm->arch.mmu.pgt) -@@ -1848,7 +1924,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) - - void kvm_arch_flush_shadow_all(struct kvm *kvm) - { -- kvm_free_stage2_pgd(&kvm->arch.mmu); - } - - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, -diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h -index caea15dcb91d..996850029ce0 100644 ---- a/arch/powerpc/include/asm/kvm_host.h -+++ b/arch/powerpc/include/asm/kvm_host.h -@@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {} - static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} - static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} - -+static inline int kvmppc_radix_possible(void) -+{ -+ return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); -+} -+ -+static inline bool kvmhv_on_pseries(void) -+{ -+ return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE); -+} -+ -+/* see the comments on the generic kvm_arch_has_test_clear_young() */ -+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young -+static inline bool kvm_arch_has_test_clear_young(void) -+{ -+ return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && -+ kvmppc_radix_possible() && !kvmhv_on_pseries(); -+} -+ - #endif /* __POWERPC_KVM_HOST_H__ */ -diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h -index eae9619b6190..0bb772fc12b1 100644 ---- a/arch/powerpc/include/asm/kvm_ppc.h -+++ b/arch/powerpc/include/asm/kvm_ppc.h -@@ -277,6 +277,8 @@ struct kvmppc_ops { - bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); - bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); - bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); -+ bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap); - bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); - void (*free_memslot)(struct kvm_memory_slot *slot); - int (*init_vm)(struct kvm *kvm); -@@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void) { return false; } - - #endif - --#ifdef CONFIG_PPC_PSERIES --static inline bool kvmhv_on_pseries(void) --{ -- return !cpu_has_feature(CPU_FTR_HVMODE); --} --#else --static inline bool kvmhv_on_pseries(void) --{ -- return false; --} --#endif -- - #ifdef CONFIG_KVM_XICS - static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) - { -diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c -index 6d525285dbe8..f4cf330e3e81 100644 ---- a/arch/powerpc/kvm/book3s.c -+++ b/arch/powerpc/kvm/book3s.c -@@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) - return kvm->arch.kvm_ops->test_age_gfn(kvm, range); - } - -+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap) -+{ -+ return kvm->arch.kvm_ops->test_clear_young && -+ kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap); -+} -+ - bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) - { - return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); -diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h -index 58391b4b32ed..fe9cac423817 100644 ---- a/arch/powerpc/kvm/book3s.h -+++ b/arch/powerpc/kvm/book3s.h -@@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, - extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); - extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); - extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); -+extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap); - extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); - - extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); -diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c -index 9d3743ca16d5..8476646c554c 100644 ---- a/arch/powerpc/kvm/book3s_64_mmu_radix.c -+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c -@@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - return ref; - } - -+bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap) -+{ -+ bool success; -+ gfn_t gfn = range->start; -+ -+ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) -+ return false; -+ -+ /* -+ * This function relies on two techniques, RCU and cmpxchg, to safely -+ * test and clear the accessed bit without taking the MMU lock. The -+ * former protects KVM page tables from being freed while the latter -+ * clears the accessed bit atomically against both the hardware and -+ * other software page table walkers. -+ */ -+ rcu_read_lock(); -+ -+ success = kvm_is_radix(kvm); -+ if (!success) -+ goto unlock; -+ -+ /* -+ * case 1: this function kvmppc_switch_mmu_to_hpt() -+ * -+ * rcu_read_lock() -+ * test kvm_is_radix() kvm->arch.radix = 0 -+ * use kvm->arch.pgtable -+ * rcu_read_unlock() -+ * synchronize_rcu() -+ * kvmppc_free_radix() -+ * -+ * -+ * case 2: this function kvmppc_switch_mmu_to_radix() -+ * -+ * kvmppc_init_vm_radix() -+ * smp_wmb() -+ * test kvm_is_radix() kvm->arch.radix = 1 -+ * smp_rmb() -+ * use kvm->arch.pgtable -+ */ -+ smp_rmb(); -+ -+ while (gfn < range->end) { -+ pte_t *ptep; -+ pte_t old, new; -+ unsigned int shift; -+ -+ ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift); -+ if (!ptep) -+ goto next; -+ -+ VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep))); -+ -+ old = READ_ONCE(*ptep); -+ if (!pte_present(old) || !pte_young(old)) -+ goto next; -+ -+ new = pte_mkold(old); -+ -+ /* see the comments on the generic kvm_arch_has_test_clear_young() */ -+ if (__test_and_change_bit(lsb_gfn - gfn, bitmap)) -+ pte_xchg(ptep, old, new); -+next: -+ gfn += shift ? BIT(shift - PAGE_SHIFT) : 1; -+ } -+unlock: -+ rcu_read_unlock(); -+ -+ return success; -+} -+ - /* Returns the number of PAGE_SIZE pages that are dirty */ - static int kvm_radix_test_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot, int pagenum) -@@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void) - { - unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; - -- kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); -+ kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, -+ SLAB_TYPESAFE_BY_RCU, pte_ctor); - if (!kvm_pte_cache) - return -ENOMEM; - - size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; - -- kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); -+ kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, -+ SLAB_TYPESAFE_BY_RCU, pmd_ctor); - if (!kvm_pmd_cache) { - kmem_cache_destroy(kvm_pte_cache); - return -ENOMEM; -diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c -index 6ba68dd6190b..17b415661282 100644 ---- a/arch/powerpc/kvm/book3s_hv.c -+++ b/arch/powerpc/kvm/book3s_hv.c -@@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) - spin_lock(&kvm->mmu_lock); - kvm->arch.radix = 0; - spin_unlock(&kvm->mmu_lock); -+ /* see the comments in kvmhv_test_clear_young() */ -+ synchronize_rcu(); - kvmppc_free_radix(kvm); - - lpcr = LPCR_VPM1; -@@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) - if (err) - return err; - kvmppc_rmap_reset(kvm); -+ /* see the comments in kvmhv_test_clear_young() */ -+ smp_wmb(); - /* Mutual exclusion with kvm_unmap_gfn_range etc. */ - spin_lock(&kvm->mmu_lock); - kvm->arch.radix = 1; -@@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = { - .unmap_gfn_range = kvm_unmap_gfn_range_hv, - .age_gfn = kvm_age_gfn_hv, - .test_age_gfn = kvm_test_age_gfn_hv, -+ .test_clear_young = kvmhv_test_clear_young, - .set_spte_gfn = kvm_set_spte_gfn_hv, - .free_memslot = kvmppc_core_free_memslot_hv, - .init_vm = kvmppc_core_init_vm_hv, -@@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void) - return 0; - } - --static int kvmppc_radix_possible(void) --{ -- return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); --} -- - static int kvmppc_book3s_init_hv(void) - { - int r; -diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h -index 6aaae18f1854..d2995c9e8f07 100644 ---- a/arch/x86/include/asm/kvm_host.h -+++ b/arch/x86/include/asm/kvm_host.h -@@ -1367,6 +1367,12 @@ struct kvm_arch { - * the MMU lock in read mode + the tdp_mmu_pages_lock or - * the MMU lock in write mode - * -+ * kvm_arch_test_clear_young() is a special case. It relies on two -+ * techniques, RCU and cmpxchg, to safely test and clear the accessed -+ * bit without taking the MMU lock. The former protects KVM page tables -+ * from being freed while the latter clears the accessed bit atomically -+ * against both the hardware and other software page table walkers. -+ * - * Roots will remain in the list until their tdp_mmu_root_count - * drops to zero, at which point the thread that decremented the - * count to zero should removed the root from the list and clean -@@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); - KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) - -+extern u64 __read_mostly shadow_accessed_mask; -+ -+/* -+ * Returns true if A/D bits are supported in hardware and are enabled by KVM. -+ * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can -+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the -+ * scenario where KVM is using A/D bits for L1, but not L2. -+ */ -+static inline bool kvm_ad_enabled(void) -+{ -+ return shadow_accessed_mask; -+} -+ -+/* see the comments on the generic kvm_arch_has_test_clear_young() */ -+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young -+static inline bool kvm_arch_has_test_clear_young(void) -+{ -+ return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) && -+ (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled)); -+} -+ - #endif /* _ASM_X86_KVM_HOST_H */ -diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h -index 6f54dc9409c9..0dc7fed1f3fd 100644 ---- a/arch/x86/kvm/mmu/spte.h -+++ b/arch/x86/kvm/mmu/spte.h -@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask; - extern u64 __read_mostly shadow_nx_mask; - extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ - extern u64 __read_mostly shadow_user_mask; --extern u64 __read_mostly shadow_accessed_mask; - extern u64 __read_mostly shadow_dirty_mask; - extern u64 __read_mostly shadow_mmio_value; - extern u64 __read_mostly shadow_mmio_mask; -@@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte) - return !!(pte & SPTE_MMU_PRESENT_MASK); - } - --/* -- * Returns true if A/D bits are supported in hardware and are enabled by KVM. -- * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can -- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the -- * scenario where KVM is using A/D bits for L1, but not L2. -- */ --static inline bool kvm_ad_enabled(void) --{ -- return !!shadow_accessed_mask; --} -- - static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) - { - return sp->role.ad_disabled; -diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c -index d6df38d371a0..9028e09f1aab 100644 ---- a/arch/x86/kvm/mmu/tdp_mmu.c -+++ b/arch/x86/kvm/mmu/tdp_mmu.c -@@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); - } - -+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap) -+{ -+ struct kvm_mmu_page *root; -+ -+ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) -+ return false; -+ -+ if (kvm_memslots_have_rmaps(kvm)) -+ return false; -+ -+ /* see the comments on kvm_arch->tdp_mmu_roots */ -+ rcu_read_lock(); -+ -+ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { -+ struct tdp_iter iter; -+ -+ if (kvm_mmu_page_as_id(root) != range->slot->as_id) -+ continue; -+ -+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { -+ u64 *sptep = rcu_dereference(iter.sptep); -+ u64 new_spte = iter.old_spte & ~shadow_accessed_mask; -+ -+ VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep))); -+ VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end); -+ -+ if (new_spte == iter.old_spte) -+ continue; -+ -+ /* see the comments on the generic kvm_arch_has_test_clear_young() */ -+ if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap)) -+ cmpxchg64(sptep, iter.old_spte, new_spte); -+ } -+ } -+ -+ rcu_read_unlock(); -+ -+ return true; -+} -+ - static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) - { -diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h -index 4f26b244f6d0..df46fc815c8b 100644 ---- a/include/linux/kvm_host.h -+++ b/include/linux/kvm_host.h -@@ -2281,4 +2281,33 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) - /* Max number of entries allowed for each kvm dirty ring */ - #define KVM_DIRTY_RING_MAX_ENTRIES 65536 - -+/* -+ * Architectures that implement kvm_arch_test_clear_young() should override -+ * kvm_arch_has_test_clear_young(). -+ * -+ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can -+ * return true if kvm_arch_test_clear_young() is supported but disabled due to -+ * some runtime constraint. In this case, kvm_arch_test_clear_young() should -+ * return false. -+ * -+ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the -+ * following specifications: -+ * 1. The offset of each bit is relative to the second to the last parameter -+ * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to -+ * better suit batching while forward looping. -+ * 2. For each KVM PTE with the accessed bit set, the implementation should flip -+ * the corresponding bit in the bitmap. It should only clear the accessed bit -+ * if the old value is 1. This allows the caller to test or test and clear -+ * the accessed bit. -+ */ -+#ifndef kvm_arch_has_test_clear_young -+static inline bool kvm_arch_has_test_clear_young(void) -+{ -+ return false; -+} -+#endif -+ -+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, -+ gfn_t lsb_gfn, unsigned long *bitmap); -+ - #endif -diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h -index d6c06e140277..521f71ad0467 100644 ---- a/include/linux/mmu_notifier.h -+++ b/include/linux/mmu_notifier.h -@@ -122,6 +122,11 @@ struct mmu_notifier_ops { - struct mm_struct *mm, - unsigned long address); - -+ /* see the comments on mmu_notifier_test_clear_young() */ -+ bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ unsigned long *bitmap); -+ - /* - * change_pte is called in cases that pte mapping to page is changed: - * for example, when ksm remaps pte to point to a new shared page. -@@ -391,6 +396,9 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - extern int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ bool fallback, unsigned long *bitmap); - extern int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address); - extern void __mmu_notifier_change_pte(struct mm_struct *mm, -@@ -433,6 +441,31 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm, - return 0; - } - -+/* -+ * This function always returns 0 if fallback is not allowed. If fallback -+ * happens, its return value is similar to that of mmu_notifier_clear_young(). -+ * -+ * The bitmap has the following specifications: -+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE. -+ * 2. The offset of each bit is relative to the end. E.g., the offset -+ * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit -+ * batching while forward looping. -+ * 3. For each KVM PTE with the accessed bit set (young), this function flips -+ * the corresponding bit in the bitmap. It only clears the accessed bit if -+ * the old value is 1. A caller can test or test and clear the accessed bit -+ * by setting the corresponding bit in the bitmap to 0 or 1, and the new -+ * value will be 1 or 0 for a young KVM PTE. -+ */ -+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ bool fallback, unsigned long *bitmap) -+{ -+ if (mm_has_notifiers(mm)) -+ return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap); -+ -+ return 0; -+} -+ - static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) - { -@@ -687,6 +720,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - return 0; - } - -+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ bool fallback, unsigned long *bitmap) -+{ -+ return 0; -+} -+ - static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) - { -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 70bd7f55bdd2..0ddbf712708d 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -379,6 +379,7 @@ enum { - LRU_GEN_CORE, - LRU_GEN_MM_WALK, - LRU_GEN_NONLEAF_YOUNG, -+ LRU_GEN_SPTE_WALK, - NR_LRU_GEN_CAPS +@@ -55,6 +55,28 @@ struct rcu_reader_struct { + struct rcu_test_struct2 *test; }; -@@ -485,7 +486,7 @@ struct lru_gen_mm_walk { - }; - - void lru_gen_init_lruvec(struct lruvec *lruvec); --void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); -+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - - #ifdef CONFIG_MEMCG - -@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) - { - } - --static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - { -+ return false; - } - - #ifdef CONFIG_MEMCG -diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c -index f45ff1b7626a..324799848fed 100644 ---- a/mm/mmu_notifier.c -+++ b/mm/mmu_notifier.c -@@ -402,6 +402,32 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, - return young; - } - -+/* see the comments on mmu_notifier_test_clear_young() */ -+int __mmu_notifier_test_clear_young(struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ bool fallback, unsigned long *bitmap) ++static int get_alloc_node_count(struct ma_state *mas) +{ -+ int key; -+ struct mmu_notifier *mn; -+ int young = 0; ++ int count = 1; ++ struct maple_alloc *node = mas->alloc; + -+ key = srcu_read_lock(&srcu); -+ -+ hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, -+ hlist, srcu_read_lock_held(&srcu)) { -+ if (mn->ops->test_clear_young && -+ mn->ops->test_clear_young(mn, mm, start, end, bitmap)) -+ continue; -+ -+ if (fallback && mn->ops->clear_young) -+ young |= mn->ops->clear_young(mn, mm, start, end); ++ if (!node || ((unsigned long)node & 0x1)) ++ return 0; ++ while (node->node_count) { ++ count += node->node_count; ++ node = node->slot[0]; + } -+ -+ srcu_read_unlock(&srcu, key); -+ -+ return young; ++ return count; +} + - int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) - { -diff --git a/mm/rmap.c b/mm/rmap.c -index c8701608bb0d..8ecbbadab752 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio, - return false; /* To break the loop */ - } - -- if (pvmw.pte) { -- if (lru_gen_enabled() && pte_young(*pvmw.pte)) { -- lru_gen_look_around(&pvmw); -+ if (lru_gen_enabled() && pvmw.pte) { -+ if (lru_gen_look_around(&pvmw)) - referenced++; -- } -- -+ } else if (pvmw.pte) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) - referenced++; -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 1a8f3b1c0bad..ec0142165ce7 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -57,6 +57,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -3927,6 +3929,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - return folio; - } - -+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end, -+ unsigned long *bitmap, unsigned long *last) ++static void check_mas_alloc_node_count(struct ma_state *mas) +{ -+ if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK)) -+ return false; -+ -+ if (*last > addr) -+ goto done; -+ -+ *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ? -+ addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1; -+ bitmap_zero(bitmap, MIN_LRU_BATCH); -+ -+ mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap); -+done: -+ return test_bit((*last - addr) / PAGE_SIZE, bitmap); -+} -+ -+static void clear_spte_young(struct mm_struct *mm, unsigned long addr, -+ unsigned long *bitmap, unsigned long *last) -+{ -+ int i; -+ unsigned long start, end = *last + 1; -+ -+ if (addr + PAGE_SIZE != end) -+ return; -+ -+ i = find_last_bit(bitmap, MIN_LRU_BATCH); -+ if (i == MIN_LRU_BATCH) -+ return; -+ -+ start = end - (i + 1) * PAGE_SIZE; -+ -+ i = find_first_bit(bitmap, MIN_LRU_BATCH); -+ -+ end -= i * PAGE_SIZE; -+ -+ mmu_notifier_test_clear_young(mm, start, end, false, bitmap); -+} -+ -+static void skip_spte_young(struct mm_struct *mm, unsigned long addr, -+ unsigned long *bitmap, unsigned long *last) -+{ -+ if (*last > addr) -+ __clear_bit((*last - addr) / PAGE_SIZE, bitmap); -+ -+ clear_spte_young(mm, addr, bitmap, last); -+} -+ - static bool suitable_to_scan(int total, int young) - { - int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); -@@ -3942,6 +3993,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, - pte_t *pte; - spinlock_t *ptl; - unsigned long addr; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; -+ unsigned long last = 0; - int total = 0; - int young = 0; - struct lru_gen_mm_walk *walk = args->private; -@@ -3960,6 +4013,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, - pte = pte_offset_map(pmd, start & PMD_MASK); - restart: - for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ bool success; - unsigned long pfn; - struct folio *folio; - -@@ -3967,20 +4021,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, - walk->mm_stats[MM_LEAF_TOTAL]++; - - pfn = get_pte_pfn(pte[i], args->vma, addr); -- if (pfn == -1) -+ if (pfn == -1) { -+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); - continue; -+ } - -- if (!pte_young(pte[i])) { -+ success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last); -+ if (!success && !pte_young(pte[i])) { -+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); -- if (!folio) -+ if (!folio) { -+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); - continue; -+ } - -- if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) -- VM_WARN_ON_ONCE(true); -+ clear_spte_young(args->vma->vm_mm, addr, bitmap, &last); -+ if (pte_young(pte[i])) -+ ptep_test_and_clear_young(args->vma, addr, pte + i); - - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; -@@ -4589,6 +4650,24 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - * rmap/PT walk feedback - ******************************************************************************/ - -+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr, -+ pte_t *pte, int *young) -+{ -+ unsigned long old = true; -+ -+ *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old); -+ if (!old) -+ *young = true; -+ -+ if (pte_young(*pte)) { -+ ptep_test_and_clear_young(vma, addr, pte); -+ *young = true; -+ return true; -+ } -+ -+ return !old && get_cap(LRU_GEN_SPTE_WALK); ++ mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); ++ mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); ++ MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); ++ mas_destroy(mas); +} + /* - * This function exploits spatial locality when shrink_folio_list() walks the - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If -@@ -4596,12 +4675,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - * the PTE table to the Bloom filter. This forms a feedback loop between the - * eviction and the aging. - */ --void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - { - int i; - unsigned long start; - unsigned long end; - struct lru_gen_mm_walk *walk; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; -+ unsigned long last = 0; - int young = 0; - pte_t *pte = pvmw->pte; - unsigned long addr = pvmw->address; -@@ -4615,8 +4696,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - lockdep_assert_held(pvmw->ptl); - VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + * check_new_node() - Check the creation of new nodes and error path + * verification. +@@ -69,6 +91,8 @@ static noinline void check_new_node(struct maple_tree *mt) -+ if (!should_look_around(pvmw->vma, addr, pte, &young)) -+ return young; + MA_STATE(mas, mt, 0, 0); + ++ check_mas_alloc_node_count(&mas); + - if (spin_is_contended(pvmw->ptl)) -- return; -+ return young; + /* Try allocating 3 nodes */ + mtree_lock(mt); + mt_set_non_kernel(0); +-- +2.40.0 + +From c9249daec15495e2d4e2a0519e75421784e31ddc Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 9 Apr 2023 21:25:25 +0200 +Subject: [PATCH 08/10] Per-VMA locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Previous versions: +v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/ +v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/ +v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/ +RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/ + +LWN article describing the feature: +https://lwn.net/Articles/906852/ + +Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM +last year [2], which concluded with suggestion that “a reader/writer +semaphore could be put into the VMA itself; that would have the effect of +using the VMA as a sort of range lock. There would still be contention at +the VMA level, but it would be an improvement.” This patchset implements +this suggested approach. + +When handling page faults we lookup the VMA that contains the faulting +page under RCU protection and try to acquire its lock. If that fails we +fall back to using mmap_lock, similar to how SPF handled this situation. + +One notable way the implementation deviates from the proposal is the way +VMAs are read-locked. During some of mm updates, multiple VMAs need to be +locked until the end of the update (e.g. vma_merge, split_vma, etc). +Tracking all the locked VMAs, avoiding recursive locks, figuring out when +it's safe to unlock previously locked VMAs would make the code more +complex. So, instead of the usual lock/unlock pattern, the proposed +solution marks a VMA as locked and provides an efficient way to: +1. Identify locked VMAs. +2. Unlock all locked VMAs in bulk. +We also postpone unlocking the locked VMAs until the end of the update, +when we do mmap_write_unlock. Potentially this keeps a VMA locked for +longer than is absolutely necessary but it results in a big reduction of +code complexity. +Read-locking a VMA is done using two sequence numbers - one in the +vm_area_struct and one in the mm_struct. VMA is considered read-locked +when these sequence numbers are equal. To read-lock a VMA we set the +sequence number in vm_area_struct to be equal to the sequence number in +mm_struct. To unlock all VMAs we increment mm_struct's seq number. This +allows for an efficient way to track locked VMAs and to drop the locks on +all VMAs at the end of the update. + +The patchset implements per-VMA locking only for anonymous pages which +are not in swap and avoids userfaultfs as their implementation is more +complex. Additional support for file-back page faults, swapped and user +pages can be added incrementally. + +Performance benchmarks show similar although slightly smaller benefits as +with SPF patchset (~75% of SPF benefits). Still, with lower complexity +this approach might be more desirable. + +Since RFC was posted in September 2022, two separate Google teams outside +of Android evaluated the patchset and confirmed positive results. Here are +the known usecases when per-VMA locks show benefits: + +Android: +Apps with high number of threads (~100) launch times improve by up to 20%. +Each thread mmaps several areas upon startup (Stack and Thread-local +storage (TLS), thread signal stack, indirect ref table), which requires +taking mmap_lock in write mode. Page faults take mmap_lock in read mode. +During app launch, both thread creation and page faults establishing the +active workinget are happening in parallel and that causes lock contention +between mm writers and readers even if updates and page faults are +happening in different VMAs. Per-vma locks prevent this contention by +providing more granular lock. + +Google Fibers: +We have several dynamically sized thread pools that spawn new threads +under increased load and reduce their number when idling. For example, +Google's in-process scheduling/threading framework, UMCG/Fibers, is backed +by such a thread pool. When idling, only a small number of idle worker +threads are available; when a spike of incoming requests arrive, each +request is handled in its own "fiber", which is a work item posted onto a +UMCG worker thread; quite often these spikes lead to a number of new +threads spawning. Each new thread needs to allocate and register an RSEQ +section on its TLS, then register itself with the kernel as a UMCG worker +thread, and only after that it can be considered by the in-process +UMCG/Fiber scheduler as available to do useful work. In short, during an +incoming workload spike new threads have to be spawned, and they perform +several syscalls (RSEQ registration, UMCG worker registration, memory +allocations) before they can actually start doing useful work. Removing +any bottlenecks on this thread startup path will greatly improve our +services' latencies when faced with request/workload spikes. +At high scale, mmap_lock contention during thread creation and stack page +faults leads to user-visible multi-second serving latencies in a similar +pattern to Android app startup. Per-VMA locking patchset has been run +successfully in limited experiments with user-facing production workloads. +In these experiments, we observed that the peak thread creation rate was +high enough that thread creation is no longer a bottleneck. + +TCP zerocopy receive: +From the point of view of TCP zerocopy receive, the per-vma lock patch is +massively beneficial. +In today's implementation, a process with N threads where N - 1 are +performing zerocopy receive and 1 thread is performing madvise() with the +write lock taken (e.g. needs to change vm_flags) will result in all N -1 +receive threads blocking until the madvise is done. Conversely, on a busy +process receiving a lot of data, an madvise operation that does need to +take the mmap lock in write mode will need to wait for all of the receives +to be done - a lose:lose proposition. Per-VMA locking _removes_ by +definition this source of contention entirely. +There are other benefits for receive as well, chiefly a reduction in +cacheline bouncing across receiving threads for locking/unlocking the +single mmap lock. On an RPC style synthetic workload with 4KB RPCs: +1a) The find+lock+unlock VMA path in the base case, without the per-vma +lock patchset, is about 0.7% of cycles as measured by perf. +1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5% +cycles overall - most of this is within the TCP read hotpath (a small +fraction is 'other' usage in the system). +2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial +patch written to take advantage of it in TCP, is about 0.4% of cycles +(down from 0.7% above) +2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1% +cycles and is out of the TCP read hotpath entirely (down from 0.5% before, +the remaining usage is the 'other' usage in the system). +So, in addition to entirely removing an onerous source of contention, it +also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+ +(compared to overall cycles in perf) for the 'small' RPC scenario. + +The patchset structure is: +0001-0008: Enable maple-tree RCU mode +0009-0031: Main per-vma locks patchset +0032-0033: Performance optimizations + +Changes since v3: +- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge +- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo +- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett +- Dropped patch [6] from the set as unnecessary, per Liam Howlett + +[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/ +[2] https://lwn.net/Articles/893906/ +[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/ +[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/ +[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/ +[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/ + +The patchset applies cleanly over mm-unstable branch. + +Laurent Dufour (1): + powerc/mm: try VMA lock-based page fault handling first + +Liam Howlett (4): + maple_tree: Be more cautious about dead nodes + maple_tree: Detect dead nodes in mas_start() + maple_tree: Fix freeing of nodes in rcu mode + maple_tree: remove extra smp_wmb() from mas_dead_leaves() + +Liam R. Howlett (4): + maple_tree: Fix write memory barrier of nodes once dead for RCU mode + maple_tree: Add smp_rmb() to dead node detection + maple_tree: Add RCU lock checking to rcu callback functions + mm: Enable maple tree RCU mode by default. + +Michel Lespinasse (1): + mm: rcu safe VMA freeing + +Suren Baghdasaryan (23): + mm: introduce CONFIG_PER_VMA_LOCK + mm: move mmap_lock assert function definitions + mm: add per-VMA lock and helper functions to control it + mm: mark VMA as being written when changing vm_flags + mm/mmap: move vma_prepare before vma_adjust_trans_huge + mm/khugepaged: write-lock VMA while collapsing a huge page + mm/mmap: write-lock VMAs in vma_prepare before modifying them + mm/mremap: write-lock VMA while remapping it to a new address range + mm: write-lock VMAs before removing them from VMA tree + mm: conditionally write-lock VMA in free_pgtables + kernel/fork: assert no VMA readers during its destruction + mm/mmap: prevent pagefault handler from racing with mmu_notifier + registration + mm: introduce vma detached flag + mm: introduce lock_vma_under_rcu to be used from arch-specific code + mm: fall back to mmap_lock if vma->anon_vma is not yet set + mm: add FAULT_FLAG_VMA_LOCK flag + mm: prevent do_swap_page from handling page faults under VMA lock + mm: prevent userfaults to be handled under per-vma lock + mm: introduce per-VMA lock statistics + x86/mm: try VMA lock-based page fault handling first + arm64/mm: try VMA lock-based page fault handling first + mm/mmap: free vm_area_struct without call_rcu in exit_mmap + mm: separate vma->lock from vm_area_struct + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/mm/userfaultfd.rst | 17 +++ + arch/arm64/Kconfig | 1 + + arch/arm64/mm/fault.c | 36 +++++ + arch/powerpc/mm/fault.c | 37 +++++ + arch/powerpc/platforms/powernv/Kconfig | 1 + + arch/powerpc/platforms/pseries/Kconfig | 1 + + arch/s390/Kconfig | 1 + + arch/s390/mm/fault.c | 24 +++ + arch/x86/Kconfig | 1 + + arch/x86/mm/fault.c | 36 +++++ + fs/userfaultfd.c | 16 ++ + include/linux/mm.h | 127 +++++++++++++++- + include/linux/mm_inline.h | 6 + + include/linux/mm_types.h | 30 +++- + include/linux/mmap_lock.h | 37 +++-- + include/linux/userfaultfd_k.h | 23 +++ + include/linux/vm_event_item.h | 6 + + include/linux/vmstat.h | 6 + + include/uapi/linux/userfaultfd.h | 10 +- + kernel/fork.c | 96 ++++++++++-- + mm/Kconfig | 12 ++ + mm/Kconfig.debug | 6 + + mm/hugetlb.c | 4 + + mm/init-mm.c | 3 + + mm/internal.h | 2 +- + mm/khugepaged.c | 10 +- + mm/memory.c | 146 ++++++++++++++++--- + mm/mmap.c | 48 ++++-- + mm/mprotect.c | 51 +++++-- + mm/mremap.c | 1 + + mm/rmap.c | 31 ++-- + mm/vmstat.c | 6 + + tools/testing/selftests/mm/userfaultfd.c | 45 +++++- + 33 files changed, 783 insertions(+), 94 deletions(-) + +diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst +index 7dc823b56ca4..bd2226299583 100644 +--- a/Documentation/admin-guide/mm/userfaultfd.rst ++++ b/Documentation/admin-guide/mm/userfaultfd.rst +@@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter + you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was + used. - /* avoid taking the LRU lock under the PTL when possible */ - walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; -@@ -4624,6 +4708,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - start = max(addr & PMD_MASK, pvmw->vma->vm_start); - end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; - -+ if (end - start == PAGE_SIZE) -+ return young; ++Userfaultfd write-protect mode currently behave differently on none ptes ++(when e.g. page is missing) over different types of memories. + - if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { - if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) - end = start + MIN_LRU_BATCH * PAGE_SIZE; -@@ -4637,28 +4724,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) -- return; -+ return young; - - arch_enter_lazy_mmu_mode(); - - pte -= (addr - start) / PAGE_SIZE; - - for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ bool success; - unsigned long pfn; - - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); -- if (pfn == -1) -+ if (pfn == -1) { -+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); - continue; -+ } - -- if (!pte_young(pte[i])) -+ success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last); -+ if (!success && !pte_young(pte[i])) { -+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); - continue; -+ } - - folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); -- if (!folio) -+ if (!folio) { -+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); - continue; -+ } - -- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) -- VM_WARN_ON_ONCE(true); -+ clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); -+ if (pte_young(pte[i])) -+ ptep_test_and_clear_young(pvmw->vma, addr, pte + i); - - young++; - -@@ -4688,6 +4784,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); ++For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes ++(e.g. when pages are missing and not populated). For file-backed memories ++like shmem and hugetlbfs, none ptes will be write protected just like a ++present pte. In other words, there will be a userfaultfd write fault ++message generated when writing to a missing page on file typed memories, ++as long as the page range was write-protected before. Such a message will ++not be generated on anonymous memories by default. + -+ return young; - } ++If the application wants to be able to write protect none ptes on anonymous ++memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ. On ++newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED ++and set the feature bit in advance to make sure none ptes will also be ++write protected even upon anonymous memory. ++ + QEMU/KVM + ======== - /****************************************************************************** -@@ -5705,6 +5803,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) - caps |= BIT(LRU_GEN_NONLEAF_YOUNG); +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 1023e896d46b..6f104c829731 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -95,6 +95,7 @@ config ARM64 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index f4cb0f85ccf4..9e0db5c387e3 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + unsigned long vm_flags; + unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); ++#ifdef CONFIG_PER_VMA_LOCK ++ struct vm_area_struct *vma; ++#endif -+ if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK)) -+ caps |= BIT(LRU_GEN_SPTE_WALK); -+ - return sysfs_emit(buf, "0x%04x\n", caps); - } + if (kprobe_page_fault(regs, esr)) + return 0; +@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, -diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c -index 07aae60288f9..a115a27b375e 100644 ---- a/virt/kvm/kvm_main.c -+++ b/virt/kvm/kvm_main.c -@@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, - return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); - } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -+static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start, -+ unsigned long end, unsigned long *bitmap) -+{ -+ int i; -+ int key; -+ bool success = true; ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(mm_flags & FAULT_FLAG_USER)) ++ goto lock_mmap; + -+ trace_kvm_age_hva(start, end); ++ vma = lock_vma_under_rcu(mm, addr); ++ if (!vma) ++ goto lock_mmap; + -+ key = srcu_read_lock(&kvm->srcu); ++ if (!(vma->vm_flags & vm_flags)) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, addr & PAGE_MASK, ++ mm_flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); + -+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { -+ struct interval_tree_node *node; -+ struct kvm_memslots *slots = __kvm_memslots(kvm, i); ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); + -+ kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) { -+ gfn_t lsb_gfn; -+ unsigned long hva_start, hva_end; -+ struct kvm_gfn_range range = { -+ .slot = container_of(node, struct kvm_memory_slot, -+ hva_node[slots->node_idx]), -+ }; ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ if (!user_mode(regs)) ++ goto no_context; ++ return 0; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ + /* + * As per x86, we may deadlock here. However, since the kernel only + * validly references user space from well defined areas of the code, +@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + } + mmap_read_unlock(mm); + ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + /* + * Handle the "normal" (no error) case first. + */ +diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c +index af46aa88422b..531177a4ee08 100644 +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, + if (is_exec) + flags |= FAULT_FLAG_INSTRUCTION; + ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; + -+ hva_start = max(start, range.slot->userspace_addr); -+ hva_end = min(end - 1, range.slot->userspace_addr + -+ range.slot->npages * PAGE_SIZE - 1); ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; + -+ range.start = hva_to_gfn_memslot(hva_start, range.slot); -+ range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1; -+ -+ if (WARN_ON_ONCE(range.end <= range.start)) -+ continue; -+ -+ /* see the comments on the generic kvm_arch_has_test_clear_young() */ -+ lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot); -+ -+ success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap); -+ if (!success) -+ break; -+ } ++ if (unlikely(access_pkey_error(is_write, is_exec, ++ (error_code & DSISR_KEYFAULT), vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; + } + -+ srcu_read_unlock(&kvm->srcu, key); ++ if (unlikely(access_error(is_write, is_exec, vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } + -+ return success; ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ ++ if (fault_signal_pending(fault, regs)) ++ return user_mode(regs) ? 0 : SIGBUS; ++ ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, + + mmap_read_unlock(current->mm); + ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + if (unlikely(fault & VM_FAULT_ERROR)) + return mm_fault_error(regs, address, fault); + +diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig +index ae248a161b43..70a46acc70d6 100644 +--- a/arch/powerpc/platforms/powernv/Kconfig ++++ b/arch/powerpc/platforms/powernv/Kconfig +@@ -16,6 +16,7 @@ config PPC_POWERNV + select PPC_DOORBELL + select MMU_NOTIFIER + select FORCE_SMP ++ select ARCH_SUPPORTS_PER_VMA_LOCK + default y + + config OPAL_PRD +diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig +index 21b22bf16ce6..4ebf2ef2845d 100644 +--- a/arch/powerpc/platforms/pseries/Kconfig ++++ b/arch/powerpc/platforms/pseries/Kconfig +@@ -22,6 +22,7 @@ config PPC_PSERIES + select HOTPLUG_CPU + select FORCE_SMP + select SWIOTLB ++ select ARCH_SUPPORTS_PER_VMA_LOCK + default y + + config PARAVIRT +diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig +index 9809c74e1240..548b5b587003 100644 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -120,6 +120,7 @@ config S390 + select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANTS_DYNAMIC_TASK_STRUCT +diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c +index a2632fd97d00..b65144c392b0 100644 +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) + access = VM_WRITE; + if (access == VM_WRITE) + flags |= FAULT_FLAG_WRITE; ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; ++ if (!(vma->vm_flags & access)) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto out; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ fault = VM_FAULT_SIGNAL; ++ goto out; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ + mmap_read_lock(mm); + + gmap = NULL; +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index a825bf031f49..df21fba77db1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -27,6 +27,7 @@ config X86_64 + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index a498ae1fbe66..e4399983c50c 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -19,6 +19,7 @@ + #include /* faulthandler_disabled() */ + #include /* efi_crash_gracefully_on_page_fault()*/ + #include ++#include /* find_and_lock_vma() */ + + #include /* boot_cpu_has, ... */ + #include /* dotraplinkage, ... */ +@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs, + } + #endif + ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; ++ ++ if (unlikely(access_error(error_code, vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ if (!user_mode(regs)) ++ kernelmode_fixup_or_oops(regs, error_code, address, ++ SIGBUS, BUS_ADRERR, ++ ARCH_DEFAULT_PKEY); ++ return; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + /* + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception +@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs, + } + + mmap_read_unlock(mm); ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + if (likely(!(fault & VM_FAULT_ERROR))) + return; + +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index 44d1ee429eb0..881e9c82b9d1 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) + return ctx->features & UFFD_FEATURE_INITIALIZED; + } + ++/* ++ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only ++ * meaningful when userfaultfd_wp()==true on the vma and when it's ++ * anonymous. ++ */ ++bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) ++{ ++ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; ++ ++ if (!ctx) ++ return false; ++ ++ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + -+static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, -+ unsigned long start, unsigned long end, -+ unsigned long *bitmap) + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) + { +@@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, + #endif + #ifndef CONFIG_PTE_MARKER_UFFD_WP + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; ++ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + #endif + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 1f79667824eb..c4c9de7d1916 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code, + struct vm_area_struct *vm_area_alloc(struct mm_struct *); + struct vm_area_struct *vm_area_dup(struct vm_area_struct *); + void vm_area_free(struct vm_area_struct *); ++/* Use only if VMA has no other users */ ++void __vm_area_free(struct vm_area_struct *vma); + + #ifndef CONFIG_MMU + extern struct rb_root nommu_region_tree; +@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ +- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } ++ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ ++ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } + + /* + * vm_fault is filled by the pagefault handler and passed to the vma's +@@ -623,6 +626,117 @@ struct vm_operations_struct { + unsigned long addr); + }; + ++#ifdef CONFIG_PER_VMA_LOCK ++/* ++ * Try to read-lock a vma. The function is allowed to occasionally yield false ++ * locked result to avoid performance overhead, in which case we fall back to ++ * using mmap_lock. The function should never yield false unlocked result. ++ */ ++static inline bool vma_start_read(struct vm_area_struct *vma) +{ -+ if (kvm_arch_has_test_clear_young()) -+ return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap); ++ /* Check before locking. A race might cause false locked result. */ ++ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) ++ return false; + ++ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) ++ return false; ++ ++ /* ++ * Overflow might produce false locked result. ++ * False unlocked result is impossible because we modify and check ++ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq ++ * modification invalidates all existing locks. ++ */ ++ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { ++ up_read(&vma->vm_lock->lock); ++ return false; ++ } ++ return true; ++} ++ ++static inline void vma_end_read(struct vm_area_struct *vma) ++{ ++ rcu_read_lock(); /* keeps vma alive till the end of up_read */ ++ up_read(&vma->vm_lock->lock); ++ rcu_read_unlock(); ++} ++ ++static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) ++{ ++ mmap_assert_write_locked(vma->vm_mm); ++ ++ /* ++ * current task is holding mmap_write_lock, both vma->vm_lock_seq and ++ * mm->mm_lock_seq can't be concurrently modified. ++ */ ++ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); ++ return (vma->vm_lock_seq == *mm_lock_seq); ++} ++ ++static inline void vma_start_write(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ if (__is_vma_write_locked(vma, &mm_lock_seq)) ++ return; ++ ++ down_write(&vma->vm_lock->lock); ++ vma->vm_lock_seq = mm_lock_seq; ++ up_write(&vma->vm_lock->lock); ++} ++ ++static inline bool vma_try_start_write(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ if (__is_vma_write_locked(vma, &mm_lock_seq)) ++ return true; ++ ++ if (!down_write_trylock(&vma->vm_lock->lock)) ++ return false; ++ ++ vma->vm_lock_seq = mm_lock_seq; ++ up_write(&vma->vm_lock->lock); ++ return true; ++} ++ ++static inline void vma_assert_write_locked(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); ++} ++ ++static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) ++{ ++ /* When detaching vma should be write-locked */ ++ if (detached) ++ vma_assert_write_locked(vma); ++ vma->detached = detached; ++} ++ ++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, ++ unsigned long address); ++ ++#else /* CONFIG_PER_VMA_LOCK */ ++ ++static inline void vma_init_lock(struct vm_area_struct *vma) {} ++static inline bool vma_start_read(struct vm_area_struct *vma) ++ { return false; } ++static inline void vma_end_read(struct vm_area_struct *vma) {} ++static inline void vma_start_write(struct vm_area_struct *vma) {} ++static inline bool vma_try_start_write(struct vm_area_struct *vma) ++ { return true; } ++static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} ++static inline void vma_mark_detached(struct vm_area_struct *vma, ++ bool detached) {} ++ ++#endif /* CONFIG_PER_VMA_LOCK */ ++ ++/* ++ * WARNING: vma_init does not initialize vma->vm_lock. ++ * Use vm_area_alloc()/vm_area_free() if vma needs locking. ++ */ + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) + { + static const struct vm_operations_struct dummy_vm_ops = {}; +@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) + vma->vm_mm = mm; + vma->vm_ops = &dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); ++ vma_mark_detached(vma, false); + } + + /* Use when VMA is not part of the VMA tree and needs no locking */ +@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma, + static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + vm_flags_init(vma, flags); + } + + static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + } + + static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; + } + + static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + } + +@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma, + static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + __vm_flags_mod(vma, set, clear); + } + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index de1e622dd366..0e1d239a882c 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, + /* The current status of the pte should be "cleared" before calling */ + WARN_ON_ONCE(!pte_none(*pte)); + ++ /* ++ * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole ++ * thing, because when zapping either it means it's dropping the ++ * page, or in TTU where the present pte will be quickly replaced ++ * with a swap pte. There's no way of leaking the bit. ++ */ + if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) + return; + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 22b2ac82bffd..ef74ea892c5b 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -471,6 +471,10 @@ struct anon_vma_name { + char name[]; + }; + ++struct vma_lock { ++ struct rw_semaphore lock; ++}; ++ + /* + * This struct describes a virtual memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory +@@ -480,9 +484,16 @@ struct anon_vma_name { + struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + +- unsigned long vm_start; /* Our start address within vm_mm. */ +- unsigned long vm_end; /* The first byte after our end address +- within vm_mm. */ ++ union { ++ struct { ++ /* VMA covers [vm_start; vm_end) addresses within mm */ ++ unsigned long vm_start; ++ unsigned long vm_end; ++ }; ++#ifdef CONFIG_PER_VMA_LOCK ++ struct rcu_head vm_rcu; /* Used for deferred freeing. */ ++#endif ++ }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + +@@ -501,6 +512,14 @@ struct vm_area_struct { + vm_flags_t __private __vm_flags; + }; + ++#ifdef CONFIG_PER_VMA_LOCK ++ int vm_lock_seq; ++ struct vma_lock *vm_lock; ++ ++ /* Flag to indicate areas detached from the mm->mm_mt tree */ ++ bool detached; ++#endif ++ + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. +@@ -637,6 +656,9 @@ struct mm_struct { + * init_mm.mmlist, and are protected + * by mmlist_lock + */ ++#ifdef CONFIG_PER_VMA_LOCK ++ int mm_lock_seq; ++#endif + + + unsigned long hiwater_rss; /* High-watermark of RSS usage */ +@@ -1042,6 +1064,7 @@ typedef struct { + * mapped after the fault. + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. ++ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two +@@ -1079,6 +1102,7 @@ enum fault_flag { + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, ++ FAULT_FLAG_VMA_LOCK = 1 << 12, + }; + + typedef unsigned int __bitwise zap_flags_t; +diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h +index 96e113e23d04..aab8f1b28d26 100644 +--- a/include/linux/mmap_lock.h ++++ b/include/linux/mmap_lock.h +@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) + + #endif /* CONFIG_TRACING */ + ++static inline void mmap_assert_locked(struct mm_struct *mm) ++{ ++ lockdep_assert_held(&mm->mmap_lock); ++ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); ++} ++ ++static inline void mmap_assert_write_locked(struct mm_struct *mm) ++{ ++ lockdep_assert_held_write(&mm->mmap_lock); ++ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); ++} ++ ++#ifdef CONFIG_PER_VMA_LOCK ++static inline void vma_end_write_all(struct mm_struct *mm) ++{ ++ mmap_assert_write_locked(mm); ++ /* No races during update due to exclusive mmap_lock being held */ ++ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); ++} ++#else ++static inline void vma_end_write_all(struct mm_struct *mm) {} ++#endif ++ + static inline void mmap_init_lock(struct mm_struct *mm) + { + init_rwsem(&mm->mmap_lock); +@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) + static inline void mmap_write_unlock(struct mm_struct *mm) + { + __mmap_lock_trace_released(mm, true); ++ vma_end_write_all(mm); + up_write(&mm->mmap_lock); + } + + static inline void mmap_write_downgrade(struct mm_struct *mm) + { + __mmap_lock_trace_acquire_returned(mm, false, true); ++ vma_end_write_all(mm); + downgrade_write(&mm->mmap_lock); + } + +@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) + up_read_non_owner(&mm->mmap_lock); + } + +-static inline void mmap_assert_locked(struct mm_struct *mm) +-{ +- lockdep_assert_held(&mm->mmap_lock); +- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +-} +- +-static inline void mmap_assert_write_locked(struct mm_struct *mm) +-{ +- lockdep_assert_held_write(&mm->mmap_lock); +- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +-} +- + static inline int mmap_lock_is_contended(struct mm_struct *mm) + { + return rwsem_is_contended(&mm->mmap_lock); +diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h +index 3767f18114ef..0cf8880219da 100644 +--- a/include/linux/userfaultfd_k.h ++++ b/include/linux/userfaultfd_k.h +@@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); + extern void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf); ++extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); + + #else /* CONFIG_USERFAULTFD */ + +@@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) + return false; + } + ++static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) ++{ + return false; +} + - static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -@@ -903,6 +960,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .clear_flush_young = kvm_mmu_notifier_clear_flush_young, - .clear_young = kvm_mmu_notifier_clear_young, - .test_young = kvm_mmu_notifier_test_young, -+ .test_clear_young = kvm_mmu_notifier_test_clear_young, - .change_pte = kvm_mmu_notifier_change_pte, - .release = kvm_mmu_notifier_release, + #endif /* CONFIG_USERFAULTFD */ + ++static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) ++{ ++ /* Only wr-protect mode uses pte markers */ ++ if (!userfaultfd_wp(vma)) ++ return false; ++ ++ /* File-based uffd-wp always need markers */ ++ if (!vma_is_anonymous(vma)) ++ return true; ++ ++ /* ++ * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED ++ * enabled (to apply markers on zero pages). ++ */ ++ return userfaultfd_wp_unpopulated(vma); ++} ++ + static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) + { + #ifdef CONFIG_PTE_MARKER_UFFD_WP +diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h +index 7f5d1caf5890..8abfa1240040 100644 +--- a/include/linux/vm_event_item.h ++++ b/include/linux/vm_event_item.h +@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, + #ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, ++#endif ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++ VMA_LOCK_SUCCESS, ++ VMA_LOCK_ABORT, ++ VMA_LOCK_RETRY, ++ VMA_LOCK_MISS, + #endif + NR_VM_EVENT_ITEMS }; --- -2.40.0.rc2 - -From c63e61e48ac0d492af1918ba84350e07a5c95d17 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 13 Feb 2023 09:26:09 +0100 -Subject: [PATCH 12/16] objtool - -Signed-off-by: Peter Jung ---- - tools/objtool/.gitignore | 1 + - tools/objtool/Build | 2 - - tools/objtool/Documentation/objtool.txt | 8 +++ - tools/objtool/Makefile | 66 +++++++++++++++++-------- - tools/objtool/builtin-check.c | 2 +- - tools/objtool/check.c | 7 +++ - tools/objtool/elf.c | 42 ++++++++-------- - tools/objtool/include/objtool/builtin.h | 2 - - tools/objtool/include/objtool/elf.h | 9 ++-- - tools/objtool/include/objtool/special.h | 2 +- - tools/objtool/special.c | 6 +-- - 11 files changed, 93 insertions(+), 54 deletions(-) - -diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore -index 14236db3677f..4faa4dd72f35 100644 ---- a/tools/objtool/.gitignore -+++ b/tools/objtool/.gitignore -@@ -2,3 +2,4 @@ - arch/x86/lib/inat-tables.c - /objtool - fixdep -+libsubcmd/ -diff --git a/tools/objtool/Build b/tools/objtool/Build -index 33f2ee5a46d3..a3cdf8af6635 100644 ---- a/tools/objtool/Build -+++ b/tools/objtool/Build -@@ -16,8 +16,6 @@ objtool-y += libctype.o - objtool-y += str_error_r.o - objtool-y += librbtree.o +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 19cf5b6892ce..fed855bae6d8 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu) + #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) + #endif --CFLAGS += -I$(srctree)/tools/lib -- - $(OUTPUT)libstring.o: ../lib/string.c FORCE - $(call rule_mkdir) - $(call if_changed_dep,cc_o_c) -diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt -index 8a671902a187..8e53fc6735ef 100644 ---- a/tools/objtool/Documentation/objtool.txt -+++ b/tools/objtool/Documentation/objtool.txt -@@ -410,6 +410,14 @@ the objtool maintainers. - can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL - directive right before the call. - -+12. file.o: warning: func(): not an indirect call target ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++#define count_vm_vma_lock_event(x) count_vm_event(x) ++#else ++#define count_vm_vma_lock_event(x) do {} while (0) ++#endif + -+ This means that objtool is running with --ibt and a function expected -+ to be an indirect call target is not. In particular, this happens for -+ init_module() or cleanup_module() if a module relies on these special -+ names and does not use module_init() / module_exit() macros to create -+ them. + #define __count_zid_vm_events(item, zid, delta) \ + __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) + +diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h +index 005e5e306266..90c958952bfc 100644 +--- a/include/uapi/linux/userfaultfd.h ++++ b/include/uapi/linux/userfaultfd.h +@@ -38,7 +38,8 @@ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ +- UFFD_FEATURE_WP_HUGETLBFS_SHMEM) ++ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ ++ UFFD_FEATURE_WP_UNPOPULATED) + #define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ +@@ -203,6 +204,12 @@ struct uffdio_api { + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. ++ * ++ * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd ++ * write-protection mode will always apply to unpopulated pages ++ * (i.e. empty ptes). This will be the default behavior for shmem ++ * & hugetlbfs, so this flag only affects anonymous memory behavior ++ * when userfault write-protection mode is registered. + */ + #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) + #define UFFD_FEATURE_EVENT_FORK (1<<1) +@@ -217,6 +224,7 @@ struct uffdio_api { + #define UFFD_FEATURE_MINOR_SHMEM (1<<10) + #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) + #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) ++#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) + __u64 features; + + __u64 ioctls; +diff --git a/kernel/fork.c b/kernel/fork.c +index 49c173e367d2..346ce90d1f33 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep; + /* SLAB cache for mm_struct structures (tsk->mm) */ + static struct kmem_cache *mm_cachep; + ++#ifdef CONFIG_PER_VMA_LOCK + - - If the error doesn't seem to make sense, it could be a bug in objtool. - Feel free to ask the objtool maintainer for help. -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index a3a9cc24e0e3..83b100c1e7f6 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -2,19 +2,18 @@ - include ../scripts/Makefile.include - include ../scripts/Makefile.arch - --# always use the host compiler --AR = $(HOSTAR) --CC = $(HOSTCC) --LD = $(HOSTLD) -- - ifeq ($(srctree),) - srctree := $(patsubst %/,%,$(dir $(CURDIR))) - srctree := $(patsubst %/,%,$(dir $(srctree))) - endif - --SUBCMD_SRCDIR = $(srctree)/tools/lib/subcmd/ --LIBSUBCMD_OUTPUT = $(or $(OUTPUT),$(CURDIR)/) --LIBSUBCMD = $(LIBSUBCMD_OUTPUT)libsubcmd.a -+LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/ -+ifneq ($(OUTPUT),) -+ LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd -+else -+ LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd -+endif -+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a - - OBJTOOL := $(OUTPUT)objtool - OBJTOOL_IN := $(OBJTOOL)-in.o -@@ -28,16 +27,29 @@ INCLUDES := -I$(srctree)/tools/include \ - -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ - -I$(srctree)/tools/arch/$(SRCARCH)/include \ - -I$(srctree)/tools/objtool/include \ -- -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include -+ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \ -+ -I$(LIBSUBCMD_OUTPUT)/include -+# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it -+# is passed here to match a legacy behavior. - WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs --CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) --LDFLAGS += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) -+OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) -+OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) - - # Allow old libelf to be used: --elfshdr := $(shell echo '$(pound)include ' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr) --CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) -+elfshdr := $(shell echo '$(pound)include ' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr) -+OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) ++/* SLAB cache for vm_area_struct.lock */ ++static struct kmem_cache *vma_lock_cachep; + -+# Always want host compilation. -+HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" - - AWK = awk -+MKDIR = mkdir ++static bool vma_lock_alloc(struct vm_area_struct *vma) ++{ ++ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); ++ if (!vma->vm_lock) ++ return false; + -+ifeq ($(V),1) -+ Q = -+else -+ Q = @ -+endif - - BUILD_ORC := n - -@@ -49,21 +61,33 @@ export BUILD_ORC - export srctree OUTPUT CFLAGS SRCARCH AWK - include $(srctree)/tools/build/Makefile.include - --$(OBJTOOL_IN): fixdep FORCE -- @$(CONFIG_SHELL) ./sync-check.sh -- @$(MAKE) $(build)=objtool -+$(OBJTOOL_IN): fixdep $(LIBSUBCMD) FORCE -+ $(Q)$(CONFIG_SHELL) ./sync-check.sh -+ $(Q)$(MAKE) $(build)=objtool $(HOST_OVERRIDES) CFLAGS="$(OBJTOOL_CFLAGS)" \ -+ LDFLAGS="$(OBJTOOL_LDFLAGS)" ++ init_rwsem(&vma->vm_lock->lock); ++ vma->vm_lock_seq = -1; + - - $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) -- $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ -+ $(QUIET_LINK)$(HOSTCC) $(OBJTOOL_IN) $(OBJTOOL_LDFLAGS) -o $@ ++ return true; ++} + ++static inline void vma_lock_free(struct vm_area_struct *vma) ++{ ++ kmem_cache_free(vma_lock_cachep, vma->vm_lock); ++} + -+$(LIBSUBCMD_OUTPUT): -+ $(Q)$(MKDIR) -p $@ ++#else /* CONFIG_PER_VMA_LOCK */ ++ ++static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } ++static inline void vma_lock_free(struct vm_area_struct *vma) {} ++ ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) + { + struct vm_area_struct *vma; -+$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE -+ $(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \ -+ DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \ -+ $(HOST_OVERRIDES) EXTRA_CFLAGS="$(OBJTOOL_CFLAGS)" \ -+ $@ install_headers - --$(LIBSUBCMD): fixdep FORCE -- $(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT) -+$(LIBSUBCMD)-clean: -+ $(call QUIET_CLEAN, libsubcmd) -+ $(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT) - --clean: -+clean: $(LIBSUBCMD)-clean - $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) - $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete -- $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep $(LIBSUBCMD) -+ $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep - - FORCE: - -diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c -index a4f39407bf59..7c175198d09f 100644 ---- a/tools/objtool/builtin-check.c -+++ b/tools/objtool/builtin-check.c -@@ -65,7 +65,7 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) - return found ? 0 : -1; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +- if (vma) +- vma_init(vma, mm); ++ if (!vma) ++ return NULL; ++ ++ vma_init(vma, mm); ++ if (!vma_lock_alloc(vma)) { ++ kmem_cache_free(vm_area_cachep, vma); ++ return NULL; ++ } ++ + return vma; } --const struct option check_options[] = { -+static const struct option check_options[] = { - OPT_GROUP("Actions:"), - OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), - OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), -diff --git a/tools/objtool/check.c b/tools/objtool/check.c -index ea1e7cdeb1b3..384b7df3fbb2 100644 ---- a/tools/objtool/check.c -+++ b/tools/objtool/check.c -@@ -856,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) - list_for_each_entry(insn, &file->endbr_list, call_node) { +@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) + { + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - int *site = (int *)sec->data->d_buf + idx; -+ struct symbol *sym = insn->sym; - *site = 0; - -+ if (opts.module && sym && sym->type == STT_FUNC && -+ insn->offset == sym->offset && -+ (!strcmp(sym->name, "init_module") || -+ !strcmp(sym->name, "cleanup_module"))) -+ WARN("%s(): not an indirect call target", sym->name); +- if (new) { +- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); +- ASSERT_EXCLUSIVE_WRITER(orig->vm_file); +- /* +- * orig->shared.rb may be modified concurrently, but the clone +- * will be reinitialized. +- */ +- data_race(memcpy(new, orig, sizeof(*new))); +- INIT_LIST_HEAD(&new->anon_vma_chain); +- dup_anon_vma_name(orig, new); ++ if (!new) ++ return NULL; + - if (elf_add_reloc_to_insn(file->elf, sec, - idx * sizeof(int), - R_X86_64_PC32, -diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c -index 64443a7f4bbf..6806ce01d933 100644 ---- a/tools/objtool/elf.c -+++ b/tools/objtool/elf.c -@@ -284,13 +284,13 @@ static int read_sections(struct elf *elf) - !elf_alloc_hash(section_name, sections_nr)) - return -1; ++ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ++ ASSERT_EXCLUSIVE_WRITER(orig->vm_file); ++ /* ++ * orig->shared.rb may be modified concurrently, but the clone ++ * will be reinitialized. ++ */ ++ data_race(memcpy(new, orig, sizeof(*new))); ++ if (!vma_lock_alloc(new)) { ++ kmem_cache_free(vm_area_cachep, new); ++ return NULL; + } ++ INIT_LIST_HEAD(&new->anon_vma_chain); ++ dup_anon_vma_name(orig, new); ++ + return new; + } -+ elf->section_data = calloc(sections_nr, sizeof(*sec)); -+ if (!elf->section_data) { -+ perror("calloc"); -+ return -1; -+ } - for (i = 0; i < sections_nr; i++) { -- sec = malloc(sizeof(*sec)); -- if (!sec) { -- perror("malloc"); -- return -1; -- } -- memset(sec, 0, sizeof(*sec)); -+ sec = &elf->section_data[i]; +-void vm_area_free(struct vm_area_struct *vma) ++void __vm_area_free(struct vm_area_struct *vma) + { + free_anon_vma_name(vma); ++ vma_lock_free(vma); + kmem_cache_free(vm_area_cachep, vma); + } - INIT_LIST_HEAD(&sec->symbol_list); - INIT_LIST_HEAD(&sec->reloc_list); -@@ -422,13 +422,13 @@ static int read_symbols(struct elf *elf) - !elf_alloc_hash(symbol_name, symbols_nr)) - return -1; ++#ifdef CONFIG_PER_VMA_LOCK ++static void vm_area_free_rcu_cb(struct rcu_head *head) ++{ ++ struct vm_area_struct *vma = container_of(head, struct vm_area_struct, ++ vm_rcu); ++ ++ /* The vma should not be locked while being destroyed. */ ++ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); ++ __vm_area_free(vma); ++} ++#endif ++ ++void vm_area_free(struct vm_area_struct *vma) ++{ ++#ifdef CONFIG_PER_VMA_LOCK ++ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); ++#else ++ __vm_area_free(vma); ++#endif ++} ++ + static void account_kernel_stack(struct task_struct *tsk, int account) + { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { +@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + seqcount_init(&mm->write_protect_seq); + mmap_init_lock(mm); + INIT_LIST_HEAD(&mm->mmlist); ++#ifdef CONFIG_PER_VMA_LOCK ++ mm->mm_lock_seq = 0; ++#endif + mm_pgtables_bytes_init(mm); + mm->map_count = 0; + mm->locked_vm = 0; +@@ -3073,6 +3140,9 @@ void __init proc_caches_init(void) + NULL); -+ elf->symbol_data = calloc(symbols_nr, sizeof(*sym)); -+ if (!elf->symbol_data) { -+ perror("calloc"); -+ return -1; -+ } - for (i = 0; i < symbols_nr; i++) { -- sym = malloc(sizeof(*sym)); -- if (!sym) { -- perror("malloc"); -- return -1; -- } -- memset(sym, 0, sizeof(*sym)); -+ sym = &elf->symbol_data[i]; + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); ++#ifdef CONFIG_PER_VMA_LOCK ++ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); ++#endif + mmap_init(); + nsproxy_cache_init(); + } +diff --git a/mm/Kconfig b/mm/Kconfig +index cf2e47030fe8..459af2123189 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS + This option has a per-memcg and per-node memory overhead. + # } - sym->idx = i; ++config ARCH_SUPPORTS_PER_VMA_LOCK ++ def_bool n ++ ++config PER_VMA_LOCK ++ def_bool y ++ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP ++ help ++ Allow per-vma locking during page fault handling. ++ ++ This feature allows locking each virtual memory area separately when ++ handling page faults instead of taking mmap_lock. ++ + source "mm/damon/Kconfig" -@@ -918,13 +918,13 @@ static int read_relocs(struct elf *elf) - sec->base->reloc = sec; + endmenu +diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug +index c3547a373c9c..4965a7333a3f 100644 +--- a/mm/Kconfig.debug ++++ b/mm/Kconfig.debug +@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN - nr_reloc = 0; -+ sec->reloc_data = calloc(sec->sh.sh_size / sec->sh.sh_entsize, sizeof(*reloc)); -+ if (!sec->reloc_data) { -+ perror("calloc"); -+ return -1; -+ } - for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) { -- reloc = malloc(sizeof(*reloc)); -- if (!reloc) { -- perror("malloc"); -- return -1; -- } -- memset(reloc, 0, sizeof(*reloc)); -+ reloc = &sec->reloc_data[i]; - switch (sec->sh.sh_type) { - case SHT_REL: - if (read_rel_reloc(sec, i, reloc, &symndx)) -@@ -1453,16 +1453,16 @@ void elf_close(struct elf *elf) - list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) { - list_del(&sym->list); - hash_del(&sym->hash); -- free(sym); - } - list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) { - list_del(&reloc->list); - hash_del(&reloc->hash); -- free(reloc); - } - list_del(&sec->list); -- free(sec); -+ free(sec->reloc_data); + If unsure, say Y. + ++config PER_VMA_LOCK_STATS ++ bool "Statistics for per-vma locks" ++ depends on PER_VMA_LOCK ++ default y ++ help ++ Statistics for per-vma locks. +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 245038a9fe4e..4d860b53a14a 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6004,6 +6004,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); + ++ /* TODO: Handle faults under the VMA lock */ ++ if (flags & FAULT_FLAG_VMA_LOCK) ++ return VM_FAULT_RETRY; ++ + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate +diff --git a/mm/init-mm.c b/mm/init-mm.c +index c9327abb771c..33269314e060 100644 +--- a/mm/init-mm.c ++++ b/mm/init-mm.c +@@ -37,6 +37,9 @@ struct mm_struct init_mm = { + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), ++#ifdef CONFIG_PER_VMA_LOCK ++ .mm_lock_seq = 0, ++#endif + .user_ns = &init_user_ns, + .cpu_bitmap = CPU_BITS_NONE, + #ifdef CONFIG_IOMMU_SVA +diff --git a/mm/internal.h b/mm/internal.h +index 7920a8b7982e..0c455d6e4e3e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio); + + void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, +- unsigned long ceiling); ++ unsigned long ceiling, bool mm_wr_locked); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); + + struct zap_details; +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 92e6f56a932d..042007f0bfa1 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1049,6 +1049,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + if (result != SCAN_SUCCEED) + goto out_up_write; + ++ vma_start_write(vma); + anon_vma_lock_write(vma->anon_vma); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, +@@ -1172,7 +1173,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ +- if (pte_swp_uffd_wp(pteval)) { ++ if (pte_swp_uffd_wp_any(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } +@@ -1512,6 +1513,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + goto drop_hpage; } -+ free(elf->symbol_data); -+ free(elf->section_data); - free(elf); - } -diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h -index fa45044e3863..2a108e648b7a 100644 ---- a/tools/objtool/include/objtool/builtin.h -+++ b/tools/objtool/include/objtool/builtin.h -@@ -7,8 +7,6 @@ - - #include - --extern const struct option check_options[]; -- - struct opts { - /* actions: */ - bool dump_orc; -diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h -index bb60fd42b46f..ad0024da262b 100644 ---- a/tools/objtool/include/objtool/elf.h -+++ b/tools/objtool/include/objtool/elf.h -@@ -39,6 +39,7 @@ struct section { - char *name; - int idx; - bool changed, text, rodata, noinstr, init, truncate; -+ struct reloc *reloc_data; - }; - - struct symbol { -@@ -49,12 +50,11 @@ struct symbol { - GElf_Sym sym; - struct section *sec; - char *name; -- unsigned int idx; -- unsigned char bind, type; -+ unsigned int idx, len; - unsigned long offset; -- unsigned int len; - unsigned long __subtree_last; - struct symbol *pfunc, *cfunc, *alias; -+ unsigned char bind, type; - u8 uaccess_safe : 1; - u8 static_call_tramp : 1; - u8 retpoline_thunk : 1; -@@ -104,6 +104,9 @@ struct elf { - struct hlist_head *section_hash; - struct hlist_head *section_name_hash; - struct hlist_head *reloc_hash; ++ /* Lock the vma before taking i_mmap and page table locks */ ++ vma_start_write(vma); + -+ struct section *section_data; -+ struct symbol *symbol_data; - }; + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that +@@ -1689,6 +1693,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { ++ /* trylock for the same lock inversion as above */ ++ if (!vma_try_start_write(vma)) ++ goto unlock_next; ++ + /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no +diff --git a/mm/memory.c b/mm/memory.c +index 01a23ad48a04..e7ffdadb684d 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map); + #endif - #define OFFSET_STRIDE_BITS 4 -diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h -index dc4721e19002..86d4af9c5aa9 100644 ---- a/tools/objtool/include/objtool/special.h -+++ b/tools/objtool/include/objtool/special.h -@@ -19,6 +19,7 @@ struct special_alt { - bool skip_orig; - bool skip_alt; - bool jump_or_nop; -+ u8 key_addend; + static vm_fault_t do_fault(struct vm_fault *vmf); ++static vm_fault_t do_anonymous_page(struct vm_fault *vmf); ++static bool vmf_pte_changed(struct vm_fault *vmf); ++ ++/* ++ * Return true if the original pte was a uffd-wp pte marker (so the pte was ++ * wr-protected). ++ */ ++static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) ++{ ++ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) ++ return false; ++ ++ return pte_marker_uffd_wp(vmf->orig_pte); ++} - struct section *orig_sec; - unsigned long orig_off; -@@ -27,7 +28,6 @@ struct special_alt { - unsigned long new_off; + /* + * A number of key systems in x86 including ioremap() rely on the assumption +@@ -348,7 +362,7 @@ void free_pgd_range(struct mmu_gather *tlb, - unsigned int orig_len, new_len; /* group only */ -- u8 key_addend; - }; + void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, +- unsigned long ceiling) ++ unsigned long ceiling, bool mm_wr_locked) + { + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - int special_get_alts(struct elf *elf, struct list_head *alts); -diff --git a/tools/objtool/special.c b/tools/objtool/special.c -index 9c8d827f69af..baa85c31526b 100644 ---- a/tools/objtool/special.c -+++ b/tools/objtool/special.c -@@ -26,7 +26,7 @@ struct special_entry { - unsigned char key; /* jump_label key */ - }; +@@ -366,6 +380,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables + */ ++ if (mm_wr_locked) ++ vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma(vma); --struct special_entry entries[] = { -+static const struct special_entry entries[] = { - { - .sec = ".altinstructions", - .group = true, -@@ -65,7 +65,7 @@ static void reloc_to_sec_off(struct reloc *reloc, struct section **sec, - *off = reloc->sym->offset + reloc->addend; +@@ -380,6 +396,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + && !is_vm_hugetlb_page(next)) { + vma = next; + next = mas_find(&mas, ceiling - 1); ++ if (mm_wr_locked) ++ vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma(vma); + } +@@ -1345,6 +1363,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, + struct zap_details *details, pte_t pteval) + { ++ /* Zap on anonymous always means dropping everything */ ++ if (vma_is_anonymous(vma)) ++ return; ++ + if (zap_drop_file_uffd_wp(details)) + return; + +@@ -1451,8 +1473,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + continue; + rss[mm_counter(page)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { +- /* Only drop the uffd-wp marker if explicitly requested */ +- if (!zap_drop_file_uffd_wp(details)) ++ /* ++ * For anon: always drop the marker; for file: only ++ * drop the marker if explicitly requested. ++ */ ++ if (!vma_is_anonymous(vma) && ++ !zap_drop_file_uffd_wp(details)) + continue; + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { +@@ -3322,6 +3348,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = NULL; + ++ if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) ++ return VM_FAULT_RETRY; ++ + if (likely(!unshare)) { + if (userfaultfd_pte_wp(vma, *vmf->pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); +@@ -3633,6 +3662,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) + return 0; } --static int get_alt_entry(struct elf *elf, struct special_entry *entry, -+static int get_alt_entry(struct elf *elf, const struct special_entry *entry, - struct section *sec, int idx, - struct special_alt *alt) - { -@@ -139,7 +139,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, ++static vm_fault_t do_pte_missing(struct vm_fault *vmf) ++{ ++ if (vma_is_anonymous(vmf->vma)) ++ return do_anonymous_page(vmf); ++ else ++ return do_fault(vmf); ++} ++ + /* + * This is actually a page-missing access, but with uffd-wp special pte + * installed. It means this pte was wr-protected before being unmapped. +@@ -3643,11 +3680,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) + * Just in case there're leftover special ptes even after the region + * got unregistered - we can simply clear them. + */ +- if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) ++ if (unlikely(!userfaultfd_wp(vmf->vma))) + return pte_marker_clear(vmf); + +- /* do_fault() can handle pte markers too like none pte */ +- return do_fault(vmf); ++ return do_pte_missing(vmf); + } + + static vm_fault_t handle_pte_marker(struct vm_fault *vmf) +@@ -3698,6 +3734,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) + if (!pte_unmap_same(vmf)) + goto out; + ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ++ ret = VM_FAULT_RETRY; ++ goto out; ++ } ++ + entry = pte_to_swp_entry(vmf->orig_pte); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { +@@ -4012,6 +4053,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ - int special_get_alts(struct elf *elf, struct list_head *alts) + static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { -- struct special_entry *entry; -+ const struct special_entry *entry; - struct section *sec; - unsigned int nr_entries; - struct special_alt *alt; ++ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + vm_fault_t ret = 0; +@@ -4045,7 +4087,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) + vma->vm_page_prot)); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); +- if (!pte_none(*vmf->pte)) { ++ if (vmf_pte_changed(vmf)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); + goto unlock; + } +@@ -4085,7 +4127,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); +- if (!pte_none(*vmf->pte)) { ++ if (vmf_pte_changed(vmf)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); + goto release; + } +@@ -4105,6 +4147,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) + folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_lru_vma(folio, vma); + setpte: ++ if (uffd_wp) ++ entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ +@@ -4272,7 +4316,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) + void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) + { + struct vm_area_struct *vma = vmf->vma; +- bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); ++ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->address != addr; + pte_t entry; +@@ -4503,6 +4547,8 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) + return ret; + } + ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) ++ return VM_FAULT_RETRY; + ret = __do_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; +@@ -4519,6 +4565,9 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret; + ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) ++ return VM_FAULT_RETRY; ++ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + +@@ -4558,6 +4607,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret, tmp; + ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) ++ return VM_FAULT_RETRY; ++ + ret = __do_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; +@@ -4916,12 +4968,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) + } + } + +- if (!vmf->pte) { +- if (vma_is_anonymous(vmf->vma)) +- return do_anonymous_page(vmf); +- else +- return do_fault(vmf); +- } ++ if (!vmf->pte) ++ return do_pte_missing(vmf); + + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); +@@ -4929,6 +4977,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); + ++ if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) ++ return VM_FAULT_RETRY; ++ + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; +@@ -4965,10 +5016,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) + } + + /* +- * By the time we get here, we already hold the mm semaphore +- * +- * The mmap_lock may have been released depending on flags and our +- * return value. See filemap_fault() and __folio_lock_or_retry(). ++ * On entry, we hold either the VMA lock or the mmap_lock ++ * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in ++ * the result, the mmap_lock is not held on exit. See filemap_fault() ++ * and __folio_lock_or_retry(). + */ + static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +@@ -5230,6 +5281,63 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + } + EXPORT_SYMBOL_GPL(handle_mm_fault); + ++#ifdef CONFIG_PER_VMA_LOCK ++/* ++ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be ++ * stable and not isolated. If the VMA is not found or is being modified the ++ * function returns NULL. ++ */ ++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, ++ unsigned long address) ++{ ++ MA_STATE(mas, &mm->mm_mt, address, address); ++ struct vm_area_struct *vma; ++ ++ rcu_read_lock(); ++retry: ++ vma = mas_walk(&mas); ++ if (!vma) ++ goto inval; ++ ++ /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ ++ if (vma_is_anonymous(vma) && !vma->anon_vma) ++ goto inval; ++ ++ if (!vma_start_read(vma)) ++ goto inval; ++ ++ /* ++ * Due to the possibility of userfault handler dropping mmap_lock, avoid ++ * it for now and fall back to page fault handling under mmap_lock. ++ */ ++ if (userfaultfd_armed(vma)) { ++ vma_end_read(vma); ++ goto inval; ++ } ++ ++ /* Check since vm_start/vm_end might change before we lock the VMA */ ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ vma_end_read(vma); ++ goto inval; ++ } ++ ++ /* Check if the VMA got isolated after we found it */ ++ if (vma->detached) { ++ vma_end_read(vma); ++ count_vm_vma_lock_event(VMA_LOCK_MISS); ++ /* The area was replaced with another one */ ++ goto retry; ++ } ++ ++ rcu_read_unlock(); ++ return vma; ++inval: ++ rcu_read_unlock(); ++ count_vm_vma_lock_event(VMA_LOCK_ABORT); ++ return NULL; ++} ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + #ifndef __PAGETABLE_P4D_FOLDED + /* + * Allocate p4d page table. +diff --git a/mm/mmap.c b/mm/mmap.c +index ff68a67a2a7c..a2bc2d9432b8 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma) + /* + * Close a vm structure and free it. + */ +-static void remove_vma(struct vm_area_struct *vma) ++static void remove_vma(struct vm_area_struct *vma, bool unreachable) + { + might_sleep(); + if (vma->vm_ops && vma->vm_ops->close) +@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma) + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); +- vm_area_free(vma); ++ if (unreachable) ++ __vm_area_free(vma); ++ else ++ vm_area_free(vma); + } + + static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, +@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp, + */ + static inline void vma_prepare(struct vma_prepare *vp) + { ++ vma_start_write(vp->vma); ++ if (vp->adj_next) ++ vma_start_write(vp->adj_next); ++ /* vp->insert is always a newly created VMA, no need for locking */ ++ if (vp->remove) ++ vma_start_write(vp->remove); ++ if (vp->remove2) ++ vma_start_write(vp->remove2); ++ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + +@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp, + + if (vp->remove) { + again: ++ vma_mark_detached(vp->remove, true); + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); +@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (vma_iter_prealloc(vmi)) + goto nomem; + ++ vma_prepare(&vp); + vma_adjust_trans_huge(vma, start, end, 0); + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + +- vma_prepare(&vp); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + return -ENOMEM; + + init_vma_prep(&vp, vma); +- vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, start, end, 0); + + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); +@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, + if (vma_iter_prealloc(vmi)) + return NULL; + +- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + +@@ -2119,7 +2132,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += nrpages; + vm_stat_account(mm, vma->vm_flags, -nrpages); +- remove_vma(vma); ++ remove_vma(vma, false); + } + vm_unacct_memory(nr_accounted); + validate_mm(mm); +@@ -2142,7 +2155,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, + update_hiwater_rss(mm); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, +- next ? next->vm_start : USER_PGTABLES_CEILING); ++ next ? next->vm_start : USER_PGTABLES_CEILING, ++ mm_wr_locked); + tlb_finish_mmu(&tlb); + } + +@@ -2198,10 +2212,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + +- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + + if (new_below) { + vma->vm_start = addr; +@@ -2245,10 +2259,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) + { ++ vma_start_write(vma); + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; + ++ vma_mark_detached(vma, true); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); + +@@ -2904,9 +2920,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (vma_iter_prealloc(vmi)) + goto unacct_fail; + +- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + init_vma_prep(&vp, vma); + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + vma->vm_end = addr + len; + vm_flags_set(vma, VM_SOFTDIRTY); + vma_iter_store(vmi, vma); +@@ -3039,7 +3055,7 @@ void exit_mmap(struct mm_struct *mm) + mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, +- USER_PGTABLES_CEILING); ++ USER_PGTABLES_CEILING, true); + tlb_finish_mmu(&tlb); + + /* +@@ -3050,7 +3066,7 @@ void exit_mmap(struct mm_struct *mm) + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); +- remove_vma(vma); ++ remove_vma(vma, true); + count++; + cond_resched(); + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); +@@ -3173,6 +3189,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); ++ vma_start_write(new_vma); + if (vma_link(mm, new_vma)) + goto out_vma_link; + *need_rmap_locks = false; +@@ -3467,6 +3484,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); ++ * - all vmas marked locked + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * +@@ -3489,6 +3507,13 @@ int mm_take_all_locks(struct mm_struct *mm) + + mutex_lock(&mm_all_locks_mutex); + ++ mas_for_each(&mas, vma, ULONG_MAX) { ++ if (signal_pending(current)) ++ goto out_unlock; ++ vma_start_write(vma); ++ } ++ ++ mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; +@@ -3578,6 +3603,7 @@ void mm_drop_all_locks(struct mm_struct *mm) + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } ++ vma_end_write_all(mm); + + mutex_unlock(&mm_all_locks_mutex); + } +diff --git a/mm/mprotect.c b/mm/mprotect.c +index 13e84d8c0797..b9da9a5f87fe 100644 +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb, + } else { + /* It must be an none page, or what else?.. */ + WARN_ON_ONCE(!pte_none(oldpte)); +- if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { ++ ++ /* ++ * Nobody plays with any none ptes besides ++ * userfaultfd when applying the protections. ++ */ ++ if (likely(!uffd_wp)) ++ continue; ++ ++ if (userfaultfd_wp_use_markers(vma)) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the +@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) + return 0; + } + +-/* Return true if we're uffd wr-protecting file-backed memory, or false */ ++/* ++ * Return true if we want to split THPs into PTE mappings in change ++ * protection procedure, false otherwise. ++ */ + static inline bool +-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) ++pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) + { ++ /* ++ * pte markers only resides in pte level, if we need pte markers, ++ * we need to split. We cannot wr-protect shmem thp because file ++ * thp is handled differently when split by erasing the pmd so far. ++ */ + return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); + } + + /* +- * If wr-protecting the range for file-backed, populate pgtable for the case +- * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() +- * failed we treat it the same way as pgtable allocation failures during +- * page faults by kicking OOM and returning error. ++ * Return true if we want to populate pgtables in change protection ++ * procedure, false otherwise ++ */ ++static inline bool ++pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags) ++{ ++ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */ ++ if (!(cp_flags & MM_CP_UFFD_WP)) ++ return false; ++ ++ /* Populate if the userfaultfd mode requires pte markers */ ++ return userfaultfd_wp_use_markers(vma); ++} ++ ++/* ++ * Populate the pgtable underneath for whatever reason if requested. ++ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable ++ * allocation failures during page faults by kicking OOM and returning ++ * error. + */ + #define change_pmd_prepare(vma, pmd, cp_flags) \ + ({ \ + long err = 0; \ +- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ ++ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ + } \ +@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) + #define change_prepare(vma, high, low, addr, cp_flags) \ + ({ \ + long err = 0; \ +- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ ++ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ + low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ + if (p == NULL) \ + err = -ENOMEM; \ +@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, + + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if ((next - addr != HPAGE_PMD_SIZE) || +- uffd_wp_protect_file(vma, cp_flags)) { ++ pgtable_split_needed(vma, cp_flags)) { + __split_huge_pmd(vma, pmd, addr, false, NULL); + /* + * For file-backed, the pmd could have been +diff --git a/mm/mremap.c b/mm/mremap.c +index 411a85682b58..dd541e59edda 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, + return -ENOMEM; + } + ++ vma_start_write(vma); + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); +diff --git a/mm/rmap.c b/mm/rmap.c +index 8632e02661ac..cfdaa56cad3e 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -25,21 +25,22 @@ + * mapping->invalidate_lock (in filemap_fault) + * page->flags PG_locked (lock_page) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) +- * mapping->i_mmap_rwsem +- * anon_vma->rwsem +- * mm->page_table_lock or pte_lock +- * swap_lock (in swap_duplicate, swap_info_get) +- * mmlist_lock (in mmput, drain_mmlist and others) +- * mapping->private_lock (in block_dirty_folio) +- * folio_lock_memcg move_lock (in block_dirty_folio) +- * i_pages lock (widely used) +- * lruvec->lru_lock (in folio_lruvec_lock_irq) +- * inode->i_lock (in set_page_dirty's __mark_inode_dirty) +- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) +- * sb_lock (within inode_lock in fs/fs-writeback.c) +- * i_pages lock (widely used, in set_page_dirty, +- * in arch-dependent flush_dcache_mmap_lock, +- * within bdi.wb->list_lock in __sync_single_inode) ++ * vma_start_write ++ * mapping->i_mmap_rwsem ++ * anon_vma->rwsem ++ * mm->page_table_lock or pte_lock ++ * swap_lock (in swap_duplicate, swap_info_get) ++ * mmlist_lock (in mmput, drain_mmlist and others) ++ * mapping->private_lock (in block_dirty_folio) ++ * folio_lock_memcg move_lock (in block_dirty_folio) ++ * i_pages lock (widely used) ++ * lruvec->lru_lock (in folio_lruvec_lock_irq) ++ * inode->i_lock (in set_page_dirty's __mark_inode_dirty) ++ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) ++ * sb_lock (within inode_lock in fs/fs-writeback.c) ++ * i_pages lock (widely used, in set_page_dirty, ++ * in arch-dependent flush_dcache_mmap_lock, ++ * within bdi.wb->list_lock in __sync_single_inode) + * + * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) + * ->tasklist_lock +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 1ea6a5ce1c41..4f1089a1860e 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = { + "direct_map_level2_splits", + "direct_map_level3_splits", + #endif ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++ "vma_lock_success", ++ "vma_lock_abort", ++ "vma_lock_retry", ++ "vma_lock_miss", ++#endif + #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ + }; + #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ +diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c +index 7f22844ed704..e030d63c031a 100644 +--- a/tools/testing/selftests/mm/userfaultfd.c ++++ b/tools/testing/selftests/mm/userfaultfd.c +@@ -1444,6 +1444,43 @@ static int pagemap_test_fork(bool present) + return result; + } + ++static void userfaultfd_wp_unpopulated_test(int pagemap_fd) ++{ ++ uint64_t value; ++ ++ /* Test applying pte marker to anon unpopulated */ ++ wp_range(uffd, (uint64_t)area_dst, page_size, true); ++ value = pagemap_read_vaddr(pagemap_fd, area_dst); ++ pagemap_check_wp(value, true); ++ ++ /* Test unprotect on anon pte marker */ ++ wp_range(uffd, (uint64_t)area_dst, page_size, false); ++ value = pagemap_read_vaddr(pagemap_fd, area_dst); ++ pagemap_check_wp(value, false); ++ ++ /* Test zap on anon marker */ ++ wp_range(uffd, (uint64_t)area_dst, page_size, true); ++ if (madvise(area_dst, page_size, MADV_DONTNEED)) ++ err("madvise(MADV_DONTNEED) failed"); ++ value = pagemap_read_vaddr(pagemap_fd, area_dst); ++ pagemap_check_wp(value, false); ++ ++ /* Test fault in after marker removed */ ++ *area_dst = 1; ++ value = pagemap_read_vaddr(pagemap_fd, area_dst); ++ pagemap_check_wp(value, false); ++ /* Drop it to make pte none again */ ++ if (madvise(area_dst, page_size, MADV_DONTNEED)) ++ err("madvise(MADV_DONTNEED) failed"); ++ ++ /* Test read-zero-page upon pte marker */ ++ wp_range(uffd, (uint64_t)area_dst, page_size, true); ++ *(volatile char *)area_dst; ++ /* Drop it to make pte none again */ ++ if (madvise(area_dst, page_size, MADV_DONTNEED)) ++ err("madvise(MADV_DONTNEED) failed"); ++} ++ + static void userfaultfd_pagemap_test(unsigned int test_pgsize) + { + struct uffdio_register uffdio_register; +@@ -1462,7 +1499,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + +- uffd_test_ctx_init(0); ++ uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); + + if (test_pgsize > page_size) { + /* This is a thp test */ +@@ -1482,6 +1519,10 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) + + pagemap_fd = pagemap_open(); + ++ /* Smoke test WP_UNPOPULATED first when it's still empty */ ++ if (test_pgsize == page_size) ++ userfaultfd_wp_unpopulated_test(pagemap_fd); ++ + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); +@@ -1526,7 +1567,7 @@ static int userfaultfd_stress(void) + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + +- uffd_test_ctx_init(0); ++ uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); -- -2.40.0.rc2 +2.40.0 -From 56bbff019101b84507c1e796512b1be6840c6eda Mon Sep 17 00:00:00 2001 +From d0f327c32c39cafbdeefac5fd65a0087a603e76f Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 3 Mar 2023 17:02:07 +0100 -Subject: [PATCH 13/16] sched +Date: Sun, 9 Apr 2023 21:25:55 +0200 +Subject: [PATCH 09/10] sched Signed-off-by: Peter Jung --- - arch/x86/kernel/itmt.c | 23 +- - arch/x86/kernel/smpboot.c | 2 +- - include/linux/sched.h | 3 + - include/linux/sched/sd_flags.h | 5 +- - kernel/sched/core.c | 77 ++-- - kernel/sched/cpufreq_schedutil.c | 43 +-- - kernel/sched/deadline.c | 42 ++- - kernel/sched/debug.c | 1 + - kernel/sched/fair.c | 591 ++++++++++++++++++++----------- - kernel/sched/features.h | 1 + - kernel/sched/pelt.c | 60 ++++ - kernel/sched/pelt.h | 42 ++- - kernel/sched/sched.h | 28 +- - 13 files changed, 601 insertions(+), 317 deletions(-) + arch/x86/kernel/itmt.c | 23 +-- + arch/x86/kernel/smpboot.c | 4 +- + include/linux/sched.h | 3 + + include/linux/sched/sd_flags.h | 5 +- + kernel/sched/core.c | 4 +- + kernel/sched/debug.c | 1 + + kernel/sched/fair.c | 265 ++++++++++++++++++++------------- + kernel/sched/features.h | 1 + + kernel/sched/pelt.c | 60 ++++++++ + kernel/sched/pelt.h | 42 +++++- + kernel/sched/sched.h | 23 ++- + 11 files changed, 294 insertions(+), 137 deletions(-) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9ff480e94511..6510883c5e81 100644 @@ -33507,7 +12550,7 @@ index 9ff480e94511..6510883c5e81 100644 + per_cpu(sched_core_priority, cpu) = prio; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 55cad72715d9..0213d066a9a9 100644 +index 9013bb28255a..cea297d97034 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -547,7 +547,7 @@ static int x86_core_flags(void) @@ -33519,8 +12562,17 @@ index 55cad72715d9..0213d066a9a9 100644 } #endif #ifdef CONFIG_SCHED_CLUSTER +@@ -578,7 +578,7 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = { + #ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, + #endif +- { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) }, + { NULL, }, + }; + diff --git a/include/linux/sched.h b/include/linux/sched.h -index 853d08f7562b..28ce1be0ba47 100644 +index 63d242164b1a..6d398b337b0d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -557,6 +557,9 @@ struct sched_entity { @@ -33534,7 +12586,7 @@ index 853d08f7562b..28ce1be0ba47 100644 #ifdef CONFIG_FAIR_GROUP_SCHED int depth; diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h -index 57bde66d95f7..800238854ba5 100644 +index 57bde66d95f7..fad77b5172e2 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) @@ -33547,15 +12599,15 @@ index 57bde66d95f7..800238854ba5 100644 * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) -+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) ++SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) /* * Prefer to place tasks in a sibling domain diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 2a4918a1faa9..5237639786b7 100644 +index 0d18c3969f90..17bb9637f314 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -722,7 +722,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) +@@ -724,7 +724,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif @@ -33564,112 +12616,7 @@ index 2a4918a1faa9..5237639786b7 100644 } void update_rq_clock(struct rq *rq) -@@ -3675,14 +3675,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) - } - - /* -- * Mark the task runnable and perform wakeup-preemption. -+ * Mark the task runnable. - */ --static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, -- struct rq_flags *rf) -+static inline void ttwu_do_wakeup(struct task_struct *p) - { -- check_preempt_curr(rq, p, wake_flags); - WRITE_ONCE(p->__state, TASK_RUNNING); - trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, -+ struct rq_flags *rf) -+{ -+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; -+ -+ lockdep_assert_rq_held(rq); -+ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+#ifdef CONFIG_SMP -+ if (wake_flags & WF_MIGRATED) -+ en_flags |= ENQUEUE_MIGRATED; -+ else -+#endif -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ activate_task(rq, p, en_flags); -+ check_preempt_curr(rq, p, wake_flags); -+ -+ ttwu_do_wakeup(p); - - #ifdef CONFIG_SMP - if (p->sched_class->task_woken) { -@@ -3712,31 +3737,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - #endif - } - --static void --ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, -- struct rq_flags *rf) --{ -- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; -- -- lockdep_assert_rq_held(rq); -- -- if (p->sched_contributes_to_load) -- rq->nr_uninterruptible--; -- --#ifdef CONFIG_SMP -- if (wake_flags & WF_MIGRATED) -- en_flags |= ENQUEUE_MIGRATED; -- else --#endif -- if (p->in_iowait) { -- delayacct_blkio_end(p); -- atomic_dec(&task_rq(p)->nr_iowait); -- } -- -- activate_task(rq, p, en_flags); -- ttwu_do_wakeup(rq, p, wake_flags, rf); --} -- - /* - * Consider @p being inside a wait loop: - * -@@ -3770,9 +3770,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) - - rq = __task_rq_lock(p, &rf); - if (task_on_rq_queued(p)) { -- /* check_preempt_curr() may use rq clock */ -- update_rq_clock(rq); -- ttwu_do_wakeup(rq, p, wake_flags, &rf); -+ if (!task_on_cpu(rq, p)) { -+ /* -+ * When on_rq && !on_cpu the task is preempted, see if -+ * it should preempt the task that is current now. -+ */ -+ update_rq_clock(rq); -+ check_preempt_curr(rq, p, wake_flags); -+ } -+ ttwu_do_wakeup(p); - ret = 1; - } - __task_rq_unlock(rq, &rf); -@@ -4138,8 +4144,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) - goto out; - - trace_sched_waking(p); -- WRITE_ONCE(p->__state, TASK_RUNNING); -- trace_sched_wakeup(p); -+ ttwu_do_wakeup(p); - goto out; - } - -@@ -4424,6 +4429,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4434,6 +4434,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -33678,215 +12625,6 @@ index 2a4918a1faa9..5237639786b7 100644 INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 1207c78f85c1..5c840151f3bb 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -48,7 +48,6 @@ struct sugov_cpu { - - unsigned long util; - unsigned long bw_dl; -- unsigned long max; - - /* The field below is for single-CPU policies only: */ - #ifdef CONFIG_NO_HZ_COMMON -@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) - { - struct rq *rq = cpu_rq(sg_cpu->cpu); - -- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), - FREQUENCY_UTIL, NULL); -@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - * sugov_iowait_apply() - Apply the IO boost to a CPU. - * @sg_cpu: the sugov data for the cpu to boost - * @time: the update time from the caller -+ * @max_cap: the max CPU capacity - * - * A CPU running a task which woken up after an IO operation can have its - * utilization boosted to speed up the completion of those IO operations. -@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - * This mechanism is designed to boost high frequently IO waiting tasks, while - * being more conservative on tasks which does sporadic IO operations. - */ --static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) -+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, -+ unsigned long max_cap) - { - unsigned long boost; - -@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) - * sg_cpu->util is already in capacity scale; convert iowait_boost - * into the same scale so we can compare. - */ -- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; -+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; - boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); - if (sg_cpu->util < boost) - sg_cpu->util = boost; -@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) - } - - static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, -- u64 time, unsigned int flags) -+ u64 time, unsigned long max_cap, -+ unsigned int flags) - { - sugov_iowait_boost(sg_cpu, time, flags); - sg_cpu->last_update = time; -@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, - return false; - - sugov_get_util(sg_cpu); -- sugov_iowait_apply(sg_cpu, time); -+ sugov_iowait_apply(sg_cpu, time, max_cap); - - return true; - } -@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned int cached_freq = sg_policy->cached_raw_freq; -+ unsigned long max_cap; - unsigned int next_f; - -- if (!sugov_update_single_common(sg_cpu, time, flags)) -+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); -+ -+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) - return; - -- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); -+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. -@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, - { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); - unsigned long prev_util = sg_cpu->util; -+ unsigned long max_cap; - - /* - * Fall back to the "frequency" path if frequency invariance is not -@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, - return; - } - -- if (!sugov_update_single_common(sg_cpu, time, flags)) -+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); -+ -+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) - return; - - /* -@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, - sg_cpu->util = prev_util; - - cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), -- map_util_perf(sg_cpu->util), sg_cpu->max); -+ map_util_perf(sg_cpu->util), max_cap); - - sg_cpu->sg_policy->last_freq_update_time = time; - } -@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) - { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - struct cpufreq_policy *policy = sg_policy->policy; -- unsigned long util = 0, max = 1; -+ unsigned long util = 0, max_cap; - unsigned int j; - -+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); -+ - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); -- unsigned long j_util, j_max; - - sugov_get_util(j_sg_cpu); -- sugov_iowait_apply(j_sg_cpu, time); -- j_util = j_sg_cpu->util; -- j_max = j_sg_cpu->max; -+ sugov_iowait_apply(j_sg_cpu, time, max_cap); - -- if (j_util * max > j_max * util) { -- util = j_util; -- max = j_max; -- } -+ util = max(j_sg_cpu->util, util); - } - -- return get_next_freq(sg_policy, util, max); -+ return get_next_freq(sg_policy, util, max_cap); - } - - static void -diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c -index 0d97d54276cc..71b24371a6f7 100644 ---- a/kernel/sched/deadline.c -+++ b/kernel/sched/deadline.c -@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) - static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) - { -- if (task_on_rq_queued(p) || task_current(rq, p)) { -+ if (!task_on_rq_queued(p)) -+ return; -+ - #ifdef CONFIG_SMP -- /* -- * This might be too much, but unfortunately -- * we don't have the old deadline value, and -- * we can't argue if the task is increasing -- * or lowering its prio, so... -- */ -- if (!rq->dl.overloaded) -- deadline_queue_pull_task(rq); -+ /* -+ * This might be too much, but unfortunately -+ * we don't have the old deadline value, and -+ * we can't argue if the task is increasing -+ * or lowering its prio, so... -+ */ -+ if (!rq->dl.overloaded) -+ deadline_queue_pull_task(rq); - -+ if (task_current(rq, p)) { - /* - * If we now have a earlier deadline task than p, - * then reschedule, provided p is still on this -@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, - */ - if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) - resched_curr(rq); --#else -+ } else { - /* -- * Again, we don't know if p has a earlier -- * or later deadline, so let's blindly set a -- * (maybe not needed) rescheduling point. -+ * Current may not be deadline in case p was throttled but we -+ * have just replenished it (e.g. rt_mutex_setprio()). -+ * -+ * Otherwise, if p was given an earlier deadline, reschedule. - */ -- resched_curr(rq); --#endif /* CONFIG_SMP */ -+ if (!dl_task(rq->curr) || -+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) -+ resched_curr(rq); - } -+#else -+ /* -+ * We don't know if p has a earlier or later deadline, so let's blindly -+ * set a (maybe not needed) rescheduling point. -+ */ -+ resched_curr(rq); -+#endif - } - - DEFINE_SCHED_CLASS(dl) = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1637b65ba07a..8d64fba16cfe 100644 --- a/kernel/sched/debug.c @@ -33900,7 +12638,7 @@ index 1637b65ba07a..8d64fba16cfe 100644 P(se.avg.load_sum); P(se.avg.runnable_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 86a988c830ef..84254f52c56a 100644 +index dcdd8422de72..115be8a965f2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -33951,311 +12689,7 @@ index 86a988c830ef..84254f52c56a 100644 struct task_numa_env { struct task_struct *p; -@@ -4494,17 +4494,9 @@ static inline int util_fits_cpu(unsigned long util, - * - * For uclamp_max, we can tolerate a drop in performance level as the - * goal is to cap the task. So it's okay if it's getting less. -- * -- * In case of capacity inversion we should honour the inverted capacity -- * for both uclamp_min and uclamp_max all the time. - */ -- capacity_orig = cpu_in_capacity_inversion(cpu); -- if (capacity_orig) { -- capacity_orig_thermal = capacity_orig; -- } else { -- capacity_orig = capacity_orig_of(cpu); -- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); -- } -+ capacity_orig = capacity_orig_of(cpu); -+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); - - /* - * We want to force a task to fit a cpu as implied by uclamp_max. -@@ -4579,8 +4571,8 @@ static inline int util_fits_cpu(unsigned long util, - * handle the case uclamp_min > uclamp_max. - */ - uclamp_min = min(uclamp_min, uclamp_max); -- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) -- fits = fits && (uclamp_min <= capacity_orig_thermal); -+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) -+ return -1; - - return fits; - } -@@ -4590,7 +4582,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) - unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); - unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); - unsigned long util = task_util_est(p); -- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); -+ /* -+ * Return true only if the cpu fully fits the task requirements, which -+ * include the utilization but also the performance hints. -+ */ -+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); - } - - static inline void update_misfit_status(struct task_struct *p, struct rq *rq) -@@ -4674,6 +4670,7 @@ static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { - u64 vruntime = cfs_rq->min_vruntime; -+ u64 sleep_time; - - /* - * The 'current' period is already promised to the current tasks, -@@ -4703,8 +4700,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - vruntime -= thresh; - } - -- /* ensure we never gain time by being placed backwards. */ -- se->vruntime = max_vruntime(se->vruntime, vruntime); -+ /* -+ * Pull vruntime of the entity being placed to the base level of -+ * cfs_rq, to prevent boosting it if placed backwards. If the entity -+ * slept for a long time, don't even try to compare its vruntime with -+ * the base as it may be too far off and the comparison may get -+ * inversed due to s64 overflow. -+ */ -+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; -+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC) -+ se->vruntime = vruntime; -+ else -+ se->vruntime = max_vruntime(se->vruntime, vruntime); - } - - static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -@@ -4914,7 +4921,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - struct sched_entity *se; - s64 delta; - -- ideal_runtime = sched_slice(cfs_rq, curr); -+ /* -+ * When many tasks blow up the sched_period; it is possible that -+ * sched_slice() reports unusually large results (when many tasks are -+ * very light for example). Therefore impose a maximum. -+ */ -+ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); -+ - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); -@@ -5479,22 +5492,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) - resched_curr(rq); - } - --static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) -+#ifdef CONFIG_SMP -+static void __cfsb_csd_unthrottle(void *arg) - { -- struct cfs_rq *cfs_rq; -+ struct cfs_rq *cursor, *tmp; -+ struct rq *rq = arg; -+ struct rq_flags rf; -+ -+ rq_lock(rq, &rf); -+ -+ /* -+ * Since we hold rq lock we're safe from concurrent manipulation of -+ * the CSD list. However, this RCU critical section annotates the -+ * fact that we pair with sched_free_group_rcu(), so that we cannot -+ * race with group being freed in the window between removing it -+ * from the list and advancing to the next entry in the list. -+ */ -+ rcu_read_lock(); -+ -+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, -+ throttled_csd_list) { -+ list_del_init(&cursor->throttled_csd_list); -+ -+ if (cfs_rq_throttled(cursor)) -+ unthrottle_cfs_rq(cursor); -+ } -+ -+ rcu_read_unlock(); -+ -+ rq_unlock(rq, &rf); -+} -+ -+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -+{ -+ struct rq *rq = rq_of(cfs_rq); -+ bool first; -+ -+ if (rq == this_rq()) { -+ unthrottle_cfs_rq(cfs_rq); -+ return; -+ } -+ -+ /* Already enqueued */ -+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) -+ return; -+ -+ first = list_empty(&rq->cfsb_csd_list); -+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); -+ if (first) -+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); -+} -+#else -+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -+{ -+ unthrottle_cfs_rq(cfs_rq); -+} -+#endif -+ -+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -+{ -+ lockdep_assert_rq_held(rq_of(cfs_rq)); -+ -+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || -+ cfs_rq->runtime_remaining <= 0)) -+ return; -+ -+ __unthrottle_cfs_rq_async(cfs_rq); -+} -+ -+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) -+{ -+ struct cfs_rq *local_unthrottle = NULL; -+ int this_cpu = smp_processor_id(); - u64 runtime, remaining = 1; -+ bool throttled = false; -+ struct cfs_rq *cfs_rq; -+ struct rq_flags rf; -+ struct rq *rq; - - rcu_read_lock(); - list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, - throttled_list) { -- struct rq *rq = rq_of(cfs_rq); -- struct rq_flags rf; -+ rq = rq_of(cfs_rq); -+ -+ if (!remaining) { -+ throttled = true; -+ break; -+ } - - rq_lock_irqsave(rq, &rf); - if (!cfs_rq_throttled(cfs_rq)) - goto next; - -- /* By the above check, this should never be true */ -+#ifdef CONFIG_SMP -+ /* Already queued for async unthrottle */ -+ if (!list_empty(&cfs_rq->throttled_csd_list)) -+ goto next; -+#endif -+ -+ /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); - - raw_spin_lock(&cfs_b->lock); -@@ -5508,16 +5604,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) - cfs_rq->runtime_remaining += runtime; - - /* we check whether we're throttled above */ -- if (cfs_rq->runtime_remaining > 0) -- unthrottle_cfs_rq(cfs_rq); -+ if (cfs_rq->runtime_remaining > 0) { -+ if (cpu_of(rq) != this_cpu || -+ SCHED_WARN_ON(local_unthrottle)) -+ unthrottle_cfs_rq_async(cfs_rq); -+ else -+ local_unthrottle = cfs_rq; -+ } else { -+ throttled = true; -+ } - - next: - rq_unlock_irqrestore(rq, &rf); -- -- if (!remaining) -- break; - } - rcu_read_unlock(); -+ -+ if (local_unthrottle) { -+ rq = cpu_rq(this_cpu); -+ rq_lock_irqsave(rq, &rf); -+ if (cfs_rq_throttled(local_unthrottle)) -+ unthrottle_cfs_rq(local_unthrottle); -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+ return throttled; - } - - /* -@@ -5562,10 +5672,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u - while (throttled && cfs_b->runtime > 0) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - /* we can't nest cfs_b->lock while distributing bandwidth */ -- distribute_cfs_runtime(cfs_b); -+ throttled = distribute_cfs_runtime(cfs_b); - raw_spin_lock_irqsave(&cfs_b->lock, flags); -- -- throttled = !list_empty(&cfs_b->throttled_cfs_rq); - } - - /* -@@ -5842,6 +5950,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) - { - cfs_rq->runtime_enabled = 0; - INIT_LIST_HEAD(&cfs_rq->throttled_list); -+#ifdef CONFIG_SMP -+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); -+#endif - } - - void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -@@ -5858,12 +5969,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) - - static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) - { -+ int __maybe_unused i; -+ - /* init_cfs_bandwidth() was not called */ - if (!cfs_b->throttled_cfs_rq.next) - return; - - hrtimer_cancel(&cfs_b->period_timer); - hrtimer_cancel(&cfs_b->slack_timer); -+ -+ /* -+ * It is possible that we still have some cfs_rq's pending on a CSD -+ * list, though this race is very rare. In order for this to occur, we -+ * must have raced with the last task leaving the group while there -+ * exist throttled cfs_rq(s), and the period_timer must have queued the -+ * CSD item but the remote cpu has not yet processed it. To handle this, -+ * we can simply flush all pending CSD work inline here. We're -+ * guaranteed at this point that no additional cfs_rq of this group can -+ * join a CSD list. -+ */ -+#ifdef CONFIG_SMP -+ for_each_possible_cpu(i) { -+ struct rq *rq = cpu_rq(i); -+ unsigned long flags; -+ -+ if (list_empty(&rq->cfsb_csd_list)) -+ continue; -+ -+ local_irq_save(flags); -+ __cfsb_csd_unthrottle(rq); -+ local_irq_restore(flags); -+ } -+#endif - } - - /* -@@ -6026,6 +6163,7 @@ static inline bool cpu_overutilized(int cpu) - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); - -+ /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); - } - -@@ -6159,6 +6297,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6333,6 +6333,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) static void set_next_buddy(struct sched_entity *se); @@ -34274,7 +12708,7 @@ index 86a988c830ef..84254f52c56a 100644 /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and -@@ -6231,6 +6381,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6405,6 +6417,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dequeue_throttle: util_est_update(&rq->cfs, p, task_sleep); @@ -34282,7 +12716,7 @@ index 86a988c830ef..84254f52c56a 100644 hrtick_update(rq); } -@@ -6364,6 +6515,23 @@ static int wake_wide(struct task_struct *p) +@@ -6538,6 +6551,23 @@ static int wake_wide(struct task_struct *p) return 1; } @@ -34306,7 +12740,7 @@ index 86a988c830ef..84254f52c56a 100644 /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous -@@ -6400,6 +6568,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) +@@ -6574,6 +6604,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -34318,7 +12752,7 @@ index 86a988c830ef..84254f52c56a 100644 return nr_cpumask_bits; } -@@ -6774,6 +6947,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool +@@ -6948,6 +6983,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool /* overloaded LLC is unlikely to have idle cpu/core */ if (nr == 1) return -1; @@ -34339,262 +12773,28 @@ index 86a988c830ef..84254f52c56a 100644 } } -@@ -6819,6 +7006,7 @@ static int - select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) - { - unsigned long task_util, util_min, util_max, best_cap = 0; -+ int fits, best_fits = 0; - int cpu, best_cpu = -1; - struct cpumask *cpus; - -@@ -6834,12 +7022,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) - - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) - continue; -- if (util_fits_cpu(task_util, util_min, util_max, cpu)) -+ -+ fits = util_fits_cpu(task_util, util_min, util_max, cpu); -+ -+ /* This CPU fits with all requirements */ -+ if (fits > 0) - return cpu; -+ /* -+ * Only the min performance hint (i.e. uclamp_min) doesn't fit. -+ * Look for the CPU with best capacity. -+ */ -+ else if (fits < 0) -+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); - -- if (cpu_cap > best_cap) { -+ /* -+ * First, select CPU which fits better (-1 being better than 0). -+ * Then, select the one with best capacity at same level. -+ */ -+ if ((fits < best_fits) || -+ ((fits == best_fits) && (cpu_cap > best_cap))) { - best_cap = cpu_cap; - best_cpu = cpu; -+ best_fits = fits; - } - } - -@@ -6852,7 +7056,11 @@ static inline bool asym_fits_cpu(unsigned long util, - int cpu) - { - if (sched_asym_cpucap_active()) -- return util_fits_cpu(util, util_min, util_max, cpu); -+ /* -+ * Return true only if the cpu fully fits the task requirements -+ * which include the utilization and the performance hints. -+ */ -+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0); - - return true; +@@ -9288,96 +9337,65 @@ group_type group_classify(unsigned int imbalance_pct, } -@@ -7219,6 +7427,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; - struct root_domain *rd = this_rq()->rd; - int cpu, best_energy_cpu, target = -1; -+ int prev_fits = -1, best_fits = -1; -+ unsigned long best_thermal_cap = 0; -+ unsigned long prev_thermal_cap = 0; - struct sched_domain *sd; - struct perf_domain *pd; - struct energy_env eenv; -@@ -7254,6 +7465,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - unsigned long prev_spare_cap = 0; - int max_spare_cap_cpu = -1; - unsigned long base_energy; -+ int fits, max_fits = -1; - cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); - -@@ -7303,7 +7515,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - util_min = max(rq_util_min, p_util_min); - util_max = max(rq_util_max, p_util_max); - } -- if (!util_fits_cpu(util, util_min, util_max, cpu)) -+ -+ fits = util_fits_cpu(util, util_min, util_max, cpu); -+ if (!fits) - continue; - - lsub_positive(&cpu_cap, util); -@@ -7311,7 +7525,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - if (cpu == prev_cpu) { - /* Always use prev_cpu as a candidate. */ - prev_spare_cap = cpu_cap; -- } else if (cpu_cap > max_spare_cap) { -+ prev_fits = fits; -+ } else if ((fits > max_fits) || -+ ((fits == max_fits) && (cpu_cap > max_spare_cap))) { - /* - * Find the CPU with the maximum spare capacity - * among the remaining CPUs in the performance -@@ -7319,6 +7535,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - */ - max_spare_cap = cpu_cap; - max_spare_cap_cpu = cpu; -+ max_fits = fits; - } - } - -@@ -7337,26 +7554,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) - if (prev_delta < base_energy) - goto unlock; - prev_delta -= base_energy; -+ prev_thermal_cap = cpu_thermal_cap; - best_delta = min(best_delta, prev_delta); - } - - /* Evaluate the energy impact of using max_spare_cap_cpu. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { -+ /* Current best energy cpu fits better */ -+ if (max_fits < best_fits) -+ continue; -+ -+ /* -+ * Both don't fit performance hint (i.e. uclamp_min) -+ * but best energy cpu has better capacity. -+ */ -+ if ((max_fits < 0) && -+ (cpu_thermal_cap <= best_thermal_cap)) -+ continue; -+ - cur_delta = compute_energy(&eenv, pd, cpus, p, - max_spare_cap_cpu); - /* CPU utilization has changed */ - if (cur_delta < base_energy) - goto unlock; - cur_delta -= base_energy; -- if (cur_delta < best_delta) { -- best_delta = cur_delta; -- best_energy_cpu = max_spare_cap_cpu; -- } -+ -+ /* -+ * Both fit for the task but best energy cpu has lower -+ * energy impact. -+ */ -+ if ((max_fits > 0) && (best_fits > 0) && -+ (cur_delta >= best_delta)) -+ continue; -+ -+ best_delta = cur_delta; -+ best_energy_cpu = max_spare_cap_cpu; -+ best_fits = max_fits; -+ best_thermal_cap = cpu_thermal_cap; - } - } - rcu_read_unlock(); - -- if (best_delta < prev_delta) -+ if ((best_fits > prev_fits) || -+ ((best_fits > 0) && (best_delta < prev_delta)) || -+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) - target = best_energy_cpu; - - return target; -@@ -8856,82 +9097,16 @@ static unsigned long scale_rt_capacity(int cpu) - - static void update_cpu_capacity(struct sched_domain *sd, int cpu) - { -- unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); - unsigned long capacity = scale_rt_capacity(cpu); - struct sched_group *sdg = sd->groups; -- struct rq *rq = cpu_rq(cpu); - -- rq->cpu_capacity_orig = capacity_orig; -+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - - if (!capacity) - capacity = 1; - -- rq->cpu_capacity = capacity; -- -- /* -- * Detect if the performance domain is in capacity inversion state. -- * -- * Capacity inversion happens when another perf domain with equal or -- * lower capacity_orig_of() ends up having higher capacity than this -- * domain after subtracting thermal pressure. -- * -- * We only take into account thermal pressure in this detection as it's -- * the only metric that actually results in *real* reduction of -- * capacity due to performance points (OPPs) being dropped/become -- * unreachable due to thermal throttling. -- * -- * We assume: -- * * That all cpus in a perf domain have the same capacity_orig -- * (same uArch). -- * * Thermal pressure will impact all cpus in this perf domain -- * equally. -- */ -- if (sched_energy_enabled()) { -- unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); -- struct perf_domain *pd; -- -- rcu_read_lock(); -- -- pd = rcu_dereference(rq->rd->pd); -- rq->cpu_capacity_inverted = 0; -- -- for (; pd; pd = pd->next) { -- struct cpumask *pd_span = perf_domain_span(pd); -- unsigned long pd_cap_orig, pd_cap; -- -- /* We can't be inverted against our own pd */ -- if (cpumask_test_cpu(cpu_of(rq), pd_span)) -- continue; -- -- cpu = cpumask_any(pd_span); -- pd_cap_orig = arch_scale_cpu_capacity(cpu); -- -- if (capacity_orig < pd_cap_orig) -- continue; -- -- /* -- * handle the case of multiple perf domains have the -- * same capacity_orig but one of them is under higher -- * thermal pressure. We record it as capacity -- * inversion. -- */ -- if (capacity_orig == pd_cap_orig) { -- pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); -- -- if (pd_cap > inv_cap) { -- rq->cpu_capacity_inverted = inv_cap; -- break; -- } -- } else if (pd_cap_orig > inv_cap) { -- rq->cpu_capacity_inverted = inv_cap; -- break; -- } -- } -- -- rcu_read_unlock(); -- } -- -- trace_sched_cpu_capacity_tp(rq); -+ cpu_rq(cpu)->cpu_capacity = capacity; -+ trace_sched_cpu_capacity_tp(cpu_rq(cpu)); - - sdg->sgc->capacity = capacity; - sdg->sgc->min_capacity = capacity; -@@ -9135,20 +9310,15 @@ group_type group_classify(unsigned int imbalance_pct, - * @sgs: Load-balancing statistics of the candidate busiest group - * @sg: The candidate busiest group - * + /** +- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks +- * @dst_cpu: Destination CPU of the load balancing +- * @sds: Load-balancing data with statistics of the local group +- * @sgs: Load-balancing statistics of the candidate busiest group +- * @sg: The candidate busiest group +- * - * Check the state of the SMT siblings of both @sds::local and @sg and decide - * if @dst_cpu can pull tasks. -+ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull -+ * tasks. ++ * sched_use_asym_prio - Check whether asym_packing priority must be used ++ * @sd: The scheduling domain of the load balancing ++ * @cpu: A CPU * - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks - * only if @dst_cpu has higher priority. -+ * This function must be called only if all the SMT siblings of @dst_cpu are -+ * idle, if any. ++ * Always use CPU priority when balancing load between SMT siblings. When ++ * balancing load between cores, it is not sufficient that @cpu is idle. Only ++ * use CPU priority if the whole core is idle. * - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. @@ -34603,24 +12803,24 @@ index 86a988c830ef..84254f52c56a 100644 - * - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings - * of @dst_cpu are idle and @sg has lower priority. -+ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than -+ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances -+ * in the number of busy CPUs will be dealt with in find_busiest_group(). - * - * Return: true if @dst_cpu can pull tasks, false otherwise. +- * +- * Return: true if @dst_cpu can pull tasks, false otherwise. ++ * Returns: True if the priority of @cpu must be followed. False otherwise. */ -@@ -9157,51 +9327,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, - struct sched_group *sg) +-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, +- struct sg_lb_stats *sgs, +- struct sched_group *sg) ++static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { #ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; - int sg_busy_cpus; - +- int sg_busy_cpus; +- - local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; - +- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; +- - if (!local_is_smt) { - /* - * If we are here, @dst_cpu is idle and does not have SMT @@ -34652,36 +12852,70 @@ index 86a988c830ef..84254f52c56a 100644 - return false; - } - - /* +- /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). -+ * If the difference in the number of busy CPUs is two or more, let -+ * find_busiest_group() take care of it. We only care if @sg has -+ * exactly one busy CPU. This covers SMT and non-SMT sched groups. - */ +- */ - if (!sds->local_stat.sum_nr_running) -+ if (sg_busy_cpus == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); ++ if (!sched_smt_active()) ++ return true; - return false; -@@ -9215,7 +9350,14 @@ static inline bool +- return false; ++ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); + #else +- /* Always return false so that callers deal with non-SMT cases. */ +- return false; ++ return true; + #endif + } + ++/** ++ * sched_asym - Check if the destination CPU can do asym_packing load balance ++ * @env: The load balancing environment ++ * @sds: Load-balancing data with statistics of the local group ++ * @sgs: Load-balancing statistics of the candidate busiest group ++ * @group: The candidate busiest group ++ * ++ * @env::dst_cpu can do asym_packing if it has higher priority than the ++ * preferred CPU of @group. ++ * ++ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu ++ * can do asym_packing balance only if all its SMT siblings are idle. Also, it ++ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger ++ * imbalances in the number of CPUS are dealt with in find_busiest_group(). ++ * ++ * If we are balancing load within an SMT core, or at DIE domain level, always ++ * proceed. ++ * ++ * Return: true if @env::dst_cpu can do with asym_packing load balance. False ++ * otherwise. ++ */ + static inline bool sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Only do SMT checks if either local or candidate have SMT siblings */ -+ /* -+ * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE -+ * is not sufficient. We need to make sure the whole core is idle. -+ */ -+ if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) +- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || +- (group->flags & SD_SHARE_CPUCAPACITY)) +- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); ++ /* Ensure that the whole local core is idle, if applicable. */ ++ if (!sched_use_asym_prio(env->sd, env->dst_cpu)) + return false; + -+ /* Only do SMT checks if either local or candidate have SMT siblings. */ - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || - (group->flags & SD_SHARE_CPUCAPACITY)) - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); -@@ -9408,10 +9550,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, ++ /* ++ * CPU priorities does not make sense for SMT cores with more than one ++ * busy sibling. ++ */ ++ if (group->flags & SD_SHARE_CPUCAPACITY) { ++ if (sgs->group_weight - sgs->idle_cpus != 1) ++ return false; ++ } + + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); + } +@@ -9567,10 +9585,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we @@ -34706,7 +12940,7 @@ index 86a988c830ef..84254f52c56a 100644 break; case group_has_spare: -@@ -9886,7 +10040,6 @@ static void update_idle_cpu_scan(struct lb_env *env, +@@ -10045,7 +10075,6 @@ static void update_idle_cpu_scan(struct lb_env *env, static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { @@ -34714,72 +12948,35 @@ index 86a988c830ef..84254f52c56a 100644 struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; -@@ -9927,9 +10080,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd +@@ -10086,8 +10115,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; -- + /* -+ * Tag domain that @env::sd prefers to spread excess tasks among -+ * sibling sched groups. ++ * Indicate that the child domain of the busiest group prefers tasks ++ * go to a child's sibling domains first. NB the flags of a sched group ++ * are those of the child domain. + */ -+ sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING; ++ if (sds->busiest) ++ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); + if (env->sd->flags & SD_NUMA) - env->fbq_type = fbq_classify_group(&sds->busiest_stat); -@@ -10159,24 +10314,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) - */ - update_sd_lb_stats(env, &sds); - -- if (sched_energy_enabled()) { -- struct root_domain *rd = env->dst_rq->rd; -- -- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) -- goto out_balanced; -- } -- -- local = &sds.local_stat; -- busiest = &sds.busiest_stat; -- - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest) - goto out_balanced; - -+ busiest = &sds.busiest_stat; -+ - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - -+ if (sched_energy_enabled()) { -+ struct root_domain *rd = env->dst_rq->rd; -+ -+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) -+ goto out_balanced; -+ } -+ - /* ASYM feature bypasses nice load balance check */ - if (busiest->group_type == group_asym_packing) - goto force_balance; -@@ -10189,6 +10343,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) - if (busiest->group_type == group_imbalanced) - goto force_balance; - -+ local = &sds.local_stat; - /* - * If the local group is busier than the selected busiest group - * don't try and pull any tasks. -@@ -10228,7 +10383,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env) +@@ -10387,7 +10421,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ ++ /* ++ * Try to move all excess tasks to a sibling domain of the busiest ++ * group's child domain. ++ */ if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; -@@ -10330,11 +10484,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, +@@ -10489,8 +10526,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; @@ -34792,72 +12989,49 @@ index 86a988c830ef..84254f52c56a 100644 + * SMT cores with more than one busy sibling. + */ if ((env->sd->flags & SD_ASYM_PACKING) && ++ sched_use_asym_prio(env->sd, i) && sched_asym_prefer(i, env->dst_cpu) && -- nr_running == 1) -- continue; -+ nr_running == 1) { -+ if (env->sd->flags & SD_SHARE_CPUCAPACITY || -+ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) -+ continue; -+ } - - switch (env->migration_type) { - case migrate_load: -@@ -10424,8 +10587,20 @@ asym_active_balance(struct lb_env *env) - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + nr_running == 1) + continue; +@@ -10579,12 +10623,19 @@ static inline bool + asym_active_balance(struct lb_env *env) + { + /* +- * ASYM_PACKING needs to force migrate tasks from busy but +- * lower priority CPUs in order to pack all tasks in the +- * highest priority CPUs. ++ * ASYM_PACKING needs to force migrate tasks from busy but lower ++ * priority CPUs in order to pack all tasks in the highest priority ++ * CPUs. When done between cores, do it only if the whole core if the ++ * whole core is idle. ++ * ++ * If @env::src_cpu is an SMT core with busy siblings, let ++ * the lower priority @env::dst_cpu help it. Do not follow ++ * CPU priority. */ -- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); -+ if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) { -+ /* Always obey priorities between SMT siblings. */ -+ if (env->sd->flags & SD_SHARE_CPUCAPACITY) -+ return sched_asym_prefer(env->dst_cpu, env->src_cpu); -+ -+ /* -+ * A lower priority CPU can help an SMT core with more than one -+ * busy sibling. -+ */ -+ return sched_asym_prefer(env->dst_cpu, env->src_cpu) || -+ !is_core_idle(env->src_cpu); -+ } -+ -+ return false; ++ sched_use_asym_prio(env->sd, env->dst_cpu) && ++ (sched_asym_prefer(env->dst_cpu, env->src_cpu) || ++ !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool -@@ -11162,8 +11337,17 @@ static void nohz_balancer_kick(struct rq *rq) +@@ -11318,9 +11369,13 @@ static void nohz_balancer_kick(struct rq *rq) + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. ++ * ++ * When balancing betwen cores, all the SMT siblings of the ++ * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { -- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; -- goto unlock; -+ /* -+ * Always do ASYM_PACKING balance in the SMT -+ * domain. In upper domains, the core must be -+ * fully idle. -+ */ -+ if (sd->flags & SD_SHARE_CPUCAPACITY || -+ (!(sd->flags & SD_SHARE_CPUCAPACITY) && -+ is_core_idle(i))) { -+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; -+ goto unlock; -+ } +- if (sched_asym_prefer(i, cpu)) { ++ if (sched_use_asym_prio(sd, i) && ++ sched_asym_prefer(i, cpu)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; } - } - } -@@ -12498,6 +12682,11 @@ __init void init_sched_fair_class(void) - for_each_possible_cpu(i) { - zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); - zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); -+ -+#ifdef CONFIG_CFS_BANDWIDTH -+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); -+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); -+#endif - } - - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ee7f23c76bd3..efdc29c42161 100644 --- a/kernel/sched/features.h @@ -35025,20 +13199,10 @@ index 3a0e0dc28721..9b35b5072bae 100644 static inline void update_idle_rq_clock_pelt(struct rq *rq) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 771f8ddb7053..9e8bb6278604 100644 +index 3e8df6d31c1e..7331d436ebc4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -645,6 +645,9 @@ struct cfs_rq { - int throttled; - int throttle_count; - struct list_head throttled_list; -+#ifdef CONFIG_SMP -+ struct list_head throttled_csd_list; -+#endif - #endif /* CONFIG_CFS_BANDWIDTH */ - #endif /* CONFIG_FAIR_GROUP_SCHED */ - }; -@@ -1015,6 +1018,7 @@ struct rq { +@@ -1018,6 +1018,7 @@ struct rq { u64 clock; /* Ensure that all clocks are in the same cache line */ u64 clock_task ____cacheline_aligned; @@ -35046,560 +13210,67 @@ index 771f8ddb7053..9e8bb6278604 100644 u64 clock_pelt; unsigned long lost_idle_time; u64 clock_pelt_idle; -@@ -1041,7 +1045,6 @@ struct rq { +@@ -1772,6 +1773,13 @@ queue_balance_callback(struct rq *rq, + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) - unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; -- unsigned long cpu_capacity_inverted; - - struct balance_callback *balance_callback; - -@@ -1154,6 +1157,11 @@ struct rq { - - /* Scratch cpumask to be temporarily used under rq_lock */ - cpumask_var_t scratch_mask; ++/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ ++#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | ++static const unsigned int SD_SHARED_CHILD_MASK = ++#include ++0; ++#undef SD_FLAG + -+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) -+ call_single_data_t cfsb_csd; -+ struct list_head cfsb_csd_list; -+#endif - }; - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -2893,24 +2901,6 @@ static inline unsigned long capacity_orig_of(int cpu) - return cpu_rq(cpu)->cpu_capacity_orig; - } - --/* -- * Returns inverted capacity if the CPU is in capacity inversion state. -- * 0 otherwise. -- * -- * Capacity inversion detection only considers thermal impact where actual -- * performance points (OPPs) gets dropped. -- * -- * Capacity inversion state happens when another performance domain that has -- * equal or lower capacity_orig_of() becomes effectively larger than the perf -- * domain this CPU belongs to due to thermal pressure throttling it hard. -- * -- * See comment in update_cpu_capacity(). -- */ --static inline unsigned long cpu_in_capacity_inversion(int cpu) --{ -- return cpu_rq(cpu)->cpu_capacity_inverted; --} -- /** - * enum cpu_util_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency --- -2.40.0.rc2 - -From e0cfd01287f19367a61351b05d43cf4471156ffd Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 6 Feb 2023 09:53:13 +0100 -Subject: [PATCH 14/16] zram - -Signed-off-by: Peter Jung ---- - Documentation/admin-guide/blockdev/zram.rst | 2 + - drivers/block/zram/zram_drv.c | 319 +++++++++++++++++++- - drivers/block/zram/zram_drv.h | 7 + - 3 files changed, 322 insertions(+), 6 deletions(-) - -diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst -index e4551579cb12..a1dd202efca1 100644 ---- a/Documentation/admin-guide/blockdev/zram.rst -+++ b/Documentation/admin-guide/blockdev/zram.rst -@@ -209,6 +209,7 @@ compact WO trigger memory compaction - debug_stat RO this file is used for zram debugging purposes - backing_dev RW set up backend storage for zram to write out - idle WO mark allocated slot as idle -+merge WO trigger merge identical pages - ====================== ====== =============================================== - - -@@ -267,6 +268,7 @@ line of text and contains the following stats separated by whitespace: - pages_compacted the number of pages freed during compaction - huge_pages the number of incompressible pages - huge_pages_since the number of incompressible pages since zram set up -+ pages_merged the number of identical pages merged into single one - ================ ============================================================= - - File /sys/block/zram/bd_stat -diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c -index e290d6d97047..084f8f830bde 100644 ---- a/drivers/block/zram/zram_drv.c -+++ b/drivers/block/zram/zram_drv.c -@@ -33,12 +33,15 @@ - #include - #include - #include -+#include -+#include - - #include "zram_drv.h" - - static DEFINE_IDR(zram_index_idr); - /* idr index must be protected */ - static DEFINE_MUTEX(zram_index_mutex); -+static DEFINE_MUTEX(zram_rbtree_mutex); - - static int zram_major; - static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; -@@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index); - static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio); - -+struct zram_rbtree_node { -+ struct rb_node node; -+ unsigned long key; -+ unsigned long cnt; -+}; -+ -+struct zram_hash_node { -+ unsigned long index; -+ struct hlist_node next; -+}; - - static int zram_slot_trylock(struct zram *zram, u32 index) + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The CPU whose highest level of sched domain is to +@@ -1779,16 +1787,25 @@ queue_balance_callback(struct rq *rq, + * @flag: The flag to check for the highest sched_domain + * for the given CPU. + * +- * Returns the highest sched_domain of a CPU which contains the given flag. ++ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has ++ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. + */ + static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { -@@ -1140,7 +1153,7 @@ static ssize_t recomp_algorithm_store(struct device *dev, - while (*args) { - args = next_arg(args, ¶m, &val); + struct sched_domain *sd, *hsd = NULL; -- if (!*val) -+ if (!val || !*val) - return -EINVAL; - - if (!strcmp(param, "algo")) { -@@ -1184,6 +1197,30 @@ static ssize_t compact_store(struct device *dev, - return len; - } - -+static int zram_do_scan(struct zram *zram); -+ -+static ssize_t merge_store(struct device *dev, -+ struct device_attribute *attr, const char *buf, size_t len) -+{ -+ struct zram *zram = dev_to_zram(dev); -+ int ret; -+ -+ down_read(&zram->init_lock); -+ if (!init_done(zram)) { -+ up_read(&zram->init_lock); -+ return -EINVAL; -+ } -+ -+ ret = zram_do_scan(zram); -+ if (ret != 0) { -+ up_read(&zram->init_lock); -+ return -ENOMEM; -+ } -+ -+ up_read(&zram->init_lock); -+ return len; -+} -+ - static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) - { -@@ -1223,7 +1260,7 @@ static ssize_t mm_stat_show(struct device *dev, - max_used = atomic_long_read(&zram->stats.max_used_pages); - - ret = scnprintf(buf, PAGE_SIZE, -- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", -+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu %8llu\n", - orig_size << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.compr_data_size), - mem_used << PAGE_SHIFT, -@@ -1232,7 +1269,8 @@ static ssize_t mm_stat_show(struct device *dev, - (u64)atomic64_read(&zram->stats.same_pages), - atomic_long_read(&pool_stats.pages_compacted), - (u64)atomic64_read(&zram->stats.huge_pages), -- (u64)atomic64_read(&zram->stats.huge_pages_since)); -+ (u64)atomic64_read(&zram->stats.huge_pages_since), -+ (u64)atomic64_read(&zram->stats.pages_merged)); - up_read(&zram->init_lock); - - return ret; -@@ -1283,6 +1321,248 @@ static DEVICE_ATTR_RO(bd_stat); - #endif - static DEVICE_ATTR_RO(debug_stat); - -+static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ struct zram_rbtree_node *this; -+ -+ while (*new) { -+ this = rb_entry(*new, struct zram_rbtree_node, node); -+ parent = *new; -+ if (data->key < this->key) -+ new = &((*new)->rb_left); -+ else if (data->key > this->key) -+ new = &((*new)->rb_right); -+ else -+ return false; -+ } -+ -+ rb_link_node(&data->node, parent, new); -+ rb_insert_color(&data->node, root); -+ return true; -+} -+ -+static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root, -+ unsigned long key) -+{ -+ struct rb_node *node = root->rb_node; -+ struct zram_rbtree_node *data; -+ -+ while (node) { -+ data = rb_entry(node, struct zram_rbtree_node, node); -+ if (key < data->key) -+ node = node->rb_left; -+ else if (key > data->key) -+ node = node->rb_right; -+ else -+ return data; -+ } -+ -+ return NULL; -+} -+ -+static unsigned long zram_calc_hash(void *src, size_t len) -+{ -+ return xxhash(src, len, 0); -+} -+ -+static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable, -+ size_t htable_size, size_t index) -+{ -+ struct zram_rbtree_node *rb_node; -+ struct zram_hash_node *node; -+ unsigned long handle, cur_handle; -+ size_t obj_size; -+ char *src, *buf; -+ unsigned long hash; -+ int ret = 0; -+ -+ handle = zram_get_handle(zram, index); -+ if (!handle) -+ return ret; -+ -+ obj_size = zram_get_obj_size(zram, index); -+ buf = kmalloc(obj_size, GFP_KERNEL); -+ if (!buf) { -+ pr_err("Failed to allocate zs_map_object buffer\n"); -+ return -ENOMEM; -+ } -+ -+ src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); -+ memcpy(buf, src, obj_size); -+ zs_unmap_object(zram->mem_pool, handle); -+ hash = zram_calc_hash(buf, obj_size); -+ -+ mutex_lock(&zram_rbtree_mutex); -+ hlist_for_each_entry(node, &htable[hash % htable_size], next) { -+ int cmp; -+ -+ zram_slot_lock(zram, node->index); + for_each_domain(cpu, sd) { +- if (!(sd->flags & flag)) ++ if (sd->flags & flag) { ++ hsd = sd; ++ continue; ++ } + + /* -+ * Page may change as the hash table is being formed, -+ * so the checks below are necessary. ++ * Stop the search if @flag is known to be shared at lower ++ * levels. It will not be found further up. + */ -+ cur_handle = zram_get_handle(zram, node->index); -+ if (handle == cur_handle || -+ obj_size != zram_get_obj_size(zram, node->index)) { -+ zram_slot_unlock(zram, node->index); -+ continue; -+ } -+ -+ src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO); -+ cmp = memcmp(buf, src, obj_size); -+ zs_unmap_object(zram->mem_pool, cur_handle); -+ -+ if (!cmp) { -+ rb_node = zram_rbtree_search(&zram->sph_rbtree, handle); -+ -+ /* -+ * This check is necessary in order not to zs_free an object -+ * that someone already refers to. This situation is possible -+ * when with repeated calls to zram_do_scan(). For example: -+ * -+ * [slot0] [slot1] [slot2] [slot3] [slot4] -+ * [obj0] [obj1] [obj2] [obj3] [obj4] -+ * -+ * Let's imagine that obj2 and obj3 are equal, and we called -+ * zram_do_scan() function: -+ * -+ * [slot0] [slot1] [slot2] [slot3] [slot4] -+ * [obj0] [obj1] [obj2] [obj2] [obj4] -+ * -+ * Now, slot2 and slot3 refers to obj2 zsmalloc object. -+ * Time passed, now slot0 refres to obj0_n, which is equal -+ * to obj2: -+ * -+ * [slot0] [slot1] [slot2] [slot3] [slot4] -+ * [obj0_n] [obj1] [obj2] [obj2] [obj4] -+ * -+ * Now we call zram_do_scan() function again. We get to slot2, -+ * and we understand that obj2 and obj0_n hashes are the same. We -+ * try to zs_free(obj2), but slot3 also already refers to it. -+ * -+ * This is not correct! -+ */ -+ if (unlikely(rb_node)) -+ if (rb_node->cnt > 1) { -+ zram_slot_unlock(zram, node->index); -+ continue; -+ } -+ -+ zram_set_handle(zram, index, cur_handle); -+ zs_free(zram->mem_pool, handle); -+ -+ rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle); -+ -+ if (!rb_node) { -+ rb_node = kzalloc(sizeof(struct zram_rbtree_node), -+ GFP_KERNEL); -+ if (!rb_node) { -+ pr_err("Failed to allocate rb_node\n"); -+ ret = -ENOMEM; -+ zram_slot_unlock(zram, node->index); -+ mutex_unlock(&zram_rbtree_mutex); -+ goto merged_or_err; -+ } -+ -+ rb_node->key = cur_handle; -+ /* Two slots refers to an zsmalloc object with cur_handle key */ -+ rb_node->cnt = 2; -+ zram_rbtree_insert(&zram->sph_rbtree, rb_node); -+ } else { -+ rb_node->cnt++; -+ } -+ -+ atomic64_inc(&zram->stats.pages_merged); -+ atomic64_sub(obj_size, &zram->stats.compr_data_size); -+ zram_set_flag(zram, index, ZRAM_MERGED); -+ zram_set_flag(zram, node->index, ZRAM_MERGED); -+ -+ zram_slot_unlock(zram, node->index); -+ mutex_unlock(&zram_rbtree_mutex); -+ goto merged_or_err; -+ } -+ -+ zram_slot_unlock(zram, node->index); -+ } -+ -+ mutex_unlock(&zram_rbtree_mutex); -+ -+ node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL); -+ if (!node) { -+ ret = -ENOMEM; -+ goto merged_or_err; -+ } -+ -+ node->index = index; -+ hlist_add_head(&node->next, &htable[hash % htable_size]); -+ -+merged_or_err: -+ kfree(buf); -+ return ret; -+} -+ -+static void zram_free_htable_entries(struct hlist_head *htable, -+ size_t htable_size) -+{ -+ struct hlist_node *n; -+ struct zram_hash_node *node; -+ -+ hlist_for_each_entry_safe(node, n, htable, next) { -+ hlist_del(&node->next); -+ kfree(node); -+ } -+} -+ -+static int zram_do_scan(struct zram *zram) -+{ -+ size_t num_pages = zram->disksize >> PAGE_SHIFT; -+ size_t htable_size = num_pages; -+ size_t index; -+ struct hlist_head *htable; -+ int i, ret = 0; -+ -+ htable = vzalloc(htable_size * sizeof(struct hlist_head)); -+ if (!htable) { -+ pr_err("Failed to allocate hash table\n"); -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < htable_size; i++) -+ INIT_HLIST_HEAD(&htable[i]); -+ -+ for (index = 0; index < num_pages; index++) { -+ zram_slot_lock(zram, index); -+ -+ if (!zram_allocated(zram, index)) { -+ zram_slot_unlock(zram, index); -+ continue; -+ } -+ -+ if (zram_test_flag(zram, index, ZRAM_UNDER_WB) || -+ zram_test_flag(zram, index, ZRAM_WB) || -+ zram_test_flag(zram, index, ZRAM_SAME)) { -+ zram_slot_unlock(zram, index); -+ continue; -+ } -+ -+ /* Ignore pages that have been recompressed */ -+ if (zram_get_priority(zram, index) != 0) -+ continue; -+ -+ ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index); -+ zram_slot_unlock(zram, index); -+ if (ret != 0) -+ goto out; -+ } -+ -+out: -+ zram_free_htable_entries(htable, htable_size); -+ vfree(htable); -+ return ret; -+} -+ - static void zram_meta_free(struct zram *zram, u64 disksize) - { - size_t num_pages = disksize >> PAGE_SHIFT; -@@ -1324,6 +1604,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) - static void zram_free_page(struct zram *zram, size_t index) - { - unsigned long handle; -+ struct zram_rbtree_node *node; ++ if (flag & SD_SHARED_CHILD_MASK) + break; +- hsd = sd; + } - #ifdef CONFIG_ZRAM_MEMORY_TRACKING - zram->table[index].ac_time = 0; -@@ -1361,7 +1642,28 @@ static void zram_free_page(struct zram *zram, size_t index) - if (!handle) - return; - -- zs_free(zram->mem_pool, handle); -+ if (zram_test_flag(zram, index, ZRAM_MERGED)) { -+ zram_clear_flag(zram, index, ZRAM_MERGED); -+ mutex_lock(&zram_rbtree_mutex); -+ -+ node = zram_rbtree_search(&zram->sph_rbtree, handle); -+ BUG_ON(!node); -+ -+ node->cnt--; -+ if (node->cnt == 0) { -+ rb_erase(&node->node, &zram->sph_rbtree); -+ mutex_unlock(&zram_rbtree_mutex); -+ -+ zs_free(zram->mem_pool, handle); -+ kfree(node); -+ } else { -+ mutex_unlock(&zram_rbtree_mutex); -+ } -+ -+ atomic64_dec(&zram->stats.pages_merged); -+ } else { -+ zs_free(zram->mem_pool, handle); -+ } - - atomic64_sub(zram_get_obj_size(zram, index), - &zram->stats.compr_data_size); -@@ -1824,7 +2126,7 @@ static ssize_t recompress_store(struct device *dev, - while (*args) { - args = next_arg(args, ¶m, &val); - -- if (!*val) -+ if (!val || !*val) - return -EINVAL; - - if (!strcmp(param, "type")) { -@@ -1909,7 +2211,8 @@ static ssize_t recompress_store(struct device *dev, - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_UNDER_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || -- zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) -+ zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE) || -+ zram_test_flag(zram, index, ZRAM_MERGED)) - goto next; - - err = zram_recompress(zram, index, page, threshold, -@@ -2295,6 +2598,7 @@ static const struct block_device_operations zram_devops = { - }; - - static DEVICE_ATTR_WO(compact); -+static DEVICE_ATTR_WO(merge); - static DEVICE_ATTR_RW(disksize); - static DEVICE_ATTR_RO(initstate); - static DEVICE_ATTR_WO(reset); -@@ -2335,6 +2639,7 @@ static struct attribute *zram_disk_attrs[] = { - #ifdef CONFIG_ZRAM_WRITEBACK - &dev_attr_bd_stat.attr, - #endif -+ &dev_attr_merge.attr, - &dev_attr_debug_stat.attr, - #ifdef CONFIG_ZRAM_MULTI_COMP - &dev_attr_recomp_algorithm.attr, -@@ -2421,6 +2726,8 @@ static int zram_add(void) - - comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); - -+ zram->sph_rbtree = RB_ROOT; -+ - zram_debugfs_register(zram); - pr_info("Added device: %s\n", zram->disk->disk_name); - return device_id; -diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h -index c5254626f051..2afdbf76a1aa 100644 ---- a/drivers/block/zram/zram_drv.h -+++ b/drivers/block/zram/zram_drv.h -@@ -56,6 +56,7 @@ enum zram_pageflags { - - ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ - ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */ -+ ZRAM_MERGED, /* page was merged */ - - __NR_ZRAM_PAGEFLAGS, - }; -@@ -87,6 +88,7 @@ struct zram_stats { - atomic_long_t max_used_pages; /* no. of maximum pages stored */ - atomic64_t writestall; /* no. of write slow paths */ - atomic64_t miss_free; /* no. of missed free */ -+ atomic64_t pages_merged; /* no. of pages, which merged into single one */ - #ifdef CONFIG_ZRAM_WRITEBACK - atomic64_t bd_count; /* no. of pages in backing device */ - atomic64_t bd_reads; /* no. of reads from backing device */ -@@ -140,5 +142,10 @@ struct zram { - #ifdef CONFIG_ZRAM_MEMORY_TRACKING - struct dentry *debugfs_dir; - #endif -+ /* -+ * This is same pages handle's rb tree, where the key is a handle -+ * to same pages and the value is a link counter -+ */ -+ struct rb_root sph_rbtree; - }; - #endif + return hsd; -- -2.40.0.rc2 +2.40.0 -From 02b507dfef3f09d3de2785ed80164e15c8ed7844 Mon Sep 17 00:00:00 2001 +From 6c867f735d5efe4f7df3cc9cf96dc0928914c438 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Tue, 14 Feb 2023 22:02:09 +0100 -Subject: [PATCH 15/16] zstd import v1.5.4 +Date: Sun, 9 Apr 2023 21:26:12 +0200 +Subject: [PATCH 10/10] zstd Signed-off-by: Peter Jung --- include/linux/zstd.h | 2 +- include/linux/zstd_errors.h | 23 +- - include/linux/zstd_lib.h | 569 +++++-- + include/linux/zstd_lib.h | 703 +++++-- + kernel/module/decompress.c | 2 +- lib/zstd/Makefile | 2 +- - lib/zstd/common/bits.h | 124 ++ - lib/zstd/common/bitstream.h | 51 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 53 +- lib/zstd/common/compiler.h | 14 +- lib/zstd/common/cpu.h | 3 +- lib/zstd/common/debug.c | 3 +- @@ -35612,46 +13283,47 @@ Signed-off-by: Peter Jung lib/zstd/common/huf.h | 222 +-- lib/zstd/common/mem.h | 2 +- lib/zstd/common/portability_macros.h | 26 +- - lib/zstd/common/zstd_common.c | 3 +- + lib/zstd/common/zstd_common.c | 38 +- lib/zstd/common/zstd_deps.h | 2 +- - lib/zstd/common/zstd_internal.h | 94 +- + lib/zstd/common/zstd_internal.h | 99 +- lib/zstd/compress/clevels.h | 3 +- lib/zstd/compress/fse_compress.c | 59 +- lib/zstd/compress/hist.c | 3 +- lib/zstd/compress/hist.h | 3 +- lib/zstd/compress/huf_compress.c | 372 ++-- - lib/zstd/compress/zstd_compress.c | 1491 ++++++++++++----- - lib/zstd/compress/zstd_compress_internal.h | 267 +-- + lib/zstd/compress/zstd_compress.c | 1762 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 333 +++- lib/zstd/compress/zstd_compress_literals.c | 155 +- lib/zstd/compress/zstd_compress_literals.h | 25 +- lib/zstd/compress/zstd_compress_sequences.c | 7 +- lib/zstd/compress/zstd_compress_sequences.h | 3 +- lib/zstd/compress/zstd_compress_superblock.c | 47 +- lib/zstd/compress/zstd_compress_superblock.h | 3 +- - lib/zstd/compress/zstd_cwksp.h | 5 +- + lib/zstd/compress/zstd_cwksp.h | 149 +- lib/zstd/compress/zstd_double_fast.c | 129 +- lib/zstd/compress/zstd_double_fast.h | 6 +- - lib/zstd/compress/zstd_fast.c | 582 +++++-- + lib/zstd/compress/zstd_fast.c | 582 ++++-- lib/zstd/compress/zstd_fast.h | 6 +- - lib/zstd/compress/zstd_lazy.c | 364 ++-- + lib/zstd/compress/zstd_lazy.c | 518 ++--- lib/zstd/compress/zstd_lazy.h | 7 +- lib/zstd/compress/zstd_ldm.c | 11 +- lib/zstd/compress/zstd_ldm.h | 3 +- lib/zstd/compress/zstd_ldm_geartab.h | 3 +- - lib/zstd/compress/zstd_opt.c | 185 +- + lib/zstd/compress/zstd_opt.c | 187 +- lib/zstd/compress/zstd_opt.h | 3 +- - lib/zstd/decompress/huf_decompress.c | 731 ++++---- - lib/zstd/decompress/zstd_ddict.c | 8 +- + lib/zstd/decompress/huf_decompress.c | 731 ++++--- + lib/zstd/decompress/zstd_ddict.c | 9 +- lib/zstd/decompress/zstd_ddict.h | 3 +- - lib/zstd/decompress/zstd_decompress.c | 215 ++- - lib/zstd/decompress/zstd_decompress_block.c | 252 ++- - lib/zstd/decompress/zstd_decompress_block.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 269 ++- + lib/zstd/decompress/zstd_decompress_block.c | 283 ++- + lib/zstd/decompress/zstd_decompress_block.h | 8 +- .../decompress/zstd_decompress_internal.h | 7 +- lib/zstd/decompress_sources.h | 2 +- - lib/zstd/zstd_common_module.c | 2 +- + lib/zstd/zstd_common_module.c | 5 +- lib/zstd/zstd_compress_module.c | 2 +- lib/zstd/zstd_decompress_module.c | 4 +- - 57 files changed, 4086 insertions(+), 2268 deletions(-) + 59 files changed, 4732 insertions(+), 2612 deletions(-) + create mode 100644 lib/zstd/common/allocations.h create mode 100644 lib/zstd/common/bits.h diff --git a/include/linux/zstd.h b/include/linux/zstd.h @@ -35733,7 +13405,7 @@ index 58b6dd45a969..6d5cf55f0bf3 100644 } ZSTD_ErrorCode; diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h -index 79d55465d5c1..dc7e9605a624 100644 +index 79d55465d5c1..738fe8ea4ead 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1,5 +1,6 @@ @@ -35796,11 +13468,21 @@ index 79d55465d5c1..dc7e9605a624 100644 #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 5 -#define ZSTD_VERSION_RELEASE 2 -+#define ZSTD_VERSION_RELEASE 4 ++#define ZSTD_VERSION_RELEASE 5 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : -@@ -156,7 +176,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t +@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t * "empty", "unknown" and "error" results to the same return value (0), * while ZSTD_getFrameContentSize() gives them separate return values. * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ @@ -35811,7 +13493,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ * `src` should point to the start of a ZSTD frame or skippable frame. -@@ -168,8 +190,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) +@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) /*====== Helper functions ======*/ @@ -35844,7 +13526,7 @@ index 79d55465d5c1..dc7e9605a624 100644 ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ -@@ -412,6 +456,9 @@ typedef enum { +@@ -412,6 +457,9 @@ typedef enum { * ZSTD_c_validateSequences * ZSTD_c_useBlockSplitter * ZSTD_c_useRowMatchFinder @@ -35854,7 +13536,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. -@@ -430,7 +477,11 @@ typedef enum { +@@ -430,7 +478,11 @@ typedef enum { ZSTD_c_experimentalParam12=1009, ZSTD_c_experimentalParam13=1010, ZSTD_c_experimentalParam14=1011, @@ -35867,7 +13549,7 @@ index 79d55465d5c1..dc7e9605a624 100644 } ZSTD_cParameter; typedef struct { -@@ -493,7 +544,7 @@ typedef enum { +@@ -493,7 +545,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". @@ -35876,7 +13558,17 @@ index 79d55465d5c1..dc7e9605a624 100644 * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. -@@ -543,13 +594,15 @@ typedef enum { +@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +596,15 @@ typedef enum { * ZSTD_d_stableOutBuffer * ZSTD_d_forceIgnoreChecksum * ZSTD_d_refMultipleDDicts @@ -35893,7 +13585,7 @@ index 79d55465d5c1..dc7e9605a624 100644 } ZSTD_dParameter; -@@ -728,8 +781,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output +@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. @@ -35902,7 +13594,7 @@ index 79d55465d5c1..dc7e9605a624 100644 ******************************************************************************/ /*! -@@ -738,6 +789,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output +@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); @@ -35912,7 +13604,7 @@ index 79d55465d5c1..dc7e9605a624 100644 */ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); /*! -@@ -788,13 +842,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer +@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer /*===== Streaming decompression functions =====*/ @@ -35945,7 +13637,7 @@ index 79d55465d5c1..dc7e9605a624 100644 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ -@@ -913,7 +985,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); +@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : * - The frame does not require a dictionary to be decoded (most common case). @@ -35954,7 +13646,22 @@ index 79d55465d5c1..dc7e9605a624 100644 * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. -@@ -937,8 +1009,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); +@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is re-used, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, * meaning "return to no-dictionary mode". @@ -35966,7 +13673,16 @@ index 79d55465d5c1..dc7e9605a624 100644 * Note 2 : Loading a dictionary involves building tables. * It's also a CPU consuming operation, with non-negligible impact on latency. * Tables are dependent on compression parameters, and for this reason, -@@ -951,7 +1024,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); +@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ @@ -35975,7 +13691,15 @@ index 79d55465d5c1..dc7e9605a624 100644 * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. -@@ -986,9 +1059,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, +@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ @@ -35988,7 +13712,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, * meaning "return to no-dictionary mode". -@@ -1012,9 +1085,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s +@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s * The memory for the table is allocated on the first call to refDDict, and can be * freed with ZSTD_freeDCtx(). * @@ -36001,7 +13725,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * Special: referencing a NULL DDict means "return to no-dictionary mode". * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. */ -@@ -1071,24 +1145,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); +@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE #endif @@ -36026,7 +13750,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /* ************************************************************************************** * experimental API (static linking only) **************************************************************************************** -@@ -1123,6 +1179,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); +@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ #define ZSTD_STRATEGY_MIN ZSTD_fast #define ZSTD_STRATEGY_MAX ZSTD_btultra2 @@ -36034,10 +13758,43 @@ index 79d55465d5c1..dc7e9605a624 100644 #define ZSTD_OVERLAPLOG_MIN 0 -@@ -1350,29 +1407,85 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size +@@ -1303,7 +1369,7 @@ typedef enum { + } ZSTD_paramSwitch_e; + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size * or an error code (if srcSize is too small) */ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_frameHeader; ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header, or requires larger `srcSize`. ++ * @return : 0, `zfhPtr` is correctly filled, ++ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ +/*! ZSTD_decompressionMargin() : + * Zstd supports in-place decompression, where the input and output buffers overlap. + * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, @@ -36125,7 +13882,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals -@@ -1388,7 +1501,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o +@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); /*! ZSTD_compressSequences() : @@ -36136,7 +13893,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) * The entire source is compressed into a single frame. * -@@ -1413,11 +1528,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si +@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, * and cannot emit an RLE block that disagrees with the repcode history @@ -36153,7 +13910,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /*! ZSTD_writeSkippableFrame() : -@@ -1481,8 +1597,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); +@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. * @@ -36166,7 +13923,7 @@ index 79d55465d5c1..dc7e9605a624 100644 */ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); -@@ -1501,7 +1620,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); +@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not estimated here. @@ -36180,16 +13937,30 @@ index 79d55465d5c1..dc7e9605a624 100644 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); -@@ -1649,22 +1773,31 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); +@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); * This function never fails (wide contract) */ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); +/*! ZSTD_CCtx_setCParams() : -+ * Set all parameters provided within @cparams into the working @cctx. ++ * Set all parameters provided within @p cparams into the working @p cctx. + * Note : if modifying parameters during compression (MT mode only), + * note that changes to the .windowLog parameter will be ignored. -+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); + /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. @@ -36216,7 +13987,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, -@@ -1808,13 +1941,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo +@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * Experimental parameter. * Default is 0 == disabled. Set to 1 to enable. * @@ -36240,7 +14011,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * * When this flag is enabled zstd won't allocate an input window buffer, * because the user guarantees it can reference the ZSTD_inBuffer until -@@ -1822,18 +1958,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo +@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also * avoid the memcpy() from the input buffer to the input window buffer. * @@ -36264,7 +14035,7 @@ index 79d55465d5c1..dc7e9605a624 100644 */ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 -@@ -1878,7 +2011,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo +@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * Without validation, providing a sequence that does not conform to the zstd spec will cause * undefined behavior, and may produce a corrupted block. * @@ -36273,7 +14044,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * specifics regarding offset/matchlength requirements) then the function will bail out and * return an error. * -@@ -1928,6 +2061,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo +@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 @@ -36321,7 +14092,7 @@ index 79d55465d5c1..dc7e9605a624 100644 + * This parameter can be used to set an upper bound on the blocksize + * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper + * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make -+ * compressBound() innacurate). Only currently meant to be used for testing. ++ * compressBound() inaccurate). Only currently meant to be used for testing. + * + */ +#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 @@ -36353,7 +14124,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. -@@ -2084,7 +2290,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete +@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete * in the range [dst, dst + pos) MUST not be modified during decompression * or you will get data corruption. * @@ -36362,7 +14133,7 @@ index 79d55465d5c1..dc7e9605a624 100644 * it can write directly to the ZSTD_outBuffer, but it will still allocate * an input buffer large enough to fit any compressed block. This will also * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. -@@ -2137,6 +2343,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete +@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete */ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 @@ -36380,7 +14151,7 @@ index 79d55465d5c1..dc7e9605a624 100644 /*! ZSTD_DCtx_setFormat() : * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). -@@ -2145,6 +2362,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete +@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") @@ -36388,7 +14159,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : -@@ -2181,6 +2399,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( +@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -36396,7 +14167,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); -@@ -2198,6 +2417,7 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, +@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -36404,7 +14175,20 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); -@@ -2218,6 +2438,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -36412,7 +14196,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, -@@ -2232,6 +2453,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, +@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") @@ -36420,7 +14204,18 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /*! ZSTD_initCStream_usingCDict_advanced() : -@@ -2250,6 +2472,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") @@ -36428,7 +14223,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, -@@ -2274,6 +2497,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, +@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -36436,7 +14231,7 @@ index 79d55465d5c1..dc7e9605a624 100644 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); -@@ -2319,8 +2543,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); +@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); * * note: no dictionary will be used if dict == NULL or dictSize < 8 @@ -36446,7 +14241,7 @@ index 79d55465d5c1..dc7e9605a624 100644 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /*! -@@ -2330,20 +2554,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo +@@ -2330,27 +2595,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo * ZSTD_DCtx_refDDict(zds, ddict); * * note : ddict is referenced, it must outlive decompression session @@ -36464,98 +14259,7 @@ index 79d55465d5c1..dc7e9605a624 100644 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x - */ -ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); -- - - /* ******************************************************************* - * Buffer-less and synchronous inner streaming functions -@@ -2362,7 +2576,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); - - Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. -- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() - - Then, consume your input using ZSTD_compressContinue(). - There are some important considerations to keep in mind when using this advanced function : -@@ -2387,15 +2600,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ --ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + -+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") -+ZSTDLIB_STATIC_API -+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ - - ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ - ZSTD_DEPRECATED("use advanced API to access custom parameters") -+ZSTDLIB_STATIC_API - size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ - ZSTD_DEPRECATED("use advanced API to access custom parameters") -+ZSTDLIB_STATIC_API - size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ - /* - Buffer-less streaming decompression (synchronous mode) -@@ -2408,8 +2626,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. - Data fragment must be large enough to ensure successful decoding. - `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. -- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. -- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. -+ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. -+ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. - errorCode, which can be tested using ZSTD_isError(). - - It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, -@@ -2428,7 +2646,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - - The most memory efficient way is to use a round buffer of sufficient size. - Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), -- which can @return an error code if required value is too large for current system (in 32-bits mode). -+ which can return an error code if required value is too large for current system (in 32-bits mode). - In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, - up to the moment there is not enough room left in the buffer to guarantee decoding another full block, - which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. -@@ -2448,7 +2666,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). - ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. - -- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). -+ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). - It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. - It can also be an error code, which can be tested with ZSTD_isError(). - -@@ -2480,6 +2698,8 @@ typedef struct { - unsigned headerSize; - unsigned dictID; - unsigned checksumFlag; -+ unsigned _reserved1; -+ unsigned _reserved2; - } ZSTD_frameHeader; - - /*! ZSTD_getFrameHeader() : -@@ -2502,6 +2722,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); - ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - /* misc */ -+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") - ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); - typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; - ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -@@ -2524,7 +2745,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - - It is necessary to init context before starting - + compression : any ZSTD_compressBegin*() variant, including with dictionary - + decompression : any ZSTD_decompressBegin*() variant, including with dictionary -- + copyCCtx() and copyDCtx() can be used too - - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB - + If input is larger than a block size, it's necessary to split input data into multiple blocks - + For inputs larger than a single block, consider using regular ZSTD_compress() instead. -@@ -2547,5 +2767,166 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_ - ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ - - +/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* + * + * *** OVERVIEW *** @@ -36716,9 +14420,194 @@ index 79d55465d5c1..dc7e9605a624 100644 + void* sequenceProducerState, + ZSTD_sequenceProducer_F* sequenceProducer +); + + + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. ++* ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2362,7 +2785,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2384,18 +2806,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) +@@ -2408,8 +2840,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2860,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2880,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +2903,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +2914,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +2922,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +2949,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +2965,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + +- #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ +diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c +index bb79ac1a6d8f..7ddc87bee274 100644 +--- a/kernel/module/decompress.c ++++ b/kernel/module/decompress.c +@@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, + zstd_dec.size = PAGE_SIZE; + + ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); +- kunmap(page); ++ kunmap_local(zstd_dec.dst); + retval = zstd_get_error_code(ret); + if (retval) + break; diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile index 20f08c644b71..464c410b2768 100644 --- a/lib/zstd/Makefile @@ -36731,12 +14620,74 @@ index 20f08c644b71..464c410b2768 100644 # All rights reserved. # # This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..05adbbeccaa9 +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "mem.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h new file mode 100644 -index 000000000000..bb7967def569 +index 000000000000..aa3487ec4b6a --- /dev/null +++ b/lib/zstd/common/bits.h -@@ -0,0 +1,124 @@ +@@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. @@ -36757,7 +14708,7 @@ index 000000000000..bb7967def569 +{ + assert(val != 0); + { -+ static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, + 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, + 26, 12, 18, 6, 11, 5, 10, 9}; @@ -36860,9 +14811,34 @@ index 000000000000..bb7967def569 + return 31 - ZSTD_countLeadingZeros32(val); +} + ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ +#endif /* ZSTD_BITS_H */ diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h -index feef3a1b1d60..83a180c65faf 100644 +index feef3a1b1d60..444dc4f85c64 100644 --- a/lib/zstd/common/bitstream.h +++ b/lib/zstd/common/bitstream.h @@ -1,7 +1,8 @@ @@ -36979,6 +14955,15 @@ index feef3a1b1d60..83a180c65faf 100644 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) { size_t const value = BIT_lookBitsFast(bitD, nbBits); +@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + * This function is safe, it guarantees it will not read beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + return BIT_DStream_overflow; diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h index c42d39faf9bd..c437e0975575 100644 --- a/lib/zstd/common/compiler.h @@ -37933,7 +15918,7 @@ index 0e3b2c0a527d..7ede8cf1ffe5 100644 + #endif /* ZSTD_PORTABILITY_MACROS_H */ diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c -index 3d7e35b309b5..5a9abca10944 100644 +index 3d7e35b309b5..44b95b25344a 100644 --- a/lib/zstd/common/zstd_common.c +++ b/lib/zstd/common/zstd_common.c @@ -1,5 +1,6 @@ @@ -37944,8 +15929,54 @@ index 3d7e35b309b5..5a9abca10944 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h -index 7a5bf44839c9..925161416033 100644 +index f06df065dec0..670c5fa2a952 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -1,6 +1,6 @@ @@ -37957,7 +15988,7 @@ index 7a5bf44839c9..925161416033 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h -index 93305d9b41bb..170cd1db4819 100644 +index 93305d9b41bb..7f023e4d4774 100644 --- a/lib/zstd/common/zstd_internal.h +++ b/lib/zstd/common/zstd_internal.h @@ -1,5 +1,6 @@ @@ -38067,7 +16098,7 @@ index 93305d9b41bb..170cd1db4819 100644 } } return seqLen; -@@ -337,12 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore +@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` */ typedef struct { @@ -38078,14 +16109,13 @@ index 93305d9b41bb..170cd1db4819 100644 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ -+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - - /* custom memory allocation functions */ - void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); -@@ -350,61 +347,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); - void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); - - +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- -MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ -{ - assert(val != 0); @@ -38139,11 +16169,10 @@ index 93305d9b41bb..170cd1db4819 100644 -# endif - } -} -- -- ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + /* ZSTD_invalidateRepCodes() : - * ensures next compression will not use repcodes from previous block. - * Note : only works with regular variant; diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h index d9a76112ec3a..6ab8be6532ef 100644 --- a/lib/zstd/compress/clevels.h @@ -39061,7 +17090,7 @@ index 74ef0db47621..83241abafe35 100644 } - diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c -index f620cafca633..81b8cd119cd8 100644 +index f620cafca633..c1c316e9e289 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -1,5 +1,6 @@ @@ -39072,7 +17101,13 @@ index f620cafca633..81b8cd119cd8 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -16,7 +17,6 @@ +@@ -11,12 +12,12 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" @@ -39080,15 +17115,15 @@ index f620cafca633..81b8cd119cd8 100644 #include "../common/huf.h" #include "zstd_compress_internal.h" #include "zstd_compress_sequences.h" -@@ -27,6 +27,7 @@ +@@ -27,6 +28,7 @@ #include "zstd_opt.h" #include "zstd_ldm.h" #include "zstd_compress_superblock.h" -+#include "../common/bits.h" /* ZSTD_highbit32 */ ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ /* *************************************************************** * Tuning parameters -@@ -55,14 +56,17 @@ +@@ -55,14 +57,17 @@ * Helper functions ***************************************/ /* ZSTD_compressBound() @@ -39112,7 +17147,7 @@ index f620cafca633..81b8cd119cd8 100644 } -@@ -171,12 +175,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) +@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) if (cctx==NULL) return 0; /* support free on NULL */ RETURN_ERROR_IF(cctx->staticSize, memory_allocation, "not compatible with static CCtx"); @@ -39127,7 +17162,7 @@ index f620cafca633..81b8cd119cd8 100644 } return 0; } -@@ -257,9 +258,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, +@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); } @@ -39139,7 +17174,7 @@ index f620cafca633..81b8cd119cd8 100644 */ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, const ZSTD_compressionParameters* const cParams) { -@@ -267,6 +268,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, +@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; } @@ -39174,7 +17209,7 @@ index f620cafca633..81b8cd119cd8 100644 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_compressionParameters cParams) { -@@ -284,6 +313,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( +@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( } cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); @@ -39185,7 +17220,7 @@ index f620cafca633..81b8cd119cd8 100644 assert(!ZSTD_checkCParams(cParams)); return cctxParams; } -@@ -329,10 +362,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) +@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) #define ZSTD_NO_CLEVEL 0 /* @@ -39201,7 +17236,7 @@ index f620cafca633..81b8cd119cd8 100644 { assert(!ZSTD_checkCParams(params->cParams)); ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); -@@ -345,6 +381,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par +@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); @@ -39211,7 +17246,7 @@ index f620cafca633..81b8cd119cd8 100644 DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); } -@@ -359,7 +398,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete +@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete /* * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. @@ -39220,7 +17255,7 @@ index f620cafca633..81b8cd119cd8 100644 */ static void ZSTD_CCtxParams_setZstdParams( ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) -@@ -455,8 +494,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) +@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) return bounds; case ZSTD_c_enableLongDistanceMatching: @@ -39231,7 +17266,7 @@ index f620cafca633..81b8cd119cd8 100644 return bounds; case ZSTD_c_ldmHashLog: -@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) +@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = 1; return bounds; @@ -39258,7 +17293,7 @@ index f620cafca633..81b8cd119cd8 100644 default: bounds.error = ERROR(parameter_unsupported); return bounds; -@@ -613,6 +672,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_useBlockSplitter: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: @@ -39269,7 +17304,7 @@ index f620cafca633..81b8cd119cd8 100644 default: return 0; } -@@ -625,7 +688,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) +@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) if (ZSTD_isUpdateAuthorized(param)) { cctx->cParamsChanged = 1; } else { @@ -39278,7 +17313,7 @@ index f620cafca633..81b8cd119cd8 100644 } } switch(param) -@@ -668,6 +731,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) +@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_useBlockSplitter: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: @@ -39289,7 +17324,7 @@ index f620cafca633..81b8cd119cd8 100644 break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); -@@ -723,12 +790,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, +@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_minMatch : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_minMatch, value); @@ -39304,7 +17339,7 @@ index f620cafca633..81b8cd119cd8 100644 return CCtxParams->cParams.targetLength; case ZSTD_c_strategy : -@@ -741,12 +808,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, +@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, /* Content size written in frame header _when known_ (default:1) */ DEBUGLOG(4, "set content size flag = %u", (value!=0)); CCtxParams->fParams.contentSizeFlag = value != 0; @@ -39319,7 +17354,7 @@ index f620cafca633..81b8cd119cd8 100644 case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); -@@ -755,18 +822,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, +@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_forceMaxWindow : CCtxParams->forceWindow = (value != 0); @@ -39341,7 +17376,7 @@ index f620cafca633..81b8cd119cd8 100644 CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; } -@@ -789,47 +856,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, +@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_enableDedicatedDictSearch : CCtxParams->enableDedicatedDictSearch = (value!=0); @@ -39397,7 +17432,7 @@ index f620cafca633..81b8cd119cd8 100644 case ZSTD_c_stableInBuffer: BOUNDCHECK(ZSTD_c_stableInBuffer, value); -@@ -866,6 +934,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, +@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->deterministicRefPrefix = !!value; return CCtxParams->deterministicRefPrefix; @@ -39425,7 +17460,7 @@ index f620cafca633..81b8cd119cd8 100644 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } -@@ -980,6 +1069,18 @@ size_t ZSTD_CCtxParams_getParameter( +@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_deterministicRefPrefix: *value = (int)CCtxParams->deterministicRefPrefix; break; @@ -39444,22 +17479,45 @@ index f620cafca633..81b8cd119cd8 100644 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; -@@ -1006,9 +1107,24 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( +@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( return 0; } +size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) +{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); + DEBUGLOG(4, "ZSTD_CCtx_setCParams"); -+ assert(cctx != NULL); -+ if (cctx->streamStage != zcss_init) { -+ /* All parameters in @cparams are allowed to be updated during MT compression. -+ * This must be signaled, so that MT compression picks up the changes */ -+ cctx->cParamsChanged = 1; -+ } -+ /* only update if parameters are valid */ ++ /* only update if all parameters are valid */ + FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); -+ cctx->requestedParams.cParams = cparams; ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); + return 0; +} + @@ -39470,15 +17528,82 @@ index f620cafca633..81b8cd119cd8 100644 RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't set pledgedSrcSize when not in init stage."); cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; -@@ -1151,6 +1267,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) +@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, - "Can't reset parameters only when not in init stage."); +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); ZSTD_clearAllDicts(cctx); + ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } return 0; -@@ -1247,7 +1364,8 @@ static ZSTD_compressionParameters +@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize, @@ -39488,7 +17613,7 @@ index f620cafca633..81b8cd119cd8 100644 { const U64 minSrcSize = 513; /* (1<<9) + 1 */ const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); -@@ -1281,8 +1399,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, +@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, } /* resize windowLog if input is small enough, to use less memory */ @@ -39499,7 +17624,7 @@ index f620cafca633..81b8cd119cd8 100644 U32 const tSize = (U32)(srcSize + dictSize); static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : -@@ -1300,6 +1418,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, +@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ @@ -39542,7 +17667,7 @@ index f620cafca633..81b8cd119cd8 100644 return cPar; } -@@ -1310,7 +1464,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, +@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, { cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; @@ -39551,7 +17676,7 @@ index f620cafca633..81b8cd119cd8 100644 } static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); -@@ -1341,7 +1495,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint == 0 means 0 */ @@ -39560,7 +17685,16 @@ index f620cafca633..81b8cd119cd8 100644 } static size_t -@@ -1386,6 +1540,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, +@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } @@ -39574,7 +17708,7 @@ index f620cafca633..81b8cd119cd8 100644 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, -@@ -1393,12 +1554,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( +@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_paramSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, @@ -39592,7 +17726,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); -@@ -1417,6 +1579,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( +@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; @@ -39604,7 +17738,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t const neededSpace = cctxSpace + entropySpace + -@@ -1425,7 +1592,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( +@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ldmSeqSpace + matchStateSize + tokenSpace + @@ -39614,7 +17748,7 @@ index f620cafca633..81b8cd119cd8 100644 DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; -@@ -1443,7 +1611,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) +@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( @@ -39623,7 +17757,7 @@ index f620cafca633..81b8cd119cd8 100644 } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) -@@ -1493,7 +1661,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); @@ -39632,7 +17766,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) ? ((size_t)1 << cParams.windowLog) + blockSize : 0; -@@ -1504,7 +1672,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, @@ -39641,7 +17775,83 @@ index f620cafca633..81b8cd119cd8 100644 } } -@@ -1768,6 +1936,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, +@@ -1637,6 +1833,19 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t + ZSTD_reset_matchState(ZSTD_matchState_t* ms, +@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + ++ if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; ++ } ++ { /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); ++ assert(cParams->hashLog >= rowLog); ++ ms->rowHashLog = cParams->hashLog - rowLog; ++ } ++ } ++ + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); +@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + } + +- if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); +- } +- { /* Switch to 32-entry rows if searchLog is 5 (or more) */ +- U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +- assert(cParams->hashLog >= rowLog); +- ms->rowHashLog = cParams->hashLog - rowLog; +- } +- } +- + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, assert(params->useRowMatchFinder != ZSTD_ps_auto); assert(params->useBlockSplitter != ZSTD_ps_auto); assert(params->ldmParams.enableLdm != ZSTD_ps_auto); @@ -39649,7 +17859,7 @@ index f620cafca633..81b8cd119cd8 100644 if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* Adjust long distance matching parameters */ ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); -@@ -1776,9 +1945,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, +@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, } { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); @@ -39661,7 +17871,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; -@@ -1795,7 +1963,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, +@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, @@ -39670,7 +17880,7 @@ index f620cafca633..81b8cd119cd8 100644 int resizeWorkspace; FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); -@@ -1838,6 +2006,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, +@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, /* init params */ zc->blockState.matchState.cParams = params->cParams; @@ -39678,10 +17888,34 @@ index f620cafca633..81b8cd119cd8 100644 zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; -@@ -1907,6 +2076,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - zc->ldmState.loadedDictEnd = 0; - } +@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ + /* reserve space for block-level external sequences */ + if (params->useSequenceProducer) { + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); @@ -39690,10 +17924,53 @@ index f620cafca633..81b8cd119cd8 100644 + (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + - DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); - assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; -@@ -1980,7 +2157,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, } params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, @@ -39703,7 +17980,7 @@ index f620cafca633..81b8cd119cd8 100644 params.cParams.windowLog = windowLog; params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, -@@ -2019,6 +2197,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, +@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, return 0; } @@ -39726,7 +18003,7 @@ index f620cafca633..81b8cd119cd8 100644 static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_CCtx_params params, -@@ -2054,14 +2248,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, +@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; @@ -39748,7 +18025,18 @@ index f620cafca633..81b8cd119cd8 100644 } /* copy tag table */ if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { -@@ -2147,6 +2342,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + +@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; params.ldmParams = srcCCtx->appliedParams.ldmParams; params.fParams = fParams; @@ -39756,7 +18044,7 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); -@@ -2294,7 +2490,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par +@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par /* See doc/zstd_compression_format.md for detailed format description */ @@ -39765,7 +18053,7 @@ index f620cafca633..81b8cd119cd8 100644 { const seqDef* const sequences = seqStorePtr->sequencesStart; BYTE* const llCodeTable = seqStorePtr->llCode; -@@ -2302,18 +2498,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) +@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) BYTE* const mlCodeTable = seqStorePtr->mlCode; U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); U32 u; @@ -39791,7 +18079,7 @@ index f620cafca633..81b8cd119cd8 100644 } /* ZSTD_useTargetCBlockSize(): -@@ -2347,6 +2549,7 @@ typedef struct { +@@ -2347,6 +2602,7 @@ typedef struct { U32 MLtype; size_t size; size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ @@ -39799,7 +18087,7 @@ index f620cafca633..81b8cd119cd8 100644 } ZSTD_symbolEncodingTypeStats_t; /* ZSTD_buildSequencesStatistics(): -@@ -2357,11 +2560,13 @@ typedef struct { +@@ -2357,11 +2613,13 @@ typedef struct { * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) */ static ZSTD_symbolEncodingTypeStats_t @@ -39818,7 +18106,7 @@ index f620cafca633..81b8cd119cd8 100644 BYTE* const ostart = dst; const BYTE* const oend = dstEnd; BYTE* op = ostart; -@@ -2375,7 +2580,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, stats.lastCountSize = 0; /* convert length/distances into codes */ @@ -39827,7 +18115,7 @@ index f620cafca633..81b8cd119cd8 100644 assert(op <= oend); assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ /* build CTable for Literal Lengths */ -@@ -2480,22 +2685,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, */ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t @@ -39859,7 +18147,7 @@ index f620cafca633..81b8cd119cd8 100644 const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; -@@ -2503,29 +2708,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; size_t lastCountSize; @@ -39898,7 +18186,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <= dstCapacity); op += cSize; -@@ -2551,11 +2758,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); return (size_t)(op - ostart); } @@ -39913,7 +18201,7 @@ index f620cafca633..81b8cd119cd8 100644 &prevEntropy->fse, &nextEntropy->fse, op, oend, strategy, count, -@@ -2564,6 +2770,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); lastCountSize = stats.lastCountSize; op += stats.size; @@ -39921,7 +18209,7 @@ index f620cafca633..81b8cd119cd8 100644 } { size_t const bitstreamSize = ZSTD_encodeSequences( -@@ -2598,14 +2805,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, } MEM_STATIC size_t @@ -39945,7 +18233,7 @@ index f620cafca633..81b8cd119cd8 100644 { size_t const cSize = ZSTD_entropyCompressSeqStore_internal( seqStorePtr, prevEntropy, nextEntropy, cctxParams, -@@ -2615,15 +2823,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. */ @@ -39969,7 +18257,7 @@ index f620cafca633..81b8cd119cd8 100644 return cSize; } -@@ -2718,6 +2932,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) +@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) ssPtr->longLengthType = ZSTD_llt_none; } @@ -40042,7 +18330,7 @@ index f620cafca633..81b8cd119cd8 100644 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) -@@ -2727,7 +3007,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) assert(srcSize <= ZSTD_BLOCKSIZE_MAX); /* Assert that we have correctly flushed the ctx params into the ms's copy */ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); @@ -40053,7 +18341,7 @@ index f620cafca633..81b8cd119cd8 100644 if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); } else { -@@ -2763,6 +3045,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } if (zc->externSeqStore.pos < zc->externSeqStore.size) { assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); @@ -40069,7 +18357,7 @@ index f620cafca633..81b8cd119cd8 100644 /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, -@@ -2774,6 +3065,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { rawSeqStore_t ldmSeqStore = kNullRawSeqStore; @@ -40084,7 +18372,7 @@ index f620cafca633..81b8cd119cd8 100644 ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; /* Updates ldmSeqStore.size */ -@@ -2788,7 +3087,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); @@ -40154,7 +18442,7 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, zc->appliedParams.useRowMatchFinder, dictMode); -@@ -2849,7 +3209,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode so we provide seqStoreSeqs[i].offset - 1 */ ZSTD_updateRep(updatedRepcodes.rep, @@ -40163,7 +18451,7 @@ index f620cafca633..81b8cd119cd8 100644 seqStoreSeqs[i].litLength == 0); literalsRead += outSeqs[i].litLength; } -@@ -2865,6 +3225,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) zc->seqCollector.seqIndex += seqStoreSeqSize; } @@ -40174,7 +18462,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t outSeqsSize, const void* src, size_t srcSize) { -@@ -2910,19 +3274,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { +@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { const size_t unrollMask = unrollSize - 1; const size_t prefixLength = length & unrollMask; size_t i; @@ -40196,7 +18484,7 @@ index f620cafca633..81b8cd119cd8 100644 return 1; } -@@ -2938,7 +3300,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) +@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) return nbSeqs < 4 && nbLits < 10; } @@ -40206,7 +18494,7 @@ index f620cafca633..81b8cd119cd8 100644 { ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; bs->prevCBlock = bs->nextCBlock; -@@ -2946,7 +3309,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c +@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c } /* Writes the block header */ @@ -40217,7 +18505,7 @@ index f620cafca633..81b8cd119cd8 100644 U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); -@@ -2959,13 +3324,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB +@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB * Stores literals block type (raw, rle, compressed, repeat) and * huffman description table to hufMetadata. * Requires ENTROPY_WORKSPACE_SIZE workspace @@ -40241,7 +18529,7 @@ index f620cafca633..81b8cd119cd8 100644 { BYTE* const wkspStart = (BYTE*)workspace; BYTE* const wkspEnd = wkspStart + wkspSize; -@@ -2973,9 +3341,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi +@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi unsigned* const countWksp = (unsigned*)workspace; const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); BYTE* const nodeWksp = countWkspStart + countWkspSize; @@ -40253,7 +18541,7 @@ index f620cafca633..81b8cd119cd8 100644 HUF_repeat repeat = prevHuf->repeatMode; DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); -@@ -2990,73 +3358,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi +@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi /* small ? don't even attempt compression (speed opt) */ #ifndef COMPRESS_LITERALS_SIZE_MIN @@ -40365,7 +18653,7 @@ index f620cafca633..81b8cd119cd8 100644 } } -@@ -3066,8 +3438,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi +@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi * and updates nextEntropy to the appropriate repeatMode. */ static ZSTD_symbolEncodingTypeStats_t @@ -40377,7 +18665,7 @@ index f620cafca633..81b8cd119cd8 100644 nextEntropy->litlength_repeatMode = FSE_repeat_none; nextEntropy->offcode_repeatMode = FSE_repeat_none; nextEntropy->matchlength_repeatMode = FSE_repeat_none; -@@ -3078,16 +3451,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { * Builds entropy for the sequences. * Stores symbol compression modes and fse table to fseMetadata. * Requires ENTROPY_WORKSPACE_SIZE wksp. @@ -40404,7 +18692,7 @@ index f620cafca633..81b8cd119cd8 100644 BYTE* const ostart = fseMetadata->fseTablesBuffer; BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); BYTE* op = ostart; -@@ -3114,23 +3489,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, /* ZSTD_buildBlockEntropyStats() : * Builds entropy for the block. * Requires workspace size ENTROPY_WORKSPACE_SIZE @@ -40444,7 +18732,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); entropyMetadata->fseMetadata.fseTablesSize = ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, -@@ -3143,11 +3523,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, } /* Returns the size estimate for the literals section (header + content) of a block */ @@ -40462,7 +18750,7 @@ index f620cafca633..81b8cd119cd8 100644 { unsigned* const countWksp = (unsigned*)workspace; unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; -@@ -3169,12 +3550,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz +@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz } /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ @@ -40482,7 +18770,7 @@ index f620cafca633..81b8cd119cd8 100644 { unsigned* const countWksp = (unsigned*)workspace; const BYTE* ctp = codeTable; -@@ -3206,99 +3588,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, } /* Returns the size estimate for the sequences section (header + content) of a block */ @@ -40635,7 +18923,7 @@ index f620cafca633..81b8cd119cd8 100644 return matchBytes; } -@@ -3307,15 +3697,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { +@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { */ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, const seqStore_t* originalSeqStore, @@ -40654,7 +18942,7 @@ index f620cafca633..81b8cd119cd8 100644 } /* Move longLengthPos into the correct position if necessary */ -@@ -3328,13 +3715,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, } resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; @@ -40671,7 +18959,7 @@ index f620cafca633..81b8cd119cd8 100644 } resultSeqStore->llCode += startIdx; resultSeqStore->mlCode += startIdx; -@@ -3342,20 +3728,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, } /* @@ -40708,7 +18996,7 @@ index f620cafca633..81b8cd119cd8 100644 } /* -@@ -3371,30 +3763,32 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c +@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c * 1-3 : repcode 1-3 * 4+ : real_offset+3 */ @@ -40719,14 +19007,16 @@ index f620cafca633..81b8cd119cd8 100644 + const seqStore_t* const seqStore, U32 const nbSeq) +{ U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; for (; idx < nbSeq; ++idx) { seqDef* const seq = seqStore->sequencesStart + idx; - U32 const ll0 = (seq->litLength == 0); +- U32 const ll0 = (seq->litLength == 0); - U32 const offCode = OFFBASE_TO_STORED(seq->offBase); - assert(seq->offBase > 0); - if (STORED_IS_REPCODE(offCode)) { - U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); - U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); + U32 const offBase = seq->offBase; + assert(offBase > 0); + if (OFFBASE_IS_REPCODE(offBase)) { @@ -40751,7 +19041,7 @@ index f620cafca633..81b8cd119cd8 100644 } } -@@ -3404,10 +3798,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ +@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ * Returns the total size of that block (including header) or a ZSTD error code. */ static size_t @@ -40765,7 +19055,7 @@ index f620cafca633..81b8cd119cd8 100644 U32 lastBlock, U32 isPartition) { const U32 rleMaxLength = 25; -@@ -3481,45 +3876,49 @@ typedef struct { +@@ -3481,45 +3930,49 @@ typedef struct { /* Helper function to perform the recursive search for block splits. * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. @@ -40825,7 +19115,7 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); splits->splitLocations[splits->idx] = (U32)midIdx; splits->idx++; -@@ -3527,14 +3926,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end +@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end } } @@ -40849,7 +19139,7 @@ index f620cafca633..81b8cd119cd8 100644 /* Refuse to try and split anything with less than 4 sequences */ return 0; } -@@ -3550,18 +3953,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) +@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) * Returns combined size of all blocks (which includes headers), or a ZSTD error code. */ static size_t @@ -40876,7 +19166,7 @@ index f620cafca633..81b8cd119cd8 100644 /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two -@@ -3583,30 +3988,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac +@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); @@ -40917,7 +19207,7 @@ index f620cafca633..81b8cd119cd8 100644 srcBytesTotal += srcBytes; if (lastPartition) { /* This is the final partition, need to account for possible last literals */ -@@ -3621,7 +4027,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac +@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac op, dstCapacity, ip, srcBytes, lastBlockEntireSrc, 1 /* isPartition */); @@ -40927,7 +19217,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); ip += srcBytes; -@@ -3629,10 +4036,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac +@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac dstCapacity -= cSizeChunk; cSize += cSizeChunk; *currSeqStore = *nextSeqStore; @@ -40941,7 +19231,7 @@ index f620cafca633..81b8cd119cd8 100644 */ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); return cSize; -@@ -3643,8 +4050,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, +@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) { @@ -40950,7 +19240,7 @@ index f620cafca633..81b8cd119cd8 100644 U32 nbSeq; size_t cSize; DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); -@@ -3655,7 +4060,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, +@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, if (bss == ZSTDbss_noCompress) { if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; @@ -40959,7 +19249,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); return cSize; -@@ -3673,9 +4078,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, +@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame) { @@ -40972,7 +19262,7 @@ index f620cafca633..81b8cd119cd8 100644 */ const U32 rleMaxLength = 25; size_t cSize; -@@ -3767,10 +4172,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, +@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, * * cSize >= blockBound(srcSize): We have expanded the block too much so * emit an uncompressed block. */ @@ -40987,7 +19277,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); -@@ -3778,7 +4184,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, +@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, } } } @@ -40996,7 +19286,7 @@ index f620cafca633..81b8cd119cd8 100644 DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); /* Superblock compression failed, attempt to emit a single no compress block. -@@ -3836,7 +4242,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, +@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, * All blocks will be terminated, all input will be consumed. * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. * Frame is supposed already started (header already produced) @@ -41005,7 +19295,7 @@ index f620cafca633..81b8cd119cd8 100644 */ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, -@@ -3860,7 +4266,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, +@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, ZSTD_matchState_t* const ms = &cctx->blockState.matchState; U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); @@ -41016,7 +19306,7 @@ index f620cafca633..81b8cd119cd8 100644 dstSize_tooSmall, "not enough space to store compressed block"); if (remaining < blockSize) blockSize = remaining; -@@ -3899,7 +4307,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, +@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, MEM_writeLE24(op, cBlockHeader); cSize += ZSTD_blockHeaderSize; } @@ -41025,7 +19315,31 @@ index f620cafca633..81b8cd119cd8 100644 ip += blockSize; -@@ -4091,7 +4499,7 @@ size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) { ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; assert(!ZSTD_checkCParams(cParams)); @@ -41033,8 +19347,34 @@ index f620cafca633..81b8cd119cd8 100644 + return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); } - size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) -@@ -4111,31 +4519,47 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, const void* src, size_t srcSize, @@ -41079,19 +19419,50 @@ index f620cafca633..81b8cd119cd8 100644 /* If the dictionary is too large, only load the suffix of the dictionary. */ if (srcSize > maxDictSize) { ip = iend - maxDictSize; - src = ip; - srcSize = maxDictSize; -- } -+ } } -+ +@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* We must have cleared our windows when our source is this large. */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); ++ } ++ ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ if (params->cParams.strategy < ZSTD_btultra) { ++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } } - DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); -@@ -4158,10 +4582,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- switch(params->cParams.strategy) { case ZSTD_fast: @@ -41104,7 +19475,16 @@ index f620cafca633..81b8cd119cd8 100644 break; case ZSTD_greedy: -@@ -4327,6 +4751,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, +@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, ZSTD_dictTableLoadMethod_e dtlm, @@ -41112,7 +19492,7 @@ index f620cafca633..81b8cd119cd8 100644 void* workspace) { const BYTE* dictPtr = (const BYTE*)dict; -@@ -4345,7 +4770,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, +@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( @@ -41121,7 +19501,7 @@ index f620cafca633..81b8cd119cd8 100644 } return dictID; } -@@ -4361,6 +4786,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, @@ -41129,7 +19509,7 @@ index f620cafca633..81b8cd119cd8 100644 void* workspace) { DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); -@@ -4373,13 +4799,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, /* dict restricted modes */ if (dictContentType == ZSTD_dct_rawContent) @@ -41145,7 +19525,7 @@ index f620cafca633..81b8cd119cd8 100644 } RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); assert(0); /* impossible */ -@@ -4387,13 +4813,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, /* dict as full zstd dictionary */ return ZSTD_loadZstdDictionary( @@ -41161,7 +19541,7 @@ index f620cafca633..81b8cd119cd8 100644 * @return : 0, or an error code */ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, -@@ -4426,11 +4853,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, +@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, cdict->dictContentSize, cdict->dictContentType, dtlm, @@ -41175,13 +19555,13 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; -@@ -4471,11 +4898,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, +@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, &cctxParams, pledgedSrcSize); } -size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) -+size_t -+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { ZSTD_CCtx_params cctxParams; - { @@ -41190,7 +19570,61 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); } DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); -@@ -4709,7 +5136,7 @@ static size_t ZSTD_initCDict_internal( +@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal( { size_t const dictID = ZSTD_compress_insertDictionary( &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, ¶ms, cdict->dictContent, cdict->dictContentSize, @@ -41199,7 +19633,43 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= (size_t)(U32)-1); cdict->dictID = (U32)dictID; -@@ -5197,30 +5624,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) +@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) { @@ -41253,7 +19723,7 @@ index f620cafca633..81b8cd119cd8 100644 if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { assert(zcs->inBuff != NULL); assert(zcs->inBuffSize > 0); -@@ -5229,8 +5667,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, assert(zcs->outBuff != NULL); assert(zcs->outBuffSize > 0); } @@ -41265,7 +19735,16 @@ index f620cafca633..81b8cd119cd8 100644 assert((U32)flushMode <= (U32)ZSTD_e_end); while (someMoreWork) { -@@ -5262,8 +5702,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, ++ size_t const cSize = ZSTD_compressEnd_public(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); +@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend-ip); zcs->inBuffPos += loaded; @@ -41275,7 +19754,7 @@ index f620cafca633..81b8cd119cd8 100644 if ( (flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget) ) { /* not enough input to fill full block : stop here */ -@@ -5274,6 +5713,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, /* empty */ someMoreWork = 0; break; } @@ -41296,7 +19775,7 @@ index f620cafca633..81b8cd119cd8 100644 } /* compress current block (note : this stage cannot be stopped in the middle) */ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); -@@ -5281,9 +5734,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, void* cDst; size_t cSize; size_t oSize = oend-op; @@ -41308,7 +19787,19 @@ index f620cafca633..81b8cd119cd8 100644 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) cDst = op; /* compress into output buffer, to skip flush stage */ else -@@ -5306,19 +5758,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, +@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, if (!lastBlock) assert(zcs->inBuffTarget <= zcs->inBuffSize); zcs->inToCompress = zcs->inBuffPos; @@ -41318,8 +19809,10 @@ index f620cafca633..81b8cd119cd8 100644 + } else { /* !inputBuffered, hence ZSTD_bm_stable */ + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); cSize = lastBlock ? - ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : - ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); /* Consume the input prior to error checking to mirror buffered mode. */ - if (iSize > 0) - ip += iSize; @@ -41332,7 +19825,7 @@ index f620cafca633..81b8cd119cd8 100644 } if (cDst == op) { /* no need to flush */ op += cSize; -@@ -5388,8 +5837,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf +@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf /* After a compression call set the expected input/output buffer. * This is validated at the start of the next compression call. */ @@ -41344,7 +19837,7 @@ index f620cafca633..81b8cd119cd8 100644 if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { cctx->expectedInBuffer = *input; } -@@ -5408,22 +5859,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, +@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, { if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ZSTD_inBuffer const expect = cctx->expectedInBuffer; @@ -41373,7 +19866,7 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_CCtx_params params = cctx->requestedParams; ZSTD_prefixDict const prefixDict = cctx->prefixDict; FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ -@@ -5437,9 +5888,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.compressionLevel = cctx->cdict->compressionLevel; } DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); @@ -41386,7 +19879,7 @@ index f620cafca633..81b8cd119cd8 100644 ? prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0); ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); -@@ -5451,6 +5902,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); @@ -41396,7 +19889,7 @@ index f620cafca633..81b8cd119cd8 100644 { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); -@@ -5477,6 +5931,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, +@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, return 0; } @@ -41405,7 +19898,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input, -@@ -5491,8 +5947,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, +@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, /* transparent initialization stage */ if (cctx->streamStage == zcss_init) { @@ -41435,7 +19928,7 @@ index f620cafca633..81b8cd119cd8 100644 } /* end of transparent initialization stage */ -@@ -5510,13 +5985,20 @@ size_t ZSTD_compressStream2_simpleArgs ( +@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs ( const void* src, size_t srcSize, size_t* srcPos, ZSTD_EndDirective endOp) { @@ -41462,7 +19955,7 @@ index f620cafca633..81b8cd119cd8 100644 } size_t ZSTD_compress2(ZSTD_CCtx* cctx, -@@ -5539,6 +6021,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, /* Reset to the original values. */ cctx->requestedParams.inBufferMode = originalInBufferMode; cctx->requestedParams.outBufferMode = originalOutBufferMode; @@ -41470,7 +19963,7 @@ index f620cafca633..81b8cd119cd8 100644 FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); if (result != 0) { /* compression not completed, due to lack of output space */ assert(oPos == dstCapacity); -@@ -5549,64 +6032,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, } } @@ -41555,7 +20048,7 @@ index f620cafca633..81b8cd119cd8 100644 if (cctx->cdict) { dictSize = (U32)cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5615,25 +6095,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, dictSize = 0; } ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); @@ -41620,7 +20113,7 @@ index f620cafca633..81b8cd119cd8 100644 ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); if (inSeqs[idx].litLength) { -@@ -5642,26 +6152,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ip += inSeqs[idx].litLength; seqPos->posInSrc += inSeqs[idx].litLength; } @@ -41650,7 +20143,7 @@ index f620cafca633..81b8cd119cd8 100644 { U32 idx = seqPos->idx; U32 startPosInSequence = seqPos->posInSequence; -@@ -5673,6 +6172,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* U32 bytesAdjustment = 0; U32 finalMatchSplit = 0; @@ -41660,7 +20153,7 @@ index f620cafca633..81b8cd119cd8 100644 if (cctx->cdict) { dictSize = cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5680,7 +6182,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* } else { dictSize = 0; } @@ -41669,7 +20162,7 @@ index f620cafca633..81b8cd119cd8 100644 DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { -@@ -5688,7 +6190,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* U32 litLength = currSeq.litLength; U32 matchLength = currSeq.matchLength; U32 const rawOffset = currSeq.offset; @@ -41678,7 +20171,7 @@ index f620cafca633..81b8cd119cd8 100644 /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { -@@ -5702,7 +6204,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* /* Move to the next sequence */ endPosInSequence -= currSeq.litLength + currSeq.matchLength; startPosInSequence = 0; @@ -41686,7 +20179,7 @@ index f620cafca633..81b8cd119cd8 100644 } else { /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence does not reach the end of the match. So, we have to split the sequence */ -@@ -5742,21 +6243,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* } /* Check if this offset can be represented with a repcode */ { U32 const ll0 = (litLength == 0); @@ -41717,7 +20210,7 @@ index f620cafca633..81b8cd119cd8 100644 } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); -@@ -5779,7 +6282,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* +@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, @@ -41726,7 +20219,7 @@ index f620cafca633..81b8cd119cd8 100644 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) { ZSTD_sequenceCopier sequenceCopier = NULL; -@@ -5793,6 +6296,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) +@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) return sequenceCopier; } @@ -41784,7 +20277,7 @@ index f620cafca633..81b8cd119cd8 100644 /* Compress, block-by-block, all of the sequences given. * * Returns the cumulative size of all compressed blocks (including their headers), -@@ -5805,9 +6359,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, const void* src, size_t srcSize) { size_t cSize = 0; @@ -41794,7 +20287,7 @@ index f620cafca633..81b8cd119cd8 100644 size_t remaining = srcSize; ZSTD_sequencePosition seqPos = {0, 0, 0}; -@@ -5827,22 +6378,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, } while (remaining) { @@ -41830,7 +20323,7 @@ index f620cafca633..81b8cd119cd8 100644 cSize += cBlockSize; ip += blockSize; op += cBlockSize; -@@ -5851,6 +6409,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, continue; } @@ -41838,7 +20331,7 @@ index f620cafca633..81b8cd119cd8 100644 compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, -@@ -5859,11 +6418,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); @@ -41852,7 +20345,7 @@ index f620cafca633..81b8cd119cd8 100644 /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 -@@ -5874,12 +6433,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, if (compressedSeqsSize == 0) { /* ZSTD_noCompressBlock writes the block header as well */ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); @@ -41869,7 +20362,7 @@ index f620cafca633..81b8cd119cd8 100644 } else { U32 cBlockHeader; /* Error checking and repcodes update */ -@@ -5891,11 +6450,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); MEM_writeLE24(op, cBlockHeader); cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; @@ -41882,7 +20375,7 @@ index f620cafca633..81b8cd119cd8 100644 if (lastBlock) { break; -@@ -5906,12 +6464,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, dstCapacity -= cBlockSize; cctx->isFirstBlock = 0; } @@ -41899,7 +20392,7 @@ index f620cafca633..81b8cd119cd8 100644 const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize) { -@@ -5921,7 +6482,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci +@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci size_t frameHeaderSize = 0; /* Transparent initialization stage, same as compressStream2() */ @@ -41908,7 +20401,7 @@ index f620cafca633..81b8cd119cd8 100644 assert(cctx != NULL); FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); /* Begin writing output, starting with frame header */ -@@ -5949,26 +6510,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci +@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci cSize += 4; } @@ -41947,7 +20440,7 @@ index f620cafca633..81b8cd119cd8 100644 if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ /* single thread mode : attempt to calculate remaining to flush more precisely */ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; -@@ -6090,7 +6659,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, +@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, cp.targetLength = (unsigned)(-clampedCompressionLevel); } /* refine parameters based on srcSize & dictSize */ @@ -41956,7 +20449,7 @@ index f620cafca633..81b8cd119cd8 100644 } } -@@ -6125,3 +6694,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH +@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } @@ -41979,7 +20472,7 @@ index f620cafca633..81b8cd119cd8 100644 + } +} diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h -index 71697a11ae30..0198c8f5cac0 100644 +index 71697a11ae30..899f5e2de8e9 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -1,5 +1,6 @@ @@ -42031,7 +20524,19 @@ index 71697a11ae30..0198c8f5cac0 100644 UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; typedef struct { -@@ -228,6 +237,11 @@ struct ZSTD_matchState_t { +@@ -212,8 +221,10 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; +@@ -228,6 +239,18 @@ struct ZSTD_matchState_t { const ZSTD_matchState_t* dictMatchState; ZSTD_compressionParameters cParams; const rawSeqStore_t* ldmSeqStore; @@ -42040,10 +20545,17 @@ index 71697a11ae30..0198c8f5cac0 100644 + * This behavior is controlled from the cctx ms. + * This parameter has no effect in the cdict ms. */ + int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; }; typedef struct { -@@ -324,6 +338,24 @@ struct ZSTD_CCtx_params_s { +@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s { /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; @@ -42068,7 +20580,7 @@ index 71697a11ae30..0198c8f5cac0 100644 }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) -@@ -355,6 +387,14 @@ typedef struct { +@@ -355,6 +396,14 @@ typedef struct { ZSTD_entropyCTablesMetadata_t entropyMetadata; } ZSTD_blockSplitCtx; @@ -42083,7 +20595,7 @@ index 71697a11ae30..0198c8f5cac0 100644 struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ -@@ -404,6 +444,7 @@ struct ZSTD_CCtx_s { +@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s { /* Stable in/out buffer verification */ ZSTD_inBuffer expectedInBuffer; @@ -42091,7 +20603,7 @@ index 71697a11ae30..0198c8f5cac0 100644 size_t expectedOutBufferSize; /* Dictionary */ -@@ -417,9 +458,13 @@ struct ZSTD_CCtx_s { +@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s { /* Workspace for block splitter */ ZSTD_blockSplitCtx blockSplitCtx; @@ -42105,7 +20617,7 @@ index 71697a11ae30..0198c8f5cac0 100644 typedef enum { ZSTD_noDict = 0, -@@ -441,7 +486,7 @@ typedef enum { +@@ -441,7 +495,7 @@ typedef enum { * In this mode we take both the source size and the dictionary size * into account when selecting and adjusting the parameters. */ @@ -42114,7 +20626,7 @@ index 71697a11ae30..0198c8f5cac0 100644 * We don't know what these parameters are for. We default to the legacy * behavior of taking both the source size and the dict size into account * when selecting and adjusting parameters. -@@ -500,9 +545,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) +@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) /* ZSTD_noCompressBlock() : * Writes uncompressed block to dst buffer from given src. * Returns the size of the block */ @@ -42127,7 +20639,7 @@ index 71697a11ae30..0198c8f5cac0 100644 RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, dstSize_tooSmall, "dst buf too small for uncompressed block"); MEM_writeLE24(dst, cBlockHeader24); -@@ -510,7 +557,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi +@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi return ZSTD_blockHeaderSize + srcSize; } @@ -42137,7 +20649,7 @@ index 71697a11ae30..0198c8f5cac0 100644 { BYTE* const op = (BYTE*)dst; U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); -@@ -529,7 +577,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) +@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) { U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); @@ -42146,7 +20658,7 @@ index 71697a11ae30..0198c8f5cac0 100644 return (srcSize >> minlog) + 2; } -@@ -565,29 +613,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con +@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con while (ip < iend) *op++ = *ip++; } @@ -42190,7 +20702,7 @@ index 71697a11ae30..0198c8f5cac0 100644 size_t matchLength) { BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; -@@ -596,8 +642,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, static const BYTE* g_start = NULL; if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ { U32 const pos = (U32)((const BYTE*)literals - g_start); @@ -42201,7 +20713,7 @@ index 71697a11ae30..0198c8f5cac0 100644 } #endif assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); -@@ -607,9 +653,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, assert(literals + litLength <= litLimit); if (litEnd <= litLimit_w) { /* Common case we can use wildcopy. @@ -42214,7 +20726,7 @@ index 71697a11ae30..0198c8f5cac0 100644 ZSTD_copy16(seqStorePtr->lit, literals); if (litLength > 16) { ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); -@@ -628,7 +674,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, seqStorePtr->sequences[0].litLength = (U16)litLength; /* match offset */ @@ -42223,7 +20735,7 @@ index 71697a11ae30..0198c8f5cac0 100644 /* match Length */ assert(matchLength >= MINMATCH); -@@ -646,17 +692,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, /* ZSTD_updateRep() : * updates in-place @rep (array of repeat offsets) @@ -42246,7 +20758,7 @@ index 71697a11ae30..0198c8f5cac0 100644 if (repCode > 0) { /* note : if repCode==0, no change */ U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; rep[2] = (repCode >= 2) ? rep[1] : rep[2]; -@@ -673,11 +719,11 @@ typedef struct repcodes_s { +@@ -673,11 +728,11 @@ typedef struct repcodes_s { } repcodes_t; MEM_STATIC repcodes_t @@ -42260,7 +20772,7 @@ index 71697a11ae30..0198c8f5cac0 100644 return newReps; } -@@ -685,59 +731,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 +@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 /*-************************************* * Match length counter ***************************************/ @@ -42320,39 +20832,51 @@ index 71697a11ae30..0198c8f5cac0 100644 MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) { const BYTE* const pStart = pIn; -@@ -783,32 +776,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, +@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, * Hashes ***************************************/ static const U32 prime3bytes = 506832829U; -static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } -+static U32 ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes) >> (32-h) ; } - MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } static const U32 prime4bytes = 2654435761U; -static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } -static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } -+static U32 ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; } -+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } static const U64 prime5bytes = 889523592379ULL; -static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } -+static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } - static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } static const U64 prime6bytes = 227718039650203ULL; -static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } -+static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } - static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } static const U64 prime7bytes = 58295818150454627ULL; -static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } -+static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } - static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; -static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } -+static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; } - static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ MEM_STATIC FORCE_INLINE_ATTR size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) @@ -42364,7 +20888,32 @@ index 71697a11ae30..0198c8f5cac0 100644 switch(mls) { default: -@@ -1167,10 +1164,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, +@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); assert(blockEndIdx >= loadedDictEnd); @@ -42381,7 +20930,7 @@ index 71697a11ae30..0198c8f5cac0 100644 */ DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); *loadedDictEndPtr = 0; -@@ -1302,6 +1304,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) +@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) #endif @@ -42424,7 +20973,7 @@ index 71697a11ae30..0198c8f5cac0 100644 /* =============================================================== -@@ -1396,4 +1434,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); +@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); */ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); @@ -42454,6 +21003,26 @@ index 71697a11ae30..0198c8f5cac0 100644 +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ + #endif /* ZSTD_COMPRESS_H */ diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c @@ -42896,7 +21465,7 @@ index 224ece79546e..826bbc9e029b 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h -index 349fc923c355..ef5e65cfcf9a 100644 +index 349fc923c355..65ea53b62844 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -1,5 +1,6 @@ @@ -42907,7 +21476,246 @@ index 349fc923c355..ef5e65cfcf9a 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -451,7 +452,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { +@@ -14,7 +15,9 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" ++#include "../common/portability_macros.h" + + + /*-************************************* +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* +@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { ++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + } + ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) ++{ ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ +@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { assert(ws->tableValidEnd >= ws->objectEnd); assert(ws->tableValidEnd <= ws->allocStart); if (ws->tableValidEnd < ws->tableEnd) { @@ -42916,6 +21724,51 @@ index 349fc923c355..ef5e65cfcf9a 100644 } ZSTD_cwksp_mark_tables_clean(ws); } +@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c index 76933dea2624..ab9440a99603 100644 --- a/lib/zstd/compress/zstd_double_fast.c @@ -44057,7 +22910,7 @@ index fddc2f532d21..e64d9e1b2d39 100644 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c -index 0298a01a7504..83727cd46f91 100644 +index 0298a01a7504..f6b4978ceba7 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -1,5 +1,6 @@ @@ -44068,15 +22921,17 @@ index 0298a01a7504..83727cd46f91 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -10,6 +11,7 @@ +@@ -10,6 +11,9 @@ #include "zstd_compress_internal.h" #include "zstd_lazy.h" +#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#define kLazySkippingStep 8 /*-************************************* -@@ -197,8 +199,8 @@ ZSTD_DUBT_findBetterDictMatch ( +@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex = dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", @@ -44087,7 +22942,7 @@ index 0298a01a7504..83727cd46f91 100644 } if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ break; /* drop, to guarantee consistency (miss a little bit of compression) */ -@@ -218,7 +220,7 @@ ZSTD_DUBT_findBetterDictMatch ( +@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch ( } if (bestLength >= MINMATCH) { @@ -44096,7 +22951,7 @@ index 0298a01a7504..83727cd46f91 100644 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } -@@ -230,7 +232,7 @@ ZSTD_DUBT_findBetterDictMatch ( +@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch ( static size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, @@ -44105,7 +22960,7 @@ index 0298a01a7504..83727cd46f91 100644 U32 const mls, const ZSTD_dictMode_e dictMode) { -@@ -327,8 +329,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > bestLength) { if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; @@ -44116,7 +22971,7 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ if (dictMode == ZSTD_dictMatchState) { nbCompares = 0; /* in addition to avoiding checking any -@@ -361,16 +363,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (dictMode == ZSTD_dictMatchState && nbCompares) { bestLength = ZSTD_DUBT_findBetterDictMatch( ms, ip, iend, @@ -44136,7 +22991,7 @@ index 0298a01a7504..83727cd46f91 100644 } return bestLength; } -@@ -381,14 +383,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, FORCE_INLINE_TEMPLATE size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, @@ -44153,7 +23008,7 @@ index 0298a01a7504..83727cd46f91 100644 } /* ********************************* -@@ -561,7 +563,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb +@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; @@ -44162,7 +23017,7 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) { /* best possible, avoids read overflow on next attempt */ return ml; -@@ -598,7 +600,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb +@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; @@ -44171,7 +23026,43 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } -@@ -691,7 +693,8 @@ size_t ZSTD_HcFindBestMatch( +@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; +@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ +@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { const BYTE* const match = base + matchIndex; assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ @@ -44181,7 +23072,7 @@ index 0298a01a7504..83727cd46f91 100644 currentMl = ZSTD_count(ip, match, iLimit); } else { const BYTE* const match = dictBase + matchIndex; -@@ -703,7 +706,7 @@ size_t ZSTD_HcFindBestMatch( +@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch( /* save best solution */ if (currentMl > ml) { ml = currentMl; @@ -44190,7 +23081,7 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } -@@ -739,7 +742,7 @@ size_t ZSTD_HcFindBestMatch( +@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch( if (currentMl > ml) { ml = currentMl; assert(curr > matchIndex + dmsIndexDelta); @@ -44199,15 +23090,16 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } -@@ -757,7 +760,6 @@ size_t ZSTD_HcFindBestMatch( +@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder ***********************************/ /* Constants for row-based hash */ - #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ -#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ -@@ -769,29 +771,8 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr +@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr * Starting from the LSB, returns the idx of the next non-zero bit. * Basically counting the nb of trailing zeroes. */ @@ -44234,12 +23126,122 @@ index 0298a01a7504..83727cd46f91 100644 - val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); - return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); -# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + return ZSTD_countTrailingZeros64(val); } - /* ZSTD_rotateRight_*(): -@@ -971,7 +952,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); @@ -44276,7 +23278,7 @@ index 0298a01a7504..83727cd46f91 100644 } #if defined(ZSTD_ARCH_X86_SSE2) -@@ -994,71 +1003,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U +@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U } #endif @@ -44285,8 +23287,7 @@ index 0298a01a7504..83727cd46f91 100644 - * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield - * to match up with the actual layout of the entries within the hashTable */ +#if defined(ZSTD_ARCH_ARM_NEON) - FORCE_INLINE_TEMPLATE ZSTD_VecMask --ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) +{ + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); @@ -44339,10 +23340,12 @@ index 0298a01a7504..83727cd46f91 100644 + * Each row is a circular buffer beginning at the value of "headGrouped". So we + * must rotate the "matches" bitfield to match up with the actual layout of the + * entries within the hashTable */ -+FORCE_INLINE_TEMPLATE ZSTD_VecMask + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) { - const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); @@ -44407,7 +23410,7 @@ index 0298a01a7504..83727cd46f91 100644 const size_t shiftAmount = ((chunkSize * 8) - chunkSize); const size_t xFF = ~((size_t)0); const size_t x01 = xFF / 0xFF; -@@ -1091,11 +1111,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, +@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, } matches = ~matches; if (rowEntries == 16) { @@ -44422,15 +23425,56 @@ index 0298a01a7504..83727cd46f91 100644 } } #endif -@@ -1143,6 +1163,7 @@ size_t ZSTD_RowFindBestMatch( +@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch( const U32 rowEntries = (1U << rowLog); const U32 rowMask = rowEntries - 1; const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; U32 nbAttempts = 1U << cappedSearchLog; size_t ml=4-1; ++ U32 hash; -@@ -1185,15 +1206,15 @@ size_t ZSTD_RowFindBestMatch( + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; +@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; U32* const row = hashTable + relRow; BYTE* tagRow = (BYTE*)(tagTable + relRow); @@ -44443,13 +23487,32 @@ index 0298a01a7504..83727cd46f91 100644 + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); /* Cycle through the matches and prefetch */ - for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { - U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; assert(numMatches < rowEntries); if (matchIndex < lowLimit) -@@ -1224,7 +1245,8 @@ size_t ZSTD_RowFindBestMatch( + break; +@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch( if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { const BYTE* const match = base + matchIndex; assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ @@ -44459,7 +23522,7 @@ index 0298a01a7504..83727cd46f91 100644 currentMl = ZSTD_count(ip, match, iLimit); } else { const BYTE* const match = dictBase + matchIndex; -@@ -1236,7 +1258,7 @@ size_t ZSTD_RowFindBestMatch( +@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch( /* Save best solution */ if (currentMl > ml) { ml = currentMl; @@ -44468,7 +23531,7 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } -@@ -1254,14 +1276,14 @@ size_t ZSTD_RowFindBestMatch( +@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch( const U32 dmsSize = (U32)(dmsEnd - dmsBase); const U32 dmsIndexDelta = dictLimit - dmsSize; @@ -44480,13 +23543,21 @@ index 0298a01a7504..83727cd46f91 100644 - ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); - for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { - U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; if (matchIndex < dmsLowestIndex) break; -@@ -1285,7 +1307,7 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch( if (currentMl > ml) { ml = currentMl; assert(curr > matchIndex + dmsIndexDelta); @@ -44495,7 +23566,7 @@ index 0298a01a7504..83727cd46f91 100644 if (ip+currentMl == iLimit) break; } } -@@ -1491,7 +1513,8 @@ ZSTD_compressBlock_lazy_generic( +@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic( const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); @@ -44505,7 +23576,7 @@ index 0298a01a7504..83727cd46f91 100644 const int isDMS = dictMode == ZSTD_dictMatchState; const int isDDS = dictMode == ZSTD_dedicatedDictSearch; -@@ -1512,8 +1535,8 @@ ZSTD_compressBlock_lazy_generic( +@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic( U32 const curr = (U32)(ip - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); U32 const maxRep = curr - windowLow; @@ -44516,7 +23587,22 @@ index 0298a01a7504..83727cd46f91 100644 } if (isDxS) { /* dictMatchState repCode checks don't currently handle repCode == 0 -@@ -1537,7 +1560,7 @@ ZSTD_compressBlock_lazy_generic( +@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic( #endif while (ip < ilimit) { size_t matchLength=0; @@ -44525,7 +23611,7 @@ index 0298a01a7504..83727cd46f91 100644 const BYTE* start=ip+1; DEBUGLOG(7, "search baseline (depth 0)"); -@@ -1562,10 +1585,10 @@ ZSTD_compressBlock_lazy_generic( +@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic( } /* first search (depth 0) */ @@ -44539,7 +23625,21 @@ index 0298a01a7504..83727cd46f91 100644 } if (matchLength < 4) { -@@ -1579,12 +1602,12 @@ ZSTD_compressBlock_lazy_generic( +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 1"); ip ++; if ( (dictMode == ZSTD_noDict) @@ -44555,7 +23655,7 @@ index 0298a01a7504..83727cd46f91 100644 } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1596,17 +1619,17 @@ ZSTD_compressBlock_lazy_generic( +@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); @@ -44580,7 +23680,7 @@ index 0298a01a7504..83727cd46f91 100644 continue; /* search a better one */ } } -@@ -1615,12 +1638,12 @@ ZSTD_compressBlock_lazy_generic( +@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 2"); ip ++; if ( (dictMode == ZSTD_noDict) @@ -44596,7 +23696,7 @@ index 0298a01a7504..83727cd46f91 100644 } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1632,17 +1655,17 @@ ZSTD_compressBlock_lazy_generic( +@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); @@ -44621,7 +23721,7 @@ index 0298a01a7504..83727cd46f91 100644 continue; } } } break; /* nothing found : store previous solution */ -@@ -1653,24 +1676,24 @@ ZSTD_compressBlock_lazy_generic( +@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic( * notably if `value` is unsigned, resulting in a large positive `-value`. */ /* catch up */ @@ -44651,8 +23751,17 @@ index 0298a01a7504..83727cd46f91 100644 + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } -@@ -1686,8 +1709,8 @@ ZSTD_compressBlock_lazy_generic( + /* check immediate repcode */ + if (isDxS) { +@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic( && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; @@ -44663,7 +23772,7 @@ index 0298a01a7504..83727cd46f91 100644 ip += matchLength; anchor = ip; continue; -@@ -1701,16 +1724,20 @@ ZSTD_compressBlock_lazy_generic( +@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic( && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; @@ -44689,7 +23798,24 @@ index 0298a01a7504..83727cd46f91 100644 /* Return the last literals size */ return (size_t)(iend - anchor); -@@ -1903,7 +1930,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( #endif while (ip < ilimit) { size_t matchLength=0; @@ -44698,7 +23824,7 @@ index 0298a01a7504..83727cd46f91 100644 const BYTE* start=ip+1; U32 curr = (U32)(ip-base); -@@ -1922,10 +1949,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( } } /* first search (depth 0) */ @@ -44712,7 +23838,21 @@ index 0298a01a7504..83727cd46f91 100644 } if (matchLength < 4) { -@@ -1939,7 +1966,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ @@ -44721,7 +23861,7 @@ index 0298a01a7504..83727cd46f91 100644 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1951,18 +1978,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); @@ -44747,7 +23887,7 @@ index 0298a01a7504..83727cd46f91 100644 continue; /* search a better one */ } } -@@ -1971,7 +1998,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ @@ -44756,7 +23896,7 @@ index 0298a01a7504..83727cd46f91 100644 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1983,36 +2010,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( +@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); @@ -44803,8 +23943,17 @@ index 0298a01a7504..83727cd46f91 100644 + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } -@@ -2029,8 +2056,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; @@ -44815,7 +23964,7 @@ index 0298a01a7504..83727cd46f91 100644 ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ -@@ -2096,7 +2123,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row( +@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row( size_t ZSTD_compressBlock_lazy2_extDict_row( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) @@ -44922,7 +24071,7 @@ index 647f865be290..cfccfc46f6f7 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c -index fd82acfda62f..a6bf7f856437 100644 +index fd82acfda62f..1e41cb04f482 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -1,5 +1,6 @@ @@ -45281,7 +24430,16 @@ index fd82acfda62f..a6bf7f856437 100644 (*nbMatches)++; } } -@@ -1098,14 +1123,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + ZSTD_optimal_t lastSequence; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* large match -> immediate encoding */ { U32 const maxML = matches[nbMatches-1].len; @@ -45300,7 +24458,7 @@ index fd82acfda62f..a6bf7f856437 100644 DEBUGLOG(6, "large match (%u>%u), immediate encoding", maxML, sufficient_len); cur = 0; -@@ -1122,15 +1147,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ } for (matchNb = 0; matchNb < nbMatches; matchNb++) { @@ -45320,7 +24478,7 @@ index fd82acfda62f..a6bf7f856437 100644 opt[pos].litlen = litlen; opt[pos].price = (int)sequencePrice; } } -@@ -1230,7 +1255,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; U32 mlen; @@ -45329,7 +24487,7 @@ index fd82acfda62f..a6bf7f856437 100644 matchNb, matches[matchNb].off, lastML, litlen); for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ -@@ -1296,7 +1321,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, for (storePos=storeStart; storePos <= storeEnd; storePos++) { U32 const llen = opt[storePos].litlen; U32 const mlen = opt[storePos].mlen; @@ -45338,7 +24496,7 @@ index fd82acfda62f..a6bf7f856437 100644 U32 const advance = llen + mlen; DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", anchor - istart, (unsigned)llen, (unsigned)mlen); -@@ -1308,8 +1333,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, } assert(anchor + llen <= iend); @@ -45349,7 +24507,7 @@ index fd82acfda62f..a6bf7f856437 100644 anchor += advance; ip = anchor; } } -@@ -1349,7 +1374,7 @@ size_t ZSTD_compressBlock_btopt( +@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt( /* ZSTD_initStats_ultra(): * make a first compression pass, just to seed stats with more accurate starting values. * only works on first block, with no dictionary and no ldm. @@ -45358,7 +24516,7 @@ index fd82acfda62f..a6bf7f856437 100644 */ static void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, -@@ -1368,7 +1393,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ @@ -45367,7 +24525,7 @@ index fd82acfda62f..a6bf7f856437 100644 ZSTD_resetSeqStore(seqStore); ms->window.base -= srcSize; ms->window.dictLimit += (U32)srcSize; -@@ -1392,20 +1417,20 @@ size_t ZSTD_compressBlock_btultra2( +@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2( U32 const curr = (U32)((const BYTE*)src - ms->window.base); DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); @@ -45407,7 +24565,7 @@ index 22b862858ba7..faa73ff4b03d 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c -index 60958afebc41..28a036f7543b 100644 +index 60958afebc41..d172e35fbd9a 100644 --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -1,7 +1,8 @@ @@ -45779,7 +24937,7 @@ index 60958afebc41..28a036f7543b 100644 + + /* Copy the arguments to local variables */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); -+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + assert(MEM_isLittleEndian()); @@ -45863,7 +25021,7 @@ index 60958afebc41..28a036f7543b 100644 -static HUF_ASM_X86_64_BMI2_ATTRS + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); -+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} + @@ -46128,7 +25286,7 @@ index 60958afebc41..28a036f7543b 100644 + + /* Copy the arguments to local registers. */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); -+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + oend[0] = op[1]; @@ -46251,7 +25409,7 @@ index 60958afebc41..28a036f7543b 100644 + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); -+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} + @@ -46598,7 +25756,7 @@ index 60958afebc41..28a036f7543b 100644 } - diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c -index dbbc7919de53..4f801e0dd564 100644 +index dbbc7919de53..30ef65e1ab5c 100644 --- a/lib/zstd/decompress/zstd_ddict.c +++ b/lib/zstd/decompress/zstd_ddict.c @@ -1,5 +1,6 @@ @@ -46609,7 +25767,13 @@ index dbbc7919de53..4f801e0dd564 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -19,7 +20,6 @@ +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" @@ -46617,7 +25781,7 @@ index dbbc7919de53..4f801e0dd564 100644 #include "../common/huf.h" #include "zstd_decompress_internal.h" #include "zstd_ddict.h" -@@ -131,7 +131,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, ZSTD_memcpy(internalBuffer, dict, dictSize); } ddict->dictSize = dictSize; @@ -46626,7 +25790,7 @@ index dbbc7919de53..4f801e0dd564 100644 /* parse dictionary content */ FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); -@@ -237,5 +237,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; @@ -46646,7 +25810,7 @@ index 8c1a79d666f8..de459a0dacd1 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c -index b9b935a9f5c0..d7eebb17a2c5 100644 +index 6b3177c94711..5e2a3ef03732 100644 --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c @@ -1,5 +1,6 @@ @@ -46657,7 +25821,12 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -56,13 +57,13 @@ +@@ -52,17 +53,18 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" @@ -46672,7 +25841,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 -@@ -72,11 +73,11 @@ +@@ -72,11 +74,11 @@ *************************************/ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 @@ -46689,7 +25858,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 #define DDICT_HASHSET_TABLE_BASE_SIZE 64 #define DDICT_HASHSET_RESIZE_FACTOR 2 -@@ -237,6 +238,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) +@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) dctx->outBufferMode = ZSTD_bm_buffered; dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; @@ -46697,7 +25866,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 } static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) -@@ -421,16 +423,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, @@ -46742,7 +25911,119 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 if ( (format != ZSTD_f_zstd1_magicless) && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { -@@ -730,10 +756,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize +@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { + unsigned long long totalDstSize = 0; +@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (totalDstSize + fcs < totalDstSize) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize ip += 4; } @@ -46755,7 +26036,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 return frameSizeInfo; } } -@@ -773,6 +800,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) +@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) return bound; } @@ -46804,61 +26085,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 /*-************************************************************* * Frame decoding -@@ -798,7 +867,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, - if (srcSize == 0) return 0; - RETURN_ERROR(dstBuffer_null, ""); - } -- ZSTD_memcpy(dst, src, srcSize); -+ ZSTD_memmove(dst, src, srcSize); - return srcSize; - } - -@@ -858,6 +927,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - - /* Loop on each block */ - while (1) { -+ BYTE* oBlockEnd = oend; - size_t decodedSize; - blockProperties_t blockProperties; - size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); -@@ -867,16 +937,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - remainingSrcSize -= ZSTD_blockHeaderSize; - RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); - -+ if (ip >= op && ip < oBlockEnd) { -+ /* We are decompressing in-place. Limit the output pointer so that we -+ * don't overwrite the block that we are currently reading. This will -+ * fail decompression if the input & output pointers aren't spaced -+ * far enough apart. -+ * -+ * This is important to set, even when the pointers are far enough -+ * apart, because ZSTD_decompressBlock_internal() can decide to store -+ * literals in the output buffer, after the block it is decompressing. -+ * Since we don't want anything to overwrite our input, we have to tell -+ * ZSTD_decompressBlock_internal to never write past ip. -+ * -+ * See ZSTD_allocateLiteralsBuffer() for reference. -+ */ -+ oBlockEnd = op + (ip - op); -+ } -+ - switch(blockProperties.blockType) - { - case bt_compressed: -- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); -+ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); - break; - case bt_raw : -+ /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ - decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); - break; - case bt_rle : -- decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize); -+ decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize); - break; - case bt_reserved : - default: -@@ -911,6 +999,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, +@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, } ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ @@ -46866,7 +26093,31 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 *srcPtr = ip; *srcSizePtr = remainingSrcSize; return (size_t)(op-ostart); -@@ -1042,8 +1131,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr +@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } /* @@ -46877,7 +26128,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 * be streamed. * * For blocks that can be streamed, this allows us to reduce the latency until we produce -@@ -1243,7 +1332,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c +@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c default: assert(0); /* impossible */ @@ -46886,7 +26137,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 } } -@@ -1284,11 +1373,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, +@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, /* in minimal huffman, we always use X1 variants */ size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, @@ -46900,7 +26151,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 #endif RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); dictPtr += hSize; -@@ -1384,7 +1473,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) +@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) dctx->prefixStart = NULL; dctx->virtualStart = NULL; dctx->dictEnd = NULL; @@ -46909,7 +26160,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 dctx->litEntropy = dctx->fseEntropy = 0; dctx->dictID = 0; dctx->bType = bt_reserved; -@@ -1446,7 +1535,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) +@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) * This could for one of the following reasons : * - The frame does not require a dictionary (most common case). * - The frame was built with dictID intentionally removed. @@ -46918,7 +26169,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, frame header could not be decoded. * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. -@@ -1455,7 +1544,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) +@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) * ZSTD_getFrameHeader(), which will provide a more precise error code. */ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) { @@ -46927,7 +26178,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); if (ZSTD_isError(hError)) return 0; return zfp.dictID; -@@ -1562,7 +1651,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di +@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di size_t ZSTD_initDStream(ZSTD_DStream* zds) { DEBUGLOG(4, "ZSTD_initDStream"); @@ -46938,7 +26189,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 } /* ZSTD_initDStream_usingDDict() : -@@ -1570,20 +1661,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) +@@ -1589,20 +1664,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) * this function cannot fail */ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) { @@ -46960,7 +26211,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) { -@@ -1651,6 +1734,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) +@@ -1670,6 +1737,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; return bounds; @@ -46972,7 +26223,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 default:; } bounds.error = ERROR(parameter_unsupported); -@@ -1691,6 +1779,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value +@@ -1710,6 +1782,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value case ZSTD_d_refMultipleDDicts: *value = (int)dctx->refMultipleDDicts; return 0; @@ -46982,7 +26233,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 default:; } RETURN_ERROR(parameter_unsupported, ""); -@@ -1724,6 +1815,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value +@@ -1743,6 +1818,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value } dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; return 0; @@ -46993,7 +26244,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 default:; } RETURN_ERROR(parameter_unsupported, ""); -@@ -1899,7 +1994,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -1918,7 +1997,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB if (zds->refMultipleDDicts && zds->ddictSet) { ZSTD_DCtx_selectFrameDDict(zds); } @@ -47001,7 +26252,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 if (ZSTD_isError(hSize)) { return hSize; /* error */ } -@@ -1913,6 +2007,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -1932,6 +2010,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB zds->lhSize += remainingInput; } input->pos = input->size; @@ -47013,7 +26264,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ } assert(ip != NULL); -@@ -1930,8 +2029,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -1949,8 +2032,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); if (ZSTD_isError(decompressedSize)) return decompressedSize; DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") @@ -47024,7 +26275,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 zds->expected = 0; zds->streamStage = zdss_init; someMoreWork = 0; -@@ -2015,6 +2115,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2034,6 +2118,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); @@ -47032,7 +26283,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 ip += neededInSize; /* Function modifies the stage so we must break */ break; -@@ -2029,7 +2130,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2048,7 +2133,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB int const isSkipFrame = ZSTD_isSkipFrame(zds); size_t loadedSize; /* At this point we shouldn't be decompressing a block that we can stream. */ @@ -47041,7 +26292,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 if (isSkipFrame) { loadedSize = MIN(toLoad, (size_t)(iend-ip)); } else { -@@ -2038,8 +2139,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2057,8 +2142,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB "should never happen"); loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); } @@ -47055,7 +26306,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ /* decode loaded input */ -@@ -2049,14 +2153,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2068,14 +2156,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB break; } case zdss_flush: @@ -47076,7 +26327,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", (int)(zds->outBuffSize - zds->outStart), (U32)zds->fParams.blockSizeMax); -@@ -2070,7 +2177,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2089,7 +2180,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB default: assert(0); /* impossible */ @@ -47085,7 +26336,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 } } /* result */ -@@ -2083,8 +2190,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB +@@ -2102,8 +2193,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB if ((ip==istart) && (op==ostart)) { /* no forward progress */ zds->noForwardProgress ++; if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { @@ -47096,7 +26347,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 assert(0); } } else { -@@ -2121,11 +2228,17 @@ size_t ZSTD_decompressStream_simpleArgs ( +@@ -2140,11 +2231,17 @@ size_t ZSTD_decompressStream_simpleArgs ( void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos) { @@ -47122,7 +26373,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644 + } } diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c -index c1913b8e7c89..ffbe53ba0346 100644 +index c1913b8e7c89..9f5577e5bc19 100644 --- a/lib/zstd/decompress/zstd_decompress_block.c +++ b/lib/zstd/decompress/zstd_decompress_block.c @@ -1,5 +1,6 @@ @@ -47337,7 +26588,7 @@ index c1913b8e7c89..ffbe53ba0346 100644 seq.matchLength = mlDInfo->baseValue; seq.litLength = llDInfo->baseValue; { U32 const ofBase = ofDInfo->baseValue; -@@ -1186,9 +1221,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) +@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) U32 const llnbBits = llDInfo->nbBits; U32 const mlnbBits = mlDInfo->nbBits; U32 const ofnbBits = ofDInfo->nbBits; @@ -47352,8 +26603,13 @@ index c1913b8e7c89..ffbe53ba0346 100644 * performance. */ -@@ -1201,13 +1240,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - #endif + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); @@ -47373,7 +26629,31 @@ index c1913b8e7c89..ffbe53ba0346 100644 } else { offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); -@@ -1552,7 +1594,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, +@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); @@ -47382,12 +26662,12 @@ index c1913b8e7c89..ffbe53ba0346 100644 (void)frame; /* Regen sequences */ -@@ -1945,34 +1987,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, +@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ +/* -+ * @returns The total size of the history referencable by zstd, including ++ * @returns The total size of the history referenceable by zstd, including + * both the prefix and the extDict. At @p op any offset larger than this + * is invalid. + */ @@ -47395,15 +26675,15 @@ index c1913b8e7c89..ffbe53ba0346 100644 +{ + return (size_t)(op - virtualStart); +} - --#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ -- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) --/* ZSTD_getLongOffsetsShare() : ++ +typedef struct { + unsigned longOffsetShare; + unsigned maxNbAdditionalBits; +} ZSTD_OffsetInfo; -+ + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : +/* ZSTD_getOffsetInfo() : * condition : offTable must be valid * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) @@ -47482,7 +26762,7 @@ index c1913b8e7c89..ffbe53ba0346 100644 size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, -@@ -1980,20 +2067,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, const void* src, size_t srcSize, const int frame, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; @@ -47513,7 +26793,7 @@ index c1913b8e7c89..ffbe53ba0346 100644 if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; -@@ -2001,6 +2089,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, /* Build Decoding Tables */ { @@ -47537,7 +26817,7 @@ index c1913b8e7c89..ffbe53ba0346 100644 /* These macros control at build-time which decompressor implementation * we use. If neither is defined, we do some inspection and dispatch at * runtime. -@@ -2008,6 +2113,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) int usePrefetchDecoder = dctx->ddictIsCold; @@ -47549,9 +26829,14 @@ index c1913b8e7c89..ffbe53ba0346 100644 #endif int nbSeq; size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); -@@ -2017,26 +2127,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; - RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); -#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) @@ -47598,8 +26883,34 @@ index c1913b8e7c89..ffbe53ba0346 100644 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ +@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h -index 3d2d57a5d25a..e372f048d186 100644 +index 3d2d57a5d25a..5888e6cc788b 100644 --- a/lib/zstd/decompress/zstd_decompress_block.h +++ b/lib/zstd/decompress/zstd_decompress_block.h @@ -1,5 +1,6 @@ @@ -47610,6 +26921,17 @@ index 3d2d57a5d25a..e372f048d186 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h index 98102edb6a83..32f79fb2873d 100644 --- a/lib/zstd/decompress/zstd_decompress_internal.h @@ -47658,7 +26980,7 @@ index a06ca187aab5..8a47eb2a4514 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c -index 22686e367e6f..2fead39eb743 100644 +index 22686e367e6f..466828e35752 100644 --- a/lib/zstd/zstd_common_module.c +++ b/lib/zstd/zstd_common_module.c @@ -1,6 +1,6 @@ @@ -47669,6 +26991,16 @@ index 22686e367e6f..2fead39eb743 100644 * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c index 04e1b5c01d9b..8ecf43226af2 100644 --- a/lib/zstd/zstd_compress_module.c @@ -47703,3518 +27035,4 @@ index f4ed952ed485..7d31518e9d5a 100644 EXPORT_SYMBOL(zstd_reset_dstream); -- -2.40.0.rc2 - -From 16b77e5461b5cc96bf4476bde0fee2ecc25aca83 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Fri, 10 Mar 2023 19:28:54 +0100 -Subject: [PATCH 16/16] v4l2-core: add v4l2loopback - -Signed-off-by: Peter Jung ---- - drivers/media/v4l2-core/Kconfig | 5 + - drivers/media/v4l2-core/Makefile | 2 + - drivers/media/v4l2-core/v4l2loopback.c | 2906 +++++++++++++++++ - drivers/media/v4l2-core/v4l2loopback.h | 96 + - .../media/v4l2-core/v4l2loopback_formats.h | 445 +++ - 5 files changed, 3454 insertions(+) - create mode 100644 drivers/media/v4l2-core/v4l2loopback.c - create mode 100644 drivers/media/v4l2-core/v4l2loopback.h - create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h - -diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig -index 348559bc2468..32a46fcc751f 100644 ---- a/drivers/media/v4l2-core/Kconfig -+++ b/drivers/media/v4l2-core/Kconfig -@@ -40,6 +40,11 @@ config VIDEO_TUNER - config V4L2_JPEG_HELPER - tristate - -+config V4L2_LOOPBACK -+ tristate "V4L2 loopback device" -+ help -+ V4L2 loopback device -+ - # Used by drivers that need v4l2-h264.ko - config V4L2_H264 - tristate -diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile -index 41d91bd10cf2..4de37a844f95 100644 ---- a/drivers/media/v4l2-core/Makefile -+++ b/drivers/media/v4l2-core/Makefile -@@ -32,6 +32,8 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o - obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o - obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o - -+obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o -+ - obj-$(CONFIG_VIDEOBUF_DMA_CONTIG) += videobuf-dma-contig.o - obj-$(CONFIG_VIDEOBUF_DMA_SG) += videobuf-dma-sg.o - obj-$(CONFIG_VIDEOBUF_GEN) += videobuf-core.o -diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c -new file mode 100644 -index 000000000000..2ab1f760cfb5 ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback.c -@@ -0,0 +1,2906 @@ -+/* -*- c-file-style: "linux" -*- */ -+/* -+ * v4l2loopback.c -- video4linux2 loopback driver -+ * -+ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com) -+ * Copyright (C) 2010-2019 IOhannes m zmoelnig (zmoelnig@iem.at) -+ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de) -+ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com) -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "v4l2loopback.h" -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1) -+#define kstrtoul strict_strtoul -+#endif -+ -+#if defined(timer_setup) && defined(from_timer) -+#define HAVE_TIMER_SETUP -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) -+#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER -+#endif -+ -+#define V4L2LOOPBACK_VERSION_CODE \ -+ KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \ -+ V4L2LOOPBACK_VERSION_BUGFIX) -+ -+MODULE_DESCRIPTION("V4L2 loopback video device"); -+MODULE_AUTHOR("Vasily Levin, " -+ "IOhannes m zmoelnig ," -+ "Stefan Diewald," -+ "Anton Novikov" -+ "et al."); -+#ifdef SNAPSHOT_VERSION -+MODULE_VERSION(__stringify(SNAPSHOT_VERSION)); -+#else -+MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify( -+ V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX)); -+#endif -+MODULE_LICENSE("GPL"); -+ -+/* -+ * helpers -+ */ -+#define dprintk(fmt, args...) \ -+ do { \ -+ if (debug > 0) { \ -+ printk(KERN_INFO "v4l2-loopback[" __stringify( \ -+ __LINE__) "], pid(%d): " fmt, \ -+ task_pid_nr(current), ##args); \ -+ } \ -+ } while (0) -+ -+#define MARK() \ -+ do { \ -+ if (debug > 1) { \ -+ printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \ -+ __LINE__, __func__, task_pid_nr(current)); \ -+ } \ -+ } while (0) -+ -+#define dprintkrw(fmt, args...) \ -+ do { \ -+ if (debug > 2) { \ -+ printk(KERN_INFO "v4l2-loopback[" __stringify( \ -+ __LINE__) "], pid(%d): " fmt, \ -+ task_pid_nr(current), ##args); \ -+ } \ -+ } while (0) -+ -+/* TODO: Make sure that function is never interrupted. */ -+static inline int mod_inc(int *number, int mod) -+{ -+ int result; -+ result = (*number + 1) % mod; -+ if (unlikely(result < 0)) -+ result += mod; -+ *number = result; -+ return result; -+} -+ -+static inline void v4l2l_get_timestamp(struct v4l2_buffer *b) -+{ -+ /* ktime_get_ts is considered deprecated, so use ktime_get_ts64 if possible */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0) -+ struct timespec ts; -+ ktime_get_ts(&ts); -+#else -+ struct timespec64 ts; -+ ktime_get_ts64(&ts); -+#endif -+ -+ b->timestamp.tv_sec = ts.tv_sec; -+ b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC); -+} -+ -+#if !defined(__poll_t) -+typedef unsigned __poll_t; -+#endif -+ -+/* module constants -+ * can be overridden during he build process using something like -+ * make KCPPFLAGS="-DMAX_DEVICES=100" -+ */ -+ -+/* maximum number of v4l2loopback devices that can be created */ -+#ifndef MAX_DEVICES -+#define MAX_DEVICES 8 -+#endif -+ -+/* whether the default is to announce capabilities exclusively or not */ -+#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS -+#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0 -+#endif -+ -+/* when a producer is considered to have gone stale */ -+#ifndef MAX_TIMEOUT -+#define MAX_TIMEOUT (100 * 1000) /* in msecs */ -+#endif -+ -+/* max buffers that can be mapped, actually they -+ * are all mapped to max_buffers buffers */ -+#ifndef MAX_BUFFERS -+#define MAX_BUFFERS 32 -+#endif -+ -+/* module parameters */ -+static int debug = 0; -+module_param(debug, int, S_IRUGO | S_IWUSR); -+MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)"); -+ -+#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2 -+static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS; -+module_param(max_buffers, int, S_IRUGO); -+MODULE_PARM_DESC(max_buffers, -+ "how many buffers should be allocated [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]"); -+ -+/* how many times a device can be opened -+ * the per-module default value can be overridden on a per-device basis using -+ * the /sys/devices interface -+ * -+ * note that max_openers should be at least 2 in order to get a working system: -+ * one opener for the producer and one opener for the consumer -+ * however, we leave that to the user -+ */ -+#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10 -+static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS; -+module_param(max_openers, int, S_IRUGO | S_IWUSR); -+MODULE_PARM_DESC( -+ max_openers, -+ "how many users can open the loopback device [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]"); -+ -+static int devices = -1; -+module_param(devices, int, 0); -+MODULE_PARM_DESC(devices, "how many devices should be created"); -+ -+static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 }; -+module_param_array(video_nr, int, NULL, 0444); -+MODULE_PARM_DESC(video_nr, -+ "video device numbers (-1=auto, 0=/dev/video0, etc.)"); -+ -+static char *card_label[MAX_DEVICES]; -+module_param_array(card_label, charp, NULL, 0000); -+MODULE_PARM_DESC(card_label, "card labels for each device"); -+ -+static bool exclusive_caps[MAX_DEVICES] = { -+ [0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS -+}; -+module_param_array(exclusive_caps, bool, NULL, 0444); -+/* FIXXME: wording */ -+MODULE_PARM_DESC( -+ exclusive_caps, -+ "whether to announce OUTPUT/CAPTURE capabilities exclusively or not [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]"); -+ -+/* format specifications */ -+#define V4L2LOOPBACK_SIZE_MIN_WIDTH 48 -+#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 32 -+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192 -+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192 -+ -+#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640 -+#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480 -+ -+static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; -+module_param(max_width, int, S_IRUGO); -+MODULE_PARM_DESC(max_width, -+ "maximum allowed frame width [DEFAULT: " __stringify( -+ V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]"); -+static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; -+module_param(max_height, int, S_IRUGO); -+MODULE_PARM_DESC(max_height, -+ "maximum allowed frame height [DEFAULT: " __stringify( -+ V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]"); -+ -+static DEFINE_IDR(v4l2loopback_index_idr); -+static DEFINE_MUTEX(v4l2loopback_ctl_mutex); -+ -+/* frame intervals */ -+#define V4L2LOOPBACK_FPS_MIN 0 -+#define V4L2LOOPBACK_FPS_MAX 1000 -+ -+/* control IDs */ -+#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000) -+#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0) -+#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1) -+#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2) -+#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3) -+ -+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl); -+static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = { -+ .s_ctrl = v4l2loopback_s_ctrl, -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_KEEP_FORMAT, -+ .name = "keep_format", -+ .type = V4L2_CTRL_TYPE_BOOLEAN, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_SUSTAIN_FRAMERATE, -+ .name = "sustain_framerate", -+ .type = V4L2_CTRL_TYPE_BOOLEAN, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_TIMEOUT, -+ .name = "timeout", -+ .type = V4L2_CTRL_TYPE_INTEGER, -+ .min = 0, -+ .max = MAX_TIMEOUT, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_TIMEOUT_IMAGE_IO, -+ .name = "timeout_image_io", -+ .type = V4L2_CTRL_TYPE_BOOLEAN, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+ -+/* module structures */ -+struct v4l2loopback_private { -+ int device_nr; -+}; -+ -+/* TODO(vasaka) use typenames which are common to kernel, but first find out if -+ * it is needed */ -+/* struct keeping state and settings of loopback device */ -+ -+struct v4l2l_buffer { -+ struct v4l2_buffer buffer; -+ struct list_head list_head; -+ int use_count; -+}; -+ -+struct v4l2_loopback_device { -+ struct v4l2_device v4l2_dev; -+ struct v4l2_ctrl_handler ctrl_handler; -+ struct video_device *vdev; -+ /* pixel and stream format */ -+ struct v4l2_pix_format pix_format; -+ struct v4l2_captureparm capture_param; -+ unsigned long frame_jiffies; -+ -+ /* ctrls */ -+ int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all -+ openers close() the device */ -+ int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain -+ (close to) nominal framerate */ -+ -+ /* buffers stuff */ -+ u8 *image; /* pointer to actual buffers data */ -+ unsigned long int imagesize; /* size of buffers data */ -+ int buffers_number; /* should not be big, 4 is a good choice */ -+ struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */ -+ int used_buffers; /* number of the actually used buffers */ -+ int max_openers; /* how many times can this device be opened */ -+ -+ int write_position; /* number of last written frame + 1 */ -+ struct list_head outbufs_list; /* buffers in output DQBUF order */ -+ int bufpos2index -+ [MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers) -+ * to inner buffer index */ -+ long buffer_size; -+ -+ /* sustain_framerate stuff */ -+ struct timer_list sustain_timer; -+ unsigned int reread_count; -+ -+ /* timeout stuff */ -+ unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */ -+ int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will -+ * read/write to timeout_image */ -+ u8 *timeout_image; /* copy of it will be captured when timeout passes */ -+ struct v4l2l_buffer timeout_image_buffer; -+ struct timer_list timeout_timer; -+ int timeout_happened; -+ -+ /* sync stuff */ -+ atomic_t open_count; -+ -+ int ready_for_capture; /* set to the number of writers that opened the -+ * device and negotiated format. */ -+ int ready_for_output; /* set to true when no writer is currently attached -+ * this differs slightly from !ready_for_capture, -+ * e.g. when using fallback images */ -+ int active_readers; /* increase if any reader starts streaming */ -+ int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE) -+ * should only be announced if the resp. "ready" -+ * flag is set; default=TRUE */ -+ -+ int max_width; -+ int max_height; -+ -+ char card_label[32]; -+ -+ wait_queue_head_t read_event; -+ spinlock_t lock; -+}; -+ -+/* types of opener shows what opener wants to do with loopback */ -+enum opener_type { -+ // clang-format off -+ UNNEGOTIATED = 0, -+ READER = 1, -+ WRITER = 2, -+ // clang-format on -+}; -+ -+/* struct keeping state and type of opener */ -+struct v4l2_loopback_opener { -+ enum opener_type type; -+ int read_position; /* number of last processed frame + 1 or -+ * write_position - 1 if reader went out of sync */ -+ unsigned int reread_count; -+ struct v4l2_buffer *buffers; -+ int buffers_number; /* should not be big, 4 is a good choice */ -+ int timeout_image_io; -+ -+ struct v4l2_fh fh; -+}; -+ -+#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh) -+ -+/* this is heavily inspired by the bttv driver found in the linux kernel */ -+struct v4l2l_format { -+ char *name; -+ int fourcc; /* video4linux 2 */ -+ int depth; /* bit/pixel */ -+ int flags; -+}; -+/* set the v4l2l_format.flags to PLANAR for non-packed formats */ -+#define FORMAT_FLAGS_PLANAR 0x01 -+#define FORMAT_FLAGS_COMPRESSED 0x02 -+ -+#include "v4l2loopback_formats.h" -+ -+static const unsigned int FORMATS = ARRAY_SIZE(formats); -+ -+static char *fourcc2str(unsigned int fourcc, char buf[4]) -+{ -+ buf[0] = (fourcc >> 0) & 0xFF; -+ buf[1] = (fourcc >> 8) & 0xFF; -+ buf[2] = (fourcc >> 16) & 0xFF; -+ buf[3] = (fourcc >> 24) & 0xFF; -+ -+ return buf; -+} -+ -+static const struct v4l2l_format *format_by_fourcc(int fourcc) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < FORMATS; i++) { -+ if (formats[i].fourcc == fourcc) -+ return formats + i; -+ } -+ -+ dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF, -+ (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, -+ (fourcc >> 24) & 0xFF); -+ return NULL; -+} -+ -+static void pix_format_set_size(struct v4l2_pix_format *f, -+ const struct v4l2l_format *fmt, -+ unsigned int width, unsigned int height) -+{ -+ f->width = width; -+ f->height = height; -+ -+ if (fmt->flags & FORMAT_FLAGS_PLANAR) { -+ f->bytesperline = width; /* Y plane */ -+ f->sizeimage = (width * height * fmt->depth) >> 3; -+ } else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) { -+ /* doesn't make sense for compressed formats */ -+ f->bytesperline = 0; -+ f->sizeimage = (width * height * fmt->depth) >> 3; -+ } else { -+ f->bytesperline = (width * fmt->depth) >> 3; -+ f->sizeimage = height * f->bytesperline; -+ } -+} -+ -+static int set_timeperframe(struct v4l2_loopback_device *dev, -+ struct v4l2_fract *tpf) -+{ -+ if ((tpf->denominator < 1) || (tpf->numerator < 1)) { -+ return -EINVAL; -+ } -+ dev->capture_param.timeperframe = *tpf; -+ dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator / -+ tpf->denominator); -+ return 0; -+} -+ -+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd); -+ -+/* device attributes */ -+/* available via sysfs: /sys/devices/virtual/video4linux/video* */ -+ -+static ssize_t attr_show_format(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ /* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ const struct v4l2_fract *tpf; -+ char buf4cc[5], buf_fps[32]; -+ -+ if (!dev || !dev->ready_for_capture) -+ return 0; -+ tpf = &dev->capture_param.timeperframe; -+ -+ fourcc2str(dev->pix_format.pixelformat, buf4cc); -+ buf4cc[4] = 0; -+ if (tpf->numerator == 1) -+ snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator); -+ else -+ snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator, -+ tpf->numerator); -+ return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width, -+ dev->pix_format.height, buf_fps); -+} -+ -+static ssize_t attr_store_format(struct device *cd, -+ struct device_attribute *attr, const char *buf, -+ size_t len) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ int fps_num = 0, fps_den = 1; -+ -+ if (!dev) -+ return -ENODEV; -+ -+ /* only fps changing is supported */ -+ if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) { -+ struct v4l2_fract f = { .numerator = fps_den, -+ .denominator = fps_num }; -+ int err = 0; -+ if ((err = set_timeperframe(dev, &f)) < 0) -+ return err; -+ return len; -+ } -+ return -EINVAL; -+} -+ -+static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format, -+ attr_store_format); -+ -+static ssize_t attr_show_buffers(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ -+ if (!dev) -+ return -ENODEV; -+ -+ return sprintf(buf, "%d\n", dev->used_buffers); -+} -+ -+static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL); -+ -+static ssize_t attr_show_maxopeners(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ -+ if (!dev) -+ return -ENODEV; -+ -+ return sprintf(buf, "%d\n", dev->max_openers); -+} -+ -+static ssize_t attr_store_maxopeners(struct device *cd, -+ struct device_attribute *attr, -+ const char *buf, size_t len) -+{ -+ struct v4l2_loopback_device *dev = NULL; -+ unsigned long curr = 0; -+ -+ if (kstrtoul(buf, 0, &curr)) -+ return -EINVAL; -+ -+ dev = v4l2loopback_cd2dev(cd); -+ if (!dev) -+ return -ENODEV; -+ -+ if (dev->max_openers == curr) -+ return len; -+ -+ if (curr > __INT_MAX__ || dev->open_count.counter > curr) { -+ /* request to limit to less openers as are currently attached to us */ -+ return -EINVAL; -+ } -+ -+ dev->max_openers = (int)curr; -+ -+ return len; -+} -+ -+static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners, -+ attr_store_maxopeners); -+ -+static void v4l2loopback_remove_sysfs(struct video_device *vdev) -+{ -+#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x) -+ -+ if (vdev) { -+ V4L2_SYSFS_DESTROY(format); -+ V4L2_SYSFS_DESTROY(buffers); -+ V4L2_SYSFS_DESTROY(max_openers); -+ /* ... */ -+ } -+} -+ -+static void v4l2loopback_create_sysfs(struct video_device *vdev) -+{ -+ int res = 0; -+ -+#define V4L2_SYSFS_CREATE(x) \ -+ res = device_create_file(&vdev->dev, &dev_attr_##x); \ -+ if (res < 0) \ -+ break -+ if (!vdev) -+ return; -+ do { -+ V4L2_SYSFS_CREATE(format); -+ V4L2_SYSFS_CREATE(buffers); -+ V4L2_SYSFS_CREATE(max_openers); -+ /* ... */ -+ } while (0); -+ -+ if (res >= 0) -+ return; -+ dev_err(&vdev->dev, "%s error: %d\n", __func__, res); -+} -+ -+/* Event APIs */ -+ -+#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START) -+#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000 -+#define V4L2_EVENT_PRI_CLIENT_USAGE \ -+ (V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1) -+ -+struct v4l2_event_client_usage { -+ __u32 count; -+}; -+ -+/* global module data */ -+/* find a device based on it's device-number (e.g. '3' for /dev/video3) */ -+struct v4l2loopback_lookup_cb_data { -+ int device_nr; -+ struct v4l2_loopback_device *device; -+}; -+static int v4l2loopback_lookup_cb(int id, void *ptr, void *data) -+{ -+ struct v4l2_loopback_device *device = ptr; -+ struct v4l2loopback_lookup_cb_data *cbdata = data; -+ if (cbdata && device && device->vdev) { -+ if (device->vdev->num == cbdata->device_nr) { -+ cbdata->device = device; -+ cbdata->device_nr = id; -+ return 1; -+ } -+ } -+ return 0; -+} -+static int v4l2loopback_lookup(int device_nr, -+ struct v4l2_loopback_device **device) -+{ -+ struct v4l2loopback_lookup_cb_data data = { -+ .device_nr = device_nr, -+ .device = NULL, -+ }; -+ int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb, -+ &data); -+ if (1 == err) { -+ if (device) -+ *device = data.device; -+ return data.device_nr; -+ } -+ return -ENODEV; -+} -+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd) -+{ -+ struct video_device *loopdev = to_video_device(cd); -+ struct v4l2loopback_private *ptr = -+ (struct v4l2loopback_private *)video_get_drvdata(loopdev); -+ int nr = ptr->device_nr; -+ -+ return idr_find(&v4l2loopback_index_idr, nr); -+} -+ -+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f) -+{ -+ struct v4l2loopback_private *ptr = video_drvdata(f); -+ int nr = ptr->device_nr; -+ -+ return idr_find(&v4l2loopback_index_idr, nr); -+} -+ -+/* forward declarations */ -+static void client_usage_queue_event(struct video_device *vdev); -+static void init_buffers(struct v4l2_loopback_device *dev); -+static int allocate_buffers(struct v4l2_loopback_device *dev); -+static void free_buffers(struct v4l2_loopback_device *dev); -+static void try_free_buffers(struct v4l2_loopback_device *dev); -+static int allocate_timeout_image(struct v4l2_loopback_device *dev); -+static void check_timers(struct v4l2_loopback_device *dev); -+static const struct v4l2_file_operations v4l2_loopback_fops; -+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops; -+ -+/* Queue helpers */ -+/* next functions sets buffer flags and adjusts counters accordingly */ -+static inline void set_done(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; -+ buffer->buffer.flags |= V4L2_BUF_FLAG_DONE; -+} -+ -+static inline void set_queued(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; -+ buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED; -+} -+ -+static inline void unset_flags(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; -+} -+ -+/* V4L2 ioctl caps and params calls */ -+/* returns device capabilities -+ * called on VIDIOC_QUERYCAP -+ */ -+static int vidioc_querycap(struct file *file, void *priv, -+ struct v4l2_capability *cap) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ int device_nr = -+ ((struct v4l2loopback_private *)video_get_drvdata(dev->vdev)) -+ ->device_nr; -+ __u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; -+ -+ strlcpy(cap->driver, "v4l2 loopback", sizeof(cap->driver)); -+ snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label); -+ snprintf(cap->bus_info, sizeof(cap->bus_info), -+ "platform:v4l2loopback-%03d", device_nr); -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0) -+ /* since 3.1.0, the v4l2-core system is supposed to set the version */ -+ cap->version = V4L2LOOPBACK_VERSION_CODE; -+#endif -+ -+ if (dev->announce_all_caps) { -+ capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT; -+ } else { -+ if (dev->ready_for_capture) { -+ capabilities |= V4L2_CAP_VIDEO_CAPTURE; -+ } -+ if (dev->ready_for_output) { -+ capabilities |= V4L2_CAP_VIDEO_OUTPUT; -+ } -+ } -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -+ dev->vdev->device_caps = -+#endif /* >=linux-4.7.0 */ -+ cap->device_caps = cap->capabilities = capabilities; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0) -+ cap->capabilities |= V4L2_CAP_DEVICE_CAPS; -+#endif -+ -+ memset(cap->reserved, 0, sizeof(cap->reserved)); -+ return 0; -+} -+ -+static int vidioc_enum_framesizes(struct file *file, void *fh, -+ struct v4l2_frmsizeenum *argp) -+{ -+ struct v4l2_loopback_device *dev; -+ -+ /* there can be only one... */ -+ if (argp->index) -+ return -EINVAL; -+ -+ dev = v4l2loopback_getdevice(file); -+ if (dev->ready_for_capture) { -+ /* format has already been negotiated -+ * cannot change during runtime -+ */ -+ if (argp->pixel_format != dev->pix_format.pixelformat) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; -+ -+ argp->discrete.width = dev->pix_format.width; -+ argp->discrete.height = dev->pix_format.height; -+ } else { -+ /* if the format has not been negotiated yet, we accept anything -+ */ -+ if (NULL == format_by_fourcc(argp->pixel_format)) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS; -+ -+ argp->stepwise.min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH; -+ argp->stepwise.min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT; -+ -+ argp->stepwise.max_width = dev->max_width; -+ argp->stepwise.max_height = dev->max_height; -+ -+ argp->stepwise.step_width = 1; -+ argp->stepwise.step_height = 1; -+ } -+ return 0; -+} -+ -+/* returns frameinterval (fps) for the set resolution -+ * called on VIDIOC_ENUM_FRAMEINTERVALS -+ */ -+static int vidioc_enum_frameintervals(struct file *file, void *fh, -+ struct v4l2_frmivalenum *argp) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ -+ /* there can be only one... */ -+ if (argp->index) -+ return -EINVAL; -+ -+ if (dev->ready_for_capture) { -+ if (argp->width != dev->pix_format.width || -+ argp->height != dev->pix_format.height || -+ argp->pixel_format != dev->pix_format.pixelformat) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMIVAL_TYPE_DISCRETE; -+ argp->discrete = dev->capture_param.timeperframe; -+ } else { -+ if (argp->width < V4L2LOOPBACK_SIZE_MIN_WIDTH || -+ argp->width > max_width || -+ argp->height < V4L2LOOPBACK_SIZE_MIN_HEIGHT || -+ argp->height > max_height || -+ NULL == format_by_fourcc(argp->pixel_format)) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS; -+ argp->stepwise.min.numerator = 1; -+ argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX; -+ argp->stepwise.max.numerator = 1; -+ argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN; -+ argp->stepwise.step.numerator = 1; -+ argp->stepwise.step.denominator = 1; -+ } -+ -+ return 0; -+} -+ -+/* ------------------ CAPTURE ----------------------- */ -+ -+/* returns device formats -+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_enum_fmt_cap(struct file *file, void *fh, -+ struct v4l2_fmtdesc *f) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (f->index) -+ return -EINVAL; -+ if (dev->ready_for_capture) { -+ const __u32 format = dev->pix_format.pixelformat; -+ -+ snprintf(f->description, sizeof(f->description), "[%c%c%c%c]", -+ (format >> 0) & 0xFF, (format >> 8) & 0xFF, -+ (format >> 16) & 0xFF, (format >> 24) & 0xFF); -+ -+ f->pixelformat = dev->pix_format.pixelformat; -+ } else { -+ return -EINVAL; -+ } -+ f->flags = 0; -+ MARK(); -+ return 0; -+} -+ -+/* returns current video format -+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_g_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (!dev->ready_for_capture) -+ return -EINVAL; -+ -+ fmt->fmt.pix = dev->pix_format; -+ MARK(); -+ return 0; -+} -+ -+/* checks if it is OK to change to format fmt; -+ * actual check is done by inner_try_fmt_cap -+ * just checking that pixelformat is OK and set other parameters, app should -+ * obey this decision -+ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_try_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ char buf[5]; -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (0 == dev->ready_for_capture) { -+ dprintk("setting fmt_cap not possible yet\n"); -+ return -EBUSY; -+ } -+ -+ if (fmt->fmt.pix.pixelformat != dev->pix_format.pixelformat) -+ return -EINVAL; -+ -+ fmt->fmt.pix = dev->pix_format; -+ -+ buf[4] = 0; -+ dprintk("capFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf)); -+ return 0; -+} -+ -+/* sets new output format, if possible -+ * actually format is set by input and we even do not check it, just return -+ * current one, but it is possible to set subregions of input TODO(vasaka) -+ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_s_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ return vidioc_try_fmt_cap(file, priv, fmt); -+} -+ -+/* ------------------ OUTPUT ----------------------- */ -+ -+/* returns device formats; -+ * LATER: allow all formats -+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_enum_fmt_out(struct file *file, void *fh, -+ struct v4l2_fmtdesc *f) -+{ -+ struct v4l2_loopback_device *dev; -+ const struct v4l2l_format *fmt; -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (dev->ready_for_capture) { -+ const __u32 format = dev->pix_format.pixelformat; -+ -+ /* format has been fixed by the writer, so only one single format is supported */ -+ if (f->index) -+ return -EINVAL; -+ -+ fmt = format_by_fourcc(format); -+ if (NULL == fmt) -+ return -EINVAL; -+ -+ /* f->flags = ??; */ -+ snprintf(f->description, sizeof(f->description), "%s", -+ fmt->name); -+ -+ f->pixelformat = dev->pix_format.pixelformat; -+ } else { -+ /* fill in a dummy format */ -+ /* coverity[unsigned_compare] */ -+ if (f->index < 0 || f->index >= FORMATS) -+ return -EINVAL; -+ -+ fmt = &formats[f->index]; -+ -+ f->pixelformat = fmt->fourcc; -+ snprintf(f->description, sizeof(f->description), "%s", -+ fmt->name); -+ } -+ f->flags = 0; -+ -+ return 0; -+} -+ -+/* returns current video format format fmt */ -+/* NOTE: this is called from the producer -+ * so if format has not been negotiated yet, -+ * it should return ALL of available formats, -+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_g_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ /* -+ * LATER: this should return the currently valid format -+ * gstreamer doesn't like it, if this returns -EINVAL, as it -+ * then concludes that there is _no_ valid format -+ * CHECK whether this assumption is wrong, -+ * or whether we have to always provide a valid format -+ */ -+ -+ fmt->fmt.pix = dev->pix_format; -+ return 0; -+} -+ -+/* checks if it is OK to change to format fmt; -+ * if format is negotiated do not change it -+ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_try_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ /* TODO(vasaka) loopback does not care about formats writer want to set, -+ * maybe it is a good idea to restrict format somehow */ -+ if (dev->ready_for_capture) { -+ fmt->fmt.pix = dev->pix_format; -+ } else { -+ __u32 w = fmt->fmt.pix.width; -+ __u32 h = fmt->fmt.pix.height; -+ __u32 pixfmt = fmt->fmt.pix.pixelformat; -+ const struct v4l2l_format *format = format_by_fourcc(pixfmt); -+ -+ if (w > dev->max_width) -+ w = dev->max_width; -+ if (h > dev->max_height) -+ h = dev->max_height; -+ -+ dprintk("trying image %dx%d\n", w, h); -+ -+ if (w < 1) -+ w = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; -+ -+ if (h < 1) -+ h = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; -+ -+ if (NULL == format) -+ format = &formats[0]; -+ -+ pix_format_set_size(&fmt->fmt.pix, format, w, h); -+ -+ fmt->fmt.pix.pixelformat = format->fourcc; -+ -+ if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) || -+ (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3)) -+ fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; -+ -+ if (V4L2_FIELD_ANY == fmt->fmt.pix.field) -+ fmt->fmt.pix.field = V4L2_FIELD_NONE; -+ -+ /* FIXXME: try_fmt should never modify the device-state */ -+ dev->pix_format = fmt->fmt.pix; -+ } -+ return 0; -+} -+ -+/* sets new output format, if possible; -+ * allocate data here because we do not know if it will be streaming or -+ * read/write IO -+ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_s_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ char buf[5]; -+ int ret; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ ret = vidioc_try_fmt_out(file, priv, fmt); -+ -+ dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture, -+ dev->pix_format.sizeimage); -+ -+ buf[4] = 0; -+ dprintk("outFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf)); -+ -+ if (ret < 0) -+ return ret; -+ -+ if (!dev->ready_for_capture) { -+ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); -+ fmt->fmt.pix.sizeimage = dev->buffer_size; -+ ret = allocate_buffers(dev); -+ } -+ return ret; -+} -+ -+// #define V4L2L_OVERLAY -+#ifdef V4L2L_OVERLAY -+/* ------------------ OVERLAY ----------------------- */ -+/* currently unsupported */ -+/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work -+ * while it should only require it, if overlay is requested -+ * once the gstreamer element is fixed, remove the overlay dummies -+ */ -+#warning OVERLAY dummies -+static int vidioc_g_fmt_overlay(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ return 0; -+} -+ -+static int vidioc_s_fmt_overlay(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ return 0; -+} -+#endif /* V4L2L_OVERLAY */ -+ -+/* ------------------ PARAMs ----------------------- */ -+ -+/* get some data flow parameters, only capability, fps and readbuffers has -+ * effect on this driver -+ * called on VIDIOC_G_PARM -+ */ -+static int vidioc_g_parm(struct file *file, void *priv, -+ struct v4l2_streamparm *parm) -+{ -+ /* do not care about type of opener, hope these enums would always be -+ * compatible */ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ parm->parm.capture = dev->capture_param; -+ return 0; -+} -+ -+/* get some data flow parameters, only capability, fps and readbuffers has -+ * effect on this driver -+ * called on VIDIOC_S_PARM -+ */ -+static int vidioc_s_parm(struct file *file, void *priv, -+ struct v4l2_streamparm *parm) -+{ -+ struct v4l2_loopback_device *dev; -+ int err = 0; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ dprintk("vidioc_s_parm called frate=%d/%d\n", -+ parm->parm.capture.timeperframe.numerator, -+ parm->parm.capture.timeperframe.denominator); -+ -+ switch (parm->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if ((err = set_timeperframe( -+ dev, &parm->parm.capture.timeperframe)) < 0) -+ return err; -+ break; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if ((err = set_timeperframe( -+ dev, &parm->parm.capture.timeperframe)) < 0) -+ return err; -+ break; -+ default: -+ return -1; -+ } -+ -+ parm->parm.capture = dev->capture_param; -+ return 0; -+} -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+/* sets a tv standard, actually we do not need to handle this any special way -+ * added to support effecttv -+ * called on VIDIOC_S_STD -+ */ -+static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std) -+{ -+ v4l2_std_id req_std = 0, supported_std = 0; -+ const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0; -+ -+ if (_std) { -+ req_std = *_std; -+ *_std = all_std; -+ } -+ -+ /* we support everything in V4L2_STD_ALL, but not more... */ -+ supported_std = (all_std & req_std); -+ if (no_std == supported_std) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* gets a fake video standard -+ * called on VIDIOC_G_STD -+ */ -+static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm) -+{ -+ if (norm) -+ *norm = V4L2_STD_ALL; -+ return 0; -+} -+/* gets a fake video standard -+ * called on VIDIOC_QUERYSTD -+ */ -+static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm) -+{ -+ if (norm) -+ *norm = V4L2_STD_ALL; -+ return 0; -+} -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id, -+ s64 val) -+{ -+ switch (id) { -+ case CID_KEEP_FORMAT: -+ if (val < 0 || val > 1) -+ return -EINVAL; -+ dev->keep_format = val; -+ try_free_buffers( -+ dev); /* will only free buffers if !keep_format */ -+ break; -+ case CID_SUSTAIN_FRAMERATE: -+ if (val < 0 || val > 1) -+ return -EINVAL; -+ spin_lock_bh(&dev->lock); -+ dev->sustain_framerate = val; -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+ break; -+ case CID_TIMEOUT: -+ if (val < 0 || val > MAX_TIMEOUT) -+ return -EINVAL; -+ spin_lock_bh(&dev->lock); -+ dev->timeout_jiffies = msecs_to_jiffies(val); -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+ allocate_timeout_image(dev); -+ break; -+ case CID_TIMEOUT_IMAGE_IO: -+ if (val < 0 || val > 1) -+ return -EINVAL; -+ dev->timeout_image_io = val; -+ break; -+ default: -+ return -EINVAL; -+ } -+ return 0; -+} -+ -+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl) -+{ -+ struct v4l2_loopback_device *dev = container_of( -+ ctrl->handler, struct v4l2_loopback_device, ctrl_handler); -+ return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val); -+} -+ -+/* returns set of device outputs, in our case there is only one -+ * called on VIDIOC_ENUMOUTPUT -+ */ -+static int vidioc_enum_output(struct file *file, void *fh, -+ struct v4l2_output *outp) -+{ -+ __u32 index = outp->index; -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ MARK(); -+ -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ -+ if (0 != index) -+ return -EINVAL; -+ -+ /* clear all data (including the reserved fields) */ -+ memset(outp, 0, sizeof(*outp)); -+ -+ outp->index = index; -+ strlcpy(outp->name, "loopback in", sizeof(outp->name)); -+ outp->type = V4L2_OUTPUT_TYPE_ANALOG; -+ outp->audioset = 0; -+ outp->modulator = 0; -+#ifdef V4L2LOOPBACK_WITH_STD -+ outp->std = V4L2_STD_ALL; -+#ifdef V4L2_OUT_CAP_STD -+ outp->capabilities |= V4L2_OUT_CAP_STD; -+#endif /* V4L2_OUT_CAP_STD */ -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ return 0; -+} -+ -+/* which output is currently active, -+ * called on VIDIOC_G_OUTPUT -+ */ -+static int vidioc_g_output(struct file *file, void *fh, unsigned int *i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ if (i) -+ *i = 0; -+ return 0; -+} -+ -+/* set output, can make sense if we have more than one video src, -+ * called on VIDIOC_S_OUTPUT -+ */ -+static int vidioc_s_output(struct file *file, void *fh, unsigned int i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ -+ if (i) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* returns set of device inputs, in our case there is only one, -+ * but later I may add more -+ * called on VIDIOC_ENUMINPUT -+ */ -+static int vidioc_enum_input(struct file *file, void *fh, -+ struct v4l2_input *inp) -+{ -+ __u32 index = inp->index; -+ MARK(); -+ -+ if (0 != index) -+ return -EINVAL; -+ -+ /* clear all data (including the reserved fields) */ -+ memset(inp, 0, sizeof(*inp)); -+ -+ inp->index = index; -+ strlcpy(inp->name, "loopback", sizeof(inp->name)); -+ inp->type = V4L2_INPUT_TYPE_CAMERA; -+ inp->audioset = 0; -+ inp->tuner = 0; -+ inp->status = 0; -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ inp->std = V4L2_STD_ALL; -+#ifdef V4L2_IN_CAP_STD -+ inp->capabilities |= V4L2_IN_CAP_STD; -+#endif -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ return 0; -+} -+ -+/* which input is currently active, -+ * called on VIDIOC_G_INPUT -+ */ -+static int vidioc_g_input(struct file *file, void *fh, unsigned int *i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_capture) -+ return -ENOTTY; -+ if (i) -+ *i = 0; -+ return 0; -+} -+ -+/* set input, can make sense if we have more than one video src, -+ * called on VIDIOC_S_INPUT -+ */ -+static int vidioc_s_input(struct file *file, void *fh, unsigned int i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_capture) -+ return -ENOTTY; -+ if (i == 0) -+ return 0; -+ return -EINVAL; -+} -+ -+/* --------------- V4L2 ioctl buffer related calls ----------------- */ -+ -+/* negotiate buffer type -+ * only mmap streaming supported -+ * called on VIDIOC_REQBUFS -+ */ -+static int vidioc_reqbufs(struct file *file, void *fh, -+ struct v4l2_requestbuffers *b) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ int i; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count, -+ dev->buffers_number); -+ if (opener->timeout_image_io) { -+ if (b->memory != V4L2_MEMORY_MMAP) -+ return -EINVAL; -+ b->count = 1; -+ return 0; -+ } -+ -+ init_buffers(dev); -+ switch (b->memory) { -+ case V4L2_MEMORY_MMAP: -+ /* do nothing here, buffers are always allocated */ -+ if (b->count < 1 || dev->buffers_number < 1) -+ return 0; -+ -+ if (b->count > dev->buffers_number) -+ b->count = dev->buffers_number; -+ -+ /* make sure that outbufs_list contains buffers from 0 to used_buffers-1 -+ * actually, it will have been already populated via v4l2_loopback_init() -+ * at this point */ -+ if (list_empty(&dev->outbufs_list)) { -+ for (i = 0; i < dev->used_buffers; ++i) -+ list_add_tail(&dev->buffers[i].list_head, -+ &dev->outbufs_list); -+ } -+ -+ /* also, if dev->used_buffers is going to be decreased, we should remove -+ * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */ -+ if (b->count < dev->used_buffers) { -+ struct v4l2l_buffer *pos, *n; -+ -+ list_for_each_entry_safe(pos, n, &dev->outbufs_list, -+ list_head) { -+ if (pos->buffer.index >= b->count) -+ list_del(&pos->list_head); -+ } -+ -+ /* after we update dev->used_buffers, buffers in outbufs_list will -+ * correspond to dev->write_position + [0;b->count-1] range */ -+ i = dev->write_position; -+ list_for_each_entry(pos, &dev->outbufs_list, -+ list_head) { -+ dev->bufpos2index[mod_inc(&i, b->count)] = -+ pos->buffer.index; -+ } -+ } -+ -+ opener->buffers_number = b->count; -+ if (opener->buffers_number < dev->used_buffers) -+ dev->used_buffers = opener->buffers_number; -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+/* returns buffer asked for; -+ * give app as many buffers as it wants, if it less than MAX, -+ * but map them in our inner buffers -+ * called on VIDIOC_QUERYBUF -+ */ -+static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b) -+{ -+ enum v4l2_buf_type type; -+ int index; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ -+ MARK(); -+ -+ type = b->type; -+ index = b->index; -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) && -+ (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) { -+ return -EINVAL; -+ } -+ if (b->index > max_buffers) -+ return -EINVAL; -+ -+ if (opener->timeout_image_io) -+ *b = dev->timeout_image_buffer.buffer; -+ else -+ *b = dev->buffers[b->index % dev->used_buffers].buffer; -+ -+ b->type = type; -+ b->index = index; -+ dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory, -+ dev->buffers_number, dev->buffer_size); -+ -+ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' -+ https://github.com/umlaeute/v4l2loopback/issues/60 */ -+ b->flags &= ~V4L2_BUF_FLAG_DONE; -+ b->flags |= V4L2_BUF_FLAG_QUEUED; -+ -+ return 0; -+} -+ -+static void buffer_written(struct v4l2_loopback_device *dev, -+ struct v4l2l_buffer *buf) -+{ -+ del_timer_sync(&dev->sustain_timer); -+ del_timer_sync(&dev->timeout_timer); -+ spin_lock_bh(&dev->lock); -+ -+ dev->bufpos2index[mod_inc(&dev->write_position, dev->used_buffers)] = -+ buf->buffer.index; -+ list_move_tail(&buf->list_head, &dev->outbufs_list); -+ dev->reread_count = 0; -+ -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+} -+ -+/* put buffer to queue -+ * called on VIDIOC_QBUF -+ */ -+static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ struct v4l2l_buffer *b; -+ int index; -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ if (buf->index > max_buffers) -+ return -EINVAL; -+ if (opener->timeout_image_io) -+ return 0; -+ -+ index = buf->index % dev->used_buffers; -+ b = &dev->buffers[index]; -+ -+ switch (buf->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ dprintkrw("capture QBUF index: %d\n", index); -+ set_queued(b); -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ dprintkrw("output QBUF pos: %d index: %d\n", -+ dev->write_position, index); -+ if (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0) -+ v4l2l_get_timestamp(&b->buffer); -+ else -+ b->buffer.timestamp = buf->timestamp; -+ b->buffer.bytesused = buf->bytesused; -+ set_done(b); -+ buffer_written(dev, b); -+ -+ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' -+ https://github.com/umlaeute/v4l2loopback/issues/60 */ -+ buf->flags &= ~V4L2_BUF_FLAG_DONE; -+ buf->flags |= V4L2_BUF_FLAG_QUEUED; -+ -+ wake_up_all(&dev->read_event); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+static int can_read(struct v4l2_loopback_device *dev, -+ struct v4l2_loopback_opener *opener) -+{ -+ int ret; -+ -+ spin_lock_bh(&dev->lock); -+ check_timers(dev); -+ ret = dev->write_position > opener->read_position || -+ dev->reread_count > opener->reread_count || dev->timeout_happened; -+ spin_unlock_bh(&dev->lock); -+ return ret; -+} -+ -+static int get_capture_buffer(struct file *file) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); -+ int pos, ret; -+ int timeout_happened; -+ -+ if ((file->f_flags & O_NONBLOCK) && -+ (dev->write_position <= opener->read_position && -+ dev->reread_count <= opener->reread_count && -+ !dev->timeout_happened)) -+ return -EAGAIN; -+ wait_event_interruptible(dev->read_event, can_read(dev, opener)); -+ -+ spin_lock_bh(&dev->lock); -+ if (dev->write_position == opener->read_position) { -+ if (dev->reread_count > opener->reread_count + 2) -+ opener->reread_count = dev->reread_count - 1; -+ ++opener->reread_count; -+ pos = (opener->read_position + dev->used_buffers - 1) % -+ dev->used_buffers; -+ } else { -+ opener->reread_count = 0; -+ if (dev->write_position > -+ opener->read_position + dev->used_buffers) -+ opener->read_position = dev->write_position - 1; -+ pos = mod_inc(&opener->read_position, dev->used_buffers); -+ } -+ timeout_happened = dev->timeout_happened; -+ dev->timeout_happened = 0; -+ spin_unlock_bh(&dev->lock); -+ -+ ret = dev->bufpos2index[pos]; -+ if (timeout_happened) { -+ if (ret < 0) { -+ dprintk("trying to return not mapped buf[%d]\n", ret); -+ return -EFAULT; -+ } -+ /* although allocated on-demand, timeout_image is freed only -+ * in free_buffers(), so we don't need to worry about it being -+ * deallocated suddenly */ -+ memcpy(dev->image + dev->buffers[ret].buffer.m.offset, -+ dev->timeout_image, dev->buffer_size); -+ } -+ return ret; -+} -+ -+/* put buffer to dequeue -+ * called on VIDIOC_DQBUF -+ */ -+static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ int index; -+ struct v4l2l_buffer *b; -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ if (opener->timeout_image_io) { -+ *buf = dev->timeout_image_buffer.buffer; -+ return 0; -+ } -+ -+ switch (buf->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ index = get_capture_buffer(file); -+ if (index < 0) -+ return index; -+ dprintkrw("capture DQBUF pos: %d index: %d\n", -+ opener->read_position - 1, index); -+ if (!(dev->buffers[index].buffer.flags & -+ V4L2_BUF_FLAG_MAPPED)) { -+ dprintk("trying to return not mapped buf[%d]\n", index); -+ return -EINVAL; -+ } -+ unset_flags(&dev->buffers[index]); -+ *buf = dev->buffers[index].buffer; -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer, -+ list_head); -+ list_move_tail(&b->list_head, &dev->outbufs_list); -+ dprintkrw("output DQBUF index: %d\n", b->buffer.index); -+ unset_flags(b); -+ *buf = b->buffer; -+ buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+/* ------------- STREAMING ------------------- */ -+ -+/* start streaming -+ * called on VIDIOC_STREAMON -+ */ -+static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ switch (type) { -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if (!dev->ready_for_capture) { -+ int ret = allocate_buffers(dev); -+ if (ret < 0) -+ return ret; -+ } -+ opener->type = WRITER; -+ dev->ready_for_output = 0; -+ dev->ready_for_capture++; -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if (!dev->ready_for_capture) -+ return -EIO; -+ opener->type = READER; -+ dev->active_readers++; -+ client_usage_queue_event(dev->vdev); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ return -EINVAL; -+} -+ -+/* stop streaming -+ * called on VIDIOC_STREAMOFF -+ */ -+static int vidioc_streamoff(struct file *file, void *fh, -+ enum v4l2_buf_type type) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ -+ MARK(); -+ dprintk("%d\n", type); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ switch (type) { -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if (dev->ready_for_capture > 0) -+ dev->ready_for_capture--; -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if (opener->type == READER) { -+ opener->type = 0; -+ dev->active_readers--; -+ client_usage_queue_event(dev->vdev); -+ } -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ return -EINVAL; -+} -+ -+#ifdef CONFIG_VIDEO_V4L1_COMPAT -+static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ p->frames = dev->buffers_number; -+ p->offsets[0] = 0; -+ p->offsets[1] = 0; -+ p->size = dev->buffer_size; -+ return 0; -+} -+#endif -+ -+static void client_usage_queue_event(struct video_device *vdev) -+{ -+ struct v4l2_event ev; -+ struct v4l2_loopback_device *dev; -+ -+ dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device, -+ v4l2_dev); -+ -+ memset(&ev, 0, sizeof(ev)); -+ ev.type = V4L2_EVENT_PRI_CLIENT_USAGE; -+ ((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers; -+ -+ v4l2_event_queue(vdev, &ev); -+} -+ -+static int client_usage_ops_add(struct v4l2_subscribed_event *sev, -+ unsigned elems) -+{ -+ if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL)) -+ return 0; -+ -+ client_usage_queue_event(sev->fh->vdev); -+ return 0; -+} -+ -+static void client_usage_ops_replace(struct v4l2_event *old, -+ const struct v4l2_event *new) -+{ -+ *((struct v4l2_event_client_usage *)&old->u) = -+ *((struct v4l2_event_client_usage *)&new->u); -+} -+ -+static void client_usage_ops_merge(const struct v4l2_event *old, -+ struct v4l2_event *new) -+{ -+ *((struct v4l2_event_client_usage *)&new->u) = -+ *((struct v4l2_event_client_usage *)&old->u); -+} -+ -+const struct v4l2_subscribed_event_ops client_usage_ops = { -+ .add = client_usage_ops_add, -+ .replace = client_usage_ops_replace, -+ .merge = client_usage_ops_merge, -+}; -+ -+static int vidioc_subscribe_event(struct v4l2_fh *fh, -+ const struct v4l2_event_subscription *sub) -+{ -+ switch (sub->type) { -+ case V4L2_EVENT_CTRL: -+ return v4l2_ctrl_subscribe_event(fh, sub); -+ case V4L2_EVENT_PRI_CLIENT_USAGE: -+ return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops); -+ } -+ -+ return -EINVAL; -+} -+ -+/* file operations */ -+static void vm_open(struct vm_area_struct *vma) -+{ -+ struct v4l2l_buffer *buf; -+ MARK(); -+ -+ buf = vma->vm_private_data; -+ buf->use_count++; -+ -+ buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED; -+} -+ -+static void vm_close(struct vm_area_struct *vma) -+{ -+ struct v4l2l_buffer *buf; -+ MARK(); -+ -+ buf = vma->vm_private_data; -+ buf->use_count--; -+ -+ if (buf->use_count <= 0) -+ buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED; -+} -+ -+static struct vm_operations_struct vm_ops = { -+ .open = vm_open, -+ .close = vm_close, -+}; -+ -+static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ u8 *addr; -+ unsigned long start; -+ unsigned long size; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ struct v4l2l_buffer *buffer = NULL; -+ MARK(); -+ -+ start = (unsigned long)vma->vm_start; -+ size = (unsigned long)(vma->vm_end - vma->vm_start); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(file->private_data); -+ -+ if (size > dev->buffer_size) { -+ dprintk("userspace tries to mmap too much, fail\n"); -+ return -EINVAL; -+ } -+ if (opener->timeout_image_io) { -+ /* we are going to map the timeout_image_buffer */ -+ if ((vma->vm_pgoff << PAGE_SHIFT) != -+ dev->buffer_size * MAX_BUFFERS) { -+ dprintk("invalid mmap offset for timeout_image_io mode\n"); -+ return -EINVAL; -+ } -+ } else if ((vma->vm_pgoff << PAGE_SHIFT) > -+ dev->buffer_size * (dev->buffers_number - 1)) { -+ dprintk("userspace tries to mmap too far, fail\n"); -+ return -EINVAL; -+ } -+ -+ /* FIXXXXXME: allocation should not happen here! */ -+ if (NULL == dev->image) -+ if (allocate_buffers(dev) < 0) -+ return -EINVAL; -+ -+ if (opener->timeout_image_io) { -+ buffer = &dev->timeout_image_buffer; -+ addr = dev->timeout_image; -+ } else { -+ int i; -+ for (i = 0; i < dev->buffers_number; ++i) { -+ buffer = &dev->buffers[i]; -+ if ((buffer->buffer.m.offset >> PAGE_SHIFT) == -+ vma->vm_pgoff) -+ break; -+ } -+ -+ if (i >= dev->buffers_number) -+ return -EINVAL; -+ -+ addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT); -+ } -+ -+ while (size > 0) { -+ struct page *page; -+ -+ page = vmalloc_to_page(addr); -+ -+ if (vm_insert_page(vma, start, page) < 0) -+ return -EAGAIN; -+ -+ start += PAGE_SIZE; -+ addr += PAGE_SIZE; -+ size -= PAGE_SIZE; -+ } -+ -+ vma->vm_ops = &vm_ops; -+ vma->vm_private_data = buffer; -+ -+ vm_open(vma); -+ -+ MARK(); -+ return 0; -+} -+ -+static unsigned int v4l2_loopback_poll(struct file *file, -+ struct poll_table_struct *pts) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ __poll_t req_events = poll_requested_events(pts); -+ int ret_mask = 0; -+ MARK(); -+ -+ opener = fh_to_opener(file->private_data); -+ dev = v4l2loopback_getdevice(file); -+ -+ if (req_events & POLLPRI) { -+ if (!v4l2_event_pending(&opener->fh)) -+ poll_wait(file, &opener->fh.wait, pts); -+ if (v4l2_event_pending(&opener->fh)) { -+ ret_mask |= POLLPRI; -+ if (!(req_events & DEFAULT_POLLMASK)) -+ return ret_mask; -+ } -+ } -+ -+ switch (opener->type) { -+ case WRITER: -+ ret_mask |= POLLOUT | POLLWRNORM; -+ break; -+ case READER: -+ if (!can_read(dev, opener)) { -+ if (ret_mask) -+ return ret_mask; -+ poll_wait(file, &dev->read_event, pts); -+ } -+ if (can_read(dev, opener)) -+ ret_mask |= POLLIN | POLLRDNORM; -+ if (v4l2_event_pending(&opener->fh)) -+ ret_mask |= POLLPRI; -+ break; -+ default: -+ break; -+ } -+ -+ MARK(); -+ return ret_mask; -+} -+ -+/* do not want to limit device opens, it can be as many readers as user want, -+ * writers are limited by means of setting writer field */ -+static int v4l2_loopback_open(struct file *file) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ MARK(); -+ dev = v4l2loopback_getdevice(file); -+ if (dev->open_count.counter >= dev->max_openers) -+ return -EBUSY; -+ /* kfree on close */ -+ opener = kzalloc(sizeof(*opener), GFP_KERNEL); -+ if (opener == NULL) -+ return -ENOMEM; -+ -+ atomic_inc(&dev->open_count); -+ -+ opener->timeout_image_io = dev->timeout_image_io; -+ if (opener->timeout_image_io) { -+ int r = allocate_timeout_image(dev); -+ -+ if (r < 0) { -+ dprintk("timeout image allocation failed\n"); -+ -+ atomic_dec(&dev->open_count); -+ -+ kfree(opener); -+ return r; -+ } -+ } -+ -+ dev->timeout_image_io = 0; -+ -+ v4l2_fh_init(&opener->fh, video_devdata(file)); -+ file->private_data = &opener->fh; -+ -+ v4l2_fh_add(&opener->fh); -+ dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL); -+ MARK(); -+ return 0; -+} -+ -+static int v4l2_loopback_close(struct file *file) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ int is_writer = 0, is_reader = 0; -+ MARK(); -+ -+ opener = fh_to_opener(file->private_data); -+ dev = v4l2loopback_getdevice(file); -+ -+ if (WRITER == opener->type) -+ is_writer = 1; -+ if (READER == opener->type) -+ is_reader = 1; -+ -+ atomic_dec(&dev->open_count); -+ if (dev->open_count.counter == 0) { -+ del_timer_sync(&dev->sustain_timer); -+ del_timer_sync(&dev->timeout_timer); -+ } -+ try_free_buffers(dev); -+ -+ v4l2_fh_del(&opener->fh); -+ v4l2_fh_exit(&opener->fh); -+ -+ kfree(opener); -+ if (is_writer) -+ dev->ready_for_output = 1; -+ if (is_reader) { -+ dev->active_readers--; -+ client_usage_queue_event(dev->vdev); -+ } -+ MARK(); -+ return 0; -+} -+ -+static ssize_t v4l2_loopback_read(struct file *file, char __user *buf, -+ size_t count, loff_t *ppos) -+{ -+ int read_index; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_buffer *b; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ read_index = get_capture_buffer(file); -+ if (read_index < 0) -+ return read_index; -+ if (count > dev->buffer_size) -+ count = dev->buffer_size; -+ b = &dev->buffers[read_index].buffer; -+ if (count > b->bytesused) -+ count = b->bytesused; -+ if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset), -+ count)) { -+ printk(KERN_ERR -+ "v4l2-loopback: failed copy_to_user() in read buf\n"); -+ return -EFAULT; -+ } -+ dprintkrw("leave v4l2_loopback_read()\n"); -+ return count; -+} -+ -+static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *ppos) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ int write_index; -+ struct v4l2_buffer *b; -+ int err = 0; -+ -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(file->private_data); -+ -+ if (UNNEGOTIATED == opener->type) { -+ spin_lock(&dev->lock); -+ -+ if (dev->ready_for_output) { -+ err = vidioc_streamon(file, file->private_data, -+ V4L2_BUF_TYPE_VIDEO_OUTPUT); -+ } -+ -+ spin_unlock(&dev->lock); -+ -+ if (err < 0) -+ return err; -+ } -+ -+ if (WRITER != opener->type) -+ return -EINVAL; -+ -+ if (!dev->ready_for_capture) { -+ int ret = allocate_buffers(dev); -+ if (ret < 0) -+ return ret; -+ dev->ready_for_capture = 1; -+ } -+ dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count); -+ if (count > dev->buffer_size) -+ count = dev->buffer_size; -+ -+ write_index = dev->write_position % dev->used_buffers; -+ b = &dev->buffers[write_index].buffer; -+ -+ if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf, -+ count)) { -+ printk(KERN_ERR -+ "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n", -+ count); -+ return -EFAULT; -+ } -+ v4l2l_get_timestamp(b); -+ b->bytesused = count; -+ b->sequence = dev->write_position; -+ buffer_written(dev, &dev->buffers[write_index]); -+ wake_up_all(&dev->read_event); -+ dprintkrw("leave v4l2_loopback_write()\n"); -+ return count; -+} -+ -+/* init functions */ -+/* frees buffers, if already allocated */ -+static void free_buffers(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev); -+ if (!dev) -+ return; -+ if (dev->image) { -+ vfree(dev->image); -+ dev->image = NULL; -+ } -+ if (dev->timeout_image) { -+ vfree(dev->timeout_image); -+ dev->timeout_image = NULL; -+ } -+ dev->imagesize = 0; -+} -+/* frees buffers, if they are no longer needed */ -+static void try_free_buffers(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ if (0 == dev->open_count.counter && !dev->keep_format) { -+ free_buffers(dev); -+ dev->ready_for_capture = 0; -+ dev->buffer_size = 0; -+ dev->write_position = 0; -+ } -+} -+/* allocates buffers, if buffer_size is set */ -+static int allocate_buffers(struct v4l2_loopback_device *dev) -+{ -+ int err; -+ -+ MARK(); -+ /* vfree on close file operation in case no open handles left */ -+ -+ if (dev->buffer_size < 1 || dev->buffers_number < 1) -+ return -EINVAL; -+ -+ if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number) -+ return -ENOSPC; -+ -+ if (dev->image) { -+ dprintk("allocating buffers again: %ld %ld\n", -+ dev->buffer_size * dev->buffers_number, dev->imagesize); -+ /* FIXME: prevent double allocation more intelligently! */ -+ if (dev->buffer_size * dev->buffers_number == dev->imagesize) -+ return 0; -+ -+ /* if there is only one writer, no problem should occur */ -+ if (dev->open_count.counter == 1) -+ free_buffers(dev); -+ else -+ return -EINVAL; -+ } -+ -+ dev->imagesize = (unsigned long)dev->buffer_size * -+ (unsigned long)dev->buffers_number; -+ -+ dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size, -+ dev->buffers_number); -+ err = -ENOMEM; -+ -+ if (dev->timeout_jiffies > 0) { -+ err = allocate_timeout_image(dev); -+ if (err < 0) -+ goto error; -+ } -+ -+ dev->image = vmalloc(dev->imagesize); -+ if (dev->image == NULL) -+ goto error; -+ -+ dprintk("vmallocated %ld bytes\n", dev->imagesize); -+ MARK(); -+ -+ init_buffers(dev); -+ return 0; -+ -+error: -+ free_buffers(dev); -+ return err; -+} -+ -+/* init inner buffers, they are capture mode and flags are set as -+ * for capture mod buffers */ -+static void init_buffers(struct v4l2_loopback_device *dev) -+{ -+ int i; -+ int buffer_size; -+ int bytesused; -+ MARK(); -+ -+ buffer_size = dev->buffer_size; -+ bytesused = dev->pix_format.sizeimage; -+ -+ for (i = 0; i < dev->buffers_number; ++i) { -+ struct v4l2_buffer *b = &dev->buffers[i].buffer; -+ b->index = i; -+ b->bytesused = bytesused; -+ b->length = buffer_size; -+ b->field = V4L2_FIELD_NONE; -+ b->flags = 0; -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1) -+ b->input = 0; -+#endif -+ b->m.offset = i * buffer_size; -+ b->memory = V4L2_MEMORY_MMAP; -+ b->sequence = 0; -+ b->timestamp.tv_sec = 0; -+ b->timestamp.tv_usec = 0; -+ b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ -+ v4l2l_get_timestamp(b); -+ } -+ dev->timeout_image_buffer = dev->buffers[0]; -+ dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size; -+ MARK(); -+} -+ -+static int allocate_timeout_image(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ if (dev->buffer_size <= 0) -+ return -EINVAL; -+ -+ if (dev->timeout_image == NULL) { -+ dev->timeout_image = vzalloc(dev->buffer_size); -+ if (dev->timeout_image == NULL) -+ return -ENOMEM; -+ } -+ return 0; -+} -+ -+/* fills and register video device */ -+static void init_vdev(struct video_device *vdev, int nr) -+{ -+ MARK(); -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ vdev->tvnorms = V4L2_STD_ALL; -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ vdev->vfl_type = VFL_TYPE_VIDEO; -+ vdev->fops = &v4l2_loopback_fops; -+ vdev->ioctl_ops = &v4l2_loopback_ioctl_ops; -+ vdev->release = &video_device_release; -+ vdev->minor = -1; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -+ vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE | -+ V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE | -+ V4L2_CAP_STREAMING; -+#endif -+ -+ if (debug > 1) -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 20, 0) -+ vdev->debug = V4L2_DEBUG_IOCTL | V4L2_DEBUG_IOCTL_ARG; -+#else -+ vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL | -+ V4L2_DEV_DEBUG_IOCTL_ARG; -+#endif -+ -+ /* since kernel-3.7, there is a new field 'vfl_dir' that has to be -+ * set to VFL_DIR_M2M for bidirectional devices */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) -+ vdev->vfl_dir = VFL_DIR_M2M; -+#endif -+ -+ MARK(); -+} -+ -+/* init default capture parameters, only fps may be changed in future */ -+static void init_capture_param(struct v4l2_captureparm *capture_param) -+{ -+ MARK(); -+ capture_param->capability = 0; -+ capture_param->capturemode = 0; -+ capture_param->extendedmode = 0; -+ capture_param->readbuffers = max_buffers; -+ capture_param->timeperframe.numerator = 1; -+ capture_param->timeperframe.denominator = 30; -+} -+ -+static void check_timers(struct v4l2_loopback_device *dev) -+{ -+ if (!dev->ready_for_capture) -+ return; -+ -+ if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer)) -+ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); -+ if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer)) -+ mod_timer(&dev->sustain_timer, -+ jiffies + dev->frame_jiffies * 3 / 2); -+} -+#ifdef HAVE_TIMER_SETUP -+static void sustain_timer_clb(struct timer_list *t) -+{ -+ struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer); -+#else -+static void sustain_timer_clb(unsigned long nr) -+{ -+ struct v4l2_loopback_device *dev = -+ idr_find(&v4l2loopback_index_idr, nr); -+#endif -+ spin_lock(&dev->lock); -+ if (dev->sustain_framerate) { -+ dev->reread_count++; -+ dprintkrw("reread: %d %d\n", dev->write_position, -+ dev->reread_count); -+ if (dev->reread_count == 1) -+ mod_timer(&dev->sustain_timer, -+ jiffies + max(1UL, dev->frame_jiffies / 2)); -+ else -+ mod_timer(&dev->sustain_timer, -+ jiffies + dev->frame_jiffies); -+ wake_up_all(&dev->read_event); -+ } -+ spin_unlock(&dev->lock); -+} -+#ifdef HAVE_TIMER_SETUP -+static void timeout_timer_clb(struct timer_list *t) -+{ -+ struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer); -+#else -+static void timeout_timer_clb(unsigned long nr) -+{ -+ struct v4l2_loopback_device *dev = -+ idr_find(&v4l2loopback_index_idr, nr); -+#endif -+ spin_lock(&dev->lock); -+ if (dev->timeout_jiffies > 0) { -+ dev->timeout_happened = 1; -+ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); -+ wake_up_all(&dev->read_event); -+ } -+ spin_unlock(&dev->lock); -+} -+ -+/* init loopback main structure */ -+#define DEFAULT_FROM_CONF(confmember, default_condition, default_value) \ -+ ((conf) ? \ -+ ((conf->confmember default_condition) ? (default_value) : \ -+ (conf->confmember)) : \ -+ default_value) -+ -+static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_ctrl_handler *hdl; -+ struct v4l2loopback_private *vdev_priv = NULL; -+ -+ int err = -ENOMEM; -+ -+ int _max_width = DEFAULT_FROM_CONF( -+ max_width, < V4L2LOOPBACK_SIZE_MIN_WIDTH, max_width); -+ int _max_height = DEFAULT_FROM_CONF( -+ max_height, < V4L2LOOPBACK_SIZE_MIN_HEIGHT, max_height); -+ bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ? -+ (conf->announce_all_caps) : -+ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS; -+ int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers); -+ int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers); -+ -+ int nr = -1; -+ -+ _announce_all_caps = (!!_announce_all_caps); -+ -+ if (conf) { -+ if (conf->capture_nr >= 0 && -+ conf->output_nr == conf->capture_nr) { -+ nr = conf->capture_nr; -+ } else if (conf->capture_nr < 0 && conf->output_nr < 0) { -+ nr = -1; -+ } else if (conf->capture_nr < 0) { -+ nr = conf->output_nr; -+ } else if (conf->output_nr < 0) { -+ nr = conf->capture_nr; -+ } else { -+ printk(KERN_ERR -+ "split OUTPUT and CAPTURE devices not yet supported."); -+ printk(KERN_INFO -+ "both devices must have the same number (%d != %d).", -+ conf->output_nr, conf->capture_nr); -+ return -EINVAL; -+ } -+ } -+ -+ if (idr_find(&v4l2loopback_index_idr, nr)) -+ return -EEXIST; -+ -+ dprintk("creating v4l2loopback-device #%d\n", nr); -+ dev = kzalloc(sizeof(*dev), GFP_KERNEL); -+ if (!dev) -+ return -ENOMEM; -+ -+ /* allocate id, if @id >= 0, we're requesting that specific id */ -+ if (nr >= 0) { -+ err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1, -+ GFP_KERNEL); -+ if (err == -ENOSPC) -+ err = -EEXIST; -+ } else { -+ err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL); -+ } -+ if (err < 0) -+ goto out_free_dev; -+ nr = err; -+ err = -ENOMEM; -+ -+ if (conf && conf->card_label[0]) { -+ snprintf(dev->card_label, sizeof(dev->card_label), "%s", -+ conf->card_label); -+ } else { -+ snprintf(dev->card_label, sizeof(dev->card_label), -+ "Dummy video device (0x%04X)", nr); -+ } -+ snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), -+ "v4l2loopback-%03d", nr); -+ -+ err = v4l2_device_register(NULL, &dev->v4l2_dev); -+ if (err) -+ goto out_free_idr; -+ MARK(); -+ -+ dev->vdev = video_device_alloc(); -+ if (dev->vdev == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL); -+ if (vdev_priv == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ video_set_drvdata(dev->vdev, vdev_priv); -+ if (video_get_drvdata(dev->vdev) == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ MARK(); -+ snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s", -+ dev->card_label); -+ -+ vdev_priv->device_nr = nr; -+ -+ init_vdev(dev->vdev, nr); -+ dev->vdev->v4l2_dev = &dev->v4l2_dev; -+ init_capture_param(&dev->capture_param); -+ err = set_timeperframe(dev, &dev->capture_param.timeperframe); -+ if (err) -+ goto out_unregister; -+ dev->keep_format = 0; -+ dev->sustain_framerate = 0; -+ -+ dev->announce_all_caps = _announce_all_caps; -+ dev->max_width = _max_width; -+ dev->max_height = _max_height; -+ dev->max_openers = _max_openers; -+ dev->buffers_number = dev->used_buffers = _max_buffers; -+ -+ dev->write_position = 0; -+ -+ MARK(); -+ spin_lock_init(&dev->lock); -+ INIT_LIST_HEAD(&dev->outbufs_list); -+ if (list_empty(&dev->outbufs_list)) { -+ int i; -+ -+ for (i = 0; i < dev->used_buffers; ++i) -+ list_add_tail(&dev->buffers[i].list_head, -+ &dev->outbufs_list); -+ } -+ memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index)); -+ atomic_set(&dev->open_count, 0); -+ dev->ready_for_capture = 0; -+ dev->ready_for_output = 1; -+ -+ dev->buffer_size = 0; -+ dev->image = NULL; -+ dev->imagesize = 0; -+#ifdef HAVE_TIMER_SETUP -+ timer_setup(&dev->sustain_timer, sustain_timer_clb, 0); -+ timer_setup(&dev->timeout_timer, timeout_timer_clb, 0); -+#else -+ setup_timer(&dev->sustain_timer, sustain_timer_clb, nr); -+ setup_timer(&dev->timeout_timer, timeout_timer_clb, nr); -+#endif -+ dev->reread_count = 0; -+ dev->timeout_jiffies = 0; -+ dev->timeout_image = NULL; -+ dev->timeout_happened = 0; -+ -+ hdl = &dev->ctrl_handler; -+ err = v4l2_ctrl_handler_init(hdl, 4); -+ if (err) -+ goto out_unregister; -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL); -+ if (hdl->error) { -+ err = hdl->error; -+ goto out_free_handler; -+ } -+ dev->v4l2_dev.ctrl_handler = hdl; -+ -+ err = v4l2_ctrl_handler_setup(hdl); -+ if (err) -+ goto out_free_handler; -+ -+ /* FIXME set buffers to 0 */ -+ -+ /* Set initial format */ -+ dev->pix_format.width = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; */ -+ dev->pix_format.height = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; */ -+ dev->pix_format.pixelformat = formats[0].fourcc; -+ dev->pix_format.colorspace = -+ V4L2_COLORSPACE_SRGB; /* do we need to set this ? */ -+ dev->pix_format.field = V4L2_FIELD_NONE; -+ -+ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); -+ dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size, -+ dev->pix_format.sizeimage); -+ -+ if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0)) -+ goto out_free_handler; -+ -+ init_waitqueue_head(&dev->read_event); -+ -+ /* register the device -> it creates /dev/video* */ -+ if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) { -+ printk(KERN_ERR -+ "v4l2loopback: failed video_register_device()\n"); -+ err = -EFAULT; -+ goto out_free_device; -+ } -+ v4l2loopback_create_sysfs(dev->vdev); -+ -+ MARK(); -+ if (ret_nr) -+ *ret_nr = dev->vdev->num; -+ return 0; -+ -+out_free_device: -+ video_device_release(dev->vdev); -+out_free_handler: -+ v4l2_ctrl_handler_free(&dev->ctrl_handler); -+out_unregister: -+ video_set_drvdata(dev->vdev, NULL); -+ if (vdev_priv != NULL) -+ kfree(vdev_priv); -+ v4l2_device_unregister(&dev->v4l2_dev); -+out_free_idr: -+ idr_remove(&v4l2loopback_index_idr, nr); -+out_free_dev: -+ kfree(dev); -+ return err; -+} -+ -+static void v4l2_loopback_remove(struct v4l2_loopback_device *dev) -+{ -+ free_buffers(dev); -+ v4l2loopback_remove_sysfs(dev->vdev); -+ kfree(video_get_drvdata(dev->vdev)); -+ video_unregister_device(dev->vdev); -+ v4l2_device_unregister(&dev->v4l2_dev); -+ v4l2_ctrl_handler_free(&dev->ctrl_handler); -+ kfree(dev); -+} -+ -+static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd, -+ unsigned long parm) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_config conf; -+ struct v4l2_loopback_config *confptr = &conf; -+ int device_nr; -+ int ret; -+ -+ ret = mutex_lock_killable(&v4l2loopback_ctl_mutex); -+ if (ret) -+ return ret; -+ -+ ret = -EINVAL; -+ switch (cmd) { -+ default: -+ ret = -ENOSYS; -+ break; -+ /* add a v4l2loopback device (pair), based on the user-provided specs */ -+ case V4L2LOOPBACK_CTL_ADD: -+ if (parm) { -+ if ((ret = copy_from_user(&conf, (void *)parm, -+ sizeof(conf))) < 0) -+ break; -+ } else -+ confptr = NULL; -+ ret = v4l2_loopback_add(confptr, &device_nr); -+ if (ret >= 0) -+ ret = device_nr; -+ break; -+ /* remove a v4l2loopback device (both capture and output) */ -+ case V4L2LOOPBACK_CTL_REMOVE: -+ ret = v4l2loopback_lookup((int)parm, &dev); -+ if (ret >= 0 && dev) { -+ int nr = ret; -+ ret = -EBUSY; -+ if (dev->open_count.counter > 0) -+ break; -+ idr_remove(&v4l2loopback_index_idr, nr); -+ v4l2_loopback_remove(dev); -+ ret = 0; -+ }; -+ break; -+ /* get information for a loopback device. -+ * this is mostly about limits (which cannot be queried directly with VIDIOC_G_FMT and friends -+ */ -+ case V4L2LOOPBACK_CTL_QUERY: -+ if (!parm) -+ break; -+ if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) < -+ 0) -+ break; -+ device_nr = (conf.output_nr < 0) ? conf.capture_nr : -+ conf.output_nr; -+ MARK(); -+ /* get the device from either capture_nr or output_nr (whatever is valid) */ -+ if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0) -+ break; -+ MARK(); -+ /* if we got the device from output_nr and there is a valid capture_nr, -+ * make sure that both refer to the same device (or bail out) -+ */ -+ if ((device_nr != conf.capture_nr) && (conf.capture_nr >= 0) && -+ (ret != v4l2loopback_lookup(conf.capture_nr, 0))) -+ break; -+ MARK(); -+ /* if otoh, we got the device from capture_nr and there is a valid output_nr, -+ * make sure that both refer to the same device (or bail out) -+ */ -+ if ((device_nr != conf.output_nr) && (conf.output_nr >= 0) && -+ (ret != v4l2loopback_lookup(conf.output_nr, 0))) -+ break; -+ MARK(); -+ -+ /* v4l2_loopback_config identified a single device, so fetch the data */ -+ snprintf(conf.card_label, sizeof(conf.card_label), "%s", -+ dev->card_label); -+ MARK(); -+ conf.output_nr = conf.capture_nr = dev->vdev->num; -+ conf.max_width = dev->max_width; -+ conf.max_height = dev->max_height; -+ conf.announce_all_caps = dev->announce_all_caps; -+ conf.max_buffers = dev->buffers_number; -+ conf.max_openers = dev->max_openers; -+ conf.debug = debug; -+ MARK(); -+ if (copy_to_user((void *)parm, &conf, sizeof(conf))) { -+ ret = -EFAULT; -+ break; -+ } -+ MARK(); -+ ret = 0; -+ ; -+ break; -+ } -+ -+ MARK(); -+ mutex_unlock(&v4l2loopback_ctl_mutex); -+ MARK(); -+ return ret; -+} -+ -+/* LINUX KERNEL */ -+ -+static const struct file_operations v4l2loopback_ctl_fops = { -+ // clang-format off -+ .owner = THIS_MODULE, -+ .open = nonseekable_open, -+ .unlocked_ioctl = v4l2loopback_control_ioctl, -+ .compat_ioctl = v4l2loopback_control_ioctl, -+ .llseek = noop_llseek, -+ // clang-format on -+}; -+ -+static struct miscdevice v4l2loopback_misc = { -+ // clang-format off -+ .minor = MISC_DYNAMIC_MINOR, -+ .name = "v4l2loopback", -+ .fops = &v4l2loopback_ctl_fops, -+ // clang-format on -+}; -+ -+static const struct v4l2_file_operations v4l2_loopback_fops = { -+ // clang-format off -+ .owner = THIS_MODULE, -+ .open = v4l2_loopback_open, -+ .release = v4l2_loopback_close, -+ .read = v4l2_loopback_read, -+ .write = v4l2_loopback_write, -+ .poll = v4l2_loopback_poll, -+ .mmap = v4l2_loopback_mmap, -+ .unlocked_ioctl = video_ioctl2, -+ // clang-format on -+}; -+ -+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = { -+ // clang-format off -+ .vidioc_querycap = &vidioc_querycap, -+ .vidioc_enum_framesizes = &vidioc_enum_framesizes, -+ .vidioc_enum_frameintervals = &vidioc_enum_frameintervals, -+ -+ .vidioc_enum_output = &vidioc_enum_output, -+ .vidioc_g_output = &vidioc_g_output, -+ .vidioc_s_output = &vidioc_s_output, -+ -+ .vidioc_enum_input = &vidioc_enum_input, -+ .vidioc_g_input = &vidioc_g_input, -+ .vidioc_s_input = &vidioc_s_input, -+ -+ .vidioc_enum_fmt_vid_cap = &vidioc_enum_fmt_cap, -+ .vidioc_g_fmt_vid_cap = &vidioc_g_fmt_cap, -+ .vidioc_s_fmt_vid_cap = &vidioc_s_fmt_cap, -+ .vidioc_try_fmt_vid_cap = &vidioc_try_fmt_cap, -+ -+ .vidioc_enum_fmt_vid_out = &vidioc_enum_fmt_out, -+ .vidioc_s_fmt_vid_out = &vidioc_s_fmt_out, -+ .vidioc_g_fmt_vid_out = &vidioc_g_fmt_out, -+ .vidioc_try_fmt_vid_out = &vidioc_try_fmt_out, -+ -+#ifdef V4L2L_OVERLAY -+ .vidioc_s_fmt_vid_overlay = &vidioc_s_fmt_overlay, -+ .vidioc_g_fmt_vid_overlay = &vidioc_g_fmt_overlay, -+#endif -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ .vidioc_s_std = &vidioc_s_std, -+ .vidioc_g_std = &vidioc_g_std, -+ .vidioc_querystd = &vidioc_querystd, -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ .vidioc_g_parm = &vidioc_g_parm, -+ .vidioc_s_parm = &vidioc_s_parm, -+ -+ .vidioc_reqbufs = &vidioc_reqbufs, -+ .vidioc_querybuf = &vidioc_querybuf, -+ .vidioc_qbuf = &vidioc_qbuf, -+ .vidioc_dqbuf = &vidioc_dqbuf, -+ -+ .vidioc_streamon = &vidioc_streamon, -+ .vidioc_streamoff = &vidioc_streamoff, -+ -+#ifdef CONFIG_VIDEO_V4L1_COMPAT -+ .vidiocgmbuf = &vidiocgmbuf, -+#endif -+ -+ .vidioc_subscribe_event = &vidioc_subscribe_event, -+ .vidioc_unsubscribe_event = &v4l2_event_unsubscribe, -+ // clang-format on -+}; -+ -+static int free_device_cb(int id, void *ptr, void *data) -+{ -+ struct v4l2_loopback_device *dev = ptr; -+ v4l2_loopback_remove(dev); -+ return 0; -+} -+static void free_devices(void) -+{ -+ idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL); -+ idr_destroy(&v4l2loopback_index_idr); -+} -+ -+static int __init v4l2loopback_init_module(void) -+{ -+ int err; -+ int i; -+ MARK(); -+ -+ err = misc_register(&v4l2loopback_misc); -+ if (err < 0) -+ return err; -+ -+ if (devices < 0) { -+ devices = 1; -+ -+ /* try guessing the devices from the "video_nr" parameter */ -+ for (i = MAX_DEVICES - 1; i >= 0; i--) { -+ if (video_nr[i] >= 0) { -+ devices = i + 1; -+ break; -+ } -+ } -+ } -+ -+ if (devices > MAX_DEVICES) { -+ devices = MAX_DEVICES; -+ printk(KERN_INFO -+ "v4l2loopback: number of initial devices is limited to: %d\n", -+ MAX_DEVICES); -+ } -+ -+ if (max_buffers > MAX_BUFFERS) { -+ max_buffers = MAX_BUFFERS; -+ printk(KERN_INFO -+ "v4l2loopback: number of buffers is limited to: %d\n", -+ MAX_BUFFERS); -+ } -+ -+ if (max_openers < 0) { -+ printk(KERN_INFO -+ "v4l2loopback: allowing %d openers rather than %d\n", -+ 2, max_openers); -+ max_openers = 2; -+ } -+ -+ if (max_width < V4L2LOOPBACK_SIZE_MIN_WIDTH) { -+ max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; -+ printk(KERN_INFO "v4l2loopback: using max_width %d\n", -+ max_width); -+ } -+ if (max_height < V4L2LOOPBACK_SIZE_MIN_HEIGHT) { -+ max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; -+ printk(KERN_INFO "v4l2loopback: using max_height %d\n", -+ max_height); -+ } -+ -+ /* kfree on module release */ -+ for (i = 0; i < devices; i++) { -+ struct v4l2_loopback_config cfg = { -+ // clang-format off -+ .output_nr = video_nr[i], -+ .capture_nr = video_nr[i], -+ .max_width = max_width, -+ .max_height = max_height, -+ .announce_all_caps = (!exclusive_caps[i]), -+ .max_buffers = max_buffers, -+ .max_openers = max_openers, -+ .debug = debug, -+ // clang-format on -+ }; -+ cfg.card_label[0] = 0; -+ if (card_label[i]) -+ snprintf(cfg.card_label, sizeof(cfg.card_label), "%s", -+ card_label[i]); -+ err = v4l2_loopback_add(&cfg, 0); -+ if (err) { -+ free_devices(); -+ goto error; -+ } -+ } -+ -+ dprintk("module installed\n"); -+ -+ printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n", -+ // clang-format off -+ (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff, -+ (V4L2LOOPBACK_VERSION_CODE >> 8) & 0xff, -+ (V4L2LOOPBACK_VERSION_CODE ) & 0xff, -+#ifdef SNAPSHOT_VERSION -+ " (" __stringify(SNAPSHOT_VERSION) ")" -+#else -+ "" -+#endif -+ ); -+ // clang-format on -+ -+ return 0; -+error: -+ misc_deregister(&v4l2loopback_misc); -+ return err; -+} -+ -+static void v4l2loopback_cleanup_module(void) -+{ -+ MARK(); -+ /* unregister the device -> it deletes /dev/video* */ -+ free_devices(); -+ /* and get rid of /dev/v4l2loopback */ -+ misc_deregister(&v4l2loopback_misc); -+ dprintk("module removed\n"); -+} -+ -+MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR); -+ -+module_init(v4l2loopback_init_module); -+module_exit(v4l2loopback_cleanup_module); -diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h -new file mode 100644 -index 000000000000..10f8e662d37a ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback.h -@@ -0,0 +1,96 @@ -+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -+/* -+ * v4l2loopback.h -+ * -+ * Written by IOhannes m zmölnig, 7/1/20. -+ * -+ * Copyright 2020 by IOhannes m zmölnig. Redistribution of this file is -+ * permitted under the GNU General Public License. -+ */ -+#ifndef _V4L2LOOPBACK_H -+#define _V4L2LOOPBACK_H -+ -+#define V4L2LOOPBACK_VERSION_MAJOR 0 -+#define V4L2LOOPBACK_VERSION_MINOR 12 -+#define V4L2LOOPBACK_VERSION_BUGFIX 7 -+ -+/* /dev/v4l2loopback interface */ -+ -+struct v4l2_loopback_config { -+ /** -+ * the device-number (/dev/video) -+ * V4L2LOOPBACK_CTL_ADD: -+ * setting this to a value<0, will allocate an available one -+ * if nr>=0 and the device already exists, the ioctl will EEXIST -+ * if output_nr and capture_nr are the same, only a single device will be created -+ * NOTE: currently split-devices (where output_nr and capture_nr differ) -+ * are not implemented yet. -+ * until then, requesting different device-IDs will result in EINVAL. -+ * -+ * V4L2LOOPBACK_CTL_QUERY: -+ * either both output_nr and capture_nr must refer to the same loopback, -+ * or one (and only one) of them must be -1 -+ * -+ */ -+ int output_nr; -+ int capture_nr; -+ -+ /** -+ * a nice name for your device -+ * if (*card_label)==0, an automatic name is assigned -+ */ -+ char card_label[32]; -+ -+ /** -+ * maximum allowed frame size -+ * if too low, default values are used -+ */ -+ int max_width; -+ int max_height; -+ -+ /** -+ * number of buffers to allocate for the queue -+ * if set to <=0, default values are used -+ */ -+ int max_buffers; -+ -+ /** -+ * how many consumers are allowed to open this device concurrently -+ * if set to <=0, default values are used -+ */ -+ int max_openers; -+ -+ /** -+ * set the debugging level for this device -+ */ -+ int debug; -+ -+ /** -+ * whether to announce OUTPUT/CAPTURE capabilities exclusively -+ * for this device or not -+ * (!exclusive_caps) -+ * NOTE: this is going to be removed once separate output/capture -+ * devices are implemented -+ */ -+ int announce_all_caps; -+}; -+ -+/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the -+ * to-be-created device set. -+ * if the ptr is NULL, a new device is created with default values at the driver's discretion. -+ * -+ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY, -+ * to get more information on the device) -+ */ -+#define V4L2LOOPBACK_CTL_ADD 0x4C80 -+ -+/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set -+ * (the two values must either refer to video-devices associated with the same loopback device -+ * or exactly one of them must be <0 -+ */ -+#define V4L2LOOPBACK_CTL_QUERY 0x4C82 -+ -+/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */ -+#define V4L2LOOPBACK_CTL_REMOVE 0x4C81 -+ -+#endif /* _V4L2LOOPBACK_H */ -diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h -new file mode 100644 -index 000000000000..d855a3796554 ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback_formats.h -@@ -0,0 +1,445 @@ -+static const struct v4l2l_format formats[] = { -+#ifndef V4L2_PIX_FMT_VP9 -+#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0') -+#endif -+#ifndef V4L2_PIX_FMT_HEVC -+#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C') -+#endif -+ -+ /* here come the packed formats */ -+ { -+ .name = "32 bpp RGB, le", -+ .fourcc = V4L2_PIX_FMT_BGR32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "32 bpp RGB, be", -+ .fourcc = V4L2_PIX_FMT_RGB32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "24 bpp RGB, le", -+ .fourcc = V4L2_PIX_FMT_BGR24, -+ .depth = 24, -+ .flags = 0, -+ }, -+ { -+ .name = "24 bpp RGB, be", -+ .fourcc = V4L2_PIX_FMT_RGB24, -+ .depth = 24, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_ABGR32 -+ { -+ .name = "32 bpp RGBA, le", -+ .fourcc = V4L2_PIX_FMT_ABGR32, -+ .depth = 32, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_RGBA32 -+ { -+ .name = "32 bpp RGBA", -+ .fourcc = V4L2_PIX_FMT_RGBA32, -+ .depth = 32, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_RGB332 -+ { -+ .name = "8 bpp RGB-3-3-2", -+ .fourcc = V4L2_PIX_FMT_RGB332, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB332 */ -+#ifdef V4L2_PIX_FMT_RGB444 -+ { -+ .name = "16 bpp RGB (xxxxrrrr ggggbbbb)", -+ .fourcc = V4L2_PIX_FMT_RGB444, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB444 */ -+#ifdef V4L2_PIX_FMT_RGB555 -+ { -+ .name = "16 bpp RGB-5-5-5", -+ .fourcc = V4L2_PIX_FMT_RGB555, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB555 */ -+#ifdef V4L2_PIX_FMT_RGB565 -+ { -+ .name = "16 bpp RGB-5-6-5", -+ .fourcc = V4L2_PIX_FMT_RGB565, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB565 */ -+#ifdef V4L2_PIX_FMT_RGB555X -+ { -+ .name = "16 bpp RGB-5-5-5 BE", -+ .fourcc = V4L2_PIX_FMT_RGB555X, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB555X */ -+#ifdef V4L2_PIX_FMT_RGB565X -+ { -+ .name = "16 bpp RGB-5-6-5 BE", -+ .fourcc = V4L2_PIX_FMT_RGB565X, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB565X */ -+#ifdef V4L2_PIX_FMT_BGR666 -+ { -+ .name = "18 bpp BGR-6-6-6", -+ .fourcc = V4L2_PIX_FMT_BGR666, -+ .depth = 18, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_BGR666 */ -+ { -+ .name = "4:2:2, packed, YUYV", -+ .fourcc = V4L2_PIX_FMT_YUYV, -+ .depth = 16, -+ .flags = 0, -+ }, -+ { -+ .name = "4:2:2, packed, UYVY", -+ .fourcc = V4L2_PIX_FMT_UYVY, -+ .depth = 16, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_YVYU -+ { -+ .name = "4:2:2, packed YVYU", -+ .fourcc = V4L2_PIX_FMT_YVYU, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_VYUY -+ { -+ .name = "4:2:2, packed VYUY", -+ .fourcc = V4L2_PIX_FMT_VYUY, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif -+ { -+ .name = "4:2:2, packed YYUV", -+ .fourcc = V4L2_PIX_FMT_YYUV, -+ .depth = 16, -+ .flags = 0, -+ }, -+ { -+ .name = "YUV-8-8-8-8", -+ .fourcc = V4L2_PIX_FMT_YUV32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "8 bpp, Greyscale", -+ .fourcc = V4L2_PIX_FMT_GREY, -+ .depth = 8, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_Y4 -+ { -+ .name = "4 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y4, -+ .depth = 4, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y4 */ -+#ifdef V4L2_PIX_FMT_Y6 -+ { -+ .name = "6 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y6, -+ .depth = 6, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y6 */ -+#ifdef V4L2_PIX_FMT_Y10 -+ { -+ .name = "10 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y10, -+ .depth = 10, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y10 */ -+#ifdef V4L2_PIX_FMT_Y12 -+ { -+ .name = "12 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y12, -+ .depth = 12, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y12 */ -+ { -+ .name = "16 bpp, Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y16, -+ .depth = 16, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_YUV444 -+ { -+ .name = "16 bpp xxxxyyyy uuuuvvvv", -+ .fourcc = V4L2_PIX_FMT_YUV444, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV444 */ -+#ifdef V4L2_PIX_FMT_YUV555 -+ { -+ .name = "16 bpp YUV-5-5-5", -+ .fourcc = V4L2_PIX_FMT_YUV555, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV555 */ -+#ifdef V4L2_PIX_FMT_YUV565 -+ { -+ .name = "16 bpp YUV-5-6-5", -+ .fourcc = V4L2_PIX_FMT_YUV565, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV565 */ -+ -+/* bayer formats */ -+#ifdef V4L2_PIX_FMT_SRGGB8 -+ { -+ .name = "Bayer RGGB 8bit", -+ .fourcc = V4L2_PIX_FMT_SRGGB8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SRGGB8 */ -+#ifdef V4L2_PIX_FMT_SGRBG8 -+ { -+ .name = "Bayer GRBG 8bit", -+ .fourcc = V4L2_PIX_FMT_SGRBG8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SGRBG8 */ -+#ifdef V4L2_PIX_FMT_SGBRG8 -+ { -+ .name = "Bayer GBRG 8bit", -+ .fourcc = V4L2_PIX_FMT_SGBRG8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SGBRG8 */ -+#ifdef V4L2_PIX_FMT_SBGGR8 -+ { -+ .name = "Bayer BA81 8bit", -+ .fourcc = V4L2_PIX_FMT_SBGGR8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SBGGR8 */ -+ -+ /* here come the planar formats */ -+ { -+ .name = "4:1:0, planar, Y-Cr-Cb", -+ .fourcc = V4L2_PIX_FMT_YVU410, -+ .depth = 9, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:2:0, planar, Y-Cr-Cb", -+ .fourcc = V4L2_PIX_FMT_YVU420, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:1:0, planar, Y-Cb-Cr", -+ .fourcc = V4L2_PIX_FMT_YUV410, -+ .depth = 9, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:2:0, planar, Y-Cb-Cr", -+ .fourcc = V4L2_PIX_FMT_YUV420, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#ifdef V4L2_PIX_FMT_YUV422P -+ { -+ .name = "16 bpp YVU422 planar", -+ .fourcc = V4L2_PIX_FMT_YUV422P, -+ .depth = 16, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_YUV422P */ -+#ifdef V4L2_PIX_FMT_YUV411P -+ { -+ .name = "16 bpp YVU411 planar", -+ .fourcc = V4L2_PIX_FMT_YUV411P, -+ .depth = 16, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_YUV411P */ -+#ifdef V4L2_PIX_FMT_Y41P -+ { -+ .name = "12 bpp YUV 4:1:1", -+ .fourcc = V4L2_PIX_FMT_Y41P, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_Y41P */ -+#ifdef V4L2_PIX_FMT_NV12 -+ { -+ .name = "12 bpp Y/CbCr 4:2:0 ", -+ .fourcc = V4L2_PIX_FMT_NV12, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_NV12 */ -+ -+/* here come the compressed formats */ -+ -+#ifdef V4L2_PIX_FMT_MJPEG -+ { -+ .name = "Motion-JPEG", -+ .fourcc = V4L2_PIX_FMT_MJPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MJPEG */ -+#ifdef V4L2_PIX_FMT_JPEG -+ { -+ .name = "JFIF JPEG", -+ .fourcc = V4L2_PIX_FMT_JPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_JPEG */ -+#ifdef V4L2_PIX_FMT_DV -+ { -+ .name = "DV1394", -+ .fourcc = V4L2_PIX_FMT_DV, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_DV */ -+#ifdef V4L2_PIX_FMT_MPEG -+ { -+ .name = "MPEG-1/2/4 Multiplexed", -+ .fourcc = V4L2_PIX_FMT_MPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG */ -+#ifdef V4L2_PIX_FMT_H264 -+ { -+ .name = "H264 with start codes", -+ .fourcc = V4L2_PIX_FMT_H264, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264 */ -+#ifdef V4L2_PIX_FMT_H264_NO_SC -+ { -+ .name = "H264 without start codes", -+ .fourcc = V4L2_PIX_FMT_H264_NO_SC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264_NO_SC */ -+#ifdef V4L2_PIX_FMT_H264_MVC -+ { -+ .name = "H264 MVC", -+ .fourcc = V4L2_PIX_FMT_H264_MVC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264_MVC */ -+#ifdef V4L2_PIX_FMT_H263 -+ { -+ .name = "H263", -+ .fourcc = V4L2_PIX_FMT_H263, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H263 */ -+#ifdef V4L2_PIX_FMT_MPEG1 -+ { -+ .name = "MPEG-1 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG1, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG1 */ -+#ifdef V4L2_PIX_FMT_MPEG2 -+ { -+ .name = "MPEG-2 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG2, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG2 */ -+#ifdef V4L2_PIX_FMT_MPEG4 -+ { -+ .name = "MPEG-4 part 2 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG4, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG4 */ -+#ifdef V4L2_PIX_FMT_XVID -+ { -+ .name = "Xvid", -+ .fourcc = V4L2_PIX_FMT_XVID, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_XVID */ -+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G -+ { -+ .name = "SMPTE 421M Annex G compliant stream", -+ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */ -+#ifdef V4L2_PIX_FMT_VC1_ANNEX_L -+ { -+ .name = "SMPTE 421M Annex L compliant stream", -+ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */ -+#ifdef V4L2_PIX_FMT_VP8 -+ { -+ .name = "VP8", -+ .fourcc = V4L2_PIX_FMT_VP8, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VP8 */ -+#ifdef V4L2_PIX_FMT_VP9 -+ { -+ .name = "VP9", -+ .fourcc = V4L2_PIX_FMT_VP9, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VP9 */ -+#ifdef V4L2_PIX_FMT_HEVC -+ { -+ .name = "HEVC", -+ .fourcc = V4L2_PIX_FMT_HEVC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_HEVC */ -+}; --- -2.40.0.rc2 +2.40.0 diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index 11213cb..0df5c7b 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,80 +1,29 @@ -From b6d3ec3be2639fe928a09b558e979c36b41ea63b Mon Sep 17 00:00:00 2001 +From 40a2f9f3e7e56936385c5a97957cd43fbb85fd32 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 11 Mar 2023 18:42:39 +0100 +Date: Sun, 9 Apr 2023 21:35:07 +0200 Subject: [PATCH] EEVDF -Ever since looking at the latency-nice patches, I've wondered if EEVDF would -not make more sense, and I did point Vincent at some older patches I had for -that (which is here his augmented rbtree thing comes from). - -Also, since I really dislike the dual tree, I also figured we could dynamically -switch between an augmented tree and not (and while I have code for that, -that's not included in this posting because with the current results I don't -think we actually need this). - -Anyway, since I'm somewhat under the weather, I spend last week desperately -trying to connect a small cluster of neurons in defiance of the snot overlord -and bring back the EEVDF patches from the dark crypts where they'd been -gathering cobwebs for the past 13 odd years. - -By friday they worked well enough, and this morning (because obviously I forgot -the weekend is ideal to run benchmarks) I ran a bunch of hackbenck, netperf, -tbench and sysbench -- there's a bunch of wins and losses, but nothing that -indicates a total fail. - -( in fact, some of the schbench results seem to indicate EEVDF schedules a lot - more consistent than CFS and has a bunch of latency wins ) - -( hackbench also doesn't show the augmented tree and generally more expensive - pick to be a loss, in fact it shows a slight win here ) - - hackbech load + cyclictest --policy other results: - - EEVDF CFS - - # Min Latencies: 00053 - LNICE(19) # Avg Latencies: 04350 - # Max Latencies: 76019 - - # Min Latencies: 00052 00053 - LNICE(0) # Avg Latencies: 00690 00687 - # Max Latencies: 14145 13913 - - # Min Latencies: 00019 - LNICE(-19) # Avg Latencies: 00261 - # Max Latencies: 05642 - -The nice -19 numbers aren't as pretty as Vincent's, but at the end I was going -cross-eyed from staring at tree prints and I just couldn't figure out where it -was going side-ways. - -There's definitely more benchmarking/tweaking to be done (0-day already -reported a stress-ng loss), but if we can pull this off we can delete a whole -much of icky heuristics code. EEVDF is a much better defined policy than what -we currently have. - Signed-off-by: Peter Jung --- - Documentation/admin-guide/cgroup-v2.rst | 10 + - include/linux/rbtree_augmented.h | 26 ++ - include/linux/sched.h | 8 + - include/linux/sched/prio.h | 27 ++ - include/uapi/linux/sched.h | 4 +- - include/uapi/linux/sched/types.h | 19 + - init/init_task.c | 1 + - kernel/sched/core.c | 66 ++++ - kernel/sched/debug.c | 39 +- - kernel/sched/fair.c | 486 ++++++++++++++++++++---- - kernel/sched/features.h | 10 +- - kernel/sched/sched.h | 12 + - tools/include/uapi/linux/sched.h | 4 +- - 13 files changed, 614 insertions(+), 98 deletions(-) + Documentation/admin-guide/cgroup-v2.rst | 10 + + include/linux/rbtree_augmented.h | 26 + + include/linux/sched.h | 9 +- + include/uapi/linux/sched.h | 4 +- + include/uapi/linux/sched/types.h | 19 + + init/init_task.c | 3 +- + kernel/sched/core.c | 67 +- + kernel/sched/debug.c | 50 +- + kernel/sched/fair.c | 1171 ++++++++++------------- + kernel/sched/features.h | 28 +- + kernel/sched/sched.h | 23 +- + tools/include/uapi/linux/sched.h | 4 +- + 12 files changed, 697 insertions(+), 717 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index 74cec76be9f2..2e511d4a4c6a 100644 +index f67c0829350b..a39dfda3d032 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst -@@ -1118,6 +1118,16 @@ All time durations are in microseconds. +@@ -1121,6 +1121,16 @@ All time durations are in microseconds. values similar to the sched_setattr(2). This maximum utilization value is used to clamp the task specific maximum utilization clamp. @@ -129,7 +78,7 @@ index d1c53e9d8c75..a78e692a9ff5 100644 * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h -index 28ce1be0ba47..764df627c243 100644 +index 6d398b337b0d..6a719374f688 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -548,6 +548,9 @@ struct sched_entity { @@ -142,25 +91,21 @@ index 28ce1be0ba47..764df627c243 100644 struct list_head group_node; unsigned int on_rq; -@@ -555,6 +558,8 @@ struct sched_entity { +@@ -555,11 +558,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; -+ s64 lag; ++ s64 vlag; + u64 slice; u64 nr_migrations; - u64 prev_sleep_sum_runtime; -@@ -571,6 +576,8 @@ struct sched_entity { - /* cached value of my_q->h_nr_running */ - unsigned long runnable_weight; - #endif -+ /* preemption offset in ns */ -+ long latency_offset; +- u64 prev_sleep_sum_runtime; +- /* average duration of a task */ +- u64 dur_avg; - #ifdef CONFIG_SMP - /* -@@ -787,6 +794,7 @@ struct task_struct { + #ifdef CONFIG_FAIR_GROUP_SCHED + int depth; +@@ -787,6 +789,7 @@ struct task_struct { int static_prio; int normal_prio; unsigned int rt_priority; @@ -168,42 +113,6 @@ index 28ce1be0ba47..764df627c243 100644 struct sched_entity se; struct sched_rt_entity rt; -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index ab83d85e1183..be79503d86af 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio) - return (MAX_NICE - prio + 1); - } - -+/* -+ * Latency nice is meant to provide scheduler hints about the relative -+ * latency requirements of a task with respect to other tasks. -+ * Thus a task with latency_nice == 19 can be hinted as the task with no -+ * latency requirements, in contrast to the task with latency_nice == -20 -+ * which should be given priority in terms of lower latency. -+ */ -+#define MAX_LATENCY_NICE 19 -+#define MIN_LATENCY_NICE -20 -+ -+#define LATENCY_NICE_WIDTH \ -+ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) -+ -+/* -+ * Default tasks should be treated as a task with latency_nice = 0. -+ */ -+#define DEFAULT_LATENCY_NICE 0 -+#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) -+ -+/* -+ * Convert user-nice values [ -20 ... 0 ... 19 ] -+ * to static latency [ 0..39 ], -+ * and back. -+ */ -+#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) -+#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) -+ - #endif /* _LINUX_SCHED_PRIO_H */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..b2e932c25be6 100644 --- a/include/uapi/linux/sched.h @@ -270,96 +179,113 @@ index f2c4589d4dbf..db1e8199e8c8 100644 #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b..071deff8dbd1 100644 +index ff6c4b9bfe6b..511cbcf3510d 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,7 @@ struct task_struct init_task .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, .normal_prio = MAX_PRIO - 20, -+ .latency_prio = DEFAULT_LATENCY_PRIO, ++ .latency_prio = DEFAULT_PRIO, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, +@@ -89,7 +90,7 @@ struct task_struct init_task + .fn = do_no_restart_syscall, + }, + .se = { +- .group_node = LIST_HEAD_INIT(init_task.se.group_node), ++ .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, + .rt = { + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 5237639786b7..9db5f9ec9022 100644 +index 17bb9637f314..fbc08605b068 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -1283,6 +1283,11 @@ static void set_load_weight(struct task_struct *p, bool update_load) +@@ -1285,6 +1285,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } -+static void set_latency_offset(struct task_struct *p) ++static inline void set_latency_prio(struct task_struct *p, int prio) +{ -+ p->se.latency_offset = calc_latency_offset(p->latency_prio); ++ p->latency_prio = prio; ++ set_latency_fair(&p->se, prio - MAX_RT_PRIO); +} + #ifdef CONFIG_UCLAMP_TASK /* * Serializes updates of utilization clamp values -@@ -4431,8 +4436,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4434,10 +4440,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.vruntime = 0; - p->se.dur_avg = 0; - p->se.prev_sleep_sum_runtime = 0; -+ p->se.lag = 0; +- p->se.dur_avg = 0; +- p->se.prev_sleep_sum_runtime = 0; ++ p->se.vlag = 0; INIT_LIST_HEAD(&p->se.group_node); -+ set_latency_offset(p); ++ set_latency_prio(p, p->latency_prio); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif -@@ -4684,6 +4692,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4688,6 +4695,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); ++ set_latency_prio(p, NICE_TO_PRIO(0)); -+ p->latency_prio = NICE_TO_LATENCY(0); -+ set_latency_offset(p); -+ /* * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: -@@ -7446,6 +7457,15 @@ static void __setscheduler_params(struct task_struct *p, +@@ -7433,7 +7441,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) + #define SETPARAM_POLICY -1 + + static void __setscheduler_params(struct task_struct *p, +- const struct sched_attr *attr) ++ const struct sched_attr *attr) + { + int policy = attr->sched_policy; + +@@ -7457,6 +7465,13 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } +static void __setscheduler_latency(struct task_struct *p, -+ const struct sched_attr *attr) ++ const struct sched_attr *attr) +{ -+ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); -+ set_latency_offset(p); -+ } ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) ++ set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice)); +} + /* * Check the target process has a UID that matches the current process's: */ -@@ -7586,6 +7606,13 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7597,6 +7612,13 @@ static int __sched_setscheduler(struct task_struct *p, return retval; } + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { -+ if (attr->sched_latency_nice > MAX_LATENCY_NICE) ++ if (attr->sched_latency_nice > MAX_NICE) + return -EINVAL; -+ if (attr->sched_latency_nice < MIN_LATENCY_NICE) ++ if (attr->sched_latency_nice < MIN_NICE) + return -EINVAL; + } + if (pi) cpuset_read_lock(); -@@ -7620,6 +7647,9 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7631,6 +7653,9 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; + if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && -+ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) ++ attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio)) + goto change; p->sched_reset_on_fork = reset_on_fork; retval = 0; -@@ -7708,6 +7738,7 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7719,6 +7744,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } @@ -367,7 +293,7 @@ index 5237639786b7..9db5f9ec9022 100644 __setscheduler_uclamp(p, attr); if (queued) { -@@ -7918,6 +7949,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a +@@ -7929,6 +7955,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -377,16 +303,16 @@ index 5237639786b7..9db5f9ec9022 100644 /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? -@@ -8155,6 +8189,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +@@ -8166,6 +8195,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; -+ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); ++ kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio); + #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine -@@ -11027,6 +11063,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +@@ -11038,6 +11069,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, { return sched_group_set_idle(css_tg(css), idle); } @@ -394,7 +320,7 @@ index 5237639786b7..9db5f9ec9022 100644 +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ -+ return LATENCY_TO_NICE(css_tg(css)->latency_prio); ++ return PRIO_TO_NICE(css_tg(css)->latency_prio); +} + +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, @@ -402,17 +328,17 @@ index 5237639786b7..9db5f9ec9022 100644 +{ + int prio; + -+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE) ++ if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + -+ prio = NICE_TO_LATENCY(nice); ++ prio = NICE_TO_PRIO(nice); + + return sched_group_set_latency(css_tg(css), prio); +} #endif static struct cftype cpu_legacy_files[] = { -@@ -11041,6 +11096,11 @@ static struct cftype cpu_legacy_files[] = { +@@ -11052,6 +11102,11 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -424,7 +350,7 @@ index 5237639786b7..9db5f9ec9022 100644 #endif #ifdef CONFIG_CFS_BANDWIDTH { -@@ -11258,6 +11318,12 @@ static struct cftype cpu_files[] = { +@@ -11269,6 +11324,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -438,10 +364,22 @@ index 5237639786b7..9db5f9ec9022 100644 #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d64fba16cfe..fe9edfa43f65 100644 +index 8d64fba16cfe..e0d10ac21016 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -535,9 +535,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -308,10 +308,7 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + +- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); +- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); +- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); +- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); ++ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +@@ -535,9 +532,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); @@ -456,7 +394,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644 (long long)(p->nvcsw + p->nivcsw), p->prio); -@@ -580,10 +584,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +@@ -580,10 +581,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { @@ -469,7 +407,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644 unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -597,26 +600,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +@@ -597,26 +597,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); @@ -509,7 +447,27 @@ index 8d64fba16cfe..fe9edfa43f65 100644 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); -@@ -1044,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -817,10 +816,7 @@ static void sched_debug_header(struct seq_file *m) + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) + #define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) +- PN(sysctl_sched_latency); +- PN(sysctl_sched_min_granularity); +- PN(sysctl_sched_idle_min_granularity); +- PN(sysctl_sched_wakeup_granularity); ++ PN(sysctl_sched_base_slice); + P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); + #undef PN +@@ -1024,7 +1020,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + __PS("nr_involuntary_switches", p->nivcsw); + + P(se.load.weight); +- P(se.dur_avg); + #ifdef CONFIG_SMP + P(se.avg.load_sum); + P(se.avg.runnable_sum); +@@ -1044,6 +1039,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); @@ -518,7 +476,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644 P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 84254f52c56a..c40b775452bc 100644 +index 115be8a965f2..76bd212ee5bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ @@ -529,7 +487,128 @@ index 84254f52c56a..c40b775452bc 100644 #include -@@ -619,13 +620,134 @@ static inline bool entity_before(struct sched_entity *a, +@@ -56,26 +57,6 @@ + #include "stats.h" + #include "autogroup.h" + +-/* +- * Targeted preemption latency for CPU-bound tasks: +- * +- * NOTE: this latency value is not the same as the concept of +- * 'timeslice length' - timeslices in CFS are of variable length +- * and have no persistent notion like in traditional, time-slice +- * based scheduling concepts. +- * +- * (to see the precise effective timeslice length of your workload, +- * run vmstat and monitor the context-switches (cs) field) +- * +- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) +- */ +-#ifdef CONFIG_CACHY +-unsigned int sysctl_sched_latency = 3000000ULL; +-static unsigned int normalized_sysctl_sched_latency = 3000000ULL; +-#else +-unsigned int sysctl_sched_latency = 6000000ULL; +-static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +-#endif + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -94,26 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +-#ifdef CONFIG_CACHY +-unsigned int sysctl_sched_min_granularity = 400000ULL; +-static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; +-#else +-unsigned int sysctl_sched_min_granularity = 750000ULL; +-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +-#endif +- +-/* +- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. +- * Applies only when SCHED_IDLE tasks compete with normal tasks. +- * +- * (default: 0.75 msec) +- */ +-unsigned int sysctl_sched_idle_min_granularity = 750000ULL; +- +-/* +- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity +- */ +-static unsigned int sched_nr_latency = 8; ++unsigned int sysctl_sched_base_slice = 750000ULL; ++static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -121,23 +84,6 @@ static unsigned int sched_nr_latency = 8; + */ + unsigned int sysctl_sched_child_runs_first __read_mostly; + +-/* +- * SCHED_OTHER wake-up granularity. +- * +- * This option delays the preemption effects of decoupled workloads +- * and reduces their over-scheduling. Synchronous workloads will still +- * have immediate wakeup/sleep latencies. +- * +- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) +- */ +-#ifdef CONFIG_CACHY +-unsigned int sysctl_sched_wakeup_granularity = 500000UL; +-static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; +-#else +-unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +-#endif +- + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + + int sched_thermal_decay_shift; +@@ -189,12 +135,8 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ +-#ifdef CONFIG_CACHY +-static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +-#else + static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif +-#endif + + #ifdef CONFIG_NUMA_BALANCING + /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +@@ -295,9 +237,7 @@ static void update_sysctl(void) + + #define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) +- SET_SYSCTL(sched_min_granularity); +- SET_SYSCTL(sched_latency); +- SET_SYSCTL(sched_wakeup_granularity); ++ SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } + +@@ -365,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight + return mul_u64_u32_shr(delta_exec, fact, shift); + } + ++/* ++ * delta /= w ++ */ ++static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++{ ++ if (unlikely(se->load.weight != NICE_0_LOAD)) ++ delta = __calc_delta(delta, NICE_0_LOAD, &se->load); ++ ++ return delta; ++} + + const struct sched_class fair_sched_class; + +@@ -619,35 +569,203 @@ static inline bool entity_before(const struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } @@ -541,6 +620,7 @@ index 84254f52c56a..c40b775452bc 100644 #define __node_2_se(node) \ rb_entry((node), struct sched_entity, run_node) +-static void update_min_vruntime(struct cfs_rq *cfs_rq) +/* + * Compute virtual time from the per-task service numbers: + * @@ -576,17 +656,23 @@ index 84254f52c56a..c40b775452bc 100644 +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ ++ unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); -+ cfs_rq->avg_vruntime += key * se->load.weight; -+ cfs_rq->avg_load += se->load.weight; ++ ++ cfs_rq->avg_vruntime += key * weight; ++ cfs_rq->avg_slice += se->slice * weight; ++ cfs_rq->avg_load += weight; +} + +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ ++ unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); -+ cfs_rq->avg_vruntime -= key * se->load.weight; -+ cfs_rq->avg_load -= se->load.weight; ++ ++ cfs_rq->avg_vruntime -= key * weight; ++ cfs_rq->avg_slice -= se->slice * weight; ++ cfs_rq->avg_load -= weight; +} + +static inline @@ -599,27 +685,69 @@ index 84254f52c56a..c40b775452bc 100644 +} + +u64 avg_vruntime(struct cfs_rq *cfs_rq) -+{ -+ struct sched_entity *curr = cfs_rq->curr; -+ s64 lag = cfs_rq->avg_vruntime; + { + struct sched_entity *curr = cfs_rq->curr; +- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); ++ s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; -+ + +- u64 vruntime = cfs_rq->min_vruntime; + if (curr && curr->on_rq) { -+ lag += entity_key(cfs_rq, curr) * curr->load.weight; -+ load += curr->load.weight; -+ } -+ ++ unsigned long weight = scale_load_down(curr->load.weight); + +- if (curr) { +- if (curr->on_rq) +- vruntime = curr->vruntime; +- else +- curr = NULL; ++ avg += entity_key(cfs_rq, curr) * weight; ++ load += weight; + } + +- if (leftmost) { /* non-empty tree */ +- struct sched_entity *se = __node_2_se(leftmost); + if (load) -+ lag = div_s64(lag, load); ++ avg = div_s64(avg, load); + +- if (!curr) +- vruntime = se->vruntime; +- else +- vruntime = min_vruntime(vruntime, se->vruntime); ++ return cfs_rq->min_vruntime + avg; ++} + -+ return cfs_rq->min_vruntime + lag; ++/* ++ * lag_i = S - s_i = w_i * (V - v_i) ++ * ++ * However, since V is approximated by the weighted average of all entities it ++ * is possible -- by addition/removal/reweight to the tree -- to move V around ++ * and end up with a larger lag than we started with. ++ * ++ * Limit this to either double the slice length with a minimum of TICK_NSEC ++ * since that is the timing granularity. ++ * ++ * EEVDF gives the following limit for a steady state system: ++ * ++ * -r_max < lag < max(r_max, q) ++ * ++ * XXX could add max_slice to the augmented data to track this. ++ */ ++void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ s64 lag, limit; ++ ++ SCHED_WARN_ON(!se->on_rq); ++ lag = avg_vruntime(cfs_rq) - se->vruntime; ++ ++ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++ se->vlag = clamp(lag, -limit, limit); +} + +/* + * Entity is eligible once it received less service than it ought to have, + * eg. lag >= 0. + * -+ * lag_i = S - s_i = w_i*(V - w_i) ++ * lag_i = S - s_i = w_i*(V - v_i) + * + * lag_i >= 0 -> V >= v_i + * @@ -628,19 +756,27 @@ index 84254f52c56a..c40b775452bc 100644 + * \Sum w_i + * + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) ++ * ++ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due ++ * to the loss in precision caused by the division. + */ +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_entity *curr = cfs_rq->curr; -+ s64 avg_vruntime = cfs_rq->avg_vruntime; -+ long avg_load = cfs_rq->avg_load; ++ s64 avg = cfs_rq->avg_vruntime; ++ long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { -+ avg_vruntime += entity_key(cfs_rq, curr) * curr->load.weight; -+ avg_load += curr->load.weight; -+ } ++ unsigned long weight = scale_load_down(curr->load.weight); + -+ return avg_vruntime >= entity_key(cfs_rq, se) * avg_load; ++ avg += entity_key(cfs_rq, curr) * weight; ++ load += weight; + } + +- /* ensure we never gain time by being placed backwards. */ +- u64_u32_store(cfs_rq->min_vruntime, +- max_vruntime(cfs_rq->min_vruntime, vruntime)); ++ return avg >= entity_key(cfs_rq, se) * load; +} + +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -657,35 +793,43 @@ index 84254f52c56a..c40b775452bc 100644 + return min_vruntime; +} + - static void update_min_vruntime(struct cfs_rq *cfs_rq) - { -+ struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; -- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); - - u64 vruntime = cfs_rq->min_vruntime; - -@@ -636,9 +758,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - curr = NULL; - } - -- if (leftmost) { /* non-empty tree */ -- struct sched_entity *se = __node_2_se(leftmost); -- -+ if (se) { - if (!curr) - vruntime = se->vruntime; - else -@@ -647,7 +767,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) - - /* ensure we never gain time by being placed backwards. */ - u64_u32_store(cfs_rq->min_vruntime, -- max_vruntime(cfs_rq->min_vruntime, vruntime)); -+ __update_min_vruntime(cfs_rq, vruntime)); ++static void update_min_vruntime(struct cfs_rq *cfs_rq) ++{ ++ if (sched_feat(MINIMAL_VA)) { ++ u64 vruntime = avg_vruntime(cfs_rq); ++ s64 delta = (s64)(vruntime - cfs_rq->min_vruntime); ++ ++ avg_vruntime_update(cfs_rq, delta); ++ ++ u64_u32_store(cfs_rq->min_vruntime, vruntime); ++ } else { ++ struct sched_entity *se = __pick_first_entity(cfs_rq); ++ struct sched_entity *curr = cfs_rq->curr; ++ ++ u64 vruntime = cfs_rq->min_vruntime; ++ ++ if (curr) { ++ if (curr->on_rq) ++ vruntime = curr->vruntime; ++ else ++ curr = NULL; ++ } ++ ++ if (se) { ++ if (!curr) ++ vruntime = se->vruntime; ++ else ++ vruntime = min_vruntime(vruntime, se->vruntime); ++ } ++ ++ /* ensure we never gain time by being placed backwards. */ ++ u64_u32_store(cfs_rq->min_vruntime, ++ __update_min_vruntime(cfs_rq, vruntime)); ++ } } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) -@@ -655,17 +775,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -655,17 +773,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } @@ -739,24 +883,11 @@ index 84254f52c56a..c40b775452bc 100644 } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -@@ -688,6 +842,101 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) - return __node_2_se(next); +@@ -678,14 +830,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + return __node_2_se(left); } -+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr) -+{ -+ struct sched_entity *left = __pick_first_entity(cfs_rq); -+ -+ /* -+ * If curr is set we have to see if its left of the leftmost entity -+ * still in the tree, provided there was anything in the tree at all. -+ */ -+ if (!left || (curr && entity_before(curr, left))) -+ left = curr; -+ -+ return left; -+} -+ +-static struct sched_entity *__pick_next_entity(struct sched_entity *se) +/* + * Earliest Eligible Virtual Deadline First + * @@ -777,11 +908,14 @@ index 84254f52c56a..c40b775452bc 100644 + * Which allows an EDF like search on (sub)trees. + */ +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -+{ + { +- struct rb_node *next = rb_next(&se->run_node); + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; -+ + +- if (!next) +- return NULL; + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + curr = NULL; + @@ -790,10 +924,6 @@ index 84254f52c56a..c40b775452bc 100644 + + /* + * If this entity is not eligible, try the left subtree. -+ * -+ * XXX: would it be worth it to do the single division for -+ * avg_vruntime() once, instead of the multiplication -+ * in entity_eligible() O(log n) times? + */ + if (!entity_eligible(cfs_rq, se)) { + node = node->rb_left; @@ -834,81 +964,153 @@ index 84254f52c56a..c40b775452bc 100644 + return left; + } + } -+ + +- return __node_2_se(next); + return best; -+} -+ + } + #ifdef CONFIG_SCHED_DEBUG - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +@@ -707,104 +926,43 @@ int sched_update_scaling(void) { -@@ -721,6 +970,14 @@ int sched_update_scaling(void) + unsigned int factor = get_update_sysctl_factor(); + +- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, +- sysctl_sched_min_granularity); +- + #define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) +- WRT_SYSCTL(sched_min_granularity); +- WRT_SYSCTL(sched_latency); +- WRT_SYSCTL(sched_wakeup_granularity); ++ WRT_SYSCTL(sched_base_slice); + #undef WRT_SYSCTL + + return 0; } #endif -+long calc_latency_offset(int prio) -+{ +-/* +- * delta /= w +- */ +-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) ++void set_latency_fair(struct sched_entity *se, int prio) + { +- if (unlikely(se->load.weight != NICE_0_LOAD)) +- delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + u32 weight = sched_prio_to_weight[prio]; -+ u64 base = sysctl_sched_min_granularity; -+ -+ return div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); -+} -+ - /* - * delta /= w - */ -@@ -797,14 +1054,30 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) - return slice; ++ u64 base = sysctl_sched_base_slice; + +- return delta; +-} +- +-/* +- * The idea is to set a period in which each task runs once. +- * +- * When there are too many tasks (sched_nr_latency) we have to stretch +- * this period because otherwise the slices get too small. +- * +- * p = (nr <= nl) ? l : l*nr/nl +- */ +-static u64 __sched_period(unsigned long nr_running) +-{ +- if (unlikely(nr_running > sched_nr_latency)) +- return nr_running * sysctl_sched_min_granularity; +- else +- return sysctl_sched_latency; ++ /* ++ * For EEVDF the virtual time slope is determined by w_i (iow. ++ * nice) while the request time r_i is determined by ++ * latency-nice. ++ * ++ * Smaller request gets better latency. ++ */ ++ se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); } +-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); +- + /* +- * We calculate the wall-time slice from the period by taking a part +- * proportional to the weight. +- * +- * s = p*P[w/rw] ++ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i ++ * this is probably good enough. + */ +-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) ++static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- unsigned int nr_running = cfs_rq->nr_running; +- struct sched_entity *init_se = se; +- unsigned int min_gran; +- u64 slice; +- +- if (sched_feat(ALT_PERIOD)) +- nr_running = rq_of(cfs_rq)->cfs.h_nr_running; +- +- slice = __sched_period(nr_running + !se->on_rq); +- +- for_each_sched_entity(se) { +- struct load_weight *load; +- struct load_weight lw; +- struct cfs_rq *qcfs_rq; +- +- qcfs_rq = cfs_rq_of(se); +- load = &qcfs_rq->load; +- +- if (unlikely(!se->on_rq)) { +- lw = qcfs_rq->load; +- +- update_load_add(&lw, se->load.weight); +- load = &lw; +- } +- slice = __calc_delta(slice, se->load.weight, load); +- } +- +- if (sched_feat(BASE_SLICE)) { +- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) +- min_gran = sysctl_sched_idle_min_granularity; +- else +- min_gran = sysctl_sched_min_granularity; +- +- slice = max_t(u64, slice, min_gran); +- } +- +- return slice; +-} ++ if ((s64)(se->vruntime - se->deadline) < 0) ++ return; + -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -+static void set_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) - { +-{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); -+ if (sched_feat(EEVDF)) { -+ /* -+ * For EEVDF the virtual time slope is determined by w_i (iow. -+ * nice) while the request time r_i is determined by -+ * latency-nice. -+ */ -+ se->slice = se->latency_offset; -+ } else { -+ /* -+ * When many tasks blow up the sched_period; it is possible -+ * that sched_slice() reports unusually large results (when -+ * many tasks are very light for example). Therefore impose a -+ * maximum. -+ */ -+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); -+ } -+ + /* -+ * vd_i = ve_i + r_i / w_i ++ * EEVDF: vd_i = ve_i + r_i / w_i + */ + se->deadline = se->vruntime + calc_delta_fair(se->slice, se); -+ se->min_deadline = se->deadline; } #include "pelt.h" -@@ -939,6 +1212,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -939,6 +1097,7 @@ static void update_curr(struct cfs_rq *cfs_rq) schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); -+ /* -+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i -+ * this is probably good enough. -+ */ -+ if ((s64)(curr->vruntime - curr->deadline) > 0) -+ set_slice(cfs_rq, curr); -+ ++ update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { -@@ -3340,6 +3620,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +@@ -3336,16 +3495,28 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) + { ++ unsigned long old_weight = se->load.weight; ++ + if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); @@ -917,7 +1119,21 @@ index 84254f52c56a..c40b775452bc 100644 update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); -@@ -3355,9 +3637,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + + update_load_set(&se->load, weight); + ++ if (!se->on_rq) { ++ /* ++ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v), ++ * we need to scale se->vlag when w_i changes. ++ */ ++ se->vlag = div_s64(se->vlag * old_weight, weight); ++ } ++ + #ifdef CONFIG_SMP + do { + u32 divider = get_pelt_divider(&se->avg); +@@ -3355,9 +3526,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); @@ -931,170 +1147,484 @@ index 84254f52c56a..c40b775452bc 100644 } void reweight_task(struct task_struct *p, int prio) -@@ -4669,49 +4953,49 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) - static void - place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - { -- u64 vruntime = cfs_rq->min_vruntime; -- u64 sleep_time; -+ u64 vruntime = avg_vruntime(cfs_rq); +@@ -4653,158 +4826,135 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} -- /* + #endif /* CONFIG_SMP */ + +-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +-#ifdef CONFIG_SCHED_DEBUG +- s64 d = se->vruntime - cfs_rq->min_vruntime; +- +- if (d < 0) +- d = -d; +- +- if (d > 3*sysctl_sched_latency) +- schedstat_inc(cfs_rq->nr_spread_over); +-#endif +-} +- +-static inline bool entity_is_long_sleeper(struct sched_entity *se) ++static void ++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- struct cfs_rq *cfs_rq; +- u64 sleep_time; ++ u64 vslice = calc_delta_fair(se->slice, se); ++ u64 vruntime = avg_vruntime(cfs_rq); ++ s64 lag = 0; + +- if (se->exec_start == 0) +- return false; ++ /* ++ * Due to how V is constructed as the weighted average of entities, ++ * adding tasks with positive lag, or removing tasks with negative lag ++ * will move 'time' backwards, this can screw around with the lag of ++ * other tasks. ++ * ++ * EEVDF: placement strategy #1 / #2 ++ */ ++ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { ++ struct sched_entity *curr = cfs_rq->curr; ++ unsigned long load; + +- cfs_rq = cfs_rq_of(se); ++ lag = se->vlag; + +- sleep_time = rq_clock_task(rq_of(cfs_rq)); ++ /* ++ * For latency sensitive tasks; those that have a shorter than ++ * average slice and do not fully consume the slice, transition ++ * to EEVDF placement strategy #2. ++ */ ++ if (sched_feat(PLACE_FUDGE) && ++ cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) { ++ lag += vslice; ++ if (lag > 0) ++ lag = 0; ++ } + +- /* Happen while migrating because of clock task divergence */ +- if (sleep_time <= se->exec_start) +- return false; ++ /* ++ * If we want to place a task and preserve lag, we have to ++ * consider the effect of the new entity on the weighted ++ * average and compensate for this, otherwise lag can quickly ++ * evaporate: ++ * ++ * l_i = V - v_i <=> v_i = V - l_i ++ * ++ * V = v_avg = W*v_avg / W ++ * ++ * V' = (W*v_avg + w_i*v_i) / (W + w_i) ++ * = (W*v_avg + w_i(v_avg - l_i)) / (W + w_i) ++ * = v_avg + w_i*l_i/(W + w_i) ++ * ++ * l_i' = V' - v_i = v_avg + w_i*l_i/(W + w_i) - (v_avg - l) ++ * = l_i - w_i*l_i/(W + w_i) ++ * ++ * l_i = (W + w_i) * l_i' / W ++ */ ++ load = cfs_rq->avg_load; ++ if (curr && curr->on_rq) ++ load += scale_load_down(curr->load.weight); + +- sleep_time -= se->exec_start; +- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) +- return true; ++ lag *= load + scale_load_down(se->load.weight); ++ if (WARN_ON_ONCE(!load)) ++ load = 1; ++ lag = div_s64(lag, load); + +- return false; +-} ++ vruntime -= lag; ++ } + +-static void +-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +-{ +- u64 vruntime = cfs_rq->min_vruntime; ++ /* ++ * Base the deadline on the 'normal' EEVDF placement policy in an ++ * attempt to not let the bonus crud below wreck things completely. ++ */ ++ se->deadline = vruntime; + + /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. -- */ ++ * The whole 'sleeper' bonus hack... :-/ This is strictly unfair. ++ * ++ * By giving a sleeping task a little boost, it becomes possible for a ++ * 50% task to compete equally with a 100% task. That is, strictly fair ++ * that setup would result in a 67% / 33% split. Sleeper bonus will ++ * change that to 50% / 50%. ++ * ++ * This thing hurts my brain, because tasks leaving with negative lag ++ * will move 'time' backward, so comparing against a historical ++ * se->vruntime is dodgy as heck. + */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); -+ if (sched_feat(PRESERVE_LAG)) -+ vruntime -= se->lag; ++ if (sched_feat(PLACE_BONUS) && ++ (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)) { ++ /* ++ * If se->vruntime is ahead of vruntime, something dodgy ++ * happened and we cannot give bonus due to not having valid ++ * history. ++ */ ++ if ((s64)(se->vruntime - vruntime) < 0) { ++ vruntime -= se->slice/2; ++ vruntime = max_vruntime(se->vruntime, vruntime); ++ } ++ } - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh; -+ if (sched_feat(FAIR_SLEEPERS)) { -+// u64 sleep_time; ++ se->vruntime = vruntime; - if (se_is_idle(se)) - thresh = sysctl_sched_min_granularity; - else - thresh = sysctl_sched_latency; -+ /* sleeps up to a single latency don't count. */ -+ if (!initial) { -+ unsigned long thresh = TICK_NSEC; -+ -+ if (!sched_feat(EEVDF)) { -+ if (se_is_idle(se)) -+ thresh = sysctl_sched_min_granularity; -+ else -+ thresh = sysctl_sched_latency; -+ } -+ -+ /* -+ * Halve their sleep time's effect, to allow -+ * for a gentler effect of sleepers: -+ */ -+ if (sched_feat(GENTLE_FAIR_SLEEPERS)) -+ thresh >>= 1; -+ -+ vruntime -= calc_delta_fair(thresh, se); -+ } ++ /* ++ * When joining the competition; the exisiting tasks will be, ++ * on average, halfway through their slice, as such start tasks ++ * off with half a slice to ease into the competition. ++ */ ++ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++ vslice /= 2; - /* +- /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: -+ * Pull vruntime of the entity being placed to the base level of -+ * cfs_rq, to prevent boosting it if placed backwards. If the entity -+ * slept for a long time, don't even try to compare its vruntime with -+ * the base as it may be too far off and the comparison may get -+ * inversed due to s64 overflow. -+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; -+ if ((s64)sleep_time < 60LL * NSEC_PER_SEC) - */ +- */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; -+ vruntime = max_vruntime(se->vruntime, vruntime); - } - +- } +- - /* - * Pull vruntime of the entity being placed to the base level of -- * cfs_rq, to prevent boosting it if placed backwards. If the entity -- * slept for a long time, don't even try to compare its vruntime with -- * the base as it may be too far off and the comparison may get -- * inversed due to s64 overflow. +- * cfs_rq, to prevent boosting it if placed backwards. +- * However, min_vruntime can advance much faster than real time, with +- * the extreme being when an entity with the minimal weight always runs +- * on the cfs_rq. If the waking entity slept for a long time, its +- * vruntime difference from min_vruntime may overflow s64 and their +- * comparison may get inversed, so ignore the entity's original +- * vruntime in that case. +- * The maximal vruntime speedup is given by the ratio of normal to +- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. +- * When placing a migrated waking entity, its exec_start has been set +- * from a different rq. In order to take into account a possible +- * divergence between new and prev rq's clocks task because of irq and +- * stolen time, we take an additional margin. +- * So, cutting off on the sleep time of +- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days +- * should be safe. - */ -- sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; -- if ((s64)sleep_time > 60LL * NSEC_PER_SEC) +- if (entity_is_long_sleeper(se)) - se->vruntime = vruntime; - else - se->vruntime = max_vruntime(se->vruntime, vruntime); -+ se->vruntime = vruntime; -+ set_slice(cfs_rq, se); ++ /* ++ * EEVDF: vd_i = ve_i + r_i/w_i ++ */ ++ se->deadline += vslice; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -@@ -4879,6 +5163,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + static inline bool cfs_bandwidth_used(void); + +-/* +- * MIGRATION +- * +- * dequeue +- * update_curr() +- * update_min_vruntime() +- * vruntime -= min_vruntime +- * +- * enqueue +- * update_curr() +- * update_min_vruntime() +- * vruntime += min_vruntime +- * +- * this way the vruntime transition between RQs is done when both +- * min_vruntime are up-to-date. +- * +- * WAKEUP (remote) +- * +- * ->migrate_task_rq_fair() (p->state == TASK_WAKING) +- * vruntime -= min_vruntime +- * +- * enqueue +- * update_curr() +- * update_min_vruntime() +- * vruntime += min_vruntime +- * +- * this way we don't have the most up-to-date min_vruntime on the originating +- * CPU and an up-to-date min_vruntime on the destination CPU. +- */ +- + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); + bool curr = cfs_rq->curr == se; + + /* + * If we're the current task, we must renormalise before calling + * update_curr(). + */ +- if (renorm && curr) +- se->vruntime += cfs_rq->min_vruntime; ++ if (curr) ++ place_entity(cfs_rq, se, flags); + + update_curr(cfs_rq); + +- /* +- * Otherwise, renormalise after, such that we're placed at the current +- * moment in time, instead of some random moment in the past. Being +- * placed in the past could significantly boost this task to the +- * fairness detriment of existing tasks. +- */ +- if (renorm && !curr) +- se->vruntime += cfs_rq->min_vruntime; +- + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. +@@ -4816,18 +4966,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + */ + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); ++ /* ++ * XXX update_load_avg() above will have attached us to the pelt sum; ++ * but update_cfs_group() here will re-adjust the weight and have to ++ * undo/redo all that. Seems wasteful. ++ */ + update_cfs_group(se); ++ ++ /* ++ * XXX now that the entity has been re-weighted, and it's lag adjusted, ++ * we can place the entity. ++ */ ++ if (!curr) ++ place_entity(cfs_rq, se, flags); ++ + account_entity_enqueue(cfs_rq, se); + +- if (flags & ENQUEUE_WAKEUP) +- place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; + + check_schedstat_required(); + update_stats_enqueue_fair(cfs_rq, se, flags); +- check_spread(cfs_rq, se); + if (!curr) + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; +@@ -4839,17 +4999,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + } + } + +-static void __clear_buddies_last(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- if (cfs_rq->last != se) +- break; +- +- cfs_rq->last = NULL; +- } +-} +- + static void __clear_buddies_next(struct sched_entity *se) + { + for_each_sched_entity(se) { +@@ -4861,27 +5010,10 @@ static void __clear_buddies_next(struct sched_entity *se) + } + } + +-static void __clear_buddies_skip(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- if (cfs_rq->skip != se) +- break; +- +- cfs_rq->skip = NULL; +- } +-} +- + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- if (cfs_rq->last == se) +- __clear_buddies_last(se); +- + if (cfs_rq->next == se) + __clear_buddies_next(se); +- +- if (cfs_rq->skip == se) +- __clear_buddies_skip(se); + } + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -4915,20 +5047,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); -+ if (sched_feat(PRESERVE_LAG) && (flags & DEQUEUE_SLEEP)) -+ se->lag = avg_vruntime(cfs_rq) - se->vruntime; -+ ++ update_entity_lag(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; -@@ -4917,19 +5204,20 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + account_entity_dequeue(cfs_rq, se); + +- /* +- * Normalize after update_curr(); which will also have moved +- * min_vruntime if @se is the one holding it back. But before doing +- * update_min_vruntime() again, which will discount @se's position and +- * can move min_vruntime forward still more. +- */ +- if (!(flags & DEQUEUE_SLEEP)) +- se->vruntime -= cfs_rq->min_vruntime; +- + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + +@@ -4953,44 +5077,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - unsigned long ideal_runtime, delta_exec; -+ unsigned long delta_exec; - struct sched_entity *se; - s64 delta; - +- struct sched_entity *se; +- s64 delta; +- - /* - * When many tasks blow up the sched_period; it is possible that - * sched_slice() reports unusually large results (when many tasks are - * very light for example). Therefore impose a maximum. - */ - ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); -+ if (sched_feat(EEVDF)) { -+ if (pick_eevdf(cfs_rq) != curr) -+ goto preempt; -+ -+ return; -+ } - - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; +- +- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { -+ if (delta_exec > curr->slice) { -+preempt: ++ if (pick_eevdf(cfs_rq) != curr) { resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get -@@ -4953,7 +5241,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) - if (delta < 0) - return; - + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); +- return; + } +- +- /* +- * Ensure that a task that missed wakeup preemption by a +- * narrow margin doesn't have to wait for a full slice. +- * This also mitigates buddy induced latencies under load. +- */ +- if (delta_exec < sysctl_sched_min_granularity) +- return; +- +- se = __pick_first_entity(cfs_rq); +- delta = curr->vruntime - se->vruntime; +- +- if (delta < 0) +- return; +- - if (delta > ideal_runtime) -+ if (delta > curr->slice) - resched_curr(rq_of(cfs_rq)); +- resched_curr(rq_of(cfs_rq)); } -@@ -5008,17 +5296,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + static void +@@ -5031,9 +5125,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + +-static int +-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +- + /* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups +@@ -5044,50 +5135,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; -+ struct sched_entity *left, *se; - -- /* +- + /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. -- */ ++ * Enabling NEXT_BUDDY will affect latency but not fairness. + */ - if (!left || (curr && entity_before(curr, left))) - left = curr; -+ if (sched_feat(EEVDF)) { -+ /* -+ * Enabling NEXT_BUDDY will affect latency but not fairness. -+ */ -+ if (sched_feat(NEXT_BUDDY) && -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ return cfs_rq->next; - +- - se = left; /* ideally we run the leftmost entity */ -+ return pick_eevdf(cfs_rq); -+ } -+ -+ se = left = pick_cfs(cfs_rq, curr); ++ if (sched_feat(NEXT_BUDDY) && ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ return cfs_rq->next; - /* - * Avoid running the skip buddy, if running something else can -@@ -6113,13 +6404,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} +- /* +- * Avoid running the skip buddy, if running something else can +- * be done without getting too unfair. +- */ +- if (cfs_rq->skip && cfs_rq->skip == se) { +- struct sched_entity *second; +- +- if (se == curr) { +- second = __pick_first_entity(cfs_rq); +- } else { +- second = __pick_next_entity(se); +- if (!second || (curr && entity_before(curr, second))) +- second = curr; +- } +- +- if (second && wakeup_preempt_entity(second, left) < 1) +- se = second; +- } +- +- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { +- /* +- * Someone really wants this to run. If it's not unfair, run it. +- */ +- se = cfs_rq->next; +- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { +- /* +- * Prefer last buddy, try to return the CPU to a preempted task. +- */ +- se = cfs_rq->last; +- } +- +- return se; ++ return pick_eevdf(cfs_rq); + } + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -5104,8 +5159,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) + /* throttle cfs_rqs exceeding runtime */ + check_cfs_rq_runtime(cfs_rq); + +- check_spread(cfs_rq, prev); +- + if (prev->on_rq) { + update_stats_wait_start_fair(cfs_rq, prev); + /* Put 'current' back into the tree. */ +@@ -6149,13 +6202,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -1109,46 +1639,331 @@ index 84254f52c56a..c40b775452bc 100644 s64 delta = slice - ran; if (delta < 0) { -@@ -7891,7 +8181,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -6179,8 +6231,7 @@ static void hrtick_update(struct rq *rq) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + return; + +- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) +- hrtick_start_fair(rq, curr); ++ hrtick_start_fair(rq, curr); + } + #else /* !CONFIG_SCHED_HRTICK */ + static inline void +@@ -6221,17 +6272,6 @@ static int sched_idle_rq(struct rq *rq) + rq->nr_running); + } + +-/* +- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use +- * of idle_nr_running, which does not consider idle descendants of normal +- * entities. +- */ +-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +-{ +- return cfs_rq->nr_running && +- cfs_rq->nr_running == cfs_rq->idle_nr_running; +-} +- + #ifdef CONFIG_SMP + static int sched_idle_cpu(int cpu) + { +@@ -6333,18 +6373,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + + static void set_next_buddy(struct sched_entity *se); + +-static inline void dur_avg_update(struct task_struct *p, bool task_sleep) +-{ +- u64 dur; +- +- if (!task_sleep) +- return; +- +- dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime; +- p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime; +- update_avg(&p->se.dur_avg, dur); +-} +- + /* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and +@@ -6417,7 +6445,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + + dequeue_throttle: + util_est_update(&rq->cfs, p, task_sleep); +- dur_avg_update(p, task_sleep); + hrtick_update(rq); + } + +@@ -6551,23 +6578,6 @@ static int wake_wide(struct task_struct *p) + return 1; + } + +-/* +- * If a task switches in and then voluntarily relinquishes the +- * CPU quickly, it is regarded as a short duration task. +- * +- * SIS_SHORT tries to wake up the short wakee on current CPU. This +- * aims to avoid race condition among CPUs due to frequent context +- * switch. Besides, the candidate short task should not be the one +- * that wakes up more than one tasks, otherwise SIS_SHORT might +- * stack too many tasks on current CPU. +- */ +-static inline int is_short_task(struct task_struct *p) +-{ +- return sched_feat(SIS_SHORT) && !p->wakee_flips && +- p->se.dur_avg && +- ((p->se.dur_avg * 8) < sysctl_sched_min_granularity); +-} +- + /* + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous +@@ -6604,11 +6614,6 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + +- /* The only running task is a short duration one. */ +- if (cpu_rq(this_cpu)->nr_running == 1 && +- is_short_task(rcu_dereference(cpu_curr(this_cpu)))) +- return this_cpu; +- + return nr_cpumask_bits; + } + +@@ -6983,20 +6988,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; +- +- /* +- * If the scan number suggested by SIS_UTIL is smaller +- * than 60% of llc_weight, it indicates a util_avg% higher +- * than 50%. System busier than this could lower its bar to +- * choose a compromised "idle" CPU. This co-exists with +- * !has_idle_core to not stack too many tasks on one CPU. +- */ +- if (!has_idle_core && this == target && +- (5 * nr < 3 * sd->span_weight) && +- cpu_rq(target)->nr_running <= 1 && +- is_short_task(p) && +- is_short_task(rcu_dereference(cpu_curr(target)))) +- return target; + } + } + +@@ -7729,18 +7720,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + { + struct sched_entity *se = &p->se; + +- /* +- * As blocked tasks retain absolute vruntime the migration needs to +- * deal with this by subtracting the old and adding the new +- * min_vruntime -- the latter is done by enqueue_entity() when placing +- * the task on the new runqueue. +- */ +- if (READ_ONCE(p->__state) == TASK_WAKING) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- +- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); +- } +- + if (!task_on_rq_migrating(p)) { + remove_entity_load_avg(se); + +@@ -7778,66 +7757,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + #endif /* CONFIG_SMP */ + +-static unsigned long wakeup_gran(struct sched_entity *se) +-{ +- unsigned long gran = sysctl_sched_wakeup_granularity; +- +- /* +- * Since its curr running now, convert the gran from real-time +- * to virtual-time in his units. +- * +- * By using 'se' instead of 'curr' we penalize light tasks, so +- * they get preempted easier. That is, if 'se' < 'curr' then +- * the resulting gran will be larger, therefore penalizing the +- * lighter, if otoh 'se' > 'curr' then the resulting gran will +- * be smaller, again penalizing the lighter task. +- * +- * This is especially important for buddies when the leftmost +- * task is higher priority than the buddy. +- */ +- return calc_delta_fair(gran, se); +-} +- +-/* +- * Should 'se' preempt 'curr'. +- * +- * |s1 +- * |s2 +- * |s3 +- * g +- * |<--->|c +- * +- * w(c, s1) = -1 +- * w(c, s2) = 0 +- * w(c, s3) = 1 +- * +- */ +-static int +-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +-{ +- s64 gran, vdiff = curr->vruntime - se->vruntime; +- +- if (vdiff <= 0) +- return -1; +- +- gran = wakeup_gran(se); +- if (vdiff > gran) +- return 1; +- +- return 0; +-} +- +-static void set_last_buddy(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- if (SCHED_WARN_ON(!se->on_rq)) +- return; +- if (se_is_idle(se)) +- return; +- cfs_rq_of(se)->last = se; +- } +-} +- + static void set_next_buddy(struct sched_entity *se) + { + for_each_sched_entity(se) { +@@ -7849,12 +7768,6 @@ static void set_next_buddy(struct sched_entity *se) + } + } + +-static void set_skip_buddy(struct sched_entity *se) +-{ +- for_each_sched_entity(se) +- cfs_rq_of(se)->skip = se; +-} +- + /* + * Preempt the current task with a newly woken task if needed: + */ +@@ -7863,7 +7776,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); +- int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; + +@@ -7879,7 +7791,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + +- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { ++ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { + set_next_buddy(pse); + next_buddy_marked = 1; + } +@@ -7924,35 +7836,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; - update_curr(cfs_rq_of(se)); +- if (wakeup_preempt_entity(se, pse) == 1) { +- /* +- * Bias pick_next to pick the sched entity that is +- * triggering this preemption. +- */ +- if (!next_buddy_marked) +- set_next_buddy(pse); + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + -+ if (sched_feat(EEVDF)) { -+ /* -+ * XXX pick_eevdf(cfs_rq) != se ? -+ */ -+ if (pick_eevdf(cfs_rq) == pse) -+ goto preempt; -+ -+ return; -+ } -+ - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is -@@ -8137,7 +8439,7 @@ static void yield_task_fair(struct rq *rq) ++ /* ++ * XXX pick_eevdf(cfs_rq) != se ? ++ */ ++ if (pick_eevdf(cfs_rq) == pse) + goto preempt; +- } + + return; + + preempt: + resched_curr(rq); +- /* +- * Only set the backward buddy when the current task is still +- * on the rq. This can happen when a wakeup gets interleaved +- * with schedule on the ->pre_schedule() or idle_balance() +- * point, either of which can * drop the rq lock. +- * +- * Also, during early boot the idle thread is in the fair class, +- * for obvious reasons its a bad idea to schedule back to it. +- */ +- if (unlikely(!se->on_rq || curr == rq->idle)) +- return; +- +- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) +- set_last_buddy(se); + } + + #ifdef CONFIG_SMP +@@ -8153,8 +8049,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) + + /* + * sched_yield() is very simple +- * +- * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ + static void yield_task_fair(struct rq *rq) + { +@@ -8170,21 +8064,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { -+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. -@@ -8150,6 +8452,8 @@ static void yield_task_fair(struct rq *rq) - */ - rq_clock_skip_update(rq); - } -+ if (sched_feat(EEVDF)) -+ se->deadline += calc_delta_fair(se->slice, se); +- update_rq_clock(rq); +- /* +- * Update run-time statistics of the 'current'. +- */ +- update_curr(cfs_rq); +- /* +- * Tell update_rq_clock() that we've just updated, +- * so we don't do microscopic update in schedule() +- * and double the fastpath cost. +- */ +- rq_clock_skip_update(rq); +- } ++ update_rq_clock(rq); ++ /* ++ * Update run-time statistics of the 'current'. ++ */ ++ update_curr(cfs_rq); ++ /* ++ * Tell update_rq_clock() that we've just updated, ++ * so we don't do microscopic update in schedule() ++ * and double the fastpath cost. ++ */ ++ rq_clock_skip_update(rq); - set_skip_buddy(se); +- set_skip_buddy(se); ++ se->deadline += calc_delta_fair(se->slice, se); } -@@ -11902,8 +12206,8 @@ static void rq_offline_fair(struct rq *rq) + + static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) +@@ -8427,8 +8319,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && +- (&p->se == cfs_rq_of(&p->se)->next || +- &p->se == cfs_rq_of(&p->se)->last)) ++ (&p->se == cfs_rq_of(&p->se)->next)) + return 1; + + if (sysctl_sched_migration_cost == -1) +@@ -11932,8 +11823,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { @@ -1158,31 +1973,130 @@ index 84254f52c56a..c40b775452bc 100644 return (rtime * min_nr_tasks > slice); } -@@ -12330,6 +12634,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +@@ -12077,8 +11968,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + */ + static void task_fork_fair(struct task_struct *p) + { +- struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se, *curr; ++ struct cfs_rq *cfs_rq; + struct rq *rq = this_rq(); + struct rq_flags rf; + +@@ -12087,22 +11978,9 @@ static void task_fork_fair(struct task_struct *p) + + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; +- if (curr) { ++ if (curr) + update_curr(cfs_rq); +- se->vruntime = curr->vruntime; +- } +- place_entity(cfs_rq, se, 1); +- +- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { +- /* +- * Upon rescheduling, sched_class::put_prev_task() will place +- * 'current' within the tree based on its new key value. +- */ +- swap(curr->vruntime, se->vruntime); +- resched_curr(rq); +- } +- +- se->vruntime -= cfs_rq->min_vruntime; ++ place_entity(cfs_rq, se, ENQUEUE_INITIAL); + rq_unlock(rq, &rf); + } + +@@ -12131,34 +12009,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + check_preempt_curr(rq, p, 0); + } + +-static inline bool vruntime_normalized(struct task_struct *p) +-{ +- struct sched_entity *se = &p->se; +- +- /* +- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, +- * the dequeue_entity(.flags=0) will already have normalized the +- * vruntime. +- */ +- if (p->on_rq) +- return true; +- +- /* +- * When !on_rq, vruntime of the task has usually NOT been normalized. +- * But there are some cases where it has already been normalized: +- * +- * - A forked child which is waiting for being woken up by +- * wake_up_new_task(). +- * - A task which has been woken up by try_to_wake_up() and +- * waiting for actually being woken up by sched_ttwu_pending(). +- */ +- if (!se->sum_exec_runtime || +- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) +- return true; +- +- return false; +-} +- + #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Propagate the changes of the sched_entity across the tg tree to make it +@@ -12229,16 +12079,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) + static void detach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- +- if (!vruntime_normalized(p)) { +- /* +- * Fix up our vruntime so that the current sleep doesn't +- * cause 'unlimited' sleep bonus. +- */ +- place_entity(cfs_rq, se, 0); +- se->vruntime -= cfs_rq->min_vruntime; +- } + + detach_entity_cfs_rq(se); + } +@@ -12246,12 +12086,8 @@ static void detach_task_cfs_rq(struct task_struct *p) + static void attach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); +- +- if (!vruntime_normalized(p)) +- se->vruntime += cfs_rq->min_vruntime; + } + + static void switched_from_fair(struct rq *rq, struct task_struct *p) +@@ -12362,6 +12198,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; -+ tg->latency_prio = DEFAULT_LATENCY_PRIO; ++ tg->latency_prio = DEFAULT_PRIO; init_cfs_bandwidth(tg_cfs_bandwidth(tg)); -@@ -12428,6 +12733,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +@@ -12460,6 +12297,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; + -+ se->latency_offset = calc_latency_offset(tg->latency_prio); ++ set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO); + /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; -@@ -12558,6 +12866,34 @@ int sched_group_set_idle(struct task_group *tg, long idle) +@@ -12590,6 +12430,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } +int sched_group_set_latency(struct task_group *tg, int prio) +{ -+ long latency_offset; + int i; + + if (tg == &root_task_group) @@ -1196,13 +2110,9 @@ index 84254f52c56a..c40b775452bc 100644 + } + + tg->latency_prio = prio; -+ latency_offset = calc_latency_offset(prio); + -+ for_each_possible_cpu(i) { -+ struct sched_entity *se = tg->se[i]; -+ -+ WRITE_ONCE(se->latency_offset, latency_offset); -+ } ++ for_each_possible_cpu(i) ++ set_latency_fair(tg->se[i], prio - MAX_RT_PRIO); + + mutex_unlock(&shares_mutex); + return 0; @@ -1211,7 +2121,7 @@ index 84254f52c56a..c40b775452bc 100644 #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } -@@ -12584,7 +12920,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task +@@ -12616,7 +12479,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) @@ -1221,39 +2131,65 @@ index 84254f52c56a..c40b775452bc 100644 return rr_interval; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index efdc29c42161..49c7e6fa4c71 100644 +index efdc29c42161..d4b7d3f7c044 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -1,16 +1,18 @@ +@@ -1,16 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ -+ - /* - * Only give sleepers 50% of their service deficit. This allows - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. - */ -+SCHED_FEAT(FAIR_SLEEPERS, false) - SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) +-/* +- * Only give sleepers 50% of their service deficit. This allows +- * them to run sooner, but does not allow tons of sleepers to +- * rip the spread apart. +- */ +-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) /* - * Place new tasks ahead so that they do not starve already running - * tasks -+ * Using the avg_vruntime, do the right thing and preserve lag -+ * across sleep+wake cycles. ++ * Using the avg_vruntime, do the right thing and preserve lag across ++ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ -SCHED_FEAT(START_DEBIT, true) -+SCHED_FEAT(PRESERVE_LAG, true) ++SCHED_FEAT(PLACE_LAG, true) ++SCHED_FEAT(PLACE_FUDGE, true) ++SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++SCHED_FEAT(PLACE_BONUS, false) ++ ++SCHED_FEAT(MINIMAL_VA, false) /* * Prefer to schedule the task we woke last (assuming it failed -@@ -102,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false) +@@ -19,13 +18,6 @@ SCHED_FEAT(START_DEBIT, true) + */ + SCHED_FEAT(NEXT_BUDDY, false) - SCHED_FEAT(ALT_PERIOD, true) - SCHED_FEAT(BASE_SLICE, true) -+ -+SCHED_FEAT(EEVDF, true) +-/* +- * Prefer to schedule the task that ran last (when we did +- * wake-preempt) as that likely will touch the same data, increases +- * cache locality. +- */ +-SCHED_FEAT(LAST_BUDDY, true) +- + /* + * Consider buddies to be cache hot, decreases the likeliness of a + * cache buddy being migrated away, increases cache locality. +@@ -62,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_PROP, false) + SCHED_FEAT(SIS_UTIL, true) +-SCHED_FEAT(SIS_SHORT, true) + + /* + * Issue a WARN when we do multiple update_rq_clock() calls +@@ -99,6 +90,3 @@ SCHED_FEAT(UTIL_EST, true) + SCHED_FEAT(UTIL_EST_FASTUP, true) + + SCHED_FEAT(LATENCY_WARN, false) +- +-SCHED_FEAT(ALT_PERIOD, true) +-SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 9e8bb6278604..fe5af7aaa931 100644 +index 7331d436ebc4..bfce45b21441 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -378,6 +378,8 @@ struct task_group { @@ -1274,28 +2210,67 @@ index 9e8bb6278604..fe5af7aaa931 100644 #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -@@ -554,6 +558,9 @@ struct cfs_rq { +@@ -554,6 +558,10 @@ struct cfs_rq { unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + s64 avg_vruntime; ++ u64 avg_slice; + u64 avg_load; + u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE -@@ -2478,6 +2485,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; +@@ -573,8 +581,6 @@ struct cfs_rq { + */ + struct sched_entity *curr; + struct sched_entity *next; +- struct sched_entity *last; +- struct sched_entity *skip; + + #ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +@@ -2154,7 +2160,7 @@ extern const u32 sched_prio_to_wmult[40]; + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup +- * ++ * ENQUEUE_INITIAL - place a new task (fork/clone) + */ + + #define DEQUEUE_SLEEP 0x01 +@@ -2174,6 +2180,7 @@ extern const u32 sched_prio_to_wmult[40]; + #else + #define ENQUEUE_MIGRATED 0x00 + #endif ++#define ENQUEUE_INITIAL 0x80 + + #define RETRY_TASK ((void *)-1UL) + +@@ -2476,10 +2483,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; + extern const_debug unsigned int sysctl_sched_migration_cost; + + #ifdef CONFIG_SCHED_DEBUG +-extern unsigned int sysctl_sched_latency; +-extern unsigned int sysctl_sched_min_granularity; +-extern unsigned int sysctl_sched_idle_min_granularity; +-extern unsigned int sysctl_sched_wakeup_granularity; ++extern unsigned int sysctl_sched_base_slice; + extern int sysctl_resched_latency_warn_ms; + extern int sysctl_resched_latency_warn_once; + +@@ -2492,6 +2496,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; #endif -+extern long calc_latency_offset(int prio); ++extern void set_latency_fair(struct sched_entity *se, int prio); + #ifdef CONFIG_SCHED_HRTICK /* -@@ -3251,4 +3260,7 @@ static inline void update_current_exec_runtime(struct task_struct *curr, - cgroup_account_cputime(curr, delta_exec); - } +@@ -3323,4 +3329,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } + #endif +extern u64 avg_vruntime(struct cfs_rq *cfs_rq); +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1323,4 +2298,4 @@ index 3bac0a8ceab2..b2e932c25be6 100644 #endif /* _UAPI_LINUX_SCHED_H */ -- -2.40.0.rc2 +2.40.0 diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch index 2ba8854..09b4f45 100644 --- a/patches/0003-bore.patch +++ b/patches/0003-bore.patch @@ -1,38 +1,40 @@ -From e016cce088886f56617becc8fcc598a0114e4faa Mon Sep 17 00:00:00 2001 +From d1d05832308e210422f7c52d052b026deb9fabf1 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 11 Mar 2023 18:44:19 +0100 -Subject: [PATCH] bore-eevdf +Date: Thu, 6 Apr 2023 19:12:01 +0200 +Subject: [PATCH] bore Signed-off-by: Peter Jung --- - include/linux/sched.h | 5 ++ - init/Kconfig | 20 +++++++ - kernel/sched/core.c | 29 ++++++++++ - kernel/sched/debug.c | 3 + - kernel/sched/fair.c | 124 +++++++++++++++++++++++++++++++++++++++++- - 5 files changed, 180 insertions(+), 1 deletion(-) + include/linux/sched.h | 6 ++ + init/Kconfig | 20 ++++++ + kernel/sched/core.c | 30 ++++++++ + kernel/sched/debug.c | 3 + + kernel/sched/fair.c | 149 +++++++++++++++++++++++++++++++++++++++- + kernel/sched/features.h | 8 +++ + 6 files changed, 213 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index 764df627c243..f912da35db34 100644 +index 63d242164b1a..39a046d6cf90 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -558,6 +558,11 @@ struct sched_entity { +@@ -555,6 +555,12 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; +#ifdef CONFIG_SCHED_BORE + u64 prev_burst_time; + u64 burst_time; -+ u8 burst_score; ++ u64 max_burst_time; ++ u8 penalty_score; +#endif // CONFIG_SCHED_BORE - s64 lag; - u64 slice; + + u64 nr_migrations; diff --git a/init/Kconfig b/init/Kconfig -index 748a9491ca12..d10f1e6257cd 100644 +index 1fb5f313d18f..6595e5ed2416 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE +@@ -1285,6 +1285,26 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -60,89 +62,90 @@ index 748a9491ca12..d10f1e6257cd 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 9db5f9ec9022..1f1e1f586407 100644 +index 0d18c3969f90..34db768f6ba8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4418,6 +4418,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4418,6 +4418,22 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SCHED_BORE -+static inline void sched_fork_update_prev_burst(struct task_struct *p) ++static inline void adjust_prev_burst(struct task_struct *p) +{ -+ struct task_struct *sib; + u32 cnt = 0; + u64 sum = 0, avg = 0; ++ struct task_struct *sib; + list_for_each_entry(sib, &p->sibling, sibling) { + cnt++; -+ sum += sib->se.prev_burst_time >> 8; ++ sum += sib->se.max_burst_time >> 8; + } + if (cnt) avg = div_u64(sum, cnt) << 8; + if (p->se.prev_burst_time < avg) p->se.prev_burst_time = avg; ++ p->se.max_burst_time = p->se.prev_burst_time; +} +#endif // CONFIG_SCHED_BORE + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4434,6 +4449,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4434,6 +4450,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_BORE + p->se.burst_time = 0; +#endif // CONFIG_SCHED_BORE - p->se.dur_avg = 0; - p->se.prev_sleep_sum_runtime = 0; - p->se.lag = 0; -@@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init); + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -4659,6 +4678,9 @@ late_initcall(sched_core_sysctl_init); int sched_fork(unsigned long clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); +#ifdef CONFIG_SCHED_BORE -+ sched_fork_update_prev_burst(p); -+ p->se.burst_time = 0; ++ adjust_prev_burst(p); +#endif // CONFIG_SCHED_BORE /* * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external -@@ -9153,6 +9175,9 @@ void __init init_idle(struct task_struct *idle, int cpu) +@@ -9126,6 +9148,10 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); +#ifdef CONFIG_SCHED_BORE + idle->se.prev_burst_time = 0; ++ idle->se.max_burst_time = 0; +#endif //CONFIG_SCHED_BORE /* * PF_KTHREAD should already be set at this point; regardless, make it * look like a proper per-CPU kthread. -@@ -9820,6 +9845,10 @@ void __init sched_init(void) +@@ -9793,6 +9819,10 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification for 1.7-eevdf2 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.1.1 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index fe9edfa43f65..3672df7c1f6a 100644 +index 1637b65ba07a..752c43a9ff13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -551,6 +551,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE -+ SEQ_printf(m, " %2d", p->se.burst_score); ++ SEQ_printf(m, " %2d", p->se.penalty_score); +#endif #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index c40b775452bc..1e4ca5419a11 100644 +index 6986ea31c984..ee461e4586ca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -151,19 +154,19 @@ index c40b775452bc..1e4ca5419a11 100644 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler -+ * Copyright (C) 2021 Masahito Suzuki ++ * Copyright (C) 2021-2023 Masahito Suzuki */ #include #include -@@ -141,6 +144,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +@@ -126,6 +129,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SCHED_BORE -+unsigned int __read_mostly sched_bore = 1; -+unsigned int __read_mostly sched_burst_penalty_scale = 1280; -+unsigned int __read_mostly sched_burst_granularity = 6; -+unsigned int __read_mostly sched_burst_smoothness = 2; ++unsigned int __read_mostly sched_bore = 3; ++unsigned int __read_mostly sched_burst_penalty_offset = 12; ++unsigned int __read_mostly sched_burst_penalty_scale = 1292; ++unsigned int __read_mostly sched_burst_smoothness = 1; +static int three = 3; +static int sixty_four = 64; +static int maxval_12_bits = 4095; @@ -172,7 +175,7 @@ index c40b775452bc..1e4ca5419a11 100644 int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -204,6 +217,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -185,6 +198,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -184,7 +187,16 @@ index c40b775452bc..1e4ca5419a11 100644 + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, ++ .extra2 = &three, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, + }, + { + .procname = "sched_burst_penalty_scale", @@ -196,15 +208,6 @@ index c40b775452bc..1e4ca5419a11 100644 + .extra2 = &maxval_12_bits, + }, + { -+ .procname = "sched_burst_granularity", -+ .data = &sched_burst_granularity, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = &sixty_four, -+ }, -+ { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(unsigned int), @@ -217,61 +220,70 @@ index c40b775452bc..1e4ca5419a11 100644 { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, -@@ -1182,6 +1233,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) +@@ -891,6 +942,47 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_BORE -+static inline void update_burst_score(struct sched_entity *se) { -+ u64 burst_time; -+ s32 bits; -+ u32 intgr, fdigs, dec10; -+ -+ burst_time = max(se->burst_time, se->prev_burst_time); -+ bits = fls64(burst_time); -+ intgr = max((u32)bits, sched_burst_granularity) - sched_burst_granularity; -+ fdigs = max(bits - 1, (s32)sched_burst_granularity); -+ dec10 = (intgr << 10) | (burst_time << (64 - fdigs) >> 54); -+ se->burst_score = min((u32)39, dec10 * sched_burst_penalty_scale >> 20); ++static inline u32 __calc_bits10(u64 burst_time) { ++ u32 bits = fls64(burst_time); ++ u32 fdigs = likely(bits) ? bits - 1 : 0; ++ return (bits << 10) | (burst_time << (64 - fdigs) >> 54); +} + -+static u64 burst_scale(u64 delta, struct sched_entity *se) { -+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22); ++static inline u32 __calc_burst_score(u32 bits10, u32 offset) { ++ u32 val10 = max((s32)0, (s32)bits10 - (s32)(offset << 10)); ++ return min((u32)39, val10 * sched_burst_penalty_scale >> 20); +} + -+static u64 calc_delta_fair_bscale(u64 delta, struct sched_entity *se) { -+ return burst_scale(calc_delta_fair(delta, se), se); ++static void update_burst_score(struct sched_entity *se) { ++ u32 bits10 = __calc_bits10(se->max_burst_time); ++ se->penalty_score = __calc_burst_score(bits10, sched_burst_penalty_offset); ++} ++ ++static inline u64 penalty_scale(u64 delta, struct sched_entity *se) { ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22); ++} ++ ++static inline u64 preempt_scale( ++ u64 delta, struct sched_entity *curr, struct sched_entity *se) { ++ ++ u32 score = max(0, (s32)se->penalty_score - (s32)curr->penalty_score) >> 1; ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[min(39, 20 + score)], 22); +} + +static inline u64 binary_smooth(u64 old, u64 new, unsigned int smoothness) { + return (new + old * ((1 << smoothness) - 1)) >> smoothness; +} + -+static inline void reset_burst(struct sched_entity *se) { ++static void reset_burst(struct sched_entity *se) { + se->prev_burst_time = binary_smooth( + se->prev_burst_time, se->burst_time, sched_burst_smoothness); + se->burst_time = 0; ++ ++ se->max_burst_time = se->prev_burst_time; +} +#endif // CONFIG_SCHED_BORE + /* * Update the current task's runtime statistics. */ -@@ -1211,6 +1295,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -920,6 +1012,14 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); +#ifdef CONFIG_SCHED_BORE + curr->burst_time += delta_exec; ++ curr->max_burst_time = max(curr->max_burst_time, curr->burst_time); + update_burst_score(curr); -+ if (sched_bore) -+ curr->vruntime += calc_delta_fair_bscale(delta_exec, curr); ++ if (sched_bore & 1) ++ curr->vruntime += penalty_scale(calc_delta_fair(delta_exec, curr), curr); + else +#endif // CONFIG_SCHED_BORE curr->vruntime += calc_delta_fair(delta_exec, curr); - /* - * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i -@@ -5283,6 +5374,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + update_min_vruntime(cfs_rq); + +@@ -5013,8 +5113,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -279,18 +291,30 @@ index c40b775452bc..1e4ca5419a11 100644 +static int +wakeup_preempt_entity_bscale(struct sched_entity *curr, + struct sched_entity *se, bool do_scale); -+#endif // CONFIG_SCHED_BORE ++#else // CONFIG_SCHED_BORE static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); ++#endif // CONFIG_SCHED_BORE -@@ -5330,7 +5426,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + /* + * Pick the next process, keeping these things in mind, in this order: +@@ -5053,16 +5159,34 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + second = curr; + } + ++#ifdef CONFIG_SCHED_BORE ++ if (second && wakeup_preempt_entity_bscale( ++ second, left, sched_bore & 2) < 1) ++#else // CONFIG_SCHED_BORE + if (second && wakeup_preempt_entity(second, left) < 1) ++#endif // CONFIG_SCHED_BORE se = second; } - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { +#ifdef CONFIG_SCHED_BORE + if (cfs_rq->next && wakeup_preempt_entity_bscale( -+ cfs_rq->next, left, sched_bore & 2) < 1) ++ cfs_rq->next, left, sched_bore & 2) < 1) +#else // CONFIG_SCHED_BORE + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) +#endif // CONFIG_SCHED_BORE @@ -298,7 +322,20 @@ index c40b775452bc..1e4ca5419a11 100644 /* * Someone really wants this to run. If it's not unfair, run it. */ -@@ -6615,6 +6717,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + se = cfs_rq->next; +- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { ++ } ++#ifdef CONFIG_SCHED_BORE ++ else if (cfs_rq->last && wakeup_preempt_entity_bscale( ++ cfs_rq->last, left, sched_bore & 2) < 1) ++#else // CONFIG_SCHED_BORE ++ else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) ++#endif // CONFIG_SCHED_BORE ++ { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ +@@ -6331,6 +6455,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -308,7 +345,7 @@ index c40b775452bc..1e4ca5419a11 100644 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); -@@ -8070,7 +8175,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) +@@ -7746,7 +7873,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) * */ static int @@ -321,28 +358,31 @@ index c40b775452bc..1e4ca5419a11 100644 { s64 gran, vdiff = curr->vruntime - se->vruntime; -@@ -8078,11 +8188,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +@@ -7754,6 +7886,9 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) return -1; gran = wakeup_gran(se); +#ifdef CONFIG_SCHED_BORE -+ if (do_scale) gran = burst_scale(gran, se); ++ if (do_scale) gran = preempt_scale(gran, curr, se); +#endif // CONFIG_SCHED_BORE if (vdiff > gran) return 1; - return 0; - } -+#ifdef CONFIG_SCHED_BORE -+static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -+{ -+ return wakeup_preempt_entity_bscale(curr, se, false); -+} -+#endif // CONFIG_SCHED_BORE +@@ -7858,7 +7993,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; - static void set_last_buddy(struct sched_entity *se) - { -@@ -8430,6 +8549,9 @@ static void yield_task_fair(struct rq *rq) + update_curr(cfs_rq_of(se)); +- if (wakeup_preempt_entity(se, pse) == 1) { ++#ifdef CONFIG_SCHED_BORE ++ if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1) ++#else // CONFIG_SCHED_BORE ++ if (wakeup_preempt_entity(se, pse) == 1) ++#endif // CONFIG_SCHED_BORE ++ { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. +@@ -8094,6 +8234,9 @@ static void yield_task_fair(struct rq *rq) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -352,5 +392,33 @@ index c40b775452bc..1e4ca5419a11 100644 /* * Are we the only task in the tree? +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index ee7f23c76bd3..3115bde98211 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -4,7 +4,11 @@ + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false) ++#else // CONFIG_SCHED_BORE + SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) ++#endif // CONFIG_SCHED_BORE + + /* + * Place new tasks ahead so that they do not starve already running +@@ -17,7 +21,11 @@ SCHED_FEAT(START_DEBIT, true) + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(NEXT_BUDDY, true) ++#else // CONFIG_SCHED_BORE + SCHED_FEAT(NEXT_BUDDY, false) ++#endif // CONFIG_SCHED_BORE + + /* + * Prefer to schedule the task that ran last (when we did -- -2.40.0.rc2 +2.40.0 diff --git a/patches/0006-Nintendo-controller-one.patch b/patches/0006-Nintendo-controller-one.patch new file mode 100644 index 0000000..a27203e --- /dev/null +++ b/patches/0006-Nintendo-controller-one.patch @@ -0,0 +1,46 @@ +diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c +index 5bfc0c4504608..2b781cc9082b4 100644 +--- a/drivers/hid/hid-nintendo.c ++++ b/drivers/hid/hid-nintendo.c +@@ -1527,6 +1527,7 @@ static int joycon_set_rumble(struct joycon_ctlr *ctlr, u16 amp_r, u16 amp_l, + u16 freq_l_low; + u16 freq_l_high; + unsigned long flags; ++ int next_rq_head; + + spin_lock_irqsave(&ctlr->lock, flags); + freq_r_low = ctlr->rumble_rl_freq; +@@ -1547,8 +1548,21 @@ static int joycon_set_rumble(struct joycon_ctlr *ctlr, u16 amp_r, u16 amp_l, + joycon_encode_rumble(data, freq_l_low, freq_l_high, amp); + + spin_lock_irqsave(&ctlr->lock, flags); +- if (++ctlr->rumble_queue_head >= JC_RUMBLE_QUEUE_SIZE) +- ctlr->rumble_queue_head = 0; ++ ++ next_rq_head = ctlr->rumble_queue_head + 1; ++ if (next_rq_head >= JC_RUMBLE_QUEUE_SIZE) ++ next_rq_head = 0; ++ ++ /* Did we overrun the circular buffer? ++ * If so, be sure we keep the latest intended rumble state. ++ */ ++ if (next_rq_head == ctlr->rumble_queue_tail) { ++ hid_dbg(ctlr->hdev, "rumble queue is full"); ++ /* overwrite the prior value at the end of the circular buf */ ++ next_rq_head = ctlr->rumble_queue_head; ++ } ++ ++ ctlr->rumble_queue_head = next_rq_head; + memcpy(ctlr->rumble_data[ctlr->rumble_queue_head], data, + JC_RUMBLE_DATA_SIZE); + +@@ -2128,7 +2142,7 @@ static int nintendo_hid_probe(struct hid_device *hdev, + + ctlr->hdev = hdev; + ctlr->ctlr_state = JOYCON_CTLR_STATE_INIT; +- ctlr->rumble_queue_head = JC_RUMBLE_QUEUE_SIZE - 1; ++ ctlr->rumble_queue_head = 0; + ctlr->rumble_queue_tail = 0; + hid_set_drvdata(hdev, ctlr); + mutex_init(&ctlr->output_mutex); +generated by cgit (git 2.34.1) at 2023-04-10 17:10:33 +0000 \ No newline at end of file diff --git a/patches/0006-Nintendo-controller-two.patch b/patches/0006-Nintendo-controller-two.patch new file mode 100644 index 0000000..a61397f --- /dev/null +++ b/patches/0006-Nintendo-controller-two.patch @@ -0,0 +1,116 @@ + +diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c +index 2b781cc9082b4..250f5d2f888ab 100644 +--- a/drivers/hid/hid-nintendo.c ++++ b/drivers/hid/hid-nintendo.c +@@ -433,7 +433,9 @@ struct joycon_ctlr { + u8 usb_ack_match; + u8 subcmd_ack_match; + bool received_input_report; ++ unsigned int last_input_report_msecs; + unsigned int last_subcmd_sent_msecs; ++ unsigned int consecutive_valid_report_deltas; + + /* factory calibration data */ + struct joycon_stick_cal left_stick_cal_x; +@@ -543,19 +545,54 @@ static void joycon_wait_for_input_report(struct joycon_ctlr *ctlr) + * Sending subcommands and/or rumble data at too high a rate can cause bluetooth + * controller disconnections. + */ ++#define JC_INPUT_REPORT_MIN_DELTA 8 ++#define JC_INPUT_REPORT_MAX_DELTA 17 ++#define JC_SUBCMD_TX_OFFSET_MS 4 ++#define JC_SUBCMD_VALID_DELTA_REQ 3 ++#define JC_SUBCMD_RATE_MAX_ATTEMPTS 500 ++#define JC_SUBCMD_RATE_LIMITER_USB_MS 20 ++#define JC_SUBCMD_RATE_LIMITER_BT_MS 60 ++#define JC_SUBCMD_RATE_LIMITER_MS(ctlr) ((ctlr)->hdev->bus == BUS_USB ? JC_SUBCMD_RATE_LIMITER_USB_MS : JC_SUBCMD_RATE_LIMITER_BT_MS) + static void joycon_enforce_subcmd_rate(struct joycon_ctlr *ctlr) + { +- static const unsigned int max_subcmd_rate_ms = 25; +- unsigned int current_ms = jiffies_to_msecs(jiffies); +- unsigned int delta_ms = current_ms - ctlr->last_subcmd_sent_msecs; ++ unsigned int current_ms; ++ unsigned long subcmd_delta; ++ int consecutive_valid_deltas = 0; ++ int attempts = 0; ++ unsigned long flags; ++ ++ if (unlikely(ctlr->ctlr_state != JOYCON_CTLR_STATE_READ)) ++ return; + +- while (delta_ms < max_subcmd_rate_ms && +- ctlr->ctlr_state == JOYCON_CTLR_STATE_READ) { ++ do { + joycon_wait_for_input_report(ctlr); + current_ms = jiffies_to_msecs(jiffies); +- delta_ms = current_ms - ctlr->last_subcmd_sent_msecs; ++ subcmd_delta = current_ms - ctlr->last_subcmd_sent_msecs; ++ ++ spin_lock_irqsave(&ctlr->lock, flags); ++ consecutive_valid_deltas = ctlr->consecutive_valid_report_deltas; ++ spin_unlock_irqrestore(&ctlr->lock, flags); ++ ++ attempts++; ++ } while ((consecutive_valid_deltas < JC_SUBCMD_VALID_DELTA_REQ || ++ subcmd_delta < JC_SUBCMD_RATE_LIMITER_MS(ctlr)) && ++ ctlr->ctlr_state == JOYCON_CTLR_STATE_READ && ++ attempts < JC_SUBCMD_RATE_MAX_ATTEMPTS); ++ ++ if (attempts >= JC_SUBCMD_RATE_MAX_ATTEMPTS) { ++ hid_warn(ctlr->hdev, "%s: exceeded max attempts", __func__); ++ return; + } ++ + ctlr->last_subcmd_sent_msecs = current_ms; ++ ++ /* ++ * Wait a short time after receiving an input report before ++ * transmitting. This should reduce odds of a TX coinciding with an RX. ++ * Minimizing concurrent BT traffic with the controller seems to lower ++ * the rate of disconnections. ++ */ ++ msleep(JC_SUBCMD_TX_OFFSET_MS); + } + + static int joycon_hid_send_sync(struct joycon_ctlr *ctlr, u8 *data, size_t len, +@@ -1223,6 +1260,7 @@ static void joycon_parse_report(struct joycon_ctlr *ctlr, + u8 tmp; + u32 btns; + unsigned long msecs = jiffies_to_msecs(jiffies); ++ unsigned long report_delta_ms = msecs - ctlr->last_input_report_msecs; + + spin_lock_irqsave(&ctlr->lock, flags); + if (IS_ENABLED(CONFIG_NINTENDO_FF) && rep->vibrator_report && +@@ -1364,6 +1402,31 @@ static void joycon_parse_report(struct joycon_ctlr *ctlr, + + input_sync(dev); + ++ spin_lock_irqsave(&ctlr->lock, flags); ++ ctlr->last_input_report_msecs = msecs; ++ /* ++ * Was this input report a reasonable time delta compared to the prior ++ * report? We use this information to decide when a safe time is to send ++ * rumble packets or subcommand packets. ++ */ ++ if (report_delta_ms >= JC_INPUT_REPORT_MIN_DELTA && ++ report_delta_ms <= JC_INPUT_REPORT_MAX_DELTA) { ++ if (ctlr->consecutive_valid_report_deltas < JC_SUBCMD_VALID_DELTA_REQ) ++ ctlr->consecutive_valid_report_deltas++; ++ } else { ++ ctlr->consecutive_valid_report_deltas = 0; ++ } ++ /* ++ * Our consecutive valid report tracking is only relevant for ++ * bluetooth-connected controllers. For USB devices, we're beholden to ++ * USB's underlying polling rate anyway. Always set to the consecutive ++ * delta requirement. ++ */ ++ if (ctlr->hdev->bus == BUS_USB) ++ ctlr->consecutive_valid_report_deltas = JC_SUBCMD_VALID_DELTA_REQ; ++ ++ spin_unlock_irqrestore(&ctlr->lock, flags); ++ + /* + * Immediately after receiving a report is the most reliable time to + * send a subcommand to the controller. Wake any subcommand senders \ No newline at end of file diff --git a/scripts/patch.sh b/scripts/patch.sh index 5b711eb..04f493e 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -13,6 +13,10 @@ patch -Np1 < "../patches/0003-bore.patch" patch -Np1 < "../patches/0004-hdr.patch" # AMD GPU USB C fix patch patch -Np1 < "../patches/0005-amd-usbc-fix.patch" +# Nintendo controller rumble patch +patch -Np1 < "../patches/0006-Nintendo-controller-one.patch" +# Nintendo controller BT patch +patch -Np1 < "../patches/0006-Nintendo-controller-two.patch" # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork # Extra Leigon laptop goodies patch -Np1 < "../patches/0001-Add-legion-laptop-v0.1.patch" diff --git a/scripts/source.sh b/scripts/source.sh index 3a486a5..4c457c0 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.6.tar.gz -tar -zxf ./linux-6.2.6.tar.gz +wget -nv https://git.kernel.org/torvalds/t/linux-6.3-rc6.tar.gz +tar -zxf ./linux-6.3-rc6.tar.gz -cd linux-6.2.6 +cd linux-6.3-rc6