From f464089613dcec33876d917084e17ae20fd30941 Mon Sep 17 00:00:00 2001
From: ferrreo <harderthanfire@gmail.com>
Date: Mon, 10 Apr 2023 18:42:41 +0100
Subject: [PATCH] Linux 6.3 + nintendo controller fixes

---
 config                                     |     6 +-
 patches/0001-cachy-all.patch               | 34544 +++----------------
 patches/0002-eevdf.patch                   |  1859 +-
 patches/0003-bore.patch                    |   256 +-
 patches/0006-Nintendo-controller-one.patch |    46 +
 patches/0006-Nintendo-controller-two.patch |   116 +
 scripts/patch.sh                           |     4 +
 scripts/source.sh                          |     6 +-
 8 files changed, 6932 insertions(+), 29905 deletions(-)
 create mode 100644 patches/0006-Nintendo-controller-one.patch
 create mode 100644 patches/0006-Nintendo-controller-two.patch

diff --git a/config b/config
index 746172b..32d8691 100644
--- a/config
+++ b/config
@@ -2,7 +2,7 @@
 # Automatically generated file; DO NOT EDIT.
 # Linux/x86 6.2.1 Kernel Configuration
 #
-CONFIG_CC_VERSION_TEXT="gcc (GCC) 12.2.1 20230216"
+CONFIG_CC_VERSION_TEXT="gcc (GCC) 12.2.1 20230311"
 CONFIG_CC_IS_GCC=y
 CONFIG_GCC_VERSION=120201
 CONFIG_CLANG_VERSION=0
@@ -534,7 +534,7 @@ CONFIG_X86_PAT=y
 CONFIG_ARCH_USES_PG_UNCACHED=y
 CONFIG_X86_UMIP=y
 CONFIG_CC_HAS_IBT=y
-# CONFIG_X86_KERNEL_IBT is not set
+CONFIG_X86_KERNEL_IBT=y
 CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y
 # CONFIG_X86_INTEL_TSX_MODE_OFF is not set
 # CONFIG_X86_INTEL_TSX_MODE_ON is not set
@@ -6938,7 +6938,7 @@ CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y
 # CONFIG_LOGO is not set
 # end of Graphics support
 
-# CONFIG_DRM_ACCEL is not set
+CONFIG_DRM_ACCEL=y
 CONFIG_SOUND=m
 CONFIG_SOUND_OSS_CORE=y
 # CONFIG_SOUND_OSS_CORE_PRECLAIM is not set
diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
index c1c40d6..ffe879d 100644
--- a/patches/0001-cachy-all.patch
+++ b/patches/0001-cachy-all.patch
@@ -1,7 +1,7 @@
-From d7322fe0d4d120555d7dd3c2a6167f7f726b8738 Mon Sep 17 00:00:00 2001
+From c2fc7486fbb316ab576a741ae264255a4cc4de44 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 17:59:47 +0100
-Subject: [PATCH 01/16] bbr2
+Date: Mon, 6 Mar 2023 18:43:03 +0100
+Subject: [PATCH 01/10] bbr2
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -256,10 +256,10 @@ index 2dfb12230f08..b6bec331a82e 100644
  
  config TCP_MD5SIG
 diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
-index af7d2cf490fb..e7a86a50838a 100644
+index 880277c9fd07..ef1da49d20a6 100644
 --- a/net/ipv4/Makefile
 +++ b/net/ipv4/Makefile
-@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
  obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
  obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
  obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
@@ -268,10 +268,10 @@ index af7d2cf490fb..e7a86a50838a 100644
  obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
  obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index 33f559f491c8..e9e8040d6491 100644
+index 288693981b00..1d530667b172 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
-@@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+@@ -3192,6 +3192,7 @@ int tcp_disconnect(struct sock *sk, int flags)
  	tp->rx_opt.dsack = 0;
  	tp->rx_opt.num_sacks = 0;
  	tp->rcv_ooopack = 0;
@@ -280,7 +280,7 @@ index 33f559f491c8..e9e8040d6491 100644
  
  	/* Clean up fastopen related fields */
 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
-index d2c470524e58..af08fb3cb139 100644
+index 146792cd26fe..16038f6ee52a 100644
 --- a/net/ipv4/tcp_bbr.c
 +++ b/net/ipv4/tcp_bbr.c
 @@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
@@ -288,7 +288,7 @@ index d2c470524e58..af08fb3cb139 100644
  }
  
 -/* override sysctl_tcp_min_tso_segs */
- static u32 bbr_min_tso_segs(struct sock *sk)
+ __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
  {
  	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
  }
@@ -3025,7 +3025,7 @@ index 000000000000..85f8052144d1
 +MODULE_LICENSE("Dual BSD/GPL");
 +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
 diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
-index d3cae40749e8..0f268f2ff2e9 100644
+index db8b4b488c31..0d6d1a949e11 100644
 --- a/net/ipv4/tcp_cong.c
 +++ b/net/ipv4/tcp_cong.c
 @@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
@@ -3123,7 +3123,7 @@ index cc072d2cfcd8..754e0212c951 100644
  	    tcp_in_quickack_mode(sk) ||
  	    /* Protocol state mandates a one-time immediate ACK */
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 71d01cf3c13e..0da3da9e56db 100644
+index ba839e441450..5ffec885e66f 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
@@ -3281,1451 +3281,31 @@ index cb79127f45c3..70e4de876a7f 100644
  	event = icsk->icsk_pending;
  
 -- 
-2.40.0.rc2
+2.40.0
 
-From 87439b08ac56036539528efb6da691914f41ca76 Mon Sep 17 00:00:00 2001
+From 8f60626e149f4437570d56968a2cab10a822fcd4 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 18:00:04 +0100
-Subject: [PATCH 02/16] bfq
+Date: Sun, 9 Apr 2023 21:21:39 +0200
+Subject: [PATCH 02/10] bfq
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- block/bfq-cgroup.c    | 101 ++++---
- block/bfq-iosched.c   | 637 ++++++++++++++++++++++++++++--------------
- block/bfq-iosched.h   | 144 ++++++++--
- block/bfq-wf2q.c      |   2 +-
- block/blk-cgroup.c    | 122 ++++----
- block/blk-cgroup.h    |  10 +-
- block/blk-iocost.c    |  58 ++--
- block/blk-iolatency.c |  39 ++-
- block/blk-rq-qos.h    |   2 +-
- block/blk-throttle.c  |  16 +-
- block/blk.h           |   6 -
- 11 files changed, 747 insertions(+), 390 deletions(-)
+ block/bfq-iosched.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
 
-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
-index 0fbde0fc0628..59929dfd559b 100644
---- a/block/bfq-cgroup.c
-+++ b/block/bfq-cgroup.c
-@@ -706,12 +706,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 		bfq_activate_bfqq(bfqd, bfqq);
- 	}
- 
--	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
-+	if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver)
- 		bfq_schedule_dispatch(bfqd);
- 	/* release extra ref taken above, bfqq may happen to be freed now */
- 	bfq_put_queue(bfqq);
- }
- 
-+static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
-+			       struct bfq_queue *sync_bfqq,
-+			       struct bfq_io_cq *bic,
-+			       struct bfq_group *bfqg,
-+			       unsigned int act_idx)
-+{
-+	struct bfq_queue *bfqq;
-+
-+	if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
-+		/* We are the only user of this bfqq, just move it */
-+		if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
-+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
-+		return;
-+	}
-+
-+	/*
-+	 * The queue was merged to a different queue. Check
-+	 * that the merge chain still belongs to the same
-+	 * cgroup.
-+	 */
-+	for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
-+		if (bfqq->entity.sched_data != &bfqg->sched_data)
-+			break;
-+	if (bfqq) {
-+		/*
-+		 * Some queue changed cgroup so the merge is not valid
-+		 * anymore. We cannot easily just cancel the merge (by
-+		 * clearing new_bfqq) as there may be other processes
-+		 * using this queue and holding refs to all queues
-+		 * below sync_bfqq->new_bfqq. Similarly if the merge
-+		 * already happened, we need to detach from bfqq now
-+		 * so that we cannot merge bio to a request from the
-+		 * old cgroup.
-+		 */
-+		bfq_put_cooperator(sync_bfqq);
-+		bic_set_bfqq(bic, NULL, true, act_idx);
-+		bfq_release_process_ref(bfqd, sync_bfqq);
-+	}
-+}
-+
- /**
-  * __bfq_bic_change_cgroup - move @bic to @bfqg.
-  * @bfqd: the queue descriptor.
-@@ -726,53 +766,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd,
- 				    struct bfq_io_cq *bic,
- 				    struct bfq_group *bfqg)
- {
--	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false);
--	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true);
--	struct bfq_entity *entity;
-+	unsigned int act_idx;
- 
--	if (async_bfqq) {
--		entity = &async_bfqq->entity;
-+	for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) {
-+		struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx);
-+		struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx);
- 
--		if (entity->sched_data != &bfqg->sched_data) {
--			bic_set_bfqq(bic, NULL, false);
-+		if (async_bfqq &&
-+		    async_bfqq->entity.sched_data != &bfqg->sched_data) {
-+			bic_set_bfqq(bic, NULL, false, act_idx);
- 			bfq_release_process_ref(bfqd, async_bfqq);
- 		}
--	}
- 
--	if (sync_bfqq) {
--		if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
--			/* We are the only user of this bfqq, just move it */
--			if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
--				bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
--		} else {
--			struct bfq_queue *bfqq;
--
--			/*
--			 * The queue was merged to a different queue. Check
--			 * that the merge chain still belongs to the same
--			 * cgroup.
--			 */
--			for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
--				if (bfqq->entity.sched_data !=
--				    &bfqg->sched_data)
--					break;
--			if (bfqq) {
--				/*
--				 * Some queue changed cgroup so the merge is
--				 * not valid anymore. We cannot easily just
--				 * cancel the merge (by clearing new_bfqq) as
--				 * there may be other processes using this
--				 * queue and holding refs to all queues below
--				 * sync_bfqq->new_bfqq. Similarly if the merge
--				 * already happened, we need to detach from
--				 * bfqq now so that we cannot merge bio to a
--				 * request from the old cgroup.
--				 */
--				bfq_put_cooperator(sync_bfqq);
--				bic_set_bfqq(bic, NULL, true);
--				bfq_release_process_ref(bfqd, sync_bfqq);
--			}
--		}
-+		if (sync_bfqq)
-+			bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx);
- 	}
- }
- 
-@@ -1106,9 +1113,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
- 	struct bfq_group *bfqg;
- 	u64 v;
- 
--	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
-+	blkg_conf_init(&ctx, buf);
-+
-+	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
- 	if (ret)
--		return ret;
-+		goto out;
- 
- 	if (sscanf(ctx.body, "%llu", &v) == 1) {
- 		/* require "default" on dfl */
-@@ -1130,7 +1139,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
- 		ret = 0;
- 	}
- out:
--	blkg_conf_finish(&ctx);
-+	blkg_conf_exit(&ctx);
- 	return ret ?: nbytes;
- }
- 
 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
-index 380e9bda2e57..aa644973d260 100644
+index d9ed3108c17a..f32b177a36e5 100644
 --- a/block/bfq-iosched.c
 +++ b/block/bfq-iosched.c
-@@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600;
- #define RQ_BIC(rq)		((struct bfq_io_cq *)((rq)->elv.priv[0]))
- #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
- 
--struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
-+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
-+			      unsigned int actuator_idx)
- {
--	return bic->bfqq[is_sync];
-+	if (is_sync)
-+		return bic->bfqq[1][actuator_idx];
-+
-+	return bic->bfqq[0][actuator_idx];
- }
- 
- static void bfq_put_stable_ref(struct bfq_queue *bfqq);
- 
--void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
-+void bic_set_bfqq(struct bfq_io_cq *bic,
-+		  struct bfq_queue *bfqq,
-+		  bool is_sync,
-+		  unsigned int actuator_idx)
- {
--	struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
--
--	/* Clear bic pointer if bfqq is detached from this bic */
--	if (old_bfqq && old_bfqq->bic == bic)
--		old_bfqq->bic = NULL;
-+	struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx];
- 
- 	/*
- 	 * If bfqq != NULL, then a non-stable queue merge between
-@@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
- 	 * we cancel the stable merge if
- 	 * bic->stable_merge_bfqq == bfqq.
- 	 */
--	bic->bfqq[is_sync] = bfqq;
-+	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx];
-+
-+	/* Clear bic pointer if bfqq is detached from this bic */
-+	if (old_bfqq && old_bfqq->bic == bic)
-+		old_bfqq->bic = NULL;
- 
--	if (bfqq && bic->stable_merge_bfqq == bfqq) {
-+	if (is_sync)
-+		bic->bfqq[1][actuator_idx] = bfqq;
-+	else
-+		bic->bfqq[0][actuator_idx] = bfqq;
-+
-+	if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) {
- 		/*
- 		 * Actually, these same instructions are executed also
- 		 * in bfq_setup_cooperator, in case of abort or actual
-@@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
- 		 * did so, we would nest even more complexity in this
- 		 * function.
- 		 */
--		bfq_put_stable_ref(bic->stable_merge_bfqq);
-+		bfq_put_stable_ref(bfqq_data->stable_merge_bfqq);
- 
--		bic->stable_merge_bfqq = NULL;
-+		bfqq_data->stable_merge_bfqq = NULL;
- 	}
- }
- 
-@@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
- {
- 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
- 	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
--	struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
- 	int depth;
- 	unsigned limit = data->q->nr_requests;
-+	unsigned int act_idx;
- 
- 	/* Sync reads have full depth available */
- 	if (op_is_sync(opf) && !op_is_write(opf)) {
-@@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
- 		limit = (limit * depth) >> bfqd->full_depth_shift;
- 	}
- 
--	/*
--	 * Does queue (or any parent entity) exceed number of requests that
--	 * should be available to it? Heavily limit depth so that it cannot
--	 * consume more available requests and thus starve other entities.
--	 */
--	if (bfqq && bfqq_request_over_limit(bfqq, limit))
--		depth = 1;
-+	for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
-+		struct bfq_queue *bfqq =
-+			bic_to_bfqq(bic, op_is_sync(opf), act_idx);
- 
-+		/*
-+		 * Does queue (or any parent entity) exceed number of
-+		 * requests that should be available to it? Heavily
-+		 * limit depth so that it cannot consume more
-+		 * available requests and thus starve other entities.
-+		 */
-+		if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
-+			depth = 1;
-+			break;
-+		}
-+	}
- 	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- 		__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
- 	if (depth)
-@@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
- {
- 	u64 dur;
- 
--	if (bfqd->bfq_wr_max_time > 0)
--		return bfqd->bfq_wr_max_time;
--
- 	dur = bfqd->rate_dur_prod;
- 	do_div(dur, bfqd->peak_rate);
- 
-@@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
- {
- 	unsigned int old_wr_coeff = 1;
- 	bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
-+	unsigned int a_idx = bfqq->actuator_idx;
-+	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
- 
--	if (bic->saved_has_short_ttime)
-+	if (bfqq_data->saved_has_short_ttime)
- 		bfq_mark_bfqq_has_short_ttime(bfqq);
- 	else
- 		bfq_clear_bfqq_has_short_ttime(bfqq);
- 
--	if (bic->saved_IO_bound)
-+	if (bfqq_data->saved_IO_bound)
- 		bfq_mark_bfqq_IO_bound(bfqq);
- 	else
- 		bfq_clear_bfqq_IO_bound(bfqq);
- 
--	bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
--	bfqq->inject_limit = bic->saved_inject_limit;
--	bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
-+	bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns;
-+	bfqq->inject_limit = bfqq_data->saved_inject_limit;
-+	bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif;
- 
--	bfqq->entity.new_weight = bic->saved_weight;
--	bfqq->ttime = bic->saved_ttime;
--	bfqq->io_start_time = bic->saved_io_start_time;
--	bfqq->tot_idle_time = bic->saved_tot_idle_time;
-+	bfqq->entity.new_weight = bfqq_data->saved_weight;
-+	bfqq->ttime = bfqq_data->saved_ttime;
-+	bfqq->io_start_time = bfqq_data->saved_io_start_time;
-+	bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time;
- 	/*
- 	 * Restore weight coefficient only if low_latency is on
- 	 */
- 	if (bfqd->low_latency) {
- 		old_wr_coeff = bfqq->wr_coeff;
--		bfqq->wr_coeff = bic->saved_wr_coeff;
-+		bfqq->wr_coeff = bfqq_data->saved_wr_coeff;
- 	}
--	bfqq->service_from_wr = bic->saved_service_from_wr;
--	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
--	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
--	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
-+	bfqq->service_from_wr = bfqq_data->saved_service_from_wr;
-+	bfqq->wr_start_at_switch_to_srt =
-+		bfqq_data->saved_wr_start_at_switch_to_srt;
-+	bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish;
-+	bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time;
- 
- 	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
- 	    time_is_before_jiffies(bfqq->last_wr_start_finish +
-@@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
- 	return bfqq_weight > in_serv_weight;
- }
- 
-+/*
-+ * Get the index of the actuator that will serve bio.
-+ */
-+static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio)
-+{
-+	unsigned int i;
-+	sector_t end;
-+
-+	/* no search needed if one or zero ranges present */
-+	if (bfqd->num_actuators == 1)
-+		return 0;
-+
-+	/* bio_end_sector(bio) gives the sector after the last one */
-+	end = bio_end_sector(bio) - 1;
-+
-+	for (i = 0; i < bfqd->num_actuators; i++) {
-+		if (end >= bfqd->sector[i] &&
-+		    end < bfqd->sector[i] + bfqd->nr_sectors[i])
-+			return i;
-+	}
-+
-+	WARN_ONCE(true,
-+		  "bfq_actuator_index: bio sector out of ranges: end=%llu\n",
-+		  end);
-+	return 0;
-+}
-+
- static bool bfq_better_to_idle(struct bfq_queue *bfqq);
- 
- static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
-@@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
- 		arrived_in_time =  ktime_get_ns() <=
- 			bfqq->ttime.last_end_request +
- 			bfqd->bfq_slice_idle * 3;
--
-+	unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
-+	bool bfqq_non_merged_or_stably_merged =
-+		bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged;
- 
- 	/*
- 	 * bfqq deserves to be weight-raised if:
-@@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
- 	 */
- 	wr_or_deserves_wr = bfqd->low_latency &&
- 		(bfqq->wr_coeff > 1 ||
--		 (bfq_bfqq_sync(bfqq) &&
--		  (bfqq->bic || RQ_BIC(rq)->stably_merged) &&
--		   (*interactive || soft_rt)));
-+		 (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged &&
-+		  (*interactive || soft_rt)));
- 
- 	/*
- 	 * Using the last flag, update budget and check whether bfqq
-@@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 	 * We reset waker detection logic also if too much time has passed
-  	 * since the first detection. If wakeups are rare, pointless idling
- 	 * doesn't hurt throughput that much. The condition below makes sure
--	 * we do not uselessly idle blocking waker in more than 1/64 cases. 
-+	 * we do not uselessly idle blocking waker in more than 1/64 cases.
- 	 */
- 	if (bfqd->last_completed_rq_bfqq !=
- 	    bfqq->tentative_waker_bfqq ||
-@@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq)
- 		 *   elapsed.
- 		 */
- 		if (bfqq == bfqd->in_service_queue &&
--		    (bfqd->rq_in_driver == 0 ||
-+		    (bfqd->tot_rq_in_driver == 0 ||
- 		     (bfqq->last_serv_time_ns > 0 &&
--		      bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
-+		      bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
- 		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- 					      msecs_to_jiffies(10))) {
- 			bfqd->last_empty_occupied_ns = ktime_get_ns();
-@@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq)
- 			 * will be set in case injection is performed
- 			 * on bfqq before rq is completed).
- 			 */
--			if (bfqd->rq_in_driver == 0)
-+			if (bfqd->tot_rq_in_driver == 0)
- 				bfqd->rqs_injected = false;
- 		}
- 	}
-@@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
- 		 */
- 		bfq_bic_update_cgroup(bic, bio);
- 
--		bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
-+		bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf),
-+					     bfq_actuator_index(bfqd, bio));
- 	} else {
- 		bfqd->bio_bfqq = NULL;
- 	}
-@@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
- void bfq_end_wr_async_queues(struct bfq_data *bfqd,
- 			     struct bfq_group *bfqg)
- {
--	int i, j;
-+	int i, j, k;
- 
--	for (i = 0; i < 2; i++)
--		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
--			if (bfqg->async_bfqq[i][j])
--				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
--	if (bfqg->async_idle_bfqq)
--		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
-+	for (k = 0; k < bfqd->num_actuators; k++) {
-+		for (i = 0; i < 2; i++)
-+			for (j = 0; j < IOPRIO_NR_LEVELS; j++)
-+				if (bfqg->async_bfqq[i][j][k])
-+					bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]);
-+		if (bfqg->async_idle_bfqq[k])
-+			bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]);
-+	}
- }
- 
- static void bfq_end_wr(struct bfq_data *bfqd)
- {
- 	struct bfq_queue *bfqq;
-+	int i;
- 
- 	spin_lock_irq(&bfqd->lock);
- 
--	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
--		bfq_bfqq_end_wr(bfqq);
-+	for (i = 0; i < bfqd->num_actuators; i++) {
-+		list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
-+			bfq_bfqq_end_wr(bfqq);
-+	}
- 	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
- 		bfq_bfqq_end_wr(bfqq);
- 	bfq_end_wr_async(bfqd);
-@@ -2794,6 +2847,40 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
- static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
- 					     struct bfq_queue *bfqq);
- 
-+static struct bfq_queue *
-+bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-+		       struct bfq_queue *stable_merge_bfqq,
-+		       struct bfq_iocq_bfqq_data *bfqq_data)
-+{
-+	int proc_ref = min(bfqq_process_refs(bfqq),
-+			   bfqq_process_refs(stable_merge_bfqq));
-+	struct bfq_queue *new_bfqq = NULL;
-+
-+	bfqq_data->stable_merge_bfqq = NULL;
-+	if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0)
-+		goto out;
-+
-+	/* next function will take at least one ref */
-+	new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq);
-+
-+	if (new_bfqq) {
-+		bfqq_data->stably_merged = true;
-+		if (new_bfqq->bic) {
-+			unsigned int new_a_idx = new_bfqq->actuator_idx;
-+			struct bfq_iocq_bfqq_data *new_bfqq_data =
-+				&new_bfqq->bic->bfqq_data[new_a_idx];
-+
-+			new_bfqq_data->stably_merged = true;
-+		}
-+	}
-+
-+out:
-+	/* deschedule stable merge, because done or aborted here */
-+	bfq_put_stable_ref(stable_merge_bfqq);
-+
-+	return new_bfqq;
-+}
-+
- /*
-  * Attempt to schedule a merge of bfqq with the currently in-service
-  * queue or with a close queue among the scheduled queues.  Return
-@@ -2819,6 +2906,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 		     void *io_struct, bool request, struct bfq_io_cq *bic)
- {
- 	struct bfq_queue *in_service_bfqq, *new_bfqq;
-+	unsigned int a_idx = bfqq->actuator_idx;
-+	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
- 
- 	/* if a merge has already been setup, then proceed with that first */
- 	if (bfqq->new_bfqq)
-@@ -2840,37 +2929,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 		 * stable merging) also if bic is associated with a
- 		 * sync queue, but this bfqq is async
- 		 */
--		if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
-+		if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq &&
- 		    !bfq_bfqq_just_created(bfqq) &&
- 		    time_is_before_jiffies(bfqq->split_time +
- 					  msecs_to_jiffies(bfq_late_stable_merging)) &&
- 		    time_is_before_jiffies(bfqq->creation_time +
- 					   msecs_to_jiffies(bfq_late_stable_merging))) {
- 			struct bfq_queue *stable_merge_bfqq =
--				bic->stable_merge_bfqq;
--			int proc_ref = min(bfqq_process_refs(bfqq),
--					   bfqq_process_refs(stable_merge_bfqq));
--
--			/* deschedule stable merge, because done or aborted here */
--			bfq_put_stable_ref(stable_merge_bfqq);
--
--			bic->stable_merge_bfqq = NULL;
--
--			if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
--			    proc_ref > 0) {
--				/* next function will take at least one ref */
--				struct bfq_queue *new_bfqq =
--					bfq_setup_merge(bfqq, stable_merge_bfqq);
--
--				if (new_bfqq) {
--					bic->stably_merged = true;
--					if (new_bfqq->bic)
--						new_bfqq->bic->stably_merged =
--									true;
--				}
--				return new_bfqq;
--			} else
--				return NULL;
-+				bfqq_data->stable_merge_bfqq;
-+
-+			return bfq_setup_stable_merge(bfqd, bfqq,
-+						      stable_merge_bfqq,
-+						      bfqq_data);
- 		}
- 	}
- 
-@@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
- {
- 	struct bfq_io_cq *bic = bfqq->bic;
-+	unsigned int a_idx = bfqq->actuator_idx;
-+	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
- 
- 	/*
- 	 * If !bfqq->bic, the queue is already shared or its requests
-@@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
- 	if (!bic)
- 		return;
- 
--	bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
--	bic->saved_inject_limit = bfqq->inject_limit;
--	bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
--
--	bic->saved_weight = bfqq->entity.orig_weight;
--	bic->saved_ttime = bfqq->ttime;
--	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
--	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
--	bic->saved_io_start_time = bfqq->io_start_time;
--	bic->saved_tot_idle_time = bfqq->tot_idle_time;
--	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
--	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
-+	bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
-+	bfqq_data->saved_inject_limit =	bfqq->inject_limit;
-+	bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif;
-+
-+	bfqq_data->saved_weight = bfqq->entity.orig_weight;
-+	bfqq_data->saved_ttime = bfqq->ttime;
-+	bfqq_data->saved_has_short_ttime =
-+		bfq_bfqq_has_short_ttime(bfqq);
-+	bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
-+	bfqq_data->saved_io_start_time = bfqq->io_start_time;
-+	bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time;
-+	bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
-+	bfqq_data->was_in_burst_list =
-+		!hlist_unhashed(&bfqq->burst_list_node);
-+
- 	if (unlikely(bfq_bfqq_just_created(bfqq) &&
- 		     !bfq_bfqq_in_large_burst(bfqq) &&
- 		     bfqq->bfqd->low_latency)) {
-@@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
- 		 * to bfqq, so that to avoid that bfqq unjustly fails
- 		 * to enjoy weight raising if split soon.
- 		 */
--		bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
--		bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
--		bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
--		bic->saved_last_wr_start_finish = jiffies;
-+		bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
-+		bfqq_data->saved_wr_start_at_switch_to_srt =
-+			bfq_smallest_from_now();
-+		bfqq_data->saved_wr_cur_max_time =
-+			bfq_wr_duration(bfqq->bfqd);
-+		bfqq_data->saved_last_wr_start_finish = jiffies;
- 	} else {
--		bic->saved_wr_coeff = bfqq->wr_coeff;
--		bic->saved_wr_start_at_switch_to_srt =
-+		bfqq_data->saved_wr_coeff = bfqq->wr_coeff;
-+		bfqq_data->saved_wr_start_at_switch_to_srt =
- 			bfqq->wr_start_at_switch_to_srt;
--		bic->saved_service_from_wr = bfqq->service_from_wr;
--		bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
--		bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
-+		bfqq_data->saved_service_from_wr =
-+			bfqq->service_from_wr;
-+		bfqq_data->saved_last_wr_start_finish =
-+			bfqq->last_wr_start_finish;
-+		bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
- 	}
- }
- 
-@@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- 	/*
- 	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
- 	 */
--	bic_set_bfqq(bic, new_bfqq, true);
-+	bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx);
- 	bfq_mark_bfqq_coop(new_bfqq);
- 	/*
- 	 * new_bfqq now belongs to at least two bics (it is a shared queue):
-@@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
- 	 * - start a new observation interval with this dispatch
- 	 */
- 	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
--	    bfqd->rq_in_driver == 0)
-+	    bfqd->tot_rq_in_driver == 0)
- 		goto update_rate_and_reset;
- 
- 	/* Update sampling information */
- 	bfqd->peak_rate_samples++;
- 
--	if ((bfqd->rq_in_driver > 0 ||
-+	if ((bfqd->tot_rq_in_driver > 0 ||
- 		now_ns - bfqd->last_completion < BFQ_MIN_TT)
- 	    && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
- 		bfqd->sequential_samples++;
-@@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
- 		return false;
- 
- 	return (bfqq->wr_coeff > 1 &&
--		(bfqd->wr_busy_queues <
--		 tot_busy_queues ||
--		 bfqd->rq_in_driver >=
--		 bfqq->dispatched + 4)) ||
-+		(bfqd->wr_busy_queues < tot_busy_queues ||
-+		 bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) ||
- 		bfq_asymmetric_scenario(bfqd, bfqq) ||
- 		tot_busy_queues == 1;
- }
-@@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
-  * function to evaluate the I/O speed of a process.
-  */
- static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
--				 bool compensate, enum bfqq_expiration reason,
--				 unsigned long *delta_ms)
-+				 bool compensate, unsigned long *delta_ms)
- {
- 	ktime_t delta_ktime;
- 	u32 delta_usecs;
-@@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
- 	/*
- 	 * Check whether the process is slow (see bfq_bfqq_is_slow).
- 	 */
--	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
-+	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta);
- 
- 	/*
- 	 * As above explained, charge slow (typically seeky) and
-@@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
- {
- 	struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
- 	unsigned int limit = in_serv_bfqq->inject_limit;
-+	int i;
-+
- 	/*
- 	 * If
- 	 * - bfqq is not weight-raised and therefore does not carry
-@@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
- 		)
- 		limit = 1;
- 
--	if (bfqd->rq_in_driver >= limit)
-+	if (bfqd->tot_rq_in_driver >= limit)
- 		return NULL;
- 
- 	/*
-@@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
- 	 *   (and re-added only if it gets new requests, but then it
- 	 *   is assigned again enough budget for its new backlog).
- 	 */
--	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
--		if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
--		    (in_serv_always_inject || bfqq->wr_coeff > 1) &&
--		    bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
--		    bfq_bfqq_budget_left(bfqq)) {
-+	for (i = 0; i < bfqd->num_actuators; i++) {
-+		list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
-+			if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
-+				(in_serv_always_inject || bfqq->wr_coeff > 1) &&
-+				bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
-+				bfq_bfqq_budget_left(bfqq)) {
- 			/*
- 			 * Allow for only one large in-flight request
- 			 * on non-rotational devices, for the
-@@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
- 			 */
- 			if (blk_queue_nonrot(bfqd->queue) &&
- 			    blk_rq_sectors(bfqq->next_rq) >=
--			    BFQQ_SECT_THR_NONROT)
--				limit = min_t(unsigned int, 1, limit);
--			else
--				limit = in_serv_bfqq->inject_limit;
--
--			if (bfqd->rq_in_driver < limit) {
-+			    BFQQ_SECT_THR_NONROT &&
-+			    bfqd->tot_rq_in_driver >= 1)
-+				continue;
-+			else {
- 				bfqd->rqs_injected = true;
- 				return bfqq;
- 			}
- 		}
-+	}
- 
- 	return NULL;
- }
- 
-+static struct bfq_queue *
-+bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
-+{
-+	struct bfq_queue *bfqq;
-+
-+	if (bfqd->in_service_queue &&
-+	    bfqd->in_service_queue->actuator_idx == idx)
-+		return bfqd->in_service_queue;
-+
-+	list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) {
-+		if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
-+			bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
-+				bfq_bfqq_budget_left(bfqq)) {
-+			return bfqq;
-+		}
-+	}
-+
-+	return NULL;
-+}
-+
-+/*
-+ * Perform a linear scan of each actuator, until an actuator is found
-+ * for which the following three conditions hold: the load of the
-+ * actuator is below the threshold (see comments on
-+ * actuator_load_threshold for details) and lower than that of the
-+ * next actuator (comments on this extra condition below), and there
-+ * is a queue that contains I/O for that actuator. On success, return
-+ * that queue.
-+ *
-+ * Performing a plain linear scan entails a prioritization among
-+ * actuators. The extra condition above breaks this prioritization and
-+ * tends to distribute injection uniformly across actuators.
-+ */
-+static struct bfq_queue *
-+bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd)
-+{
-+	int i;
-+
-+	for (i = 0 ; i < bfqd->num_actuators; i++) {
-+		if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold &&
-+		    (i == bfqd->num_actuators - 1 ||
-+		     bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) {
-+			struct bfq_queue *bfqq =
-+				bfq_find_active_bfqq_for_actuator(bfqd, i);
-+
-+			if (bfqq)
-+				return bfqq;
-+		}
-+	}
-+
-+	return NULL;
-+}
-+
-+
- /*
-  * Select a queue for service.  If we have a current queue in service,
-  * check whether to continue servicing it, or retrieve and set a new one.
-  */
- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- {
--	struct bfq_queue *bfqq;
-+	struct bfq_queue *bfqq, *inject_bfqq;
- 	struct request *next_rq;
- 	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
- 
-@@ -4689,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- 		goto expire;
- 
- check_queue:
-+	/*
-+	 *  If some actuator is underutilized, but the in-service
-+	 *  queue does not contain I/O for that actuator, then try to
-+	 *  inject I/O for that actuator.
-+	 */
-+	inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd);
-+	if (inject_bfqq && inject_bfqq != bfqq)
-+		return inject_bfqq;
-+
- 	/*
- 	 * This loop is rarely executed more than once. Even when it
- 	 * happens, it is much more convenient to re-execute this loop
-@@ -4748,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- 	 */
- 	if (bfq_bfqq_wait_request(bfqq) ||
- 	    (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
--		struct bfq_queue *async_bfqq =
--			bfqq->bic && bfqq->bic->bfqq[0] &&
--			bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
--			bfqq->bic->bfqq[0]->next_rq ?
--			bfqq->bic->bfqq[0] : NULL;
-+		unsigned int act_idx = bfqq->actuator_idx;
-+		struct bfq_queue *async_bfqq = NULL;
- 		struct bfq_queue *blocked_bfqq =
- 			!hlist_empty(&bfqq->woken_list) ?
- 			container_of(bfqq->woken_list.first,
-@@ -4760,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- 				     woken_list_node)
- 			: NULL;
- 
-+		if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] &&
-+		    bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) &&
-+		    bfqq->bic->bfqq[0][act_idx]->next_rq)
-+			async_bfqq = bfqq->bic->bfqq[0][act_idx];
- 		/*
- 		 * The next four mutually-exclusive ifs decide
- 		 * whether to try injection, and choose the queue to
-@@ -4844,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
- 		    icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
- 		    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
- 		    bfq_bfqq_budget_left(async_bfqq))
--			bfqq = bfqq->bic->bfqq[0];
-+			bfqq = async_bfqq;
- 		else if (bfqq->waker_bfqq &&
- 			   bfq_bfqq_busy(bfqq->waker_bfqq) &&
- 			   bfqq->waker_bfqq->next_rq &&
-@@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
- 	bfq_dispatch_remove(bfqd->queue, rq);
- 
- 	if (bfqq != bfqd->in_service_queue)
--		goto return_rq;
-+		return rq;
- 
- 	/*
- 	 * If weight raising has to terminate for bfqq, then next
-@@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
- 	 * belongs to CLASS_IDLE and other queues are waiting for
- 	 * service.
- 	 */
--	if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
--		goto return_rq;
--
--	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
-+	if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))
-+		bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
- 
--return_rq:
- 	return rq;
- }
- 
-@@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 
- 		/*
- 		 * We exploit the bfq_finish_requeue_request hook to
--		 * decrement rq_in_driver, but
-+		 * decrement tot_rq_in_driver, but
- 		 * bfq_finish_requeue_request will not be invoked on
- 		 * this request. So, to avoid unbalance, just start
--		 * this request, without incrementing rq_in_driver. As
--		 * a negative consequence, rq_in_driver is deceptively
-+		 * this request, without incrementing tot_rq_in_driver. As
-+		 * a negative consequence, tot_rq_in_driver is deceptively
- 		 * lower than it should be while this request is in
- 		 * service. This may cause bfq_schedule_dispatch to be
- 		 * invoked uselessly.
-@@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 		 * bfq_finish_requeue_request hook, if defined, is
- 		 * probably invoked also on this request. So, by
- 		 * exploiting this hook, we could 1) increment
--		 * rq_in_driver here, and 2) decrement it in
-+		 * tot_rq_in_driver here, and 2) decrement it in
- 		 * bfq_finish_requeue_request. Such a solution would
- 		 * let the value of the counter be always accurate,
- 		 * but it would entail using an extra interface
-@@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 	 * Of course, serving one request at a time may cause loss of
- 	 * throughput.
- 	 */
--	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
-+	if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0)
- 		goto exit;
- 
- 	bfqq = bfq_select_queue(bfqd);
-@@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
- 
- 	if (rq) {
- inc_in_driver_start_rq:
--		bfqd->rq_in_driver++;
-+		bfqd->rq_in_driver[bfqq->actuator_idx]++;
-+		bfqd->tot_rq_in_driver++;
- start_rq:
- 		rq->rq_flags |= RQF_STARTED;
- 	}
-@@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq)
- 	 */
- 	__bfqq = bfqq->new_bfqq;
- 	while (__bfqq) {
--		if (__bfqq == bfqq)
--			break;
- 		next = __bfqq->new_bfqq;
- 		bfq_put_queue(__bfqq);
- 		__bfqq = next;
-@@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
- 	bfq_release_process_ref(bfqd, bfqq);
- }
- 
--static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
-+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
-+			      unsigned int actuator_idx)
- {
--	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
-+	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx);
- 	struct bfq_data *bfqd;
- 
- 	if (bfqq)
- 		bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
- 
- 	if (bfqq && bfqd) {
--		unsigned long flags;
--
--		spin_lock_irqsave(&bfqd->lock, flags);
--		bic_set_bfqq(bic, NULL, is_sync);
-+		bic_set_bfqq(bic, NULL, is_sync, actuator_idx);
- 		bfq_exit_bfqq(bfqd, bfqq);
--		spin_unlock_irqrestore(&bfqd->lock, flags);
- 	}
- }
- 
- static void bfq_exit_icq(struct io_cq *icq)
- {
- 	struct bfq_io_cq *bic = icq_to_bic(icq);
-+	struct bfq_data *bfqd = bic_to_bfqd(bic);
-+	unsigned long flags;
-+	unsigned int act_idx;
-+	/*
-+	 * If bfqd and thus bfqd->num_actuators is not available any
-+	 * longer, then cycle over all possible per-actuator bfqqs in
-+	 * next loop. We rely on bic being zeroed on creation, and
-+	 * therefore on its unused per-actuator fields being NULL.
-+	 */
-+	unsigned int num_actuators = BFQ_MAX_ACTUATORS;
-+	struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
- 
--	if (bic->stable_merge_bfqq) {
--		struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
-+	/*
-+	 * bfqd is NULL if scheduler already exited, and in that case
-+	 * this is the last time these queues are accessed.
-+	 */
-+	if (bfqd) {
-+		spin_lock_irqsave(&bfqd->lock, flags);
-+		num_actuators = bfqd->num_actuators;
-+	}
- 
--		/*
--		 * bfqd is NULL if scheduler already exited, and in
--		 * that case this is the last time bfqq is accessed.
--		 */
--		if (bfqd) {
--			unsigned long flags;
-+	for (act_idx = 0; act_idx < num_actuators; act_idx++) {
-+		if (bfqq_data[act_idx].stable_merge_bfqq)
-+			bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
- 
--			spin_lock_irqsave(&bfqd->lock, flags);
--			bfq_put_stable_ref(bic->stable_merge_bfqq);
--			spin_unlock_irqrestore(&bfqd->lock, flags);
--		} else {
--			bfq_put_stable_ref(bic->stable_merge_bfqq);
--		}
-+		bfq_exit_icq_bfqq(bic, true, act_idx);
-+		bfq_exit_icq_bfqq(bic, false, act_idx);
- 	}
- 
--	bfq_exit_icq_bfqq(bic, true);
--	bfq_exit_icq_bfqq(bic, false);
-+	if (bfqd)
-+		spin_unlock_irqrestore(&bfqd->lock, flags);
- }
- 
- /*
-@@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
- 
- 	bic->ioprio = ioprio;
- 
--	bfqq = bic_to_bfqq(bic, false);
-+	bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio));
- 	if (bfqq) {
- 		struct bfq_queue *old_bfqq = bfqq;
- 
- 		bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
--		bic_set_bfqq(bic, bfqq, false);
-+		bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio));
- 		bfq_release_process_ref(bfqd, old_bfqq);
- 	}
- 
--	bfqq = bic_to_bfqq(bic, true);
-+	bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio));
- 	if (bfqq)
- 		bfq_set_next_ioprio_data(bfqq, bic);
- }
- 
- static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
--			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
-+			  struct bfq_io_cq *bic, pid_t pid, int is_sync,
-+			  unsigned int act_idx)
- {
- 	u64 now_ns = ktime_get_ns();
- 
-+	bfqq->actuator_idx = act_idx;
- 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
- 	INIT_LIST_HEAD(&bfqq->fifo);
- 	INIT_HLIST_NODE(&bfqq->burst_list_node);
-@@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 
- 	/* first request is almost certainly seeky */
- 	bfqq->seek_history = 1;
-+
-+	bfqq->decrease_time_jif = jiffies;
- }
- 
- static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
- 					       struct bfq_group *bfqg,
--					       int ioprio_class, int ioprio)
-+					       int ioprio_class, int ioprio, int act_idx)
- {
- 	switch (ioprio_class) {
- 	case IOPRIO_CLASS_RT:
--		return &bfqg->async_bfqq[0][ioprio];
-+		return &bfqg->async_bfqq[0][ioprio][act_idx];
- 	case IOPRIO_CLASS_NONE:
- 		ioprio = IOPRIO_BE_NORM;
- 		fallthrough;
- 	case IOPRIO_CLASS_BE:
--		return &bfqg->async_bfqq[1][ioprio];
-+		return &bfqg->async_bfqq[1][ioprio][act_idx];
- 	case IOPRIO_CLASS_IDLE:
--		return &bfqg->async_idle_bfqq;
-+		return &bfqg->async_idle_bfqq[act_idx];
- 	default:
- 		return NULL;
- 	}
-@@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 			  struct bfq_io_cq *bic,
- 			  struct bfq_queue *last_bfqq_created)
- {
-+	unsigned int a_idx = last_bfqq_created->actuator_idx;
- 	struct bfq_queue *new_bfqq =
- 		bfq_setup_merge(bfqq, last_bfqq_created);
- 
-@@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- 		return bfqq;
- 
- 	if (new_bfqq->bic)
--		new_bfqq->bic->stably_merged = true;
--	bic->stably_merged = true;
-+		new_bfqq->bic->bfqq_data[a_idx].stably_merged = true;
-+	bic->bfqq_data[a_idx].stably_merged = true;
- 
- 	/*
- 	 * Reusing merge functions. This implies that
-@@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
- 	 * it has been set already, but too long ago, then move it
- 	 * forward to bfqq. Finally, move also if bfqq belongs to a
- 	 * different group than last_bfqq_created, or if bfqq has a
--	 * different ioprio or ioprio_class. If none of these
--	 * conditions holds true, then try an early stable merge or
--	 * schedule a delayed stable merge.
-+	 * different ioprio, ioprio_class or actuator_idx. If none of
-+	 * these conditions holds true, then try an early stable merge
-+	 * or schedule a delayed stable merge. As for the condition on
-+	 * actuator_idx, the reason is that, if queues associated with
-+	 * different actuators are merged, then control is lost on
-+	 * each actuator. Therefore some actuator may be
-+	 * underutilized, and throughput may decrease.
- 	 *
- 	 * A delayed merge is scheduled (instead of performing an
- 	 * early merge), in case bfqq might soon prove to be more
-@@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
- 			bfqq->creation_time) ||
- 		bfqq->entity.parent != last_bfqq_created->entity.parent ||
- 		bfqq->ioprio != last_bfqq_created->ioprio ||
--		bfqq->ioprio_class != last_bfqq_created->ioprio_class)
-+		bfqq->ioprio_class != last_bfqq_created->ioprio_class ||
-+		bfqq->actuator_idx != last_bfqq_created->actuator_idx)
- 		*source_bfqq = bfqq;
- 	else if (time_after_eq(last_bfqq_created->creation_time +
- 				 bfqd->bfq_burst_interval,
-@@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
- 			/*
- 			 * Record the bfqq to merge to.
- 			 */
--			bic->stable_merge_bfqq = last_bfqq_created;
-+			bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq =
-+				last_bfqq_created;
- 		}
- 	}
- 
-@@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- 	bfqg = bfq_bio_bfqg(bfqd, bio);
- 	if (!is_sync) {
- 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
--						  ioprio);
-+						  ioprio,
-+						  bfq_actuator_index(bfqd, bio));
- 		bfqq = *async_bfqq;
- 		if (bfqq)
- 			goto out;
-@@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
- 
- 	if (bfqq) {
- 		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
--			      is_sync);
-+			      is_sync, bfq_actuator_index(bfqd, bio));
- 		bfq_init_entity(&bfqq->entity, bfqg);
- 		bfq_log_bfqq(bfqd, bfqq, "allocated");
- 	} else {
-@@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
- 		 * then complete the merge and redirect it to
- 		 * new_bfqq.
- 		 */
--		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
-+		if (bic_to_bfqq(RQ_BIC(rq), true,
-+				bfq_actuator_index(bfqd, rq->bio)) == bfqq)
- 			bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
- 					bfqq, new_bfqq);
- 
-@@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
- 	struct bfq_queue *bfqq = bfqd->in_service_queue;
- 
- 	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
--				       bfqd->rq_in_driver);
-+				       bfqd->tot_rq_in_driver);
- 
- 	if (bfqd->hw_tag == 1)
- 		return;
-@@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
- 	 * sum is not exact, as it's not taking into account deactivated
- 	 * requests.
- 	 */
--	if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
-+	if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
- 		return;
- 
- 	/*
-@@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
- 	if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
- 	    bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
- 	    BFQ_HW_QUEUE_THRESHOLD &&
--	    bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
-+	    bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
- 		return;
- 
- 	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
-@@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
- 
- 	bfq_update_hw_tag(bfqd);
- 
--	bfqd->rq_in_driver--;
-+	bfqd->rq_in_driver[bfqq->actuator_idx]--;
-+	bfqd->tot_rq_in_driver--;
- 	bfqq->dispatched--;
- 
- 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
-@@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
- 					BFQQE_NO_MORE_REQUESTS);
- 	}
- 
--	if (!bfqd->rq_in_driver)
-+	if (!bfqd->tot_rq_in_driver)
- 		bfq_schedule_dispatch(bfqd);
- }
- 
-@@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
- 	 * conditions to do it, or we can lower the last base value
- 	 * computed.
- 	 *
--	 * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
-+	 * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O
- 	 * request in flight, because this function is in the code
- 	 * path that handles the completion of a request of bfqq, and,
- 	 * in particular, this function is executed before
--	 * bfqd->rq_in_driver is decremented in such a code path.
-+	 * bfqd->tot_rq_in_driver is decremented in such a code path.
- 	 */
--	if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
-+	if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) ||
- 	    tot_time_ns < bfqq->last_serv_time_ns) {
- 		if (bfqq->last_serv_time_ns == 0) {
- 			/*
-@@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
- 			bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
- 		}
- 		bfqq->last_serv_time_ns = tot_time_ns;
--	} else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
-+	} else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1)
- 		/*
- 		 * No I/O injected and no request still in service in
- 		 * the drive: these are the exact conditions for
-@@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
- 		return bfqq;
- 	}
- 
--	bic_set_bfqq(bic, NULL, true);
-+	bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
- 
- 	bfq_put_cooperator(bfqq);
- 
-@@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
- 						   bool split, bool is_sync,
- 						   bool *new_queue)
- {
--	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
-+	unsigned int act_idx = bfq_actuator_index(bfqd, bio);
-+	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
-+	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx];
- 
- 	if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
- 		return bfqq;
-@@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
- 		bfq_put_queue(bfqq);
- 	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
- 
--	bic_set_bfqq(bic, bfqq, is_sync);
-+	bic_set_bfqq(bic, bfqq, is_sync, act_idx);
- 	if (split && is_sync) {
--		if ((bic->was_in_burst_list && bfqd->large_burst) ||
--		    bic->saved_in_large_burst)
-+		if ((bfqq_data->was_in_burst_list && bfqd->large_burst) ||
-+		    bfqq_data->saved_in_large_burst)
- 			bfq_mark_bfqq_in_large_burst(bfqq);
- 		else {
- 			bfq_clear_bfqq_in_large_burst(bfqq);
--			if (bic->was_in_burst_list)
-+			if (bfqq_data->was_in_burst_list)
- 				/*
- 				 * If bfqq was in the current
- 				 * burst list before being
-@@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
- 	struct bfq_queue *bfqq;
- 	bool new_queue = false;
- 	bool bfqq_already_existing = false, split = false;
-+	unsigned int a_idx = bfq_actuator_index(bfqd, bio);
- 
- 	if (unlikely(!rq->elv.icq))
- 		return NULL;
- 
- 	/*
--	 * Assuming that elv.priv[1] is set only if everything is set
-+	 * Assuming that RQ_BFQQ(rq) is set only if everything is set
- 	 * for this rq. This holds true, because this function is
- 	 * invoked only for insertion or merging, and, after such
- 	 * events, a request cannot be manipulated any longer before
- 	 * being removed from bfq.
- 	 */
--	if (rq->elv.priv[1])
--		return rq->elv.priv[1];
-+	if (RQ_BFQQ(rq))
-+		return RQ_BFQQ(rq);
- 
- 	bic = icq_to_bic(rq->elv.icq);
- 
-@@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
- 	if (likely(!new_queue)) {
- 		/* If the queue was seeky for too long, break it apart. */
- 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
--			!bic->stably_merged) {
-+			!bic->bfqq_data[a_idx].stably_merged) {
- 			struct bfq_queue *old_bfqq = bfqq;
- 
- 			/* Update bic before losing reference to bfqq */
- 			if (bfq_bfqq_in_large_burst(bfqq))
--				bic->saved_in_large_burst = true;
-+				bic->bfqq_data[a_idx].saved_in_large_burst =
-+					true;
- 
- 			bfqq = bfq_split_bfqq(bic, bfqq);
- 			split = true;
-@@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
-  */
- void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
- {
--	int i, j;
-+	int i, j, k;
- 
--	for (i = 0; i < 2; i++)
--		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
--			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
-+	for (k = 0; k < bfqd->num_actuators; k++) {
-+		for (i = 0; i < 2; i++)
-+			for (j = 0; j < IOPRIO_NR_LEVELS; j++)
-+				__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]);
- 
--	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
-+		__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]);
-+	}
- }
- 
- /*
-@@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- {
- 	struct bfq_data *bfqd;
- 	struct elevator_queue *eq;
-+	unsigned int i;
-+	struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
- 
- 	eq = elevator_alloc(q, e);
- 	if (!eq)
-@@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
- 	 * Grab a permanent reference to it, so that the normal code flow
- 	 * will not attempt to free it.
-+	 * Set zero as actuator index: we will pretend that
-+	 * all I/O requests are for the same actuator.
- 	 */
--	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0);
- 	bfqd->oom_bfqq.ref++;
- 	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
- 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
-@@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 
- 	bfqd->queue = q;
- 
-+	bfqd->num_actuators = 1;
-+	/*
-+	 * If the disk supports multiple actuators, copy independent
-+	 * access ranges from the request queue structure.
-+	 */
-+	spin_lock_irq(&q->queue_lock);
-+	if (ia_ranges) {
-+		/*
-+		 * Check if the disk ia_ranges size exceeds the current bfq
-+		 * actuator limit.
-+		 */
-+		if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) {
-+			pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n",
-+				ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS);
-+			pr_crit("Falling back to single actuator mode.\n");
-+		} else {
-+			bfqd->num_actuators = ia_ranges->nr_ia_ranges;
-+
-+			for (i = 0; i < bfqd->num_actuators; i++) {
-+				bfqd->sector[i] = ia_ranges->ia_range[i].sector;
-+				bfqd->nr_sectors[i] =
-+					ia_ranges->ia_range[i].nr_sectors;
-+			}
-+		}
-+	}
-+
-+	/* Otherwise use single-actuator dev info */
-+	if (bfqd->num_actuators == 1) {
-+		bfqd->sector[0] = 0;
-+		bfqd->nr_sectors[0] = get_capacity(q->disk);
-+	}
-+	spin_unlock_irq(&q->queue_lock);
-+
- 	INIT_LIST_HEAD(&bfqd->dispatch);
- 
- 	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
-@@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 	bfqd->num_groups_with_pending_reqs = 0;
- #endif
- 
--	INIT_LIST_HEAD(&bfqd->active_list);
-+	INIT_LIST_HEAD(&bfqd->active_list[0]);
-+	INIT_LIST_HEAD(&bfqd->active_list[1]);
- 	INIT_LIST_HEAD(&bfqd->idle_list);
- 	INIT_HLIST_HEAD(&bfqd->burst_list);
- 
-@@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 	 */
- 	bfqd->bfq_wr_coeff = 30;
- 	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
--	bfqd->bfq_wr_max_time = 0;
- 	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
- 	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
- 	bfqd->bfq_wr_max_softrt_rate = 7000; /*
-@@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
- 		ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
- 	bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
- 
-+	/* see comments on the definition of next field inside bfq_data */
-+	bfqd->actuator_load_threshold = 4;
-+
- 	spin_lock_init(&bfqd->lock);
- 
- 	/*
-@@ -7412,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched");
+@@ -7617,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched");
  static int __init bfq_init(void)
  {
  	int ret;
-+	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.2";
++	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.3";
  
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
  	ret = blkcg_policy_register(&blkcg_policy_bfq);
-@@ -7443,6 +7649,11 @@ static int __init bfq_init(void)
+@@ -7648,6 +7649,11 @@ static int __init bfq_init(void)
  	if (ret)
  		goto slab_kill;
  
@@ -4737,2196 +3317,18 @@ index 380e9bda2e57..aa644973d260 100644
  	return 0;
  
  slab_kill:
-diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
-index 466e4865ace6..75cc6a324267 100644
---- a/block/bfq-iosched.h
-+++ b/block/bfq-iosched.h
-@@ -33,6 +33,14 @@
-  */
- #define BFQ_SOFTRT_WEIGHT_FACTOR	100
- 
-+/*
-+ * Maximum number of actuators supported. This constant is used simply
-+ * to define the size of the static array that will contain
-+ * per-actuator data. The current value is hopefully a good upper
-+ * bound to the possible number of actuators of any actual drive.
-+ */
-+#define BFQ_MAX_ACTUATORS 8
-+
- struct bfq_entity;
- 
- /**
-@@ -227,12 +235,14 @@ struct bfq_ttime {
-  * struct bfq_queue - leaf schedulable entity.
-  *
-  * A bfq_queue is a leaf request queue; it can be associated with an
-- * io_context or more, if it  is  async or shared  between  cooperating
-- * processes. @cgroup holds a reference to the cgroup, to be sure that it
-- * does not disappear while a bfqq still references it (mostly to avoid
-- * races between request issuing and task migration followed by cgroup
-- * destruction).
-- * All the fields are protected by the queue lock of the containing bfqd.
-+ * io_context or more, if it is async or shared between cooperating
-+ * processes. Besides, it contains I/O requests for only one actuator
-+ * (an io_context is associated with a different bfq_queue for each
-+ * actuator it generates I/O for). @cgroup holds a reference to the
-+ * cgroup, to be sure that it does not disappear while a bfqq still
-+ * references it (mostly to avoid races between request issuing and
-+ * task migration followed by cgroup destruction).  All the fields are
-+ * protected by the queue lock of the containing bfqd.
-  */
- struct bfq_queue {
- 	/* reference counter */
-@@ -397,24 +407,18 @@ struct bfq_queue {
- 	 * the woken queues when this queue exits.
- 	 */
- 	struct hlist_head woken_list;
-+
-+	/* index of the actuator this queue is associated with */
-+	unsigned int actuator_idx;
- };
- 
- /**
-- * struct bfq_io_cq - per (request_queue, io_context) structure.
-- */
--struct bfq_io_cq {
--	/* associated io_cq structure */
--	struct io_cq icq; /* must be the first member */
--	/* array of two process queues, the sync and the async */
--	struct bfq_queue *bfqq[2];
--	/* per (request_queue, blkcg) ioprio */
--	int ioprio;
--#ifdef CONFIG_BFQ_GROUP_IOSCHED
--	uint64_t blkcg_serial_nr; /* the current blkcg serial */
--#endif
-+* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq
-+*/
-+struct bfq_iocq_bfqq_data {
- 	/*
- 	 * Snapshot of the has_short_time flag before merging; taken
--	 * to remember its value while the queue is merged, so as to
-+	 * to remember its values while the queue is merged, so as to
- 	 * be able to restore it in case of split.
- 	 */
- 	bool saved_has_short_ttime;
-@@ -428,7 +432,7 @@ struct bfq_io_cq {
- 	u64 saved_tot_idle_time;
- 
- 	/*
--	 * Same purpose as the previous fields for the value of the
-+	 * Same purpose as the previous fields for the values of the
- 	 * field keeping the queue's belonging to a large burst
- 	 */
- 	bool saved_in_large_burst;
-@@ -466,6 +470,38 @@ struct bfq_io_cq {
- 	struct bfq_queue *stable_merge_bfqq;
- 
- 	bool stably_merged;	/* non splittable if true */
-+};
-+
-+/**
-+ * struct bfq_io_cq - per (request_queue, io_context) structure.
-+ */
-+struct bfq_io_cq {
-+	/* associated io_cq structure */
-+	struct io_cq icq; /* must be the first member */
-+	/*
-+	 * Matrix of associated process queues: first row for async
-+	 * queues, second row sync queues. Each row contains one
-+	 * column for each actuator. An I/O request generated by the
-+	 * process is inserted into the queue pointed by bfqq[i][j] if
-+	 * the request is to be served by the j-th actuator of the
-+	 * drive, where i==0 or i==1, depending on whether the request
-+	 * is async or sync. So there is a distinct queue for each
-+	 * actuator.
-+	 */
-+	struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];
-+	/* per (request_queue, blkcg) ioprio */
-+	int ioprio;
-+#ifdef CONFIG_BFQ_GROUP_IOSCHED
-+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
-+#endif
-+
-+	/*
-+	 * Persistent data for associated synchronous process queues
-+	 * (one queue per actuator, see field bfqq above). In
-+	 * particular, each of these queues may undergo a merge.
-+	 */
-+	struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];
-+
- 	unsigned int requests;	/* Number of requests this process has in flight */
- };
- 
-@@ -554,7 +590,12 @@ struct bfq_data {
- 	/* number of queued requests */
- 	int queued;
- 	/* number of requests dispatched and waiting for completion */
--	int rq_in_driver;
-+	int tot_rq_in_driver;
-+	/*
-+	 * number of requests dispatched and waiting for completion
-+	 * for each actuator
-+	 */
-+	int rq_in_driver[BFQ_MAX_ACTUATORS];
- 
- 	/* true if the device is non rotational and performs queueing */
- 	bool nonrot_with_queueing;
-@@ -648,8 +689,13 @@ struct bfq_data {
- 	/* maximum budget allotted to a bfq_queue before rescheduling */
- 	int bfq_max_budget;
- 
--	/* list of all the bfq_queues active on the device */
--	struct list_head active_list;
-+	/*
-+	 * List of all the bfq_queues active for a specific actuator
-+	 * on the device. Keeping active queues separate on a
-+	 * per-actuator basis helps implementing per-actuator
-+	 * injection more efficiently.
-+	 */
-+	struct list_head active_list[BFQ_MAX_ACTUATORS];
- 	/* list of all the bfq_queues idle on the device */
- 	struct list_head idle_list;
- 
-@@ -723,8 +769,6 @@ struct bfq_data {
- 	 * is multiplied.
- 	 */
- 	unsigned int bfq_wr_coeff;
--	/* maximum duration of a weight-raising period (jiffies) */
--	unsigned int bfq_wr_max_time;
- 
- 	/* Maximum weight-raising duration for soft real-time processes */
- 	unsigned int bfq_wr_rt_max_time;
-@@ -772,6 +816,42 @@ struct bfq_data {
- 	 */
- 	unsigned int word_depths[2][2];
- 	unsigned int full_depth_shift;
-+
-+	/*
-+	 * Number of independent actuators. This is equal to 1 in
-+	 * case of single-actuator drives.
-+	 */
-+	unsigned int num_actuators;
-+	/*
-+	 * Disk independent access ranges for each actuator
-+	 * in this device.
-+	 */
-+	sector_t sector[BFQ_MAX_ACTUATORS];
-+	sector_t nr_sectors[BFQ_MAX_ACTUATORS];
-+	struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
-+
-+	/*
-+	 * If the number of I/O requests queued in the device for a
-+	 * given actuator is below next threshold, then the actuator
-+	 * is deemed as underutilized. If this condition is found to
-+	 * hold for some actuator upon a dispatch, but (i) the
-+	 * in-service queue does not contain I/O for that actuator,
-+	 * while (ii) some other queue does contain I/O for that
-+	 * actuator, then the head I/O request of the latter queue is
-+	 * returned (injected), instead of the head request of the
-+	 * currently in-service queue.
-+	 *
-+	 * We set the threshold, empirically, to the minimum possible
-+	 * value for which an actuator is fully utilized, or close to
-+	 * be fully utilized. By doing so, injected I/O 'steals' as
-+	 * few drive-queue slots as possibile to the in-service
-+	 * queue. This reduces as much as possible the probability
-+	 * that the service of I/O from the in-service bfq_queue gets
-+	 * delayed because of slot exhaustion, i.e., because all the
-+	 * slots of the drive queue are filled with I/O injected from
-+	 * other queues (NCQ provides for 32 slots).
-+	 */
-+	unsigned int actuator_load_threshold;
- };
- 
- enum bfqq_state_flags {
-@@ -937,8 +1017,8 @@ struct bfq_group {
- 
- 	struct bfq_data *bfqd;
- 
--	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
--	struct bfq_queue *async_idle_bfqq;
-+	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
-+	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
- 
- 	struct bfq_entity *my_entity;
- 
-@@ -955,8 +1035,8 @@ struct bfq_group {
- 	struct bfq_entity entity;
- 	struct bfq_sched_data sched_data;
- 
--	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
--	struct bfq_queue *async_idle_bfqq;
-+	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
-+	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
- 
- 	struct rb_root rq_pos_tree;
- };
-@@ -969,8 +1049,10 @@ struct bfq_group {
- 
- extern const int bfq_timeout;
- 
--struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
--void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
-+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
-+				unsigned int actuator_idx);
-+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,
-+				unsigned int actuator_idx);
- struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
- void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
- void bfq_weights_tree_add(struct bfq_queue *bfqq);
-diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
-index ea4c3d757fdd..7941b6f07391 100644
---- a/block/bfq-wf2q.c
-+++ b/block/bfq-wf2q.c
-@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
- 	bfq_update_active_tree(node);
- 
- 	if (bfqq)
--		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]);
- 
- 	bfq_inc_active_entities(entity);
- }
-diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
-index 9ac1efb053e0..4272599a3f08 100644
---- a/block/blk-cgroup.c
-+++ b/block/blk-cgroup.c
-@@ -33,7 +33,6 @@
- #include "blk-cgroup.h"
- #include "blk-ioprio.h"
- #include "blk-throttle.h"
--#include "blk-rq-qos.h"
- 
- /*
-  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
-@@ -626,69 +625,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
- EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
- 
- /**
-- * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
-- * @inputp: input string pointer
-+ * blkg_conf_init - initialize a blkg_conf_ctx
-+ * @ctx: blkg_conf_ctx to initialize
-+ * @input: input string
-+ *
-+ * Initialize @ctx which can be used to parse blkg config input string @input.
-+ * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
-+ * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
-+ */
-+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
-+{
-+	*ctx = (struct blkg_conf_ctx){ .input = input };
-+}
-+EXPORT_SYMBOL_GPL(blkg_conf_init);
-+
-+/**
-+ * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
-+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
-  *
-- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
-- * from @input and get and return the matching bdev.  *@inputp is
-- * updated to point past the device node prefix.  Returns an ERR_PTR()
-- * value on error.
-+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
-+ * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
-+ * set to point past the device node prefix.
-  *
-- * Use this function iff blkg_conf_prep() can't be used for some reason.
-+ * This function may be called multiple times on @ctx and the extra calls become
-+ * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
-+ * explicitly if bdev access is needed without resolving the blkcg / policy part
-+ * of @ctx->input. Returns -errno on error.
-  */
--struct block_device *blkcg_conf_open_bdev(char **inputp)
-+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
- {
--	char *input = *inputp;
-+	char *input = ctx->input;
- 	unsigned int major, minor;
- 	struct block_device *bdev;
- 	int key_len;
- 
-+	if (ctx->bdev)
-+		return 0;
-+
- 	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
--		return ERR_PTR(-EINVAL);
-+		return -EINVAL;
- 
- 	input += key_len;
- 	if (!isspace(*input))
--		return ERR_PTR(-EINVAL);
-+		return -EINVAL;
- 	input = skip_spaces(input);
- 
- 	bdev = blkdev_get_no_open(MKDEV(major, minor));
- 	if (!bdev)
--		return ERR_PTR(-ENODEV);
-+		return -ENODEV;
- 	if (bdev_is_partition(bdev)) {
- 		blkdev_put_no_open(bdev);
--		return ERR_PTR(-ENODEV);
-+		return -ENODEV;
- 	}
- 
--	*inputp = input;
--	return bdev;
-+	ctx->body = input;
-+	ctx->bdev = bdev;
-+	return 0;
- }
- 
- /**
-  * blkg_conf_prep - parse and prepare for per-blkg config update
-  * @blkcg: target block cgroup
-  * @pol: target policy
-- * @input: input string
-- * @ctx: blkg_conf_ctx to be filled
-+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
-+ *
-+ * Parse per-blkg config update from @ctx->input and initialize @ctx
-+ * accordingly. On success, @ctx->body points to the part of @ctx->input
-+ * following MAJ:MIN, @ctx->bdev points to the target block device and
-+ * @ctx->blkg to the blkg being configured.
-  *
-- * Parse per-blkg config update from @input and initialize @ctx with the
-- * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
-- * part of @input following MAJ:MIN.  This function returns with RCU read
-- * lock and queue lock held and must be paired with blkg_conf_finish().
-+ * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
-+ * function returns with queue lock held and must be followed by
-+ * blkg_conf_exit().
-  */
- int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
--		   char *input, struct blkg_conf_ctx *ctx)
--	__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
-+		   struct blkg_conf_ctx *ctx)
-+	__acquires(&bdev->bd_queue->queue_lock)
- {
--	struct block_device *bdev;
- 	struct gendisk *disk;
- 	struct request_queue *q;
- 	struct blkcg_gq *blkg;
- 	int ret;
- 
--	bdev = blkcg_conf_open_bdev(&input);
--	if (IS_ERR(bdev))
--		return PTR_ERR(bdev);
--	disk = bdev->bd_disk;
-+	ret = blkg_conf_open_bdev(ctx);
-+	if (ret)
-+		return ret;
-+
-+	disk = ctx->bdev->bd_disk;
- 	q = disk->queue;
- 
- 	/*
-@@ -699,7 +722,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- 	if (ret)
- 		goto fail;
- 
--	rcu_read_lock();
- 	spin_lock_irq(&q->queue_lock);
- 
- 	if (!blkcg_policy_enabled(q, pol)) {
-@@ -728,7 +750,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- 
- 		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
- 		spin_unlock_irq(&q->queue_lock);
--		rcu_read_unlock();
- 
- 		new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
- 		if (unlikely(!new_blkg)) {
-@@ -742,7 +763,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- 			goto fail_exit_queue;
- 		}
- 
--		rcu_read_lock();
- 		spin_lock_irq(&q->queue_lock);
- 
- 		if (!blkcg_policy_enabled(q, pol)) {
-@@ -769,20 +789,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- 	}
- success:
- 	blk_queue_exit(q);
--	ctx->bdev = bdev;
- 	ctx->blkg = blkg;
--	ctx->body = input;
- 	return 0;
- 
- fail_preloaded:
- 	radix_tree_preload_end();
- fail_unlock:
- 	spin_unlock_irq(&q->queue_lock);
--	rcu_read_unlock();
- fail_exit_queue:
- 	blk_queue_exit(q);
- fail:
--	blkdev_put_no_open(bdev);
- 	/*
- 	 * If queue was bypassing, we should retry.  Do so after a
- 	 * short msleep().  It isn't strictly necessary but queue
-@@ -798,20 +814,27 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- EXPORT_SYMBOL_GPL(blkg_conf_prep);
- 
- /**
-- * blkg_conf_finish - finish up per-blkg config update
-- * @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
-+ * blkg_conf_exit - clean up per-blkg config update
-+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
-  *
-- * Finish up after per-blkg config update.  This function must be paired
-- * with blkg_conf_prep().
-+ * Clean up after per-blkg config update. This function must be called on all
-+ * blkg_conf_ctx's initialized with blkg_conf_init().
-  */
--void blkg_conf_finish(struct blkg_conf_ctx *ctx)
--	__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
-+void blkg_conf_exit(struct blkg_conf_ctx *ctx)
-+	__releases(&ctx->bdev->bd_queue->queue_lock)
- {
--	spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
--	rcu_read_unlock();
--	blkdev_put_no_open(ctx->bdev);
-+	if (ctx->blkg) {
-+		spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
-+		ctx->blkg = NULL;
-+	}
-+
-+	if (ctx->bdev) {
-+		blkdev_put_no_open(ctx->bdev);
-+		ctx->body = NULL;
-+		ctx->bdev = NULL;
-+	}
- }
--EXPORT_SYMBOL_GPL(blkg_conf_finish);
-+EXPORT_SYMBOL_GPL(blkg_conf_exit);
- 
- static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
- {
-@@ -1300,14 +1323,8 @@ int blkcg_init_disk(struct gendisk *disk)
- 	if (ret)
- 		goto err_ioprio_exit;
- 
--	ret = blk_iolatency_init(disk);
--	if (ret)
--		goto err_throtl_exit;
--
- 	return 0;
- 
--err_throtl_exit:
--	blk_throtl_exit(disk);
- err_ioprio_exit:
- 	blk_ioprio_exit(disk);
- err_destroy_all:
-@@ -1323,7 +1340,6 @@ int blkcg_init_disk(struct gendisk *disk)
- void blkcg_exit_disk(struct gendisk *disk)
- {
- 	blkg_destroy_all(disk);
--	rq_qos_exit(disk->queue);
- 	blk_throtl_exit(disk);
- }
- 
-diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
-index 1e94e404eaa8..fe09e8b4c2a8 100644
---- a/block/blk-cgroup.h
-+++ b/block/blk-cgroup.h
-@@ -208,15 +208,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
- u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
- 
- struct blkg_conf_ctx {
-+	char				*input;
-+	char				*body;
- 	struct block_device		*bdev;
- 	struct blkcg_gq			*blkg;
--	char				*body;
- };
- 
--struct block_device *blkcg_conf_open_bdev(char **inputp);
-+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
-+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
- int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
--		   char *input, struct blkg_conf_ctx *ctx);
--void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-+		   struct blkg_conf_ctx *ctx);
-+void blkg_conf_exit(struct blkg_conf_ctx *ctx);
- 
- /**
-  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
-diff --git a/block/blk-iocost.c b/block/blk-iocost.c
-index ec7219caea16..c31d57e29bf8 100644
---- a/block/blk-iocost.c
-+++ b/block/blk-iocost.c
-@@ -3096,9 +3096,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
- 		return nbytes;
- 	}
- 
--	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
-+	blkg_conf_init(&ctx, buf);
-+
-+	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
- 	if (ret)
--		return ret;
-+		goto err;
- 
- 	iocg = blkg_to_iocg(ctx.blkg);
- 
-@@ -3117,12 +3119,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
- 	weight_updated(iocg, &now);
- 	spin_unlock(&iocg->ioc->lock);
- 
--	blkg_conf_finish(&ctx);
-+	blkg_conf_exit(&ctx);
- 	return nbytes;
- 
- einval:
--	blkg_conf_finish(&ctx);
--	return -EINVAL;
-+	ret = -EINVAL;
-+err:
-+	blkg_conf_exit(&ctx);
-+	return ret;
- }
- 
- static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
-@@ -3177,19 +3181,22 @@ static const match_table_t qos_tokens = {
- static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
- 			     size_t nbytes, loff_t off)
- {
--	struct block_device *bdev;
-+	struct blkg_conf_ctx ctx;
- 	struct gendisk *disk;
- 	struct ioc *ioc;
- 	u32 qos[NR_QOS_PARAMS];
- 	bool enable, user;
--	char *p;
-+	char *body, *p;
- 	int ret;
- 
--	bdev = blkcg_conf_open_bdev(&input);
--	if (IS_ERR(bdev))
--		return PTR_ERR(bdev);
-+	blkg_conf_init(&ctx, input);
- 
--	disk = bdev->bd_disk;
-+	ret = blkg_conf_open_bdev(&ctx);
-+	if (ret)
-+		goto err;
-+
-+	body = ctx.body;
-+	disk = ctx.bdev->bd_disk;
- 	ioc = q_to_ioc(disk->queue);
- 	if (!ioc) {
- 		ret = blk_iocost_init(disk);
-@@ -3206,7 +3213,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
- 	enable = ioc->enabled;
- 	user = ioc->user_qos_params;
- 
--	while ((p = strsep(&input, " \t\n"))) {
-+	while ((p = strsep(&body, " \t\n"))) {
- 		substring_t args[MAX_OPT_ARGS];
- 		char buf[32];
- 		int tok;
-@@ -3295,7 +3302,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
- 	blk_mq_unquiesce_queue(disk->queue);
- 	blk_mq_unfreeze_queue(disk->queue);
- 
--	blkdev_put_no_open(bdev);
-+	blkg_conf_exit(&ctx);
- 	return nbytes;
- einval:
- 	spin_unlock_irq(&ioc->lock);
-@@ -3305,7 +3312,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
- 
- 	ret = -EINVAL;
- err:
--	blkdev_put_no_open(bdev);
-+	blkg_conf_exit(&ctx);
- 	return ret;
- }
- 
-@@ -3356,22 +3363,25 @@ static const match_table_t i_lcoef_tokens = {
- static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
- 				    size_t nbytes, loff_t off)
- {
--	struct block_device *bdev;
-+	struct blkg_conf_ctx ctx;
- 	struct request_queue *q;
- 	struct ioc *ioc;
- 	u64 u[NR_I_LCOEFS];
- 	bool user;
--	char *p;
-+	char *body, *p;
- 	int ret;
- 
--	bdev = blkcg_conf_open_bdev(&input);
--	if (IS_ERR(bdev))
--		return PTR_ERR(bdev);
-+	blkg_conf_init(&ctx, input);
-+
-+	ret = blkg_conf_open_bdev(&ctx);
-+	if (ret)
-+		goto err;
- 
--	q = bdev_get_queue(bdev);
-+	body = ctx.body;
-+	q = bdev_get_queue(ctx.bdev);
- 	ioc = q_to_ioc(q);
- 	if (!ioc) {
--		ret = blk_iocost_init(bdev->bd_disk);
-+		ret = blk_iocost_init(ctx.bdev->bd_disk);
- 		if (ret)
- 			goto err;
- 		ioc = q_to_ioc(q);
-@@ -3384,7 +3394,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
- 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
- 	user = ioc->user_cost_model;
- 
--	while ((p = strsep(&input, " \t\n"))) {
-+	while ((p = strsep(&body, " \t\n"))) {
- 		substring_t args[MAX_OPT_ARGS];
- 		char buf[32];
- 		int tok;
-@@ -3431,7 +3441,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
- 	blk_mq_unquiesce_queue(q);
- 	blk_mq_unfreeze_queue(q);
- 
--	blkdev_put_no_open(bdev);
-+	blkg_conf_exit(&ctx);
- 	return nbytes;
- 
- einval:
-@@ -3442,7 +3452,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
- 
- 	ret = -EINVAL;
- err:
--	blkdev_put_no_open(bdev);
-+	blkg_conf_exit(&ctx);
- 	return ret;
- }
- 
-diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
-index ecdc10741836..3484393dbc4a 100644
---- a/block/blk-iolatency.c
-+++ b/block/blk-iolatency.c
-@@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
- 	}
- }
- 
--int blk_iolatency_init(struct gendisk *disk)
-+static int blk_iolatency_init(struct gendisk *disk)
- {
- 	struct request_queue *q = disk->queue;
- 	struct blk_iolatency *blkiolat;
-@@ -830,6 +830,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
- 	}
- }
- 
-+static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx)
-+{
-+	static DEFINE_MUTEX(init_mutex);
-+	int ret;
-+
-+	ret = blkg_conf_open_bdev(ctx);
-+	if (ret)
-+		return ret;
-+
-+	/*
-+	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
-+	 * confuse iolat_rq_qos() test. Make the test and init atomic.
-+	 */
-+	mutex_lock(&init_mutex);
-+
-+	if (!iolat_rq_qos(ctx->bdev->bd_queue))
-+		ret = blk_iolatency_init(ctx->bdev->bd_disk);
-+
-+	mutex_unlock(&init_mutex);
-+
-+	return ret;
-+}
-+
- static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
- 			     size_t nbytes, loff_t off)
- {
-@@ -842,9 +865,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
- 	u64 oldval;
- 	int ret;
- 
--	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
-+	blkg_conf_init(&ctx, buf);
-+
-+	ret = blk_iolatency_try_init(&ctx);
- 	if (ret)
--		return ret;
-+		goto out;
-+
-+	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
-+	if (ret)
-+		goto out;
- 
- 	iolat = blkg_to_lat(ctx.blkg);
- 	p = ctx.body;
-@@ -880,7 +909,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
- 		iolatency_clear_scaling(blkg);
- 	ret = 0;
- out:
--	blkg_conf_finish(&ctx);
-+	blkg_conf_exit(&ctx);
- 	return ret ?: nbytes;
- }
- 
-@@ -974,7 +1003,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
- {
- 	struct iolatency_grp *iolat = pd_to_lat(pd);
- 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
--	struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
-+	struct rq_qos *rqos = iolat_rq_qos(blkg->q);
- 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
- 	u64 now = ktime_to_ns(ktime_get());
- 	int cpu;
-diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
-index 1ef1f7d4bc3c..27f004fae66b 100644
---- a/block/blk-rq-qos.h
-+++ b/block/blk-rq-qos.h
-@@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
- 	return rq_qos_id(q, RQ_QOS_WBT);
- }
- 
--static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
-+static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
- {
- 	return rq_qos_id(q, RQ_QOS_LATENCY);
- }
-diff --git a/block/blk-throttle.c b/block/blk-throttle.c
-index 6fb5a2f9e1ee..75841d1d9bf4 100644
---- a/block/blk-throttle.c
-+++ b/block/blk-throttle.c
-@@ -1369,9 +1369,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
- 	int ret;
- 	u64 v;
- 
--	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-+	blkg_conf_init(&ctx, buf);
-+
-+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
- 	if (ret)
--		return ret;
-+		goto out_finish;
- 
- 	ret = -EINVAL;
- 	if (sscanf(ctx.body, "%llu", &v) != 1)
-@@ -1390,7 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
- 	tg_conf_updated(tg, false);
- 	ret = 0;
- out_finish:
--	blkg_conf_finish(&ctx);
-+	blkg_conf_exit(&ctx);
- 	return ret ?: nbytes;
- }
- 
-@@ -1562,9 +1564,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
- 	int ret;
- 	int index = of_cft(of)->private;
- 
--	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-+	blkg_conf_init(&ctx, buf);
-+
-+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
- 	if (ret)
--		return ret;
-+		goto out_finish;
- 
- 	tg = blkg_to_tg(ctx.blkg);
- 	tg_update_carryover(tg);
-@@ -1663,7 +1667,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
- 		tg->td->limit_valid[LIMIT_LOW]);
- 	ret = 0;
- out_finish:
--	blkg_conf_finish(&ctx);
-+	blkg_conf_exit(&ctx);
- 	return ret ?: nbytes;
- }
- 
-diff --git a/block/blk.h b/block/blk.h
-index 4c3b3325219a..78f1706cddca 100644
---- a/block/blk.h
-+++ b/block/blk.h
-@@ -392,12 +392,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
- 	return bio;
- }
- 
--#ifdef CONFIG_BLK_CGROUP_IOLATENCY
--int blk_iolatency_init(struct gendisk *disk);
--#else
--static inline int blk_iolatency_init(struct gendisk *disk) { return 0; };
--#endif
--
- #ifdef CONFIG_BLK_DEV_ZONED
- void disk_free_zone_bitmaps(struct gendisk *disk);
- void disk_clear_zone_settings(struct gendisk *disk);
 -- 
-2.40.0.rc2
+2.40.0
 
-From e44295cea72d5cefc97900011495f89f000873ac Mon Sep 17 00:00:00 2001
+From 7a2801ac4761f911a6b2e7a8532b9fedc5382bc5 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 13 Feb 2023 11:26:20 +0100
-Subject: [PATCH 03/16] bitmap
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- include/linux/bitmap.h   |  46 ++++++-------
- include/linux/cpumask.h  | 144 +++++++++++++++++++--------------------
- include/linux/find.h     |  40 +++++------
- include/linux/nodemask.h |  86 +++++++++++------------
- 4 files changed, 158 insertions(+), 158 deletions(-)
-
-diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
-index 7d6d73b78147..40e53a2ecc0d 100644
---- a/include/linux/bitmap.h
-+++ b/include/linux/bitmap.h
-@@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
-  * the bit offset of all zero areas this function finds is multiples of that
-  * power of 2. A @align_mask of 0 means no alignment is required.
-  */
--static inline unsigned long
-+static __always_inline unsigned long
- bitmap_find_next_zero_area(unsigned long *map,
- 			   unsigned long size,
- 			   unsigned long start,
-@@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
- #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
- #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
- 
--static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
-+static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
- {
- 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
- 
-@@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
- 		memset(dst, 0, len);
- }
- 
--static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
-+static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
- {
- 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
- 
-@@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
- 		memset(dst, 0xff, len);
- }
- 
--static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
-+static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
- 			unsigned int nbits)
- {
- 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-@@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
- /*
-  * Copy bitmap and clear tail bits in last word.
-  */
--static inline void bitmap_copy_clear_tail(unsigned long *dst,
-+static __always_inline void bitmap_copy_clear_tail(unsigned long *dst,
- 		const unsigned long *src, unsigned int nbits)
- {
- 	bitmap_copy(dst, src, nbits);
-@@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
- 	bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
- #endif
- 
--static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
-+static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
- 			const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
- 	return __bitmap_and(dst, src1, src2, nbits);
- }
- 
--static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
-+static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
- 			const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
- 		__bitmap_or(dst, src1, src2, nbits);
- }
- 
--static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
-+static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
- 			const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
- 		__bitmap_xor(dst, src1, src2, nbits);
- }
- 
--static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
-+static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
- 			const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
- 	return __bitmap_andnot(dst, src1, src2, nbits);
- }
- 
--static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
-+static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
- 			unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr
- #endif
- #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)
- 
--static inline bool bitmap_equal(const unsigned long *src1,
-+static __always_inline bool bitmap_equal(const unsigned long *src1,
- 				const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1,
-  *
-  * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
-  */
--static inline bool bitmap_or_equal(const unsigned long *src1,
-+static __always_inline bool bitmap_or_equal(const unsigned long *src1,
- 				   const unsigned long *src2,
- 				   const unsigned long *src3,
- 				   unsigned int nbits)
-@@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1,
- 	return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
- }
- 
--static inline bool bitmap_intersects(const unsigned long *src1,
-+static __always_inline bool bitmap_intersects(const unsigned long *src1,
- 				     const unsigned long *src2,
- 				     unsigned int nbits)
- {
-@@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1,
- 		return __bitmap_intersects(src1, src2, nbits);
- }
- 
--static inline bool bitmap_subset(const unsigned long *src1,
-+static __always_inline bool bitmap_subset(const unsigned long *src1,
- 				 const unsigned long *src2, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1,
- 		return __bitmap_subset(src1, src2, nbits);
- }
- 
--static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
-+static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
- 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
-@@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
- 	return find_first_bit(src, nbits) == nbits;
- }
- 
--static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
-+static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
- 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
-@@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
- 		__bitmap_clear(map, start, nbits);
- }
- 
--static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
-+static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
- 				unsigned int shift, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s
- 		__bitmap_shift_right(dst, src, shift, nbits);
- }
- 
--static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
-+static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
- 				unsigned int shift, unsigned int nbits)
- {
- 	if (small_const_nbits(nbits))
-@@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr
- 		__bitmap_shift_left(dst, src, shift, nbits);
- }
- 
--static inline void bitmap_replace(unsigned long *dst,
-+static __always_inline void bitmap_replace(unsigned long *dst,
- 				  const unsigned long *old,
- 				  const unsigned long *new,
- 				  const unsigned long *mask,
-@@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst,
- 		__bitmap_replace(dst, old, new, mask, nbits);
- }
- 
--static inline void bitmap_next_set_region(unsigned long *bitmap,
-+static __always_inline void bitmap_next_set_region(unsigned long *bitmap,
- 					  unsigned int *rs, unsigned int *re,
- 					  unsigned int end)
- {
-@@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
-  * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
-  * but we expect the lower 32-bits of u64.
-  */
--static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
-+static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
- {
- 	bitmap_from_arr64(dst, &mask, 64);
- }
-@@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
-  * Returns the 8-bit value located at the @start bit offset within the @src
-  * memory region.
-  */
--static inline unsigned long bitmap_get_value8(const unsigned long *map,
-+static __always_inline unsigned long bitmap_get_value8(const unsigned long *map,
- 					      unsigned long start)
- {
- 	const size_t index = BIT_WORD(start);
-@@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map,
-  * @value: the 8-bit value; values wider than 8 bits may clobber bitmap
-  * @start: bit offset of the 8-bit value; must be a multiple of 8
-  */
--static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
-+static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value,
- 				     unsigned long start)
- {
- 	const size_t index = BIT_WORD(start);
-diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
-index c2aa0aa26b45..9543b22d6dc2 100644
---- a/include/linux/cpumask.h
-+++ b/include/linux/cpumask.h
-@@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
- extern unsigned int nr_cpu_ids;
- #endif
- 
--static inline void set_nr_cpu_ids(unsigned int nr)
-+static __always_inline void set_nr_cpu_ids(unsigned int nr)
- {
- #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
- 	WARN_ON(nr != nr_cpu_ids);
-@@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
-  *
-  * Returns >= nr_cpu_ids if no cpus set.
-  */
--static inline unsigned int cpumask_first(const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
- {
- 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
-  *
-  * Returns >= nr_cpu_ids if all cpus are set.
-  */
--static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
- {
- 	return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
-  *
-  * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
- {
- 	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
-@@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
-  *
-  * Returns	>= nr_cpumask_bits if no CPUs set.
-  */
--static inline unsigned int cpumask_last(const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
- {
- 	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
-  *
-  * Returns >= nr_cpu_ids if no further cpus set.
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_next(int n, const struct cpumask *srcp)
- {
- 	/* -1 is a legal arg here. */
-@@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
-  *
-  * Returns >= nr_cpu_ids if no further cpus unset.
-  */
--static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
- {
- 	/* -1 is a legal arg here. */
- 	if (n != -1)
-@@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
- 
- #if NR_CPUS == 1
- /* Uniprocessor: there is only one valid CPU */
--static inline unsigned int cpumask_local_spread(unsigned int i, int node)
-+static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node)
- {
- 	return 0;
- }
- 
--static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
-+static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
- 						      const struct cpumask *src2p)
- {
- 	return cpumask_first_and(src1p, src2p);
- }
- 
--static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
- {
- 	return cpumask_first(srcp);
- }
-@@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp);
-  *
-  * Returns >= nr_cpu_ids if no further cpus set in both.
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
- 		     const struct cpumask *src2p)
- {
-@@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
- 	for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
- 
- #if NR_CPUS == 1
--static inline
-+static __always_inline
- unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
- {
- 	cpumask_check(start);
-@@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
-  * Often used to find any cpu but smp_processor_id() in a mask.
-  * Returns >= nr_cpu_ids if no cpus set.
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
- {
- 	unsigned int i;
-@@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
-  *
-  * Returns >= nr_cpu_ids if such cpu doesn't exist.
-  */
--static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
- {
- 	return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
- }
-@@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s
-  *
-  * Returns >= nr_cpu_ids if such cpu doesn't exist.
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
- 							const struct cpumask *srcp2)
- {
-@@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
-  *
-  * Returns >= nr_cpu_ids if such cpu doesn't exist.
-  */
--static inline
-+static __always_inline
- unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
- 							const struct cpumask *srcp2)
- {
-@@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
-  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
-  * @dstp: the cpumask pointer
-  */
--static inline void cpumask_setall(struct cpumask *dstp)
-+static __always_inline void cpumask_setall(struct cpumask *dstp)
- {
- 	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
- }
-@@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
-  * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
-  * @dstp: the cpumask pointer
-  */
--static inline void cpumask_clear(struct cpumask *dstp)
-+static __always_inline void cpumask_clear(struct cpumask *dstp)
- {
- 	bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
- }
-@@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp)
-  *
-  * If *@dstp is empty, returns false, else returns true
-  */
--static inline bool cpumask_and(struct cpumask *dstp,
-+static __always_inline bool cpumask_and(struct cpumask *dstp,
- 			       const struct cpumask *src1p,
- 			       const struct cpumask *src2p)
- {
-@@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp,
-  * @src1p: the first input
-  * @src2p: the second input
-  */
--static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
-+static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
- 			      const struct cpumask *src2p)
- {
- 	bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
-@@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
-  * @src1p: the first input
-  * @src2p: the second input
-  */
--static inline void cpumask_xor(struct cpumask *dstp,
-+static __always_inline void cpumask_xor(struct cpumask *dstp,
- 			       const struct cpumask *src1p,
- 			       const struct cpumask *src2p)
- {
-@@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
-  *
-  * If *@dstp is empty, returns false, else returns true
-  */
--static inline bool cpumask_andnot(struct cpumask *dstp,
-+static __always_inline bool cpumask_andnot(struct cpumask *dstp,
- 				  const struct cpumask *src1p,
- 				  const struct cpumask *src2p)
- {
-@@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
-  * @dstp: the cpumask result
-  * @srcp: the input to invert
-  */
--static inline void cpumask_complement(struct cpumask *dstp,
-+static __always_inline void cpumask_complement(struct cpumask *dstp,
- 				      const struct cpumask *srcp)
- {
- 	bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
-@@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp,
-  * @src1p: the first input
-  * @src2p: the second input
-  */
--static inline bool cpumask_equal(const struct cpumask *src1p,
-+static __always_inline bool cpumask_equal(const struct cpumask *src1p,
- 				const struct cpumask *src2p)
- {
- 	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
-@@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
-  * @src2p: the second input
-  * @src3p: the third input
-  */
--static inline bool cpumask_or_equal(const struct cpumask *src1p,
-+static __always_inline bool cpumask_or_equal(const struct cpumask *src1p,
- 				    const struct cpumask *src2p,
- 				    const struct cpumask *src3p)
- {
-@@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
-  * @src1p: the first input
-  * @src2p: the second input
-  */
--static inline bool cpumask_intersects(const struct cpumask *src1p,
-+static __always_inline bool cpumask_intersects(const struct cpumask *src1p,
- 				     const struct cpumask *src2p)
- {
- 	return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
-@@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
-  *
-  * Returns true if *@src1p is a subset of *@src2p, else returns false
-  */
--static inline bool cpumask_subset(const struct cpumask *src1p,
-+static __always_inline bool cpumask_subset(const struct cpumask *src1p,
- 				 const struct cpumask *src2p)
- {
- 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
-@@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
-  * cpumask_empty - *srcp == 0
-  * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
-  */
--static inline bool cpumask_empty(const struct cpumask *srcp)
-+static __always_inline bool cpumask_empty(const struct cpumask *srcp)
- {
- 	return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
-  * cpumask_full - *srcp == 0xFFFFFFFF...
-  * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
-  */
--static inline bool cpumask_full(const struct cpumask *srcp)
-+static __always_inline bool cpumask_full(const struct cpumask *srcp)
- {
- 	return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
-  * cpumask_weight - Count of bits in *srcp
-  * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
-  */
--static inline unsigned int cpumask_weight(const struct cpumask *srcp)
-+static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
- {
- 	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
- }
-@@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
-  * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
-  * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
-  */
--static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
-+static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
- 						const struct cpumask *srcp2)
- {
- 	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
-@@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
-  * @srcp: the input to shift
-  * @n: the number of bits to shift by
-  */
--static inline void cpumask_shift_right(struct cpumask *dstp,
-+static __always_inline void cpumask_shift_right(struct cpumask *dstp,
- 				       const struct cpumask *srcp, int n)
- {
- 	bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
-@@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
-  * @srcp: the input to shift
-  * @n: the number of bits to shift by
-  */
--static inline void cpumask_shift_left(struct cpumask *dstp,
-+static __always_inline void cpumask_shift_left(struct cpumask *dstp,
- 				      const struct cpumask *srcp, int n)
- {
- 	bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
-@@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
-  * @dstp: the result
-  * @srcp: the input cpumask
-  */
--static inline void cpumask_copy(struct cpumask *dstp,
-+static __always_inline void cpumask_copy(struct cpumask *dstp,
- 				const struct cpumask *srcp)
- {
- 	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
-@@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
-  *
-  * Returns -errno, or 0 for success.
-  */
--static inline int cpumask_parse_user(const char __user *buf, int len,
-+static __always_inline int cpumask_parse_user(const char __user *buf, int len,
- 				     struct cpumask *dstp)
- {
- 	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
-@@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
-  *
-  * Returns -errno, or 0 for success.
-  */
--static inline int cpumask_parselist_user(const char __user *buf, int len,
-+static __always_inline int cpumask_parselist_user(const char __user *buf, int len,
- 				     struct cpumask *dstp)
- {
- 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
-@@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
-  *
-  * Returns -errno, or 0 for success.
-  */
--static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
-+static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
- {
- 	return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
- }
-@@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
-  *
-  * Returns -errno, or 0 for success.
-  */
--static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
-+static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
- {
- 	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
- }
-@@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
- /**
-  * cpumask_size - size to allocate for a 'struct cpumask' in bytes
-  */
--static inline unsigned int cpumask_size(void)
-+static __always_inline unsigned int cpumask_size(void)
- {
- 	return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
- }
-@@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t;
- 
- bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
- 
--static inline
-+static __always_inline
- bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
- {
- 	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
-@@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
-  *
-  * See alloc_cpumask_var_node.
-  */
--static inline
-+static __always_inline
- bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
- {
- 	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
- }
- 
--static inline
-+static __always_inline
- bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
- {
- 	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
-@@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
- void free_cpumask_var(cpumask_var_t mask);
- void free_bootmem_cpumask_var(cpumask_var_t mask);
- 
--static inline bool cpumask_available(cpumask_var_t mask)
-+static __always_inline bool cpumask_available(cpumask_var_t mask)
- {
- 	return mask != NULL;
- }
-@@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1];
- #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
- #define __cpumask_var_read_mostly
- 
--static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-+static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
- {
- 	return true;
- }
- 
--static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
-+static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
- 					  int node)
- {
- 	return true;
- }
- 
--static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-+static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
- {
- 	cpumask_clear(*mask);
- 	return true;
- }
- 
--static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
-+static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
- 					  int node)
- {
- 	cpumask_clear(*mask);
- 	return true;
- }
- 
--static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
-+static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
- {
- }
- 
--static inline void free_cpumask_var(cpumask_var_t mask)
-+static __always_inline void free_cpumask_var(cpumask_var_t mask)
- {
- }
- 
--static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
-+static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
- {
- }
- 
--static inline bool cpumask_available(cpumask_var_t mask)
-+static __always_inline bool cpumask_available(cpumask_var_t mask)
- {
- 	return true;
- }
-@@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src);
- void init_cpu_possible(const struct cpumask *src);
- void init_cpu_online(const struct cpumask *src);
- 
--static inline void reset_cpu_possible_mask(void)
-+static __always_inline void reset_cpu_possible_mask(void)
- {
- 	bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
- }
- 
--static inline void
-+static __always_inline void
- set_cpu_possible(unsigned int cpu, bool possible)
- {
- 	if (possible)
-@@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible)
- 		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
- }
- 
--static inline void
-+static __always_inline void
- set_cpu_present(unsigned int cpu, bool present)
- {
- 	if (present)
-@@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present)
- 
- void set_cpu_online(unsigned int cpu, bool online);
- 
--static inline void
-+static __always_inline void
- set_cpu_active(unsigned int cpu, bool active)
- {
- 	if (active)
-@@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active)
- 		cpumask_clear_cpu(cpu, &__cpu_active_mask);
- }
- 
--static inline void
-+static __always_inline void
- set_cpu_dying(unsigned int cpu, bool dying)
- {
- 	if (dying)
-@@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying)
- 	((struct cpumask *)(1 ? (bitmap)				\
- 			    : (void *)sizeof(__check_is_bitmap(bitmap))))
- 
--static inline int __check_is_bitmap(const unsigned long *bitmap)
-+static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
- {
- 	return 1;
- }
-@@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap)
- extern const unsigned long
- 	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
- 
--static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
-+static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
- {
- 	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
- 	p -= cpu / BITS_PER_LONG;
-@@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
-  * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
-  * region.
-  */
--static inline unsigned int num_online_cpus(void)
-+static __always_inline unsigned int num_online_cpus(void)
- {
- 	return atomic_read(&__num_online_cpus);
- }
-@@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void)
- #define num_present_cpus()	cpumask_weight(cpu_present_mask)
- #define num_active_cpus()	cpumask_weight(cpu_active_mask)
- 
--static inline bool cpu_online(unsigned int cpu)
-+static __always_inline bool cpu_online(unsigned int cpu)
- {
- 	return cpumask_test_cpu(cpu, cpu_online_mask);
- }
- 
--static inline bool cpu_possible(unsigned int cpu)
-+static __always_inline bool cpu_possible(unsigned int cpu)
- {
- 	return cpumask_test_cpu(cpu, cpu_possible_mask);
- }
- 
--static inline bool cpu_present(unsigned int cpu)
-+static __always_inline bool cpu_present(unsigned int cpu)
- {
- 	return cpumask_test_cpu(cpu, cpu_present_mask);
- }
- 
--static inline bool cpu_active(unsigned int cpu)
-+static __always_inline bool cpu_active(unsigned int cpu)
- {
- 	return cpumask_test_cpu(cpu, cpu_active_mask);
- }
- 
--static inline bool cpu_dying(unsigned int cpu)
-+static __always_inline bool cpu_dying(unsigned int cpu)
- {
- 	return cpumask_test_cpu(cpu, cpu_dying_mask);
- }
-@@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu)
- #define num_present_cpus()	1U
- #define num_active_cpus()	1U
- 
--static inline bool cpu_online(unsigned int cpu)
-+static __always_inline bool cpu_online(unsigned int cpu)
- {
- 	return cpu == 0;
- }
- 
--static inline bool cpu_possible(unsigned int cpu)
-+static __always_inline bool cpu_possible(unsigned int cpu)
- {
- 	return cpu == 0;
- }
- 
--static inline bool cpu_present(unsigned int cpu)
-+static __always_inline bool cpu_present(unsigned int cpu)
- {
- 	return cpu == 0;
- }
- 
--static inline bool cpu_active(unsigned int cpu)
-+static __always_inline bool cpu_active(unsigned int cpu)
- {
- 	return cpu == 0;
- }
- 
--static inline bool cpu_dying(unsigned int cpu)
-+static __always_inline bool cpu_dying(unsigned int cpu)
- {
- 	return false;
- }
-@@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu)
-  * Returns the length of the (null-terminated) @buf string, zero if
-  * nothing is copied.
-  */
--static inline ssize_t
-+static __always_inline ssize_t
- cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
- {
- 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
-@@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
-  * Returns the length of how many bytes have been copied, excluding
-  * terminating '\0'.
-  */
--static inline ssize_t
-+static __always_inline ssize_t
- cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
- 		loff_t off, size_t count)
- {
-@@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
-  * Everything is same with the above cpumap_print_bitmask_to_buf()
-  * except the print format.
-  */
--static inline ssize_t
-+static __always_inline ssize_t
- cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
- 		loff_t off, size_t count)
- {
-diff --git a/include/linux/find.h b/include/linux/find.h
-index ccaf61a0f5fd..db2f2851601d 100644
---- a/include/linux/find.h
-+++ b/include/linux/find.h
-@@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
-  * Returns the bit number for the next set bit
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- 			    unsigned long offset)
- {
-@@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-  * Returns the bit number for the next set bit
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_and_bit(const unsigned long *addr1,
- 		const unsigned long *addr2, unsigned long size,
- 		unsigned long offset)
-@@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
-  * Returns the bit number for the next set bit
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_andnot_bit(const unsigned long *addr1,
- 		const unsigned long *addr2, unsigned long size,
- 		unsigned long offset)
-@@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1,
-  * Returns the bit number of the next zero bit
-  * If no bits are zero, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
- 				 unsigned long offset)
- {
-@@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-  * Returns the bit number of the first set bit.
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
- {
- 	if (small_const_nbits(size)) {
-@@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-  * Returns the bit number of the N'th set bit.
-  * If no such, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
- {
- 	if (n >= size)
-@@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign
-  * Returns the bit number of the N'th set bit.
-  * If no such, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
- 				unsigned long size, unsigned long n)
- {
-@@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *
-  * Returns the bit number of the N'th set bit.
-  * If no such, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
- 				unsigned long size, unsigned long n)
- {
-@@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
-  * Returns the bit number for the next set bit
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_first_and_bit(const unsigned long *addr1,
- 				 const unsigned long *addr2,
- 				 unsigned long size)
-@@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1,
-  * Returns the bit number of the first cleared bit.
-  * If no bits are zero, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
- {
- 	if (small_const_nbits(size)) {
-@@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
-  *
-  * Returns the bit number of the last set bit, or size.
-  */
--static inline
-+static __always_inline
- unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
- {
- 	if (small_const_nbits(size)) {
-@@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
-  * Returns the bit number for the next set bit, or first set bit up to @offset
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
- 					const unsigned long *addr2,
- 					unsigned long size, unsigned long offset)
-@@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
-  * Returns the bit number for the next set bit, or first set bit up to @offset
-  * If no bits are set, returns @size.
-  */
--static inline
-+static __always_inline
- unsigned long find_next_bit_wrap(const unsigned long *addr,
- 					unsigned long size, unsigned long offset)
- {
-@@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr,
-  * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
-  * before using it alone.
-  */
--static inline
-+static __always_inline
- unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
- 				 unsigned long start, unsigned long n)
- {
-@@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump,
- 
- #if defined(__LITTLE_ENDIAN)
- 
--static inline unsigned long find_next_zero_bit_le(const void *addr,
-+static __always_inline unsigned long find_next_zero_bit_le(const void *addr,
- 		unsigned long size, unsigned long offset)
- {
- 	return find_next_zero_bit(addr, size, offset);
- }
- 
--static inline unsigned long find_next_bit_le(const void *addr,
-+static __always_inline unsigned long find_next_bit_le(const void *addr,
- 		unsigned long size, unsigned long offset)
- {
- 	return find_next_bit(addr, size, offset);
- }
- 
--static inline unsigned long find_first_zero_bit_le(const void *addr,
-+static __always_inline unsigned long find_first_zero_bit_le(const void *addr,
- 		unsigned long size)
- {
- 	return find_first_zero_bit(addr, size);
-@@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
- #elif defined(__BIG_ENDIAN)
- 
- #ifndef find_next_zero_bit_le
--static inline
-+static __always_inline
- unsigned long find_next_zero_bit_le(const void *addr, unsigned
- 		long size, unsigned long offset)
- {
-@@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
- #endif
- 
- #ifndef find_first_zero_bit_le
--static inline
-+static __always_inline
- unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
- {
- 	if (small_const_nbits(size)) {
-@@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
- #endif
- 
- #ifndef find_next_bit_le
--static inline
-+static __always_inline
- unsigned long find_next_bit_le(const void *addr, unsigned
- 		long size, unsigned long offset)
- {
-diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
-index bb0ee80526b2..8c04254c5284 100644
---- a/include/linux/nodemask.h
-+++ b/include/linux/nodemask.h
-@@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_;
-  */
- #define nodemask_pr_args(maskp)	__nodemask_pr_numnodes(maskp), \
- 				__nodemask_pr_bits(maskp)
--static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
-+static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
- {
- 	return m ? MAX_NUMNODES : 0;
- }
--static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
-+static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
- {
- 	return m ? m->bits : NULL;
- }
-@@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
- }
- 
- #define node_clear(node, dst) __node_clear((node), &(dst))
--static inline void __node_clear(int node, volatile nodemask_t *dstp)
-+static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
- {
- 	clear_bit(node, dstp->bits);
- }
- 
- #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
--static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
-+static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
- {
- 	bitmap_fill(dstp->bits, nbits);
- }
- 
- #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
--static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
-+static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
- {
- 	bitmap_zero(dstp->bits, nbits);
- }
-@@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
- 
- #define node_test_and_set(node, nodemask) \
- 			__node_test_and_set((node), &(nodemask))
--static inline bool __node_test_and_set(int node, nodemask_t *addr)
-+static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
- {
- 	return test_and_set_bit(node, addr->bits);
- }
- 
- #define nodes_and(dst, src1, src2) \
- 			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
--static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
-+static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
-@@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
- 
- #define nodes_or(dst, src1, src2) \
- 			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
--static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
-+static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
-@@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
- 
- #define nodes_xor(dst, src1, src2) \
- 			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
--static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
-+static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
-@@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
- 
- #define nodes_andnot(dst, src1, src2) \
- 			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
--static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
-+static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
-@@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
- 
- #define nodes_complement(dst, src) \
- 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
--static inline void __nodes_complement(nodemask_t *dstp,
-+static __always_inline void __nodes_complement(nodemask_t *dstp,
- 					const nodemask_t *srcp, unsigned int nbits)
- {
- 	bitmap_complement(dstp->bits, srcp->bits, nbits);
-@@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
- 
- #define nodes_equal(src1, src2) \
- 			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
--static inline bool __nodes_equal(const nodemask_t *src1p,
-+static __always_inline bool __nodes_equal(const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
-@@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p,
- 
- #define nodes_intersects(src1, src2) \
- 			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
--static inline bool __nodes_intersects(const nodemask_t *src1p,
-+static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
-@@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p,
- 
- #define nodes_subset(src1, src2) \
- 			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
--static inline bool __nodes_subset(const nodemask_t *src1p,
-+static __always_inline bool __nodes_subset(const nodemask_t *src1p,
- 					const nodemask_t *src2p, unsigned int nbits)
- {
- 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
- }
- 
- #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
--static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
-+static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
- {
- 	return bitmap_empty(srcp->bits, nbits);
- }
- 
- #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
--static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
-+static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
- {
- 	return bitmap_full(srcp->bits, nbits);
- }
- 
- #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
--static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
-+static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
- {
- 	return bitmap_weight(srcp->bits, nbits);
- }
- 
- #define nodes_shift_right(dst, src, n) \
- 			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
--static inline void __nodes_shift_right(nodemask_t *dstp,
-+static __always_inline void __nodes_shift_right(nodemask_t *dstp,
- 					const nodemask_t *srcp, int n, int nbits)
- {
- 	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
-@@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp,
- 
- #define nodes_shift_left(dst, src, n) \
- 			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
--static inline void __nodes_shift_left(nodemask_t *dstp,
-+static __always_inline void __nodes_shift_left(nodemask_t *dstp,
- 					const nodemask_t *srcp, int n, int nbits)
- {
- 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
-@@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp,
-           > MAX_NUMNODES, then the silly min_ts could be dropped. */
- 
- #define first_node(src) __first_node(&(src))
--static inline unsigned int __first_node(const nodemask_t *srcp)
-+static __always_inline unsigned int __first_node(const nodemask_t *srcp)
- {
- 	return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
- }
- 
- #define next_node(n, src) __next_node((n), &(src))
--static inline unsigned int __next_node(int n, const nodemask_t *srcp)
-+static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
- {
- 	return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
- }
-@@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp)
-  * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
-  */
- #define next_node_in(n, src) __next_node_in((n), &(src))
--static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
-+static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
- {
- 	unsigned int ret = __next_node(node, srcp);
- 
-@@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
- 	return ret;
- }
- 
--static inline void init_nodemask_of_node(nodemask_t *mask, int node)
-+static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
- {
- 	nodes_clear(*mask);
- 	node_set(node, *mask);
-@@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node)
- })
- 
- #define first_unset_node(mask) __first_unset_node(&(mask))
--static inline unsigned int __first_unset_node(const nodemask_t *maskp)
-+static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
- {
- 	return min_t(unsigned int, MAX_NUMNODES,
- 			find_first_zero_bit(maskp->bits, MAX_NUMNODES));
-@@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp)
- 
- #define nodemask_parse_user(ubuf, ulen, dst) \
- 		__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
--static inline int __nodemask_parse_user(const char __user *buf, int len,
-+static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
- 					nodemask_t *dstp, int nbits)
- {
- 	return bitmap_parse_user(buf, len, dstp->bits, nbits);
- }
- 
- #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
--static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
-+static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
- {
- 	return bitmap_parselist(buf, dstp->bits, nbits);
- }
- 
- #define node_remap(oldbit, old, new) \
- 		__node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
--static inline int __node_remap(int oldbit,
-+static __always_inline int __node_remap(int oldbit,
- 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
- {
- 	return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
-@@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit,
- 
- #define nodes_remap(dst, src, old, new) \
- 		__nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
--static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
-+static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
- 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
- {
- 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
-@@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
- 
- #define nodes_onto(dst, orig, relmap) \
- 		__nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
--static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
-+static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
- 		const nodemask_t *relmapp, int nbits)
- {
- 	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
-@@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
- 
- #define nodes_fold(dst, orig, sz) \
- 		__nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
--static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
-+static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
- 		int sz, int nbits)
- {
- 	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
-@@ -418,22 +418,22 @@ enum node_states {
- extern nodemask_t node_states[NR_NODE_STATES];
- 
- #if MAX_NUMNODES > 1
--static inline int node_state(int node, enum node_states state)
-+static __always_inline int node_state(int node, enum node_states state)
- {
- 	return node_isset(node, node_states[state]);
- }
- 
--static inline void node_set_state(int node, enum node_states state)
-+static __always_inline void node_set_state(int node, enum node_states state)
- {
- 	__node_set(node, &node_states[state]);
- }
- 
--static inline void node_clear_state(int node, enum node_states state)
-+static __always_inline void node_clear_state(int node, enum node_states state)
- {
- 	__node_clear(node, &node_states[state]);
- }
- 
--static inline int num_node_state(enum node_states state)
-+static __always_inline int num_node_state(enum node_states state)
- {
- 	return nodes_weight(node_states[state]);
- }
-@@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state)
- 
- #define first_online_node	first_node(node_states[N_ONLINE])
- #define first_memory_node	first_node(node_states[N_MEMORY])
--static inline unsigned int next_online_node(int nid)
-+static __always_inline unsigned int next_online_node(int nid)
- {
- 	return next_node(nid, node_states[N_ONLINE]);
- }
--static inline unsigned int next_memory_node(int nid)
-+static __always_inline unsigned int next_memory_node(int nid)
- {
- 	return next_node(nid, node_states[N_MEMORY]);
- }
-@@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid)
- extern unsigned int nr_node_ids;
- extern unsigned int nr_online_nodes;
- 
--static inline void node_set_online(int nid)
-+static __always_inline void node_set_online(int nid)
- {
- 	node_set_state(nid, N_ONLINE);
- 	nr_online_nodes = num_node_state(N_ONLINE);
- }
- 
--static inline void node_set_offline(int nid)
-+static __always_inline void node_set_offline(int nid)
- {
- 	node_clear_state(nid, N_ONLINE);
- 	nr_online_nodes = num_node_state(N_ONLINE);
-@@ -469,20 +469,20 @@ static inline void node_set_offline(int nid)
- 
- #else
- 
--static inline int node_state(int node, enum node_states state)
-+static __always_inline int node_state(int node, enum node_states state)
- {
- 	return node == 0;
- }
- 
--static inline void node_set_state(int node, enum node_states state)
-+static __always_inline void node_set_state(int node, enum node_states state)
- {
- }
- 
--static inline void node_clear_state(int node, enum node_states state)
-+static __always_inline void node_clear_state(int node, enum node_states state)
- {
- }
- 
--static inline int num_node_state(enum node_states state)
-+static __always_inline int num_node_state(enum node_states state)
- {
- 	return 1;
- }
-@@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state)
- 
- #endif
- 
--static inline int node_random(const nodemask_t *maskp)
-+static __always_inline int node_random(const nodemask_t *maskp)
- {
- #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
- 	int w, bit;
--- 
-2.40.0.rc2
-
-From 5d1ae6ec70d7e64ac75501503e3dcf229e0942fb Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 11 Mar 2023 14:42:34 +0100
-Subject: [PATCH 04/16] cachy
+Date: Sun, 9 Apr 2023 21:21:58 +0200
+Subject: [PATCH 03/10] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  .gitignore                                    |   1 +
- .../admin-guide/kernel-parameters.txt         |  11 +-
+ .../admin-guide/kernel-parameters.txt         |   9 +
  Documentation/dontdiff                        |   1 +
  Makefile                                      |   8 +-
  arch/arc/configs/axs101_defconfig             |   1 +
@@ -6954,7 +3356,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  drivers/i2c/busses/i2c-nct6775.c              | 647 ++++++++++++++++++
  drivers/i2c/busses/i2c-piix4.c                |   4 +-
  drivers/md/dm-crypt.c                         |   5 +
- drivers/pci/quirks.c                          | 103 ++-
+ drivers/pci/quirks.c                          | 101 +++
  drivers/platform/x86/Kconfig                  |  14 +
  drivers/platform/x86/Makefile                 |   3 +
  drivers/platform/x86/steamdeck.c              | 523 ++++++++++++++
@@ -6966,15 +3368,9 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  kernel/Kconfig.hz                             |  24 +
  kernel/fork.c                                 |  14 +
  kernel/module/Kconfig                         |  25 +
- kernel/rcu/Kconfig                            |   4 +-
- kernel/rcu/rcutorture.c                       |   2 +-
- kernel/rcu/tree.c                             |   6 +-
- kernel/rcu/tree_nocb.h                        |   4 +-
- kernel/rcu/tree_plugin.h                      |   4 +-
  kernel/sched/fair.c                           |  20 +-
  kernel/sysctl.c                               |  12 +
  kernel/user_namespace.c                       |   7 +
- lib/string.c                                  |  62 +-
  mm/Kconfig                                    |   2 +-
  mm/compaction.c                               |   4 +
  mm/page-writeback.c                           |   8 +
@@ -6986,16 +3382,16 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  net/ipv4/tcp_ipv4.c                           |   2 +
  scripts/Makefile.lib                          |  13 +-
  scripts/Makefile.modinst                      |   7 +-
- 61 files changed, 2200 insertions(+), 76 deletions(-)
+ 55 files changed, 2144 insertions(+), 46 deletions(-)
  create mode 100644 arch/x86/Makefile.postlink
  create mode 100644 drivers/i2c/busses/i2c-nct6775.c
  create mode 100644 drivers/platform/x86/steamdeck.c
 
 diff --git a/.gitignore b/.gitignore
-index 20dce5c3b9e0..466c23de56ce 100644
+index 70ec6037fa7a..9bafd3c6bb5f 100644
 --- a/.gitignore
 +++ b/.gitignore
-@@ -63,6 +63,7 @@ modules.order
+@@ -65,6 +65,7 @@ modules.order
  /vmlinux
  /vmlinux.32
  /vmlinux.map
@@ -7004,10 +3400,10 @@ index 20dce5c3b9e0..466c23de56ce 100644
  /vmlinux-gdb.py
  /vmlinuz
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 6cfa6e3996cf..9595abf34974 100644
+index 6221a1d057dd..4f6761a93715 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
-@@ -4178,6 +4178,15 @@
+@@ -4190,6 +4190,15 @@
  		nomsi		[MSI] If the PCI_MSI kernel config parameter is
  				enabled, this kernel boot option can be used to
  				disable the use of MSI interrupts system-wide.
@@ -7023,20 +3419,11 @@ index 6cfa6e3996cf..9595abf34974 100644
  		noioapicquirk	[APIC] Disable all boot interrupt quirks.
  				Safety option to keep boot IRQs enabled. This
  				should never be necessary.
-@@ -4751,7 +4760,7 @@
- 			overwritten.
- 
- 	rcutree.kthread_prio= 	 [KNL,BOOT]
--			Set the SCHED_FIFO priority of the RCU per-CPU
-+			Set the SCHED_RR priority of the RCU per-CPU
- 			kthreads (rcuc/N). This value is also used for
- 			the priority of the RCU boost threads (rcub/N)
- 			and for the RCU grace-period kthreads (rcu_bh,
 diff --git a/Documentation/dontdiff b/Documentation/dontdiff
-index 352ff53a2306..7c210744d84c 100644
+index 3c399f132e2d..a62ad01e6d11 100644
 --- a/Documentation/dontdiff
 +++ b/Documentation/dontdiff
-@@ -255,6 +255,7 @@ vmlinux.aout
+@@ -254,6 +254,7 @@ vmlinux.aout
  vmlinux.bin.all
  vmlinux.lds
  vmlinux.map
@@ -7045,10 +3432,10 @@ index 352ff53a2306..7c210744d84c 100644
  vmlinuz
  voffset.h
 diff --git a/Makefile b/Makefile
-index 1a1d63f2a9ed..9caed88238ab 100644
+index 5aeea3d98fc0..c6249845f6a1 100644
 --- a/Makefile
 +++ b/Makefile
-@@ -834,6 +834,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
+@@ -818,6 +818,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
  ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
  KBUILD_CFLAGS += -O2
  KBUILD_RUSTFLAGS += -Copt-level=2
@@ -7058,7 +3445,7 @@ index 1a1d63f2a9ed..9caed88238ab 100644
  else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
  KBUILD_CFLAGS += -Os
  KBUILD_RUSTFLAGS += -Copt-level=s
-@@ -1075,11 +1078,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
+@@ -1060,11 +1063,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
  # Make sure -fstack-check isn't enabled (like gentoo apparently did)
  KBUILD_CFLAGS  += -fno-stack-check
  
@@ -7749,7 +4136,7 @@ index 542377cd419d..08d887d1220d 100644
  config IA32_FEAT_CTL
  	def_bool y
 diff --git a/arch/x86/Makefile b/arch/x86/Makefile
-index 73ed982d4100..cb4c6620b34a 100644
+index b39975977c03..00d94852490b 100644
 --- a/arch/x86/Makefile
 +++ b/arch/x86/Makefile
 @@ -67,7 +67,7 @@ export BITS
@@ -7813,7 +4200,7 @@ index 73ed982d4100..cb4c6620b34a 100644
  
 diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink
 new file mode 100644
-index 000000000000..b38ffa4defb3
+index 000000000000..195af937aa4d
 --- /dev/null
 +++ b/arch/x86/Makefile.postlink
 @@ -0,0 +1,41 @@
@@ -7829,7 +4216,7 @@ index 000000000000..b38ffa4defb3
 +__archpost:
 +
 +-include include/config/auto.conf
-+include scripts/Kbuild.include
++include $(srctree)/scripts/Kbuild.include
 +
 +CMD_RELOCS = arch/x86/tools/relocs
 +quiet_cmd_relocs = RELOCS  $@.relocs
@@ -7871,7 +4258,7 @@ index 25805199a506..b2968175fc27 100644
  mkpiggy
  piggy.S
 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
-index 1acff356d97a..d995595394bb 100644
+index 6b6cfe607bdb..19d1fb601796 100644
 --- a/arch/x86/boot/compressed/Makefile
 +++ b/arch/x86/boot/compressed/Makefile
 @@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE
@@ -7984,10 +4371,10 @@ index 75884d2cdec3..18021e8c0c28 100644
  #define MODULE_PROC_FAMILY "ELAN "
  #elif defined CONFIG_MCRUSOE
 diff --git a/drivers/Makefile b/drivers/Makefile
-index bdf1c66141c9..1e1a0832fb48 100644
+index 20b118dca999..c19dee206e53 100644
 --- a/drivers/Makefile
 +++ b/drivers/Makefile
-@@ -59,15 +59,8 @@ obj-y				+= char/
+@@ -64,15 +64,8 @@ obj-y				+= char/
  # iommu/ comes before gpu as gpu are using iommu controllers
  obj-y				+= iommu/
  
@@ -8003,7 +4390,7 @@ index bdf1c66141c9..1e1a0832fb48 100644
  obj-$(CONFIG_PARPORT)		+= parport/
  obj-y				+= base/ block/ misc/ mfd/ nfc/
  obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
-@@ -79,6 +72,14 @@ obj-y				+= macintosh/
+@@ -84,6 +77,14 @@ obj-y				+= macintosh/
  obj-y				+= scsi/
  obj-y				+= nvme/
  obj-$(CONFIG_ATA)		+= ata/
@@ -8019,7 +4406,7 @@ index bdf1c66141c9..1e1a0832fb48 100644
  obj-$(CONFIG_MTD)		+= mtd/
  obj-$(CONFIG_SPI)		+= spi/
 diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
-index a7bfddf08fa7..c9a5fa597950 100644
+index 25eb4e8fd22f..2f95d74ad0b4 100644
 --- a/drivers/i2c/busses/Kconfig
 +++ b/drivers/i2c/busses/Kconfig
 @@ -229,6 +229,15 @@ config I2C_CHT_WC
@@ -8039,7 +4426,7 @@ index a7bfddf08fa7..c9a5fa597950 100644
  	tristate "Nvidia nForce2, nForce3 and nForce4"
  	depends on PCI
 diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
-index e73cdb1d2b5a..052ccd05c13c 100644
+index af56fe2c75c0..76be74584719 100644
 --- a/drivers/i2c/busses/Makefile
 +++ b/drivers/i2c/busses/Makefile
 @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
@@ -8722,10 +5109,10 @@ index 809fbd014cd6..d54b35b147ee 100644
  	/* If the SMBus is still busy, we give up */
  	if (timeout == MAX_TIMEOUT) {
 diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
-index 2653516bcdef..973fe8f80051 100644
+index 3ba53dc3cc3f..0fde1b3ced78 100644
 --- a/drivers/md/dm-crypt.c
 +++ b/drivers/md/dm-crypt.c
-@@ -3207,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+@@ -3213,6 +3213,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  			goto bad;
  	}
  
@@ -8738,7 +5125,7 @@ index 2653516bcdef..973fe8f80051 100644
  	if (ret < 0)
  		goto bad;
 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index 494fa46f5767..bcdfc072cbfb 100644
+index 44cab813bf95..25edf55de985 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
@@ -8848,18 +5235,16 @@ index 494fa46f5767..bcdfc072cbfb 100644
  /*
   * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
   * prevented for those affected devices.
-@@ -5000,8 +5100,7 @@ static const struct pci_dev_acs_enabled {
- 	{ PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs },
- 	/* Zhaoxin Root/Downstream Ports */
+@@ -5002,6 +5102,7 @@ static const struct pci_dev_acs_enabled {
  	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
--	/* Wangxun nics */
--	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+ 	/* Wangxun nics */
+ 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
 +	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
  	{ 0 }
  };
  
 diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
-index 5692385e2d26..badc4f642ad2 100644
+index 4a01b315e0a9..e9ddf76b8b57 100644
 --- a/drivers/platform/x86/Kconfig
 +++ b/drivers/platform/x86/Kconfig
 @@ -1099,6 +1099,20 @@ config WINMATE_FM07_KEYS
@@ -9424,10 +5809,10 @@ index 000000000000..77a6677ec19e
 +MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
 +MODULE_LICENSE("GPL");
 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
-index 29e1f9e76eb6..a7852e22101f 100644
+index 0acb8e1fb7af..b0b49c8653b0 100644
 --- a/include/linux/pagemap.h
 +++ b/include/linux/pagemap.h
-@@ -1178,7 +1178,7 @@ struct readahead_control {
+@@ -1182,7 +1182,7 @@ struct readahead_control {
  		._index = i,						\
  	}
  
@@ -9489,7 +5874,7 @@ index 901b440238d5..7026df84a0f6 100644
  
  	TP_PROTO(const struct sock *sk, const struct request_sock *req),
 diff --git a/init/Kconfig b/init/Kconfig
-index 44e90b28a30f..748a9491ca12 100644
+index 1fb5f313d18f..9b298860cfed 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
@@ -9546,7 +5931,7 @@ index 44e90b28a30f..748a9491ca12 100644
  config PID_NS
  	bool "PID Namespaces"
  	default y
-@@ -1420,6 +1453,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
+@@ -1433,6 +1466,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
  	  with the "-O2" compiler flag for best performance and most
  	  helpful compile-time warnings.
  
@@ -9602,7 +5987,7 @@ index 38ef6d06888e..0f78364efd4f 100644
  
  config SCHED_HRTICK
 diff --git a/kernel/fork.c b/kernel/fork.c
-index 9f7fe3541897..068062cdf5a3 100644
+index 0c92f224c68c..49c173e367d2 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -98,6 +98,10 @@
@@ -9616,7 +6001,7 @@ index 9f7fe3541897..068062cdf5a3 100644
  #include <asm/pgalloc.h>
  #include <linux/uaccess.h>
  #include <asm/mmu_context.h>
-@@ -2030,6 +2034,10 @@ static __latent_entropy struct task_struct *copy_process(
+@@ -2031,6 +2035,10 @@ static __latent_entropy struct task_struct *copy_process(
  	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
  		return ERR_PTR(-EINVAL);
  
@@ -9627,7 +6012,7 @@ index 9f7fe3541897..068062cdf5a3 100644
  	/*
  	 * Thread groups must share signals as well, and detached threads
  	 * can only be started up within the thread group.
-@@ -3180,6 +3188,12 @@ int ksys_unshare(unsigned long unshare_flags)
+@@ -3181,6 +3189,12 @@ int ksys_unshare(unsigned long unshare_flags)
  	if (unshare_flags & CLONE_NEWNS)
  		unshare_flags |= CLONE_FS;
  
@@ -9676,105 +6061,8 @@ index 424b3bc58f3f..ecf2798c5ccf 100644
  config MODULE_DECOMPRESS
  	bool "Support in-kernel module decompression"
  	depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
-diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
-index ab62074174c3..f1f909bdc30d 100644
---- a/kernel/rcu/Kconfig
-+++ b/kernel/rcu/Kconfig
-@@ -280,9 +280,9 @@ config RCU_NOCB_CPU_CB_BOOST
- 	depends on RCU_NOCB_CPU && RCU_BOOST
- 	default y if PREEMPT_RT
- 	help
--	  Use this option to invoke offloaded callbacks as SCHED_FIFO
-+	  Use this option to invoke offloaded callbacks as SCHED_RR
- 	  to avoid starvation by heavy SCHED_OTHER background load.
--	  Of course, running as SCHED_FIFO during callback floods will
-+	  Of course, running as SCHED_RR during callback floods will
- 	  cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
- 	  of milliseconds or more.  Therefore, when enabling this option,
- 	  it is your responsibility to ensure that latency-sensitive
-diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
-index 634df26a2c27..8c54871cc0a0 100644
---- a/kernel/rcu/rcutorture.c
-+++ b/kernel/rcu/rcutorture.c
-@@ -2406,7 +2406,7 @@ static int rcutorture_booster_init(unsigned int cpu)
- 		t = per_cpu(ksoftirqd, cpu);
- 		WARN_ON_ONCE(!t);
- 		sp.sched_priority = 2;
--		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-+		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
- 	}
- 
- 	/* Don't allow time recalculation while creating a new task. */
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index cf34a961821a..80cf9824d461 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -4443,8 +4443,8 @@ static void __init rcu_start_exp_gp_kworkers(void)
- 		return;
- 	}
- 
--	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
--	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
-+	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, &param);
-+	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR,
- 				   &param);
- }
- 
-@@ -4482,7 +4482,7 @@ static int __init rcu_spawn_gp_kthread(void)
- 		return 0;
- 	if (kthread_prio) {
- 		sp.sched_priority = kthread_prio;
--		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-+		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
- 	}
- 	rnp = rcu_get_root();
- 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
-index 9e1c8caec5ce..dd39c50ae099 100644
---- a/kernel/rcu/tree_nocb.h
-+++ b/kernel/rcu/tree_nocb.h
-@@ -1465,7 +1465,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
- 		}
- 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- 		if (kthread_prio)
--			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-+			sched_setscheduler_nocheck(t, SCHED_RR, &sp);
- 	}
- 	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
- 
-@@ -1476,7 +1476,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
- 		goto end;
- 
- 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
--		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-+		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
- 
- 	WRITE_ONCE(rdp->nocb_cb_kthread, t);
- 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
-index 7b0fe741a088..77ad9e033358 100644
---- a/kernel/rcu/tree_plugin.h
-+++ b/kernel/rcu/tree_plugin.h
-@@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
- 	struct sched_param sp;
- 
- 	sp.sched_priority = kthread_prio;
--	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-+	sched_setscheduler_nocheck(current, SCHED_RR, &sp);
- #endif /* #ifdef CONFIG_RCU_BOOST */
- 
- 	WRITE_ONCE(rdp->rcuc_activity, jiffies);
-@@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
- 	rnp->boost_kthread_task = t;
- 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- 	sp.sched_priority = kthread_prio;
--	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-+	sched_setscheduler_nocheck(t, SCHED_RR, &sp);
- 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- 
-  out:
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 0f8736991427..86a988c830ef 100644
+index 6986ea31c984..dcdd8422de72 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -69,9 +69,13 @@
@@ -9864,7 +6152,7 @@ index 1c240d2c99bc..98e1a7472fd2 100644
  	{
  		.procname	= "tainted",
 diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
-index 54211dbd516c..16ca0c151629 100644
+index 1d8e47bed3f1..fec01d016a35 100644
 --- a/kernel/user_namespace.c
 +++ b/kernel/user_namespace.c
 @@ -22,6 +22,13 @@
@@ -9881,102 +6169,11 @@ index 54211dbd516c..16ca0c151629 100644
  static struct kmem_cache *user_ns_cachep __read_mostly;
  static DEFINE_MUTEX(userns_state_mutex);
  
-diff --git a/lib/string.c b/lib/string.c
-index 4fb566ea610f..4746a98b153e 100644
---- a/lib/string.c
-+++ b/lib/string.c
-@@ -792,24 +792,61 @@ char *strnstr(const char *s1, const char *s2, size_t len)
- EXPORT_SYMBOL(strnstr);
- #endif
- 
-+#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
-+
-+#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL)
-+
-+#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
-+
-+#define MEMCHR_MASK_GEN(mask)                                                  \
-+	do {                                                                   \
-+		mask *= 0x01010101;                                            \
-+		mask |= mask << 32;                                            \
-+	} while (0)
-+
-+#else
-+
-+#define MEMCHR_MASK_GEN(mask)                                                  \
-+	do {                                                                   \
-+		mask |= mask << 8;                                             \
-+		mask |= mask << 16;                                            \
-+		mask |= mask << 32;                                            \
-+	} while (0)
-+
-+#endif
-+
- #ifndef __HAVE_ARCH_MEMCHR
- /**
-  * memchr - Find a character in an area of memory.
-- * @s: The memory area
-+ * @p: The memory area
-  * @c: The byte to search for
-- * @n: The size of the area.
-+ * @length: The size of the area.
-  *
-  * returns the address of the first occurrence of @c, or %NULL
-  * if @c is not found
-  */
--void *memchr(const void *s, int c, size_t n)
-+void *memchr(const void *p, int c, unsigned long length)
- {
--	const unsigned char *p = s;
--	while (n-- != 0) {
--        	if ((unsigned char)c == *p++) {
--			return (void *)(p - 1);
-+	u64 mask, val;
-+	const void *end = p + length;
-+
-+	c &= 0xff;
-+	if (p <= end - 8) {
-+		mask = c;
-+		MEMCHR_MASK_GEN(mask);
-+
-+		for (; p <= end - 8; p += 8) {
-+			val = *(u64 *)p ^ mask;
-+			if ((val + 0xfefefefefefefeffu) &
-+			    (~val & 0x8080808080808080u))
-+				break;
- 		}
- 	}
-+
-+	for (; p < end; p++)
-+		if (*(unsigned char *)p == c)
-+			return (void *)p;
-+
- 	return NULL;
- }
- EXPORT_SYMBOL(memchr);
-@@ -845,16 +882,7 @@ void *memchr_inv(const void *start, int c, size_t bytes)
- 		return check_bytes8(start, value, bytes);
- 
- 	value64 = value;
--#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
--	value64 *= 0x0101010101010101ULL;
--#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
--	value64 *= 0x01010101;
--	value64 |= value64 << 32;
--#else
--	value64 |= value64 << 8;
--	value64 |= value64 << 16;
--	value64 |= value64 << 32;
--#endif
-+	MEMCHR_MASK_GEN(value64);
- 
- 	prefix = (unsigned long)start % 8;
- 	if (prefix) {
 diff --git a/mm/Kconfig b/mm/Kconfig
-index ff7b209dec05..bf317c39ed2d 100644
+index 4751031f3f05..cf2e47030fe8 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
-@@ -602,7 +602,7 @@ config COMPACTION
+@@ -621,7 +621,7 @@ config COMPACTION
  config COMPACT_UNEVICTABLE_DEFAULT
  	int
  	depends on COMPACTION
@@ -9986,10 +6183,10 @@ index ff7b209dec05..bf317c39ed2d 100644
  
  #
 diff --git a/mm/compaction.c b/mm/compaction.c
-index 8238e83385a7..d0b16a5b30f7 100644
+index 5a9501e0ae01..4d8c63b9cdca 100644
 --- a/mm/compaction.c
 +++ b/mm/compaction.c
-@@ -2717,7 +2717,11 @@ static void compact_nodes(void)
+@@ -2735,7 +2735,11 @@ static void compact_nodes(void)
   * aggressively the kernel should compact memory in the
   * background. It takes values in the range [0, 100].
   */
@@ -10002,7 +6199,7 @@ index 8238e83385a7..d0b16a5b30f7 100644
  int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
  		void *buffer, size_t *length, loff_t *ppos)
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
-index ad608ef2a243..178cfd5490b1 100644
+index 516b1aa247e8..78fb31d27ed7 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
@@ -10030,10 +6227,10 @@ index ad608ef2a243..178cfd5490b1 100644
  EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  
 diff --git a/mm/swap.c b/mm/swap.c
-index 70e2063ef43a..79ab9b1c3910 100644
+index 57cb01b042f6..3a7bec75480f 100644
 --- a/mm/swap.c
 +++ b/mm/swap.c
-@@ -1134,6 +1134,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag);
+@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
   */
  void __init swap_setup(void)
  {
@@ -10044,7 +6241,7 @@ index 70e2063ef43a..79ab9b1c3910 100644
  	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
  
  	/* Use a smaller cluster for small-memory machines */
-@@ -1145,4 +1149,5 @@ void __init swap_setup(void)
+@@ -1101,4 +1105,5 @@ void __init swap_setup(void)
  	 * Right now other parts of the system means that we
  	 * _really_ don't want to cluster much more
  	 */
@@ -10067,10 +6264,10 @@ index b52644771cc4..11a4b0e3b583 100644
  
  /*
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 5b7b8d4f5297..160acbbdf111 100644
+index 9c1c5e8b24b8..71a7f4517e5a 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
-@@ -190,7 +190,11 @@ struct scan_control {
+@@ -186,7 +186,11 @@ struct scan_control {
  /*
   * From 0 .. 200.  Higher means more swappy.
   */
@@ -10082,7 +6279,7 @@ index 5b7b8d4f5297..160acbbdf111 100644
  
  static void set_task_reclaim_state(struct task_struct *task,
  				   struct reclaim_state *rs)
-@@ -4559,7 +4563,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
+@@ -4536,7 +4540,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
  }
  
  /* to protect the working set of the last N jiffies */
@@ -10174,10 +6371,10 @@ index 754e0212c951..b6d7faeb737a 100644
  	 * drop receive data on the floor.  It will get retransmitted
  	 * and hopefully then we'll have sufficient space.
 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
-index 8320d0ecb13a..37a09cd767a1 100644
+index ea370afa70ed..b869b6c1b226 100644
 --- a/net/ipv4/tcp_ipv4.c
 +++ b/net/ipv4/tcp_ipv4.c
-@@ -3274,6 +3274,8 @@ static int __net_init tcp_sk_init(struct net *net)
+@@ -3275,6 +3275,8 @@ static int __net_init tcp_sk_init(struct net *net)
  	else
  		net->ipv4.tcp_congestion_control = &tcp_reno;
  
@@ -10187,10 +6384,10 @@ index 8320d0ecb13a..37a09cd767a1 100644
  }
  
 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
-index 4a4a5f67c1a6..993e4578c0f2 100644
+index 100a386fcd71..a3ec7265fb57 100644
 --- a/scripts/Makefile.lib
 +++ b/scripts/Makefile.lib
-@@ -557,14 +557,21 @@ quiet_cmd_xzmisc = XZMISC  $@
+@@ -542,14 +542,21 @@ quiet_cmd_xzmisc = XZMISC  $@
  # decompression is used, like initramfs decompression, zstd22 should likely not
  # be used because it would require zstd to allocate a 128 MB buffer.
  
@@ -10216,7 +6413,7 @@ index 4a4a5f67c1a6..993e4578c0f2 100644
  # ASM offsets
  # ---------------------------------------------------------------------------
 diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
-index 4815a8e32227..6a3c36713045 100644
+index ab0c5bd1a60f..f4989f706d7f 100644
 --- a/scripts/Makefile.modinst
 +++ b/scripts/Makefile.modinst
 @@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP    $@
@@ -10235,1350 +6432,12 @@ index 4815a8e32227..6a3c36713045 100644
  $(dst)/%.ko.gz: $(dst)/%.ko FORCE
  	$(call cmd,gzip)
 -- 
-2.40.0.rc2
+2.40.0
 
-From 0e45a02aaaa398cc0465a407331459f28cdb1ae9 Mon Sep 17 00:00:00 2001
+From 9b77615274a43646ad38d250d0c63be888c15bda Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 18:00:48 +0100
-Subject: [PATCH 05/16] clr
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- arch/x86/kernel/tsc.c                    |   3 +
- arch/x86/mm/fault.c                      |   4 +-
- drivers/cpufreq/intel_pstate.c           |   7 +
- drivers/idle/intel_idle.c                |  50 ++--
- drivers/input/serio/i8042.c              |  10 +-
- drivers/net/dummy.c                      |   2 +-
- drivers/pci/pci.c                        |   2 +-
- drivers/powercap/intel_rapl_common.c     |   2 +-
- drivers/thermal/intel/intel_powerclamp.c |  10 +
- fs/xattr.c                               |  15 +-
- include/linux/jbd2.h                     |   2 +-
- include/linux/rcuref.h                   |  89 +++++++
- include/linux/types.h                    |   6 +
- include/linux/wait.h                     |   2 +
- include/net/dst.h                        |  21 +-
- include/net/sock.h                       |   2 +-
- include/uapi/linux/if_bonding.h          |   2 +-
- init/do_mounts.c                         |  16 +-
- kernel/locking/rwsem.c                   |   4 +-
- kernel/sched/wait.c                      |  24 ++
- kernel/watchdog.c                        |   2 +-
- lib/Makefile                             |   2 +-
- lib/raid6/algos.c                        |   4 +-
- lib/rcuref.c                             | 311 +++++++++++++++++++++++
- mm/ksm.c                                 |  11 +-
- net/bridge/br_nf_core.c                  |   2 +-
- net/core/dst.c                           |  26 +-
- net/core/rtnetlink.c                     |   2 +-
- net/ipv4/inet_connection_sock.c          |   2 +-
- net/ipv4/tcp.c                           |   4 +-
- net/ipv6/route.c                         |   6 +-
- net/netfilter/ipvs/ip_vs_xmit.c          |   4 +-
- 32 files changed, 559 insertions(+), 90 deletions(-)
- create mode 100644 include/linux/rcuref.h
- create mode 100644 lib/rcuref.c
-
-diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
-index a78e73da4a74..bab8a98080cf 100644
---- a/arch/x86/kernel/tsc.c
-+++ b/arch/x86/kernel/tsc.c
-@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void)
- 	if (!constant_tsc || !mask)
- 		return 0;
- 
-+	if (cpu != 0)
-+		return cpu_data(0).loops_per_jiffy;
-+
- 	sibling = cpumask_any_but(mask, cpu);
- 	if (sibling < nr_cpu_ids)
- 		return cpu_data(sibling).loops_per_jiffy;
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index 7b0d4ab894c8..1a14f52added 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
- 	if (!printk_ratelimit())
- 		return;
- 
--	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
-+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i",
- 		loglvl, tsk->comm, task_pid_nr(tsk), address,
--		(void *)regs->ip, (void *)regs->sp, error_code);
-+		(void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id());
- 
- 	print_vma_addr(KERN_CONT " in ", regs->ip);
- 
-diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
-index fd73d6d2b808..0c0071ab3966 100644
---- a/drivers/cpufreq/intel_pstate.c
-+++ b/drivers/cpufreq/intel_pstate.c
-@@ -366,6 +366,13 @@ static void intel_pstate_set_itmt_prio(int cpu)
- 	 * update them at any time after it has been called.
- 	 */
- 	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
-+	/*
-+	 * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff.
-+	 * In this case we can't use CPPC.highest_perf to enable ITMT.
-+	 * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide.
-+	 */
-+	if (cppc_perf.highest_perf == 0xff)
-+		cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached));
- 
- 	if (max_highest_perf <= min_highest_perf) {
- 		if (cppc_perf.highest_perf > max_highest_perf)
-diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
-index f060ac7376e6..1cd277c8f77f 100644
---- a/drivers/idle/intel_idle.c
-+++ b/drivers/idle/intel_idle.c
-@@ -572,7 +572,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 10,
--		.target_residency = 20,
-+		.target_residency = 120,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -580,7 +580,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x10",
- 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 33,
--		.target_residency = 100,
-+		.target_residency = 900,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x20",
- 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 133,
--		.target_residency = 400,
-+		.target_residency = 1000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -596,7 +596,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x32",
- 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 166,
--		.target_residency = 500,
-+		.target_residency = 1500,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -604,7 +604,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x40",
- 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 300,
--		.target_residency = 900,
-+		.target_residency = 2000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -612,7 +612,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x50",
- 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 600,
--		.target_residency = 1800,
-+		.target_residency = 5000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -620,7 +620,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x60",
- 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 2600,
--		.target_residency = 7700,
-+		.target_residency = 9000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -640,7 +640,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 10,
--		.target_residency = 20,
-+		.target_residency = 120,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -648,7 +648,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x10",
- 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 40,
--		.target_residency = 100,
-+		.target_residency = 1000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -656,7 +656,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x20",
- 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 133,
--		.target_residency = 400,
-+		.target_residency = 1000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -664,7 +664,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x32",
- 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 166,
--		.target_residency = 500,
-+		.target_residency = 2000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -672,7 +672,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x40",
- 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 300,
--		.target_residency = 900,
-+		.target_residency = 4000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -680,7 +680,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x50",
- 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 600,
--		.target_residency = 1800,
-+		.target_residency = 7000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -688,7 +688,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
- 		.desc = "MWAIT 0x60",
- 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 2600,
--		.target_residency = 7700,
-+		.target_residency = 9000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -709,7 +709,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 10,
--		.target_residency = 20,
-+		.target_residency = 120,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -717,7 +717,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x10",
- 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 70,
--		.target_residency = 100,
-+		.target_residency = 1000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -725,7 +725,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x20",
- 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
- 		.exit_latency = 85,
--		.target_residency = 200,
-+		.target_residency = 600,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -733,7 +733,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x33",
- 		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
- 		.exit_latency = 124,
--		.target_residency = 800,
-+		.target_residency = 3000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -741,7 +741,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x40",
- 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
- 		.exit_latency = 200,
--		.target_residency = 800,
-+		.target_residency = 3200,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -749,7 +749,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x50",
- 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
- 		.exit_latency = 480,
--		.target_residency = 5000,
-+		.target_residency = 9000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -757,7 +757,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
- 		.desc = "MWAIT 0x60",
- 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
- 		.exit_latency = 890,
--		.target_residency = 5000,
-+		.target_residency = 9000,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -778,7 +778,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 10,
--		.target_residency = 20,
-+		.target_residency = 300,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -807,7 +807,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 4,
--		.target_residency = 4,
-+		.target_residency = 40,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -815,7 +815,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
- 		.desc = "MWAIT 0x20",
- 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
- 		.exit_latency = 170,
--		.target_residency = 600,
-+		.target_residency = 900,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-@@ -981,7 +981,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
- 		.desc = "MWAIT 0x01",
- 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
- 		.exit_latency = 2,
--		.target_residency = 4,
-+		.target_residency = 40,
- 		.enter = &intel_idle,
- 		.enter_s2idle = intel_idle_s2idle, },
- 	{
-diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
-index 6dac7c1853a5..fab04cd8a7a0 100644
---- a/drivers/input/serio/i8042.c
-+++ b/drivers/input/serio/i8042.c
-@@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void)
- 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
- 		i8042_ctr &= ~I8042_CTR_KBDINT;
- 		i8042_ctr |= I8042_CTR_KBDDIS;
--		pr_err("Failed to enable KBD port\n");
-+		pr_info("Failed to enable KBD port\n");
- 		return -EIO;
- 	}
- 
-@@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void)
- 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
- 		i8042_ctr &= ~I8042_CTR_AUXINT;
- 		i8042_ctr |= I8042_CTR_AUXDIS;
--		pr_err("Failed to enable AUX port\n");
-+		pr_info("Failed to enable AUX port\n");
- 		return -EIO;
- 	}
- 
-@@ -732,7 +732,7 @@ static int i8042_check_mux(void)
- 	i8042_ctr &= ~I8042_CTR_AUXINT;
- 
- 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
--		pr_err("Failed to disable AUX port, can't use MUX\n");
-+		pr_info("Failed to disable AUX port, can't use MUX\n");
- 		return -EIO;
- 	}
- 
-@@ -955,7 +955,7 @@ static int i8042_controller_selftest(void)
- 	do {
- 
- 		if (i8042_command(&param, I8042_CMD_CTL_TEST)) {
--			pr_err("i8042 controller selftest timeout\n");
-+			pr_info("i8042 controller selftest timeout\n");
- 			return -ENODEV;
- 		}
- 
-@@ -977,7 +977,7 @@ static int i8042_controller_selftest(void)
- 	pr_info("giving up on controller selftest, continuing anyway...\n");
- 	return 0;
- #else
--	pr_err("i8042 controller selftest failed\n");
-+	pr_info("i8042 controller selftest failed\n");
- 	return -EIO;
- #endif
- }
-diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
-index c4b1b0aa438a..06b00f7a8eab 100644
---- a/drivers/net/dummy.c
-+++ b/drivers/net/dummy.c
-@@ -43,7 +43,7 @@
- 
- #define DRV_NAME	"dummy"
- 
--static int numdummies = 1;
-+static int numdummies = 0;
- 
- /* fake multicast ability */
- static void set_multicast_list(struct net_device *dev)
-diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
-index 7a67611dc5f4..48b350fe09d8 100644
---- a/drivers/pci/pci.c
-+++ b/drivers/pci/pci.c
-@@ -62,7 +62,7 @@ struct pci_pme_device {
- 	struct pci_dev *dev;
- };
- 
--#define PME_TIMEOUT 1000 /* How long between PME checks */
-+#define PME_TIMEOUT 4000 /* How long between PME checks */
- 
- static void pci_dev_d3_sleep(struct pci_dev *dev)
- {
-diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
-index 26d00b1853b4..3e239d6548b5 100644
---- a/drivers/powercap/intel_rapl_common.c
-+++ b/drivers/powercap/intel_rapl_common.c
-@@ -1518,7 +1518,7 @@ static int __init rapl_init(void)
- 
- 	id = x86_match_cpu(rapl_ids);
- 	if (!id) {
--		pr_err("driver does not support CPU family %d model %d\n",
-+		pr_info("driver does not support CPU family %d model %d\n",
- 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
- 
- 		return -ENODEV;
-diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
-index 2f4cbfdf26a0..2d297a1cfa34 100644
---- a/drivers/thermal/intel/intel_powerclamp.c
-+++ b/drivers/thermal/intel/intel_powerclamp.c
-@@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
- 	.set_cur_state = powerclamp_set_cur_state,
- };
- 
-+static const struct x86_cpu_id amd_cpu[] = {
-+	{ X86_VENDOR_AMD },
-+	{},
-+};
-+
- static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
- 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
- 	{}
-@@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
- static int __init powerclamp_probe(void)
- {
- 
-+	if (x86_match_cpu(amd_cpu)){
-+		pr_info("Intel PowerClamp does not support AMD CPUs\n");
-+		return -ENODEV;
-+	}
-+
- 	if (!x86_match_cpu(intel_powerclamp_ids)) {
- 		pr_err("CPU does not support MWAIT\n");
- 		return -ENODEV;
-diff --git a/fs/xattr.c b/fs/xattr.c
-index adab9a70b536..4ada829a3b1b 100644
---- a/fs/xattr.c
-+++ b/fs/xattr.c
-@@ -139,16 +139,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
- 	}
- 
- 	/*
--	 * In the user.* namespace, only regular files and directories can have
--	 * extended attributes. For sticky directories, only the owner and
--	 * privileged users can write attributes.
-+	 * In the user.* namespace, only regular files, symbolic links, and
-+	 * directories can have extended attributes. For symbolic links and
-+	 * sticky directories, only the owner and privileged users can write
-+	 * attributes.
- 	 */
- 	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
--		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode))
- 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
--		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
--		    (mask & MAY_WRITE) &&
--		    !inode_owner_or_capable(mnt_userns, inode))
-+		if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX))
-+		        || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE)
-+		    && !inode_owner_or_capable(mnt_userns, inode))
- 			return -EPERM;
- 	}
- 
-diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
-index 2170e0cc279d..e8fa79f5bb34 100644
---- a/include/linux/jbd2.h
-+++ b/include/linux/jbd2.h
-@@ -45,7 +45,7 @@
- /*
-  * The default maximum commit age, in seconds.
-  */
--#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
-+#define JBD2_DEFAULT_MAX_COMMIT_AGE 30
- 
- #ifdef CONFIG_JBD2_DEBUG
- /*
-diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
-new file mode 100644
-index 000000000000..57ffb3c02ace
---- /dev/null
-+++ b/include/linux/rcuref.h
-@@ -0,0 +1,89 @@
-+/* SPDX-License-Identifier: GPL-2.0-only */
-+#ifndef _LINUX_RCUREF_H
-+#define _LINUX_RCUREF_H
-+
-+#include <linux/atomic.h>
-+#include <linux/bug.h>
-+#include <linux/limits.h>
-+#include <linux/lockdep.h>
-+#include <linux/preempt.h>
-+#include <linux/rcupdate.h>
-+
-+#define RCUREF_NOREF		0x00000000
-+#define RCUREF_ONEREF		0x00000001
-+#define RCUREF_MAXREF		0x7FFFFFFF
-+#define RCUREF_SATURATED	0xA0000000
-+#define RCUREF_RELEASED		0xC0000000
-+#define RCUREF_DEAD		0xE0000000
-+
-+/**
-+ * rcuref_init - Initialize a rcuref reference count with the given reference count
-+ * @ref:	Pointer to the reference count
-+ * @cnt:	The initial reference count typically '1'
-+ */
-+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
-+{
-+	atomic_set(&ref->refcnt, cnt);
-+}
-+
-+/**
-+ * rcuref_read - Read the number of held reference counts of a rcuref
-+ * @ref:	Pointer to the reference count
-+ *
-+ * Return: The number of held references (0 ... N)
-+ */
-+static inline unsigned int rcuref_read(rcuref_t *ref)
-+{
-+	unsigned int c = atomic_read(&ref->refcnt);
-+
-+	/* Return 0 if within the DEAD zone. */
-+	return c >= RCUREF_RELEASED ? 0 : c;
-+}
-+
-+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref, unsigned int new);
-+
-+/**
-+ * rcuref_get - Acquire one reference on a rcuref reference count
-+ * @ref:	Pointer to the reference count
-+ *
-+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
-+ *
-+ * Provides no memory ordering, it is assumed the caller has guaranteed the
-+ * object memory to be stable (RCU, etc.). It does provide a control dependency
-+ * and thereby orders future stores. See documentation in lib/rcuref.c
-+ *
-+ * Return:
-+ *	False if the attempt to acquire a reference failed. This happens
-+ *	when the last reference has been put already
-+ *
-+ *	True if a reference was successfully acquired
-+ */
-+static inline __must_check bool rcuref_get(rcuref_t *ref)
-+{
-+	/*
-+	 * Unconditionally increase the reference count. The saturation and
-+	 * dead zones provide enough tolerance for this.
-+	 */
-+	unsigned int old = atomic_fetch_add_relaxed(1, &ref->refcnt);
-+
-+	/*
-+	 * If the old value is less than RCUREF_MAXREF, this is a valid
-+	 * reference.
-+	 *
-+	 * In case the original value was RCUREF_NOREF the above
-+	 * unconditional increment raced with a concurrent put() operation
-+	 * dropping the last reference. That racing put() operation
-+	 * subsequently fails to mark the reference count dead because the
-+	 * count is now elevated again and the concurrent caller is
-+	 * therefore not allowed to deconstruct the object.
-+	 */
-+	if (likely(old < RCUREF_MAXREF))
-+		return true;
-+
-+	/* Handle the cases inside the saturation and dead zones */
-+	return rcuref_get_slowpath(ref, old);
-+}
-+
-+extern __must_check bool rcuref_put(rcuref_t *ref);
-+
-+#endif
-diff --git a/include/linux/types.h b/include/linux/types.h
-index ea8cf60a8a79..419baa980529 100644
---- a/include/linux/types.h
-+++ b/include/linux/types.h
-@@ -175,6 +175,12 @@ typedef struct {
- } atomic64_t;
- #endif
- 
-+typedef struct {
-+	atomic_t refcnt;
-+} rcuref_t;
-+
-+#define RCUREF_INIT(i)	{ .refcnt = ATOMIC_INIT(i) }
-+
- struct list_head {
- 	struct list_head *next, *prev;
- };
-diff --git a/include/linux/wait.h b/include/linux/wait.h
-index a0307b516b09..edc21128f387 100644
---- a/include/linux/wait.h
-+++ b/include/linux/wait.h
-@@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
- 
- extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
- extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
-+extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
- extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
- extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
- 
-@@ -1192,6 +1193,7 @@ do {										\
-  */
- void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
- bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
-+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
- long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
- void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
- long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
-diff --git a/include/net/dst.h b/include/net/dst.h
-index d67fda89cd0f..0909a3306902 100644
---- a/include/net/dst.h
-+++ b/include/net/dst.h
-@@ -16,6 +16,7 @@
- #include <linux/bug.h>
- #include <linux/jiffies.h>
- #include <linux/refcount.h>
-+#include <linux/rcuref.h>
- #include <net/neighbour.h>
- #include <asm/processor.h>
- #include <linux/indirect_call_wrapper.h>
-@@ -65,19 +66,29 @@ struct dst_entry {
- 	 * input/output/ops or performance tanks badly
- 	 */
- #ifdef CONFIG_64BIT
--	atomic_t		__refcnt;	/* 64-bit offset 64 */
-+	rcuref_t		__refcnt;	/* 64-bit offset 64 */
- #endif
- 	int			__use;
- 	unsigned long		lastuse;
--	struct lwtunnel_state   *lwtstate;
- 	struct rcu_head		rcu_head;
- 	short			error;
- 	short			__pad;
- 	__u32			tclassid;
- #ifndef CONFIG_64BIT
--	atomic_t		__refcnt;	/* 32-bit offset 64 */
-+	struct lwtunnel_state   *lwtstate;
-+	rcuref_t		__refcnt;	/* 32-bit offset 64 */
- #endif
- 	netdevice_tracker	dev_tracker;
-+#ifdef CONFIG_64BIT
-+	/*
-+	 * Ensure that lwtstate is not in the same cache line as __refcnt,
-+	 * because that would lead to false sharing under high contention
-+	 * of __refcnt. This also ensures that rtable::rt_genid is not
-+	 * sharing the same cache-line.
-+	 */
-+	int			pad2[6];
-+	struct lwtunnel_state   *lwtstate;
-+#endif
- };
- 
- struct dst_metrics {
-@@ -228,7 +239,7 @@ static inline void dst_hold(struct dst_entry *dst)
- 	 * the placement of __refcnt in struct dst_entry
- 	 */
- 	BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
--	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
-+	WARN_ON(!rcuref_get(&dst->__refcnt));
- }
- 
- static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
-@@ -292,7 +303,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
-  */
- static inline bool dst_hold_safe(struct dst_entry *dst)
- {
--	return atomic_inc_not_zero(&dst->__refcnt);
-+	return rcuref_get(&dst->__refcnt);
- }
- 
- /**
-diff --git a/include/net/sock.h b/include/net/sock.h
-index c6584a352463..dbf85161c0c7 100644
---- a/include/net/sock.h
-+++ b/include/net/sock.h
-@@ -2159,7 +2159,7 @@ sk_dst_get(struct sock *sk)
- 
- 	rcu_read_lock();
- 	dst = rcu_dereference(sk->sk_dst_cache);
--	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
-+	if (dst && !rcuref_get(&dst->__refcnt))
- 		dst = NULL;
- 	rcu_read_unlock();
- 	return dst;
-diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
-index d174914a837d..bf8e2af101a3 100644
---- a/include/uapi/linux/if_bonding.h
-+++ b/include/uapi/linux/if_bonding.h
-@@ -82,7 +82,7 @@
- #define BOND_STATE_ACTIVE       0   /* link is active */
- #define BOND_STATE_BACKUP       1   /* link is backup */
- 
--#define BOND_DEFAULT_MAX_BONDS  1   /* Default maximum number of devices to support */
-+#define BOND_DEFAULT_MAX_BONDS  0   /* Default maximum number of devices to support */
- 
- #define BOND_DEFAULT_TX_QUEUES 16   /* Default number of tx queues per device */
- 
-diff --git a/init/do_mounts.c b/init/do_mounts.c
-index 811e94daf0a8..06fef7f97c02 100644
---- a/init/do_mounts.c
-+++ b/init/do_mounts.c
-@@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name)
- 	if (strcmp(name, "/dev/ram") == 0)
- 		return Root_RAM0;
- #ifdef CONFIG_BLOCK
--	if (strncmp(name, "PARTUUID=", 9) == 0)
--		return devt_from_partuuid(name + 9);
-+	if (strncmp(name, "PARTUUID=", 9) == 0) {
-+		dev_t res;
-+		int  needtowait = 40<<1;
-+		res = devt_from_partuuid(name + 9);
-+		while (!res && needtowait) {
-+			/* waiting 0.5 sec */
-+			msleep(500);
-+			res = devt_from_partuuid(name + 9);
-+			needtowait--;
-+		}
-+		return res;
-+	}
- 	if (strncmp(name, "PARTLABEL=", 10) == 0)
- 		return devt_from_partlabel(name + 10);
- 	if (strncmp(name, "/dev/", 5) == 0)
-@@ -612,7 +622,9 @@ void __init prepare_namespace(void)
- 	 * For example, it is not atypical to wait 5 seconds here
- 	 * for the touchpad of a laptop to initialize.
- 	 */
-+	async_synchronize_full();
- 	wait_for_device_probe();
-+	async_synchronize_full();
- 
- 	md_run_setup();
- 
-diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
-index 84d5b649b95f..e341ca8731f7 100644
---- a/kernel/locking/rwsem.c
-+++ b/kernel/locking/rwsem.c
-@@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
- 	struct task_struct *new, *owner;
- 	unsigned long flags, new_flags;
- 	enum owner_state state;
-+	int i = 0;
- 
- 	lockdep_assert_preemption_disabled();
- 
-@@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
- 			break;
- 		}
- 
--		cpu_relax();
-+		if (i++ > 1000)
-+			cpu_relax();
- 	}
- 
- 	return state;
-diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
-index 133b74730738..1647fb8662eb 100644
---- a/kernel/sched/wait.c
-+++ b/kernel/sched/wait.c
-@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
- }
- EXPORT_SYMBOL_GPL(add_wait_queue_priority);
- 
-+void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
-+{
-+	unsigned long flags;
-+
-+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-+	spin_lock_irqsave(&wq_head->lock, flags);
-+	__add_wait_queue(wq_head, wq_entry);
-+	spin_unlock_irqrestore(&wq_head->lock, flags);
-+}
-+EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
-+
- void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
- {
- 	unsigned long flags;
-@@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent
- }
- EXPORT_SYMBOL(prepare_to_wait_exclusive);
- 
-+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
-+{
-+	unsigned long flags;
-+
-+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-+	spin_lock_irqsave(&wq_head->lock, flags);
-+	if (list_empty(&wq_entry->entry))
-+		__add_wait_queue(wq_head, wq_entry);
-+	set_current_state(state);
-+	spin_unlock_irqrestore(&wq_head->lock, flags);
-+}
-+EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
-+
- void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
- {
- 	wq_entry->flags = flags;
-diff --git a/kernel/watchdog.c b/kernel/watchdog.c
-index 8e61f21e7e33..be1439d38f26 100644
---- a/kernel/watchdog.c
-+++ b/kernel/watchdog.c
-@@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled;
- int __read_mostly watchdog_user_enabled = 1;
- int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
- int __read_mostly soft_watchdog_user_enabled = 1;
--int __read_mostly watchdog_thresh = 10;
-+int __read_mostly watchdog_thresh = 40;
- static int __read_mostly nmi_watchdog_available;
- 
- struct cpumask watchdog_cpumask __read_mostly;
-diff --git a/lib/Makefile b/lib/Makefile
-index 4d9461bfea42..71c9627153b8 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
- 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
- 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- 	 percpu-refcount.o rhashtable.o base64.o \
--	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
-+	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
- 	 generic-radix-tree.o
- obj-$(CONFIG_STRING_SELFTEST) += test_string.o
- obj-y += string_helpers.o
-diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
-index a22a05c9af8a..a70bcbbd1673 100644
---- a/lib/raid6/algos.c
-+++ b/lib/raid6/algos.c
-@@ -126,8 +126,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
- 
- 	for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
- 		if (!best || (*algo)->priority > best->priority)
--			if (!(*algo)->valid || (*algo)->valid())
-+			if (!(*algo)->valid || (*algo)->valid()) {
- 				best = *algo;
-+				break;
-+			}
- 
- 	if (best) {
- 		raid6_2data_recov = best->data2;
-diff --git a/lib/rcuref.c b/lib/rcuref.c
-new file mode 100644
-index 000000000000..34fa40618fca
---- /dev/null
-+++ b/lib/rcuref.c
-@@ -0,0 +1,311 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+
-+/*
-+ * rcuref - A scalable reference count implementation for RCU managed objects
-+ *
-+ * rcuref is provided to replace open coded reference count implementations
-+ * based on atomic_t. It protects explicitely RCU managed objects which can
-+ * be visible even after the last reference has been dropped and the object
-+ * is heading towards destruction.
-+ *
-+ * A common usage pattern is:
-+ *
-+ * get()
-+ *	rcu_read_lock();
-+ *	p = get_ptr();
-+ *	if (p && !atomic_inc_not_zero(&p->refcnt))
-+ *		p = NULL;
-+ *	rcu_read_unlock();
-+ *	return p;
-+ *
-+ * put()
-+ *	if (!atomic_dec_return(&->refcnt)) {
-+ *		remove_ptr(p);
-+ *		kfree_rcu((p, rcu);
-+ *	}
-+ *
-+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
-+ * O(N^2) behaviour under contention with N concurrent operations.
-+ *
-+ * rcuref uses atomic_fetch_add_relaxed() and atomic_fetch_sub_release()
-+ * for the fast path, which scale better under contention.
-+ *
-+ * Why not refcount?
-+ * =================
-+ *
-+ * In principle it should be possible to make refcount use the rcuref
-+ * scheme, but the destruction race described below cannot be prevented
-+ * unless the protected object is RCU managed.
-+ *
-+ * Theory of operation
-+ * ===================
-+ *
-+ * rcuref uses an unsigned integer reference counter. As long as the
-+ * counter value is greater than or equal to RCUREF_ONEREF and not larger
-+ * than RCUREF_MAXREF the reference is alive:
-+ *
-+ * NOREF ONEREF   MAXREF             SATURATED             RELEASED      DEAD
-+ * 0     1      0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
-+ * <---valid ------------> <-------saturation zone-------> <-----------dead zone---------->
-+ *
-+ * The get() and put() operations do unconditional increments and
-+ * decrements. The result is checked after the operation. This optimizes
-+ * for the fast path.
-+ *
-+ * If the reference count is saturated or dead, then the increments and
-+ * decrements are not harmful as the reference count still stays in the
-+ * respective zones and is always set back to STATURATED resp. DEAD. The
-+ * zones have room for 2^28 racing operations in each direction, which
-+ * makes it practically impossible to escape the zones.
-+ *
-+ * Once the last reference is dropped the reference count becomes
-+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
-+ * slowpath then tries to set the reference count from RCUREF_NOREF to
-+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
-+ * concurrent rcuref_get() can acquire the reference count and bring it
-+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
-+ *
-+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
-+ * DEAD + 1, which is inside the dead zone. If that happens the reference
-+ * count is put back to DEAD.
-+ *
-+ * The actual race is possible due to the unconditional increment and
-+ * decrements in rcuref_get() and rcuref_put():
-+ *
-+ *	T1				T2
-+ *	get()				put()
-+ *					if (atomic_fetch_sub(1, &ref->refcnt) >= 0)
-+ *		succeeds->			atomic_try_cmpxchg(&ref->refcnt, -1, DEAD);
-+ *
-+ *	old = atomic_fetch_add(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
-+ *
-+ * As @old observed by T1 is within the dead zone the T1 get() fails.
-+ *
-+ * Possible critical states:
-+ *
-+ *	Context Counter	References	Operation
-+ *	T1	1	1		init()
-+ *	T2	2	2		get()
-+ *	T1	1	1		put()
-+ *	T2      0	0		put() tries to mark dead
-+ *	T1	1	1		get()
-+ *	T2	1	1		put() mark dead fails
-+ *	T1      0	0		put() tries to mark dead
-+ *	T1    DEAD	0		put() mark dead succeeds
-+ *	T2    DEAD+1	0		get() fails and puts it back to DEAD
-+ *
-+ * Of course there are more complex scenarios, but the above illustrates
-+ * the working principle. The rest is left to the imagination of the
-+ * reader.
-+ *
-+ * Deconstruction race
-+ * ===================
-+ *
-+ * The release operation must be protected by prohibiting a grace period in
-+ * order to prevent a possible use after free:
-+ *
-+ *	T1				T2
-+ *	put()				get()
-+ *	// ref->refcnt = ONEREF
-+ *	if (atomic_fetch_sub(1, &ref->cnt) > ONEREF)
-+ *		return false;				<- Not taken
-+ *
-+ *	// ref->refcnt == NOREF
-+ *	--> preemption
-+ *					// Elevates ref->c to ONEREF
-+ *					if (!atomic_fetch_add(1, &ref->refcnt) >= NOREF)
-+ *						return true;			<- taken
-+ *
-+ *					if (put(&p->ref)) { <-- Succeeds
-+ *						remove_pointer(p);
-+ *						kfree_rcu(p, rcu);
-+ *					}
-+ *
-+ *		RCU grace period ends, object is freed
-+ *
-+ *	atomic_cmpxchg(&ref->refcnt, NONE, DEAD);	<- UAF
-+ *
-+ * This is prevented by disabling preemption around the put() operation as
-+ * that's in most kernel configurations cheaper than a rcu_read_lock() /
-+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
-+ * prevents the grace period which keeps the object alive until all put()
-+ * operations complete.
-+ *
-+ * Saturation protection
-+ * =====================
-+ *
-+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
-+ * Once this is exceedded the reference count becomes stale by setting it
-+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
-+ * wrap arounds which obviously cause worse problems than a memory
-+ * leak. When saturation is reached a warning is emitted.
-+ *
-+ * Race conditions
-+ * ===============
-+ *
-+ * All reference count increment/decrement operations are unconditional and
-+ * only verified after the fact. This optimizes for the good case and takes
-+ * the occasional race vs. a dead or already saturated refcount into
-+ * account. The saturation and dead zones are large enough to accomodate
-+ * for that.
-+ *
-+ * Memory ordering
-+ * ===============
-+ *
-+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
-+ * and provide only what is strictly required for refcounts.
-+ *
-+ * The increments are fully relaxed; these will not provide ordering. The
-+ * rationale is that whatever is used to obtain the object to increase the
-+ * reference count on will provide the ordering. For locked data
-+ * structures, its the lock acquire, for RCU/lockless data structures its
-+ * the dependent load.
-+ *
-+ * rcuref_get() provides a control dependency ordering future stores which
-+ * ensures that the object is not modified when acquiring a reference
-+ * fails.
-+ *
-+ * rcuref_put() provides release order, i.e. all prior loads and stores
-+ * will be issued before. It also provides a control dependency ordering
-+ * against the subsequent destruction of the object.
-+ *
-+ * If rcuref_put() successfully dropped the last reference and marked the
-+ * object DEAD it also provides acquire ordering.
-+ */
-+
-+#include <linux/export.h>
-+#include <linux/rcuref.h>
-+
-+/**
-+ * rcuref_get_slowpath - Slowpath of rcuref_get()
-+ * @ref:	Pointer to the reference count
-+ * @old:	The reference count before the unconditional increment
-+ *		operation in rcuref_get()
-+ *
-+ * Invoked when the reference count is outside of the valid zone.
-+ *
-+ * Return:
-+ *	False if the reference count was already marked dead
-+ *
-+ *	True if the reference count is saturated, which prevents the
-+ *	object from being deconstructed ever.
-+ */
-+bool rcuref_get_slowpath(rcuref_t *ref, unsigned int old)
-+{
-+	/*
-+	 * If the reference count was already marked dead, undo the
-+	 * increment so it stays in the middle of the dead zone and return
-+	 * fail.
-+	 */
-+	if (old >= RCUREF_RELEASED) {
-+		atomic_set(&ref->refcnt, RCUREF_DEAD);
-+		return false;
-+	}
-+
-+	/*
-+	 * If it was saturated, warn and mark it so. In case the increment
-+	 * was already on a saturated value restore the saturation
-+	 * marker. This keeps it in the middle of the saturation zone and
-+	 * prevents the reference count from overflowing. This leaks the
-+	 * object memory, but prevents the obvious reference count overflow
-+	 * damage.
-+	 */
-+	WARN_ONCE(old >= RCUREF_MAXREF, "rcuref saturated - leaking memory");
-+	atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+	return true;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
-+
-+static __must_check bool __rcuref_put(rcuref_t *ref)
-+{
-+	/*
-+	 * Unconditionally decrement the reference count. The saturation and
-+	 * dead zones provide enough tolerance for this.
-+	 */
-+	unsigned int old = atomic_fetch_sub_release(1, &ref->refcnt);
-+
-+	/*
-+	 * If the old value is in the valid range and is greater than
-+	 * RCUREF_ONEREF, nothing to do.
-+	 */
-+	if (likely(old > RCUREF_ONEREF && old <= RCUREF_MAXREF))
-+		return false;
-+
-+	/* Did this drop the last reference? */
-+	if (likely(old == RCUREF_ONEREF)) {
-+		/*
-+		 * Carefully try to set the reference count to RCUREF_DEAD.
-+		 *
-+		 * This can fail if a concurrent get() operation has
-+		 * elevated it again or the corresponding put() even marked
-+		 * it dead already. Both are valid situations and do not
-+		 * require a retry. If this fails the caller is not
-+		 * allowed to deconstruct the object.
-+		 */
-+		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
-+			return false;
-+
-+		/*
-+		 * The caller can safely schedule the object for
-+		 * deconstruction. Provide acquire ordering.
-+		 */
-+		smp_acquire__after_ctrl_dep();
-+		return true;
-+	}
-+
-+	/*
-+	 * If the reference count was already in the dead zone, then this
-+	 * put() operation is imbalanced. Warn, put the reference count back to
-+	 * DEAD and tell the caller to not deconstruct the object.
-+	 */
-+	if (WARN_ONCE(old >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
-+		atomic_set(&ref->refcnt, RCUREF_DEAD);
-+		return false;
-+	}
-+
-+	/*
-+	 * This is a put() operation on a saturated refcount. Restore the
-+	 * mean saturation value and tell the caller to not deconstruct the
-+	 * object.
-+	 */
-+	atomic_set(&ref->refcnt, RCUREF_SATURATED);
-+	return false;
-+}
-+
-+/**
-+ * rcuref_put -- Release one reference for a rcuref reference count
-+ * @ref:	Pointer to the reference count
-+ *
-+ * Can be invoked from any context.
-+ *
-+ * Provides release memory ordering, such that prior loads and stores are done
-+ * before, and provides an acquire ordering on success such that free()
-+ * must come after.
-+ *
-+ * Return:
-+ *
-+ *	True if this was the last reference with no future references
-+ *	possible. This signals the caller that it can safely schedule the
-+ *	object, which is protected by the reference counter, for
-+ *	deconstruction.
-+ *
-+ *	False if there are still active references or the put() raced
-+ *	with a concurrent get()/put() pair. Caller is not allowed to
-+ *	deconstruct the protected object.
-+ */
-+bool rcuref_put(rcuref_t *ref)
-+{
-+	bool released;
-+
-+	/*
-+	 * Protect against a concurrent get()/put() pair which marks the
-+	 * reference count DEAD and schedules it for RCU free. This
-+	 * prevents a grace period and is cheaper than
-+	 * rcu_read_lock()/unlock().
-+	 */
-+	preempt_disable();
-+	released = __rcuref_put(ref);
-+	preempt_enable();
-+	return released;
-+}
-+EXPORT_SYMBOL_GPL(rcuref_put);
-diff --git a/mm/ksm.c b/mm/ksm.c
-index addf490da146..a92c9594a2d3 100644
---- a/mm/ksm.c
-+++ b/mm/ksm.c
-@@ -2454,9 +2454,14 @@ static int ksm_scan_thread(void *nothing)
- 
- 		if (ksmd_should_run()) {
- 			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
--			wait_event_interruptible_timeout(ksm_iter_wait,
--				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
--				msecs_to_jiffies(sleep_ms));
-+			if (sleep_ms >= 1000)
-+				wait_event_interruptible_timeout(ksm_iter_wait,
-+					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
-+					msecs_to_jiffies(round_jiffies_relative(sleep_ms)));
-+			else
-+				wait_event_interruptible_timeout(ksm_iter_wait,
-+					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
-+					msecs_to_jiffies(sleep_ms));
- 		} else {
- 			wait_event_freezable(ksm_thread_wait,
- 				ksmd_should_run() || kthread_should_stop());
-diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
-index 8c69f0c95a8e..c2b628e3cc7f 100644
---- a/net/bridge/br_nf_core.c
-+++ b/net/bridge/br_nf_core.c
-@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
- {
- 	struct rtable *rt = &br->fake_rtable;
- 
--	atomic_set(&rt->dst.__refcnt, 1);
-+	rcuref_init(&rt->dst.__refcnt, 1);
- 	rt->dst.dev = br->dev;
- 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
- 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
-diff --git a/net/core/dst.c b/net/core/dst.c
-index 6d2dd03dafa8..750440803883 100644
---- a/net/core/dst.c
-+++ b/net/core/dst.c
-@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- 	dst->tclassid = 0;
- #endif
- 	dst->lwtstate = NULL;
--	atomic_set(&dst->__refcnt, initial_ref);
-+	rcuref_init(&dst->__refcnt, initial_ref);
- 	dst->__use = 0;
- 	dst->lastuse = jiffies;
- 	dst->flags = flags;
-@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put);
- 
- void dst_release(struct dst_entry *dst)
- {
--	if (dst) {
--		int newrefcnt;
--
--		newrefcnt = atomic_dec_return(&dst->__refcnt);
--		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
--			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
--					     __func__, dst, newrefcnt);
--		if (!newrefcnt)
--			call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
--	}
-+	if (dst && rcuref_put(&dst->__refcnt))
-+		call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
- }
- EXPORT_SYMBOL(dst_release);
- 
- void dst_release_immediate(struct dst_entry *dst)
- {
--	if (dst) {
--		int newrefcnt;
--
--		newrefcnt = atomic_dec_return(&dst->__refcnt);
--		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
--			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
--					     __func__, dst, newrefcnt);
--		if (!newrefcnt)
--			dst_destroy(dst);
--	}
-+	if (dst && rcuref_put(&dst->__refcnt))
-+		dst_destroy(dst);
- }
- EXPORT_SYMBOL(dst_release_immediate);
- 
-diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
-index 64289bc98887..228c54bbdecc 100644
---- a/net/core/rtnetlink.c
-+++ b/net/core/rtnetlink.c
-@@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- 	if (dst) {
- 		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
- 		ci.rta_used = dst->__use;
--		ci.rta_clntref = atomic_read(&dst->__refcnt);
-+		ci.rta_clntref = rcuref_read(&dst->__refcnt);
- 	}
- 	if (expires) {
- 		unsigned long clock;
-diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
-index f2c43f67187d..9885bfb429a2 100644
---- a/net/ipv4/inet_connection_sock.c
-+++ b/net/ipv4/inet_connection_sock.c
-@@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
- 	 * having to remove and re-insert us on the wait queue.
- 	 */
- 	for (;;) {
--		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
-+		prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
- 					  TASK_INTERRUPTIBLE);
- 		release_sock(sk);
- 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
-diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index e9e8040d6491..f9b56123b3b8 100644
---- a/net/ipv4/tcp.c
-+++ b/net/ipv4/tcp.c
-@@ -4815,8 +4815,8 @@ void __init tcp_init(void)
- 	tcp_init_mem();
- 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
- 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
--	max_wshare = min(4UL*1024*1024, limit);
--	max_rshare = min(6UL*1024*1024, limit);
-+	max_wshare = min(16UL*1024*1024, limit);
-+	max_rshare = min(16UL*1024*1024, limit);
- 
- 	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
- 	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
-diff --git a/net/ipv6/route.c b/net/ipv6/route.c
-index a6983a13dd20..8b5e3d57b08d 100644
---- a/net/ipv6/route.c
-+++ b/net/ipv6/route.c
-@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
- 
- static const struct rt6_info ip6_null_entry_template = {
- 	.dst = {
--		.__refcnt	= ATOMIC_INIT(1),
-+		.__refcnt	= RCUREF_INIT(1),
- 		.__use		= 1,
- 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
- 		.error		= -ENETUNREACH,
-@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
- 
- static const struct rt6_info ip6_prohibit_entry_template = {
- 	.dst = {
--		.__refcnt	= ATOMIC_INIT(1),
-+		.__refcnt	= RCUREF_INIT(1),
- 		.__use		= 1,
- 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
- 		.error		= -EACCES,
-@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
- 
- static const struct rt6_info ip6_blk_hole_entry_template = {
- 	.dst = {
--		.__refcnt	= ATOMIC_INIT(1),
-+		.__refcnt	= RCUREF_INIT(1),
- 		.__use		= 1,
- 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
- 		.error		= -EINVAL,
-diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
-index 029171379884..bc9dc51828f7 100644
---- a/net/netfilter/ipvs/ip_vs_xmit.c
-+++ b/net/netfilter/ipvs/ip_vs_xmit.c
-@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
- 			spin_unlock_bh(&dest->dst_lock);
- 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
- 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
--				  atomic_read(&rt->dst.__refcnt));
-+				  rcuref_read(&rt->dst.__refcnt));
- 		}
- 		if (ret_saddr)
- 			*ret_saddr = dest_dst->dst_saddr.ip;
-@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
- 			spin_unlock_bh(&dest->dst_lock);
- 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
--				  atomic_read(&rt->dst.__refcnt));
-+				  rcuref_read(&rt->dst.__refcnt));
- 		}
- 		if (ret_saddr)
- 			*ret_saddr = dest_dst->dst_saddr.in6;
--- 
-2.40.0.rc2
-
-From ed2979f1636e3197b42234c8acac4d20f4e2ed8e Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 18:03:29 +0100
-Subject: [PATCH 06/16] fixes
+Date: Sun, 9 Apr 2023 21:22:15 +0200
+Subject: [PATCH 04/10] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -11587,42 +6446,27 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  Documentation/admin-guide/mm/ksm.rst          |    7 +
  Documentation/leds/index.rst                  |    1 +
  Documentation/leds/ledtrig-blkdev.rst         |  158 +++
- arch/x86/boot/compressed/Makefile             |    2 +-
- arch/x86/events/rapl.c                        |   20 +-
- arch/x86/kernel/cpu/amd.c                     |    9 +
- arch/x86/mm/tlb.c                             |    2 +-
- arch/x86/net/bpf_jit_comp.c                   |    5 +-
  drivers/bluetooth/btusb.c                     |    2 +-
- drivers/char/tpm/tpm-chip.c                   |   60 +-
- drivers/char/tpm/tpm.h                        |   73 +
  drivers/leds/trigger/Kconfig                  |    9 +
  drivers/leds/trigger/Makefile                 |    1 +
  drivers/leds/trigger/ledtrig-blkdev.c         | 1221 +++++++++++++++++
- fs/eventpoll.c                                |    2 +-
+ fs/eventpoll.c                                |  188 ++-
  fs/proc/base.c                                |    1 +
  include/linux/mm_types.h                      |    7 +-
  include/linux/pageblock-flags.h               |    2 +-
  kernel/kheaders.c                             |   10 +-
- kernel/kthread.c                              |    5 +
  kernel/padata.c                               |    4 +-
- lib/string.c                                  |   10 +-
- lib/zstd/decompress/huf_decompress.c          |    2 +-
- mm/compaction.c                               |   75 +-
- mm/internal.h                                 |    6 +-
- mm/ksm.c                                      |  196 ++-
- mm/page_alloc.c                               |   22 +-
- mm/z3fold.c                                   |    2 -
- mm/zsmalloc.c                                 |    3 -
+ mm/ksm.c                                      |  185 ++-
  scripts/Makefile.vmlinux_o                    |    2 +-
  sound/pci/hda/cs35l41_hda.c                   |    2 +-
- .../selftests/vm/ksm_functional_tests.c       |   96 +-
- 34 files changed, 1995 insertions(+), 110 deletions(-)
+ .../selftests/mm/ksm_functional_tests.c       |   96 +-
+ 19 files changed, 1862 insertions(+), 122 deletions(-)
  create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
  create mode 100644 Documentation/leds/ledtrig-blkdev.rst
  create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
 
 diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
-index cd14ecb3c9a5..ad47337ac75a 100644
+index 282de3680367..ac1dd2fbd855 100644
 --- a/Documentation/ABI/stable/sysfs-block
 +++ b/Documentation/ABI/stable/sysfs-block
 @@ -101,6 +101,16 @@ Description:
@@ -11727,10 +6571,10 @@ index 000000000000..28ce8c814fb7
 +		may not match the device special file paths written to
 +		link_device and unlink_device.)
 diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
-index fb6ba2002a4b..f160f9487a90 100644
+index eed51a910c94..270560fef3b2 100644
 --- a/Documentation/admin-guide/mm/ksm.rst
 +++ b/Documentation/admin-guide/mm/ksm.rst
-@@ -173,6 +173,13 @@ stable_node_chains
+@@ -171,6 +171,13 @@ stable_node_chains
          the number of KSM pages that hit the ``max_page_sharing`` limit
  stable_node_dups
          number of duplicated KSM pages
@@ -11745,7 +6589,7 @@ index fb6ba2002a4b..f160f9487a90 100644
  A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
  sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
 diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
-index e5d63b940045..e3c24e468cbc 100644
+index b9ca081fac71..5e37d8e7bd28 100644
 --- a/Documentation/leds/index.rst
 +++ b/Documentation/leds/index.rst
 @@ -10,6 +10,7 @@ LEDs
@@ -11920,129 +6764,8 @@ index 000000000000..9ff5b99de451
 +* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
 +  A device can be associated with multiple LEDs, and an LED can be associated
 +  with multiple devices.
-diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
-index d995595394bb..19d1fb601796 100644
---- a/arch/x86/boot/compressed/Makefile
-+++ b/arch/x86/boot/compressed/Makefile
-@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
- KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
- KBUILD_CFLAGS += -D__DISABLE_EXPORTS
- # Disable relocation relaxation in case the link is not PIE.
--KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
-+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
- KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
- 
- # sev.c indirectly inludes inat-table.h which is generated during
-diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
-index 52e6e7ed4f78..f000cc16d128 100644
---- a/arch/x86/events/rapl.c
-+++ b/arch/x86/events/rapl.c
-@@ -343,14 +343,15 @@ static int rapl_pmu_event_init(struct perf_event *event)
- 	if (event->cpu < 0)
- 		return -EINVAL;
- 
--	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
--
- 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
- 		return -EINVAL;
- 
- 	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
- 	bit = cfg - 1;
- 
-+	if (bit != PERF_RAPL_PP0)
-+		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-+
- 	/* check event supported */
- 	if (!(rapl_cntr_mask & (1 << bit)))
- 		return -EINVAL;
-@@ -363,7 +364,15 @@ static int rapl_pmu_event_init(struct perf_event *event)
- 	pmu = cpu_to_rapl_pmu(event->cpu);
- 	if (!pmu)
- 		return -EINVAL;
--	event->cpu = pmu->cpu;
-+
-+	/*
-+	 * FIXME: RAPL PMU considers events are uncore and MSRs can be read from
-+	 * the first available CPU of the die. But this is not true for energy-cores
-+	 * event. Therefore as a workaround don't consider pmu->cpu here for PERF_RAPL_PP0.
-+	 */
-+	if (event->event_caps & PERF_EV_CAP_READ_ACTIVE_PKG)
-+		event->cpu = pmu->cpu;
-+
- 	event->pmu_private = pmu;
- 	event->hw.event_base = rapl_msrs[bit].msr;
- 	event->hw.config = cfg;
-@@ -537,7 +546,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
-  * - want to use same event codes across both architectures
-  */
- static struct perf_msr amd_rapl_msrs[] = {
--	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
-+	[PERF_RAPL_PP0]  = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
- 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
- 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
- 	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
-@@ -764,7 +773,8 @@ static struct rapl_model model_spr = {
- };
- 
- static struct rapl_model model_amd_hygon = {
--	.events		= BIT(PERF_RAPL_PKG),
-+	.events		= BIT(PERF_RAPL_PP0) |
-+			  BIT(PERF_RAPL_PKG),
- 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
- 	.rapl_msrs      = amd_rapl_msrs,
- };
-diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
-index f769d6d08b43..06f2ede1544f 100644
---- a/arch/x86/kernel/cpu/amd.c
-+++ b/arch/x86/kernel/cpu/amd.c
-@@ -880,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
- 		}
- 	}
- #endif
-+	/*
-+	 * Work around Erratum 1386.  The XSAVES instruction malfunctions in
-+	 * certain circumstances on Zen1/2 uarch, and not all parts have had
-+	 * updated microcode at the time of writing (March 2023).
-+	 *
-+	 * Affected parts all have no supervisor XSAVE states, meaning that
-+	 * the XSAVEC instruction (which works fine) is equivalent.
-+	 */
-+	clear_cpu_cap(c, X86_FEATURE_XSAVES);
- }
- 
- static void init_amd_zn(struct cpuinfo_x86 *c)
-diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
-index c1e31e9a85d7..92d73ccede70 100644
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -1205,7 +1205,7 @@ void __flush_tlb_all(void)
- 	 */
- 	VM_WARN_ON_ONCE(preemptible());
- 
--	if (boot_cpu_has(X86_FEATURE_PGE)) {
-+	if (cpu_feature_enabled(X86_FEATURE_PGE)) {
- 		__flush_tlb_global();
- 	} else {
- 		/*
-diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
-index b808be77635e..6e696c6b7018 100644
---- a/arch/x86/net/bpf_jit_comp.c
-+++ b/arch/x86/net/bpf_jit_comp.c
-@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip)
- 
- static int emit_rsb_call(u8 **pprog, void *func, void *ip)
- {
-+	void *adjusted_ip;
- 	OPTIMIZER_HIDE_VAR(func);
--	x86_call_depth_emit_accounting(pprog, func);
--	return emit_patch(pprog, func, ip, 0xE8);
-+	adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func);
-+	return emit_patch(pprog, func, adjusted_ip, 0xE8);
- }
- 
- static int emit_jump(u8 **pprog, void *func, void *ip)
 diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
-index 18bc94718711..7b9ee86b4609 100644
+index 5c536151ef83..5a80379253a7 100644
 --- a/drivers/bluetooth/btusb.c
 +++ b/drivers/bluetooth/btusb.c
 @@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
@@ -12054,168 +6777,6 @@ index 18bc94718711..7b9ee86b4609 100644
  		gpiod_set_value_cansleep(reset_gpio, 1);
  
  		return;
-diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
-index 741d8f3e8fb3..c467eeae9973 100644
---- a/drivers/char/tpm/tpm-chip.c
-+++ b/drivers/char/tpm/tpm-chip.c
-@@ -512,6 +512,63 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip)
- 	return 0;
- }
- 
-+/*
-+ * Some AMD fTPM versions may cause stutter
-+ * https://www.amd.com/en/support/kb/faq/pa-410
-+ *
-+ * Fixes are available in two series of fTPM firmware:
-+ * 6.x.y.z series: 6.0.18.6 +
-+ * 3.x.y.z series: 3.57.y.5 +
-+ */
-+static bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
-+{
-+	u32 val1, val2;
-+	u64 version;
-+	int ret;
-+
-+	if (!(chip->flags & TPM_CHIP_FLAG_TPM2))
-+		return false;
-+
-+	ret = tpm_request_locality(chip);
-+	if (ret)
-+		return false;
-+
-+	ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL);
-+	if (ret)
-+		goto release;
-+	if (val1 != 0x414D4400U /* AMD */) {
-+		ret = -ENODEV;
-+		goto release;
-+	}
-+	ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL);
-+	if (ret)
-+		goto release;
-+	ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL);
-+
-+release:
-+	tpm_relinquish_locality(chip);
-+
-+	if (ret)
-+		return false;
-+
-+	version = ((u64)val1 << 32) | val2;
-+	if ((version >> 48) == 6) {
-+		if (version >= 0x0006000000180006ULL)
-+			return false;
-+	} else if ((version >> 48) == 3) {
-+		if (version >= 0x0003005700000005ULL)
-+			return false;
-+	} else {
-+		return false;
-+	}
-+
-+	dev_warn(&chip->dev,
-+		 "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n",
-+		 version);
-+
-+	return true;
-+}
-+
- static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
- {
- 	struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng);
-@@ -521,7 +578,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
- 
- static int tpm_add_hwrng(struct tpm_chip *chip)
- {
--	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip))
-+	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) ||
-+	    tpm_amd_is_rng_defective(chip))
- 		return 0;
- 
- 	snprintf(chip->hwrng_name, sizeof(chip->hwrng_name),
-diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
-index 24ee4e1cc452..830014a26609 100644
---- a/drivers/char/tpm/tpm.h
-+++ b/drivers/char/tpm/tpm.h
-@@ -150,6 +150,79 @@ enum tpm_sub_capabilities {
- 	TPM_CAP_PROP_TIS_DURATION = 0x120,
- };
- 
-+enum tpm2_pt_props {
-+	TPM2_PT_NONE = 0x00000000,
-+	TPM2_PT_GROUP = 0x00000100,
-+	TPM2_PT_FIXED = TPM2_PT_GROUP * 1,
-+	TPM2_PT_FAMILY_INDICATOR = TPM2_PT_FIXED + 0,
-+	TPM2_PT_LEVEL = TPM2_PT_FIXED + 1,
-+	TPM2_PT_REVISION = TPM2_PT_FIXED + 2,
-+	TPM2_PT_DAY_OF_YEAR = TPM2_PT_FIXED + 3,
-+	TPM2_PT_YEAR = TPM2_PT_FIXED + 4,
-+	TPM2_PT_MANUFACTURER = TPM2_PT_FIXED + 5,
-+	TPM2_PT_VENDOR_STRING_1 = TPM2_PT_FIXED + 6,
-+	TPM2_PT_VENDOR_STRING_2 = TPM2_PT_FIXED + 7,
-+	TPM2_PT_VENDOR_STRING_3 = TPM2_PT_FIXED + 8,
-+	TPM2_PT_VENDOR_STRING_4 = TPM2_PT_FIXED + 9,
-+	TPM2_PT_VENDOR_TPM_TYPE = TPM2_PT_FIXED + 10,
-+	TPM2_PT_FIRMWARE_VERSION_1 = TPM2_PT_FIXED + 11,
-+	TPM2_PT_FIRMWARE_VERSION_2 = TPM2_PT_FIXED + 12,
-+	TPM2_PT_INPUT_BUFFER = TPM2_PT_FIXED + 13,
-+	TPM2_PT_HR_TRANSIENT_MIN = TPM2_PT_FIXED + 14,
-+	TPM2_PT_HR_PERSISTENT_MIN = TPM2_PT_FIXED + 15,
-+	TPM2_PT_HR_LOADED_MIN = TPM2_PT_FIXED + 16,
-+	TPM2_PT_ACTIVE_SESSIONS_MAX = TPM2_PT_FIXED + 17,
-+	TPM2_PT_PCR_COUNT = TPM2_PT_FIXED + 18,
-+	TPM2_PT_PCR_SELECT_MIN = TPM2_PT_FIXED + 19,
-+	TPM2_PT_CONTEXT_GAP_MAX = TPM2_PT_FIXED + 20,
-+	TPM2_PT_NV_COUNTERS_MAX = TPM2_PT_FIXED + 22,
-+	TPM2_PT_NV_INDEX_MAX = TPM2_PT_FIXED + 23,
-+	TPM2_PT_MEMORY = TPM2_PT_FIXED + 24,
-+	TPM2_PT_CLOCK_UPDATE = TPM2_PT_FIXED + 25,
-+	TPM2_PT_CONTEXT_HASH = TPM2_PT_FIXED + 26,
-+	TPM2_PT_CONTEXT_SYM = TPM2_PT_FIXED + 27,
-+	TPM2_PT_CONTEXT_SYM_SIZE = TPM2_PT_FIXED + 28,
-+	TPM2_PT_ORDERLY_COUNT = TPM2_PT_FIXED + 29,
-+	TPM2_PT_MAX_COMMAND_SIZE = TPM2_PT_FIXED + 30,
-+	TPM2_PT_MAX_RESPONSE_SIZE = TPM2_PT_FIXED + 31,
-+	TPM2_PT_MAX_DIGEST = TPM2_PT_FIXED + 32,
-+	TPM2_PT_MAX_OBJECT_CONTEXT = TPM2_PT_FIXED + 33,
-+	TPM2_PT_MAX_SESSION_CONTEXT = TPM2_PT_FIXED + 34,
-+	TPM2_PT_PS_FAMILY_INDICATOR = TPM2_PT_FIXED + 35,
-+	TPM2_PT_PS_LEVEL = TPM2_PT_FIXED + 36,
-+	TPM2_PT_PS_REVISION = TPM2_PT_FIXED + 37,
-+	TPM2_PT_PS_DAY_OF_YEAR = TPM2_PT_FIXED + 38,
-+	TPM2_PT_PS_YEAR = TPM2_PT_FIXED + 39,
-+	TPM2_PT_SPLIT_MAX = TPM2_PT_FIXED + 40,
-+	TPM2_PT_TOTAL_COMMANDS = TPM2_PT_FIXED + 41,
-+	TPM2_PT_LIBRARY_COMMANDS = TPM2_PT_FIXED + 42,
-+	TPM2_PT_VENDOR_COMMANDS = TPM2_PT_FIXED + 43,
-+	TPM2_PT_NV_BUFFER_MAX = TPM2_PT_FIXED + 44,
-+	TPM2_PT_MODES = TPM2_PT_FIXED + 45,
-+	TPM2_PT_MAX_CAP_BUFFER = TPM2_PT_FIXED + 46,
-+	TPM2_PT_VAR = TPM2_PT_GROUP * 2,
-+	TPM2_PT_PERMANENT = TPM2_PT_VAR + 0,
-+	TPM2_PT_STARTUP_CLEAR = TPM2_PT_VAR + 1,
-+	TPM2_PT_HR_NV_INDEX = TPM2_PT_VAR + 2,
-+	TPM2_PT_HR_LOADED = TPM2_PT_VAR + 3,
-+	TPM2_PT_HR_LOADED_AVAIL = TPM2_PT_VAR + 4,
-+	TPM2_PT_HR_ACTIVE = TPM2_PT_VAR + 5,
-+	TPM2_PT_HR_ACTIVE_AVAIL = TPM2_PT_VAR + 6,
-+	TPM2_PT_HR_TRANSIENT_AVAIL = TPM2_PT_VAR + 7,
-+	TPM2_PT_HR_PERSISTENT = TPM2_PT_VAR + 8,
-+	TPM2_PT_HR_PERSISTENT_AVAIL = TPM2_PT_VAR + 9,
-+	TPM2_PT_NV_COUNTERS = TPM2_PT_VAR + 10,
-+	TPM2_PT_NV_COUNTERS_AVAIL = TPM2_PT_VAR + 11,
-+	TPM2_PT_ALGORITHM_SET = TPM2_PT_VAR + 12,
-+	TPM2_PT_LOADED_CURVES = TPM2_PT_VAR + 13,
-+	TPM2_PT_LOCKOUT_COUNTER = TPM2_PT_VAR + 14,
-+	TPM2_PT_MAX_AUTH_FAIL = TPM2_PT_VAR + 15,
-+	TPM2_PT_LOCKOUT_INTERVAL = TPM2_PT_VAR + 16,
-+	TPM2_PT_LOCKOUT_RECOVERY = TPM2_PT_VAR + 17,
-+	TPM2_PT_NV_WRITE_RECOVERY = TPM2_PT_VAR + 18,
-+	TPM2_PT_AUDIT_COUNTER_0 = TPM2_PT_VAR + 19,
-+	TPM2_PT_AUDIT_COUNTER_1 = TPM2_PT_VAR + 20,
-+};
- 
- /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18
-  * bytes, but 128 is still a relatively large number of random bytes and
 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
 index dc6816d36d06..bda249068182 100644
 --- a/drivers/leds/trigger/Kconfig
@@ -13471,10 +8032,323 @@ index 000000000000..067eedb003b5
 +MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
 +MODULE_LICENSE("GPL v2");
 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
-index 64659b110973..8b5ca9f8f4bb 100644
+index 64659b110973..4cad490028ab 100644
 --- a/fs/eventpoll.c
 +++ b/fs/eventpoll.c
-@@ -1760,7 +1760,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+@@ -57,13 +57,7 @@
+  * we need a lock that will allow us to sleep. This lock is a
+  * mutex (ep->mtx). It is acquired during the event transfer loop,
+  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
+- * Then we also need a global mutex to serialize eventpoll_release_file()
+- * and ep_free().
+- * This mutex is acquired by ep_free() during the epoll file
+- * cleanup path and it is also acquired by eventpoll_release_file()
+- * if a file has been pushed inside an epoll set and it is then
+- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
+- * It is also acquired when inserting an epoll fd onto another epoll
++ * The epmutex is acquired when inserting an epoll fd onto another epoll
+  * fd. We do this so that we walk the epoll tree and ensure that this
+  * insertion does not create a cycle of epoll file descriptors, which
+  * could lead to deadlock. We need a global mutex to prevent two
+@@ -153,6 +147,13 @@ struct epitem {
+ 	/* The file descriptor information this item refers to */
+ 	struct epoll_filefd ffd;
+ 
++	/*
++	 * Protected by file->f_lock, true for to-be-released epitem already
++	 * removed from the "struct file" items list; together with
++	 * eventpoll->refcount orchestrates "struct eventpoll" disposal
++	 */
++	bool dying;
++
+ 	/* List containing poll wait queues */
+ 	struct eppoll_entry *pwqlist;
+ 
+@@ -217,6 +218,12 @@ struct eventpoll {
+ 	u64 gen;
+ 	struct hlist_head refs;
+ 
++	/*
++	 * usage count, used together with epitem->dying to
++	 * orchestrate the disposal of this struct
++	 */
++	refcount_t refcount;
++
+ #ifdef CONFIG_NET_RX_BUSY_POLL
+ 	/* used to track busy poll napi_id */
+ 	unsigned int napi_id;
+@@ -240,9 +247,7 @@ struct ep_pqueue {
+ /* Maximum number of epoll watched descriptors, per user */
+ static long max_user_watches __read_mostly;
+ 
+-/*
+- * This mutex is used to serialize ep_free() and eventpoll_release_file().
+- */
++/* Used for cycles detection */
+ static DEFINE_MUTEX(epmutex);
+ 
+ static u64 loop_check_gen = 0;
+@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
+ 
+ /*
+  * This function unregisters poll callbacks from the associated file
+- * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
+- * ep_free).
++ * descriptor.  Must be called with "mtx" held.
+  */
+ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
+ {
+@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head)
+ 	kmem_cache_free(epi_cache, epi);
+ }
+ 
++static void ep_get(struct eventpoll *ep)
++{
++	refcount_inc(&ep->refcount);
++}
++
++/*
++ * Returns true if the event poll can be disposed
++ */
++static bool ep_refcount_dec_and_test(struct eventpoll *ep)
++{
++	if (!refcount_dec_and_test(&ep->refcount))
++		return false;
++
++	WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
++	return true;
++}
++
++static void ep_free(struct eventpoll *ep)
++{
++	mutex_destroy(&ep->mtx);
++	free_uid(ep->user);
++	wakeup_source_unregister(ep->ws);
++	kfree(ep);
++}
++
+ /*
+  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
+  * all the associated resources. Must be called with "mtx" held.
++ * If the dying flag is set, do the removal only if force is true.
++ * This prevents ep_clear_and_put() from dropping all the ep references
++ * while running concurrently with eventpoll_release_file().
++ * Returns true if the eventpoll can be disposed.
+  */
+-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
++static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
+ {
+ 	struct file *file = epi->ffd.file;
+ 	struct epitems_head *to_free;
+@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+ 
+ 	/* Remove the current item from the list of epoll hooks */
+ 	spin_lock(&file->f_lock);
++	if (epi->dying && !force) {
++		spin_unlock(&file->f_lock);
++		return false;
++	}
++
+ 	to_free = NULL;
+ 	head = file->f_ep;
+ 	if (head->first == &epi->fllink && !epi->fllink.next) {
+@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+ 	call_rcu(&epi->rcu, epi_rcu_free);
+ 
+ 	percpu_counter_dec(&ep->user->epoll_watches);
++	return ep_refcount_dec_and_test(ep);
++}
+ 
+-	return 0;
++/*
++ * ep_remove variant for callers owing an additional reference to the ep
++ */
++static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
++{
++	WARN_ON_ONCE(__ep_remove(ep, epi, false));
+ }
+ 
+-static void ep_free(struct eventpoll *ep)
++static void ep_clear_and_put(struct eventpoll *ep)
+ {
+ 	struct rb_node *rbp;
+ 	struct epitem *epi;
++	bool dispose;
+ 
+ 	/* We need to release all tasks waiting for these file */
+ 	if (waitqueue_active(&ep->poll_wait))
+ 		ep_poll_safewake(ep, NULL, 0);
+ 
+-	/*
+-	 * We need to lock this because we could be hit by
+-	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
+-	 * We do not need to hold "ep->mtx" here because the epoll file
+-	 * is on the way to be removed and no one has references to it
+-	 * anymore. The only hit might come from eventpoll_release_file() but
+-	 * holding "epmutex" is sufficient here.
+-	 */
+-	mutex_lock(&epmutex);
++	mutex_lock(&ep->mtx);
+ 
+ 	/*
+ 	 * Walks through the whole tree by unregistering poll callbacks.
+@@ -768,25 +806,21 @@ static void ep_free(struct eventpoll *ep)
+ 
+ 	/*
+ 	 * Walks through the whole tree by freeing each "struct epitem". At this
+-	 * point we are sure no poll callbacks will be lingering around, and also by
+-	 * holding "epmutex" we can be sure that no file cleanup code will hit
+-	 * us during this operation. So we can avoid the lock on "ep->lock".
+-	 * We do not need to lock ep->mtx, either, we only do it to prevent
+-	 * a lockdep warning.
++	 * point we are sure no poll callbacks will be lingering around.
++	 * Since we still own a reference to the eventpoll struct, the loop can't
++	 * dispose it.
+ 	 */
+-	mutex_lock(&ep->mtx);
+ 	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
+ 		epi = rb_entry(rbp, struct epitem, rbn);
+-		ep_remove(ep, epi);
++		ep_remove_safe(ep, epi);
+ 		cond_resched();
+ 	}
++
++	dispose = ep_refcount_dec_and_test(ep);
+ 	mutex_unlock(&ep->mtx);
+ 
+-	mutex_unlock(&epmutex);
+-	mutex_destroy(&ep->mtx);
+-	free_uid(ep->user);
+-	wakeup_source_unregister(ep->ws);
+-	kfree(ep);
++	if (dispose)
++		ep_free(ep);
+ }
+ 
+ static int ep_eventpoll_release(struct inode *inode, struct file *file)
+@@ -794,7 +828,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
+ 	struct eventpoll *ep = file->private_data;
+ 
+ 	if (ep)
+-		ep_free(ep);
++		ep_clear_and_put(ep);
+ 
+ 	return 0;
+ }
+@@ -906,33 +940,34 @@ void eventpoll_release_file(struct file *file)
+ {
+ 	struct eventpoll *ep;
+ 	struct epitem *epi;
+-	struct hlist_node *next;
++	bool dispose;
+ 
+ 	/*
+-	 * We don't want to get "file->f_lock" because it is not
+-	 * necessary. It is not necessary because we're in the "struct file"
+-	 * cleanup path, and this means that no one is using this file anymore.
+-	 * So, for example, epoll_ctl() cannot hit here since if we reach this
+-	 * point, the file counter already went to zero and fget() would fail.
+-	 * The only hit might come from ep_free() but by holding the mutex
+-	 * will correctly serialize the operation. We do need to acquire
+-	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
+-	 * from anywhere but ep_free().
+-	 *
+-	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
++	 * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
++	 * touching the epitems list before eventpoll_release_file() can access
++	 * the ep->mtx.
+ 	 */
+-	mutex_lock(&epmutex);
+-	if (unlikely(!file->f_ep)) {
+-		mutex_unlock(&epmutex);
+-		return;
+-	}
+-	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
++again:
++	spin_lock(&file->f_lock);
++	if (file->f_ep && file->f_ep->first) {
++		epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
++		epi->dying = true;
++		spin_unlock(&file->f_lock);
++
++		/*
++		 * ep access is safe as we still own a reference to the ep
++		 * struct
++		 */
+ 		ep = epi->ep;
+-		mutex_lock_nested(&ep->mtx, 0);
+-		ep_remove(ep, epi);
++		mutex_lock(&ep->mtx);
++		dispose = __ep_remove(ep, epi, true);
+ 		mutex_unlock(&ep->mtx);
++
++		if (dispose)
++			ep_free(ep);
++		goto again;
+ 	}
+-	mutex_unlock(&epmutex);
++	spin_unlock(&file->f_lock);
+ }
+ 
+ static int ep_alloc(struct eventpoll **pep)
+@@ -955,6 +990,7 @@ static int ep_alloc(struct eventpoll **pep)
+ 	ep->rbr = RB_ROOT_CACHED;
+ 	ep->ovflist = EP_UNACTIVE_PTR;
+ 	ep->user = user;
++	refcount_set(&ep->refcount, 1);
+ 
+ 	*pep = ep;
+ 
+@@ -1223,10 +1259,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
+ 		 */
+ 		list_del_init(&wait->entry);
+ 		/*
+-		 * ->whead != NULL protects us from the race with ep_free()
+-		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
+-		 * held by the caller. Once we nullify it, nothing protects
+-		 * ep/epi or even wait.
++		 * ->whead != NULL protects us from the race with
++		 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
++		 * takes whead->lock held by the caller. Once we nullify it,
++		 * nothing protects ep/epi or even wait.
+ 		 */
+ 		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
+ 	}
+@@ -1496,16 +1532,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
+ 	if (tep)
+ 		mutex_unlock(&tep->mtx);
+ 
++	/*
++	 * ep_remove_safe() calls in the later error paths can't lead to
++	 * ep_free() as the ep file itself still holds an ep reference.
++	 */
++	ep_get(ep);
++
+ 	/* now check if we've created too many backpaths */
+ 	if (unlikely(full_check && reverse_path_check())) {
+-		ep_remove(ep, epi);
++		ep_remove_safe(ep, epi);
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (epi->event.events & EPOLLWAKEUP) {
+ 		error = ep_create_wakeup_source(epi);
+ 		if (error) {
+-			ep_remove(ep, epi);
++			ep_remove_safe(ep, epi);
+ 			return error;
+ 		}
+ 	}
+@@ -1529,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
+ 	 * high memory pressure.
+ 	 */
+ 	if (unlikely(!epq.epi)) {
+-		ep_remove(ep, epi);
++		ep_remove_safe(ep, epi);
+ 		return -ENOMEM;
+ 	}
+ 
+@@ -1760,7 +1802,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
  {
  	int ret = default_wake_function(wq_entry, mode, sync, key);
  
@@ -13483,8 +8357,37 @@ index 64659b110973..8b5ca9f8f4bb 100644
  	return ret;
  }
  
+@@ -2025,7 +2067,7 @@ static int do_epoll_create(int flags)
+ out_free_fd:
+ 	put_unused_fd(fd);
+ out_free_ep:
+-	ep_free(ep);
++	ep_clear_and_put(ep);
+ 	return error;
+ }
+ 
+@@ -2167,10 +2209,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ 			error = -EEXIST;
+ 		break;
+ 	case EPOLL_CTL_DEL:
+-		if (epi)
+-			error = ep_remove(ep, epi);
+-		else
++		if (epi) {
++			/*
++			 * The eventpoll itself is still alive: the refcount
++			 * can't go to zero here.
++			 */
++			ep_remove_safe(ep, epi);
++			error = 0;
++		} else {
+ 			error = -ENOENT;
++		}
+ 		break;
+ 	case EPOLL_CTL_MOD:
+ 		if (epi) {
 diff --git a/fs/proc/base.c b/fs/proc/base.c
-index 9e479d7d202b..ac9ebe972be0 100644
+index 5e0e0ccd47aa..07463ad4a70a 100644
 --- a/fs/proc/base.c
 +++ b/fs/proc/base.c
 @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
@@ -13496,10 +8399,10 @@ index 9e479d7d202b..ac9ebe972be0 100644
  	}
  
 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 9757067c3053..d853e1c8a581 100644
+index a57e6ae78e65..22b2ac82bffd 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
-@@ -776,7 +776,7 @@ struct mm_struct {
+@@ -740,7 +740,7 @@ struct mm_struct {
  #ifdef CONFIG_KSM
  		/*
  		 * Represent how many pages of this process are involved in KSM
@@ -13508,7 +8411,7 @@ index 9757067c3053..d853e1c8a581 100644
  		 */
  		unsigned long ksm_merging_pages;
  		/*
-@@ -784,6 +784,11 @@ struct mm_struct {
+@@ -748,6 +748,11 @@ struct mm_struct {
  		 * including merged and not merged.
  		 */
  		unsigned long ksm_rmap_items;
@@ -13567,29 +8470,6 @@ index 8f69772af77b..42163c9e94e5 100644
  	return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
  }
  
-diff --git a/kernel/kthread.c b/kernel/kthread.c
-index f97fd01a2932..7e6751b29101 100644
---- a/kernel/kthread.c
-+++ b/kernel/kthread.c
-@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
-  * Flush and destroy @worker.  The simple flush is enough because the kthread
-  * worker API is used only in trivial scenarios.  There are no multi-step state
-  * machines needed.
-+ *
-+ * Note that this function is not responsible for handling delayed work, so
-+ * caller should be responsible for queuing or canceling all delayed work items
-+ * before invoke this function.
-  */
- void kthread_destroy_worker(struct kthread_worker *worker)
- {
-@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
- 
- 	kthread_flush_worker(worker);
- 	kthread_stop(task);
-+	WARN_ON(!list_empty(&worker->delayed_work_list));
- 	WARN_ON(!list_empty(&worker->work_list));
- 	kfree(worker);
- }
 diff --git a/kernel/padata.c b/kernel/padata.c
 index e007b8a4b738..7c80301ab084 100644
 --- a/kernel/padata.c
@@ -13612,214 +8492,8 @@ index e007b8a4b738..7c80301ab084 100644
  {
  	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
  	struct padata_mt_job_state *ps = pw->pw_data;
-diff --git a/lib/string.c b/lib/string.c
-index 4746a98b153e..6b7cf32b4e54 100644
---- a/lib/string.c
-+++ b/lib/string.c
-@@ -480,13 +480,11 @@ EXPORT_SYMBOL(strcspn);
-  */
- char *strpbrk(const char *cs, const char *ct)
- {
--	const char *sc1, *sc2;
-+	const char *sc;
- 
--	for (sc1 = cs; *sc1 != '\0'; ++sc1) {
--		for (sc2 = ct; *sc2 != '\0'; ++sc2) {
--			if (*sc1 == *sc2)
--				return (char *)sc1;
--		}
-+	for (sc = cs; *sc != '\0'; ++sc) {
-+		if (strchr(ct, *sc))
-+			return (char *)sc;
- 	}
- 	return NULL;
- }
-diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
-index 89b269a641c7..60958afebc41 100644
---- a/lib/zstd/decompress/huf_decompress.c
-+++ b/lib/zstd/decompress/huf_decompress.c
-@@ -985,7 +985,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
- 
- static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
-                            const sortedSymbol_t* sortedList,
--                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
-+                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
-                            const U32 nbBitsBaseline)
- {
-     U32* const rankVal = rankValOrigin[0];
-diff --git a/mm/compaction.c b/mm/compaction.c
-index d0b16a5b30f7..3613d7f174dc 100644
---- a/mm/compaction.c
-+++ b/mm/compaction.c
-@@ -122,7 +122,6 @@ bool PageMovable(struct page *page)
- 
- 	return false;
- }
--EXPORT_SYMBOL(PageMovable);
- 
- void __SetPageMovable(struct page *page, const struct movable_operations *mops)
- {
-@@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
- 
- 		/*
- 		 * Avoid isolating too much unless this block is being
--		 * rescanned (e.g. dirty/writeback pages, parallel allocation)
-+		 * fully scanned (e.g. dirty/writeback pages, parallel allocation)
- 		 * or a lock is contended. For contention, isolate quickly to
- 		 * potentially remove one source of contention.
- 		 */
- 		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
--		    !cc->rescan && !cc->contended) {
-+		    !cc->finish_pageblock && !cc->contended) {
- 			++low_pfn;
- 			break;
- 		}
-@@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
- 	}
- 
- 	/*
--	 * Updated the cached scanner pfn once the pageblock has been scanned
-+	 * Update the cached scanner pfn once the pageblock has been scanned.
- 	 * Pages will either be migrated in which case there is no point
- 	 * scanning in the near future or migration failed in which case the
- 	 * failure reason may persist. The block is marked for skipping if
- 	 * there were no pages isolated in the block or if the block is
- 	 * rescanned twice in a row.
- 	 */
--	if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
-+	if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
- 		if (valid_page && !skip_updated)
- 			set_pageblock_skip(valid_page);
- 		update_cached_migrate(cc, low_pfn);
-@@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
- 	if (cc->ignore_skip_hint)
- 		return pfn;
- 
-+	/*
-+	 * If the pageblock should be finished then do not select a different
-+	 * pageblock.
-+	 */
-+	if (cc->finish_pageblock)
-+		return pfn;
-+
- 	/*
- 	 * If the migrate_pfn is not at the start of a zone or the start
- 	 * of a pageblock then assume this is a continuation of a previous
-@@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
- 					pfn = cc->zone->zone_start_pfn;
- 				cc->fast_search_fail = 0;
- 				found_block = true;
--				set_pageblock_skip(freepage);
- 				break;
- 			}
- 		}
-@@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
- 		unsigned long iteration_start_pfn = cc->migrate_pfn;
- 
- 		/*
--		 * Avoid multiple rescans which can happen if a page cannot be
--		 * isolated (dirty/writeback in async mode) or if the migrated
--		 * pages are being allocated before the pageblock is cleared.
--		 * The first rescan will capture the entire pageblock for
--		 * migration. If it fails, it'll be marked skip and scanning
--		 * will proceed as normal.
-+		 * Avoid multiple rescans of the same pageblock which can
-+		 * happen if a page cannot be isolated (dirty/writeback in
-+		 * async mode) or if the migrated pages are being allocated
-+		 * before the pageblock is cleared.  The first rescan will
-+		 * capture the entire pageblock for migration. If it fails,
-+		 * it'll be marked skip and scanning will proceed as normal.
- 		 */
--		cc->rescan = false;
-+		cc->finish_pageblock = false;
- 		if (pageblock_start_pfn(last_migrated_pfn) ==
- 		    pageblock_start_pfn(iteration_start_pfn)) {
--			cc->rescan = true;
-+			cc->finish_pageblock = true;
- 		}
- 
-+rescan:
- 		switch (isolate_migratepages(cc)) {
- 		case ISOLATE_ABORT:
- 			ret = COMPACT_CONTENDED;
-@@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
- 				goto out;
- 			}
- 			/*
--			 * We failed to migrate at least one page in the current
--			 * order-aligned block, so skip the rest of it.
-+			 * If an ASYNC or SYNC_LIGHT fails to migrate a page
-+			 * within the current order-aligned block, scan the
-+			 * remainder of the pageblock. This will mark the
-+			 * pageblock "skip" to avoid rescanning in the near
-+			 * future. This will isolate more pages than necessary
-+			 * for the request but avoid loops due to
-+			 * fast_find_migrateblock revisiting blocks that were
-+			 * recently partially scanned.
- 			 */
--			if (cc->direct_compaction &&
--						(cc->mode == MIGRATE_ASYNC)) {
--				cc->migrate_pfn = block_end_pfn(
--						cc->migrate_pfn - 1, cc->order);
--				/* Draining pcplists is useless in this case */
--				last_migrated_pfn = 0;
-+			if (cc->direct_compaction && !cc->finish_pageblock &&
-+						(cc->mode < MIGRATE_SYNC)) {
-+				cc->finish_pageblock = true;
-+
-+				/*
-+				 * Draining pcplists does not help THP if
-+				 * any page failed to migrate. Even after
-+				 * drain, the pageblock will not be free.
-+				 */
-+				if (cc->order == COMPACTION_HPAGE_ORDER)
-+					last_migrated_pfn = 0;
-+
-+				goto rescan;
- 			}
- 		}
- 
-+		/* Stop if a page has been captured */
-+		if (capc && capc->page) {
-+			ret = COMPACT_SUCCESS;
-+			break;
-+		}
-+
- check_drain:
- 		/*
- 		 * Has the migration scanner moved away from the previous
-@@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
- 				last_migrated_pfn = 0;
- 			}
- 		}
--
--		/* Stop if a page has been captured */
--		if (capc && capc->page) {
--			ret = COMPACT_SUCCESS;
--			break;
--		}
- 	}
- 
- out:
-diff --git a/mm/internal.h b/mm/internal.h
-index bcf75a8b032d..21466d0ab22f 100644
---- a/mm/internal.h
-+++ b/mm/internal.h
-@@ -422,7 +422,11 @@ struct compact_control {
- 	bool proactive_compaction;	/* kcompactd proactive compaction */
- 	bool whole_zone;		/* Whole zone should/has been scanned */
- 	bool contended;			/* Signal lock contention */
--	bool rescan;			/* Rescanning the same pageblock */
-+	bool finish_pageblock;		/* Scan the remainder of a pageblock. Used
-+					 * when there are potentially transient
-+					 * isolation or migration failures to
-+					 * ensure forward progress.
-+					 */
- 	bool alloc_contig;		/* alloc_contig_range allocation */
- };
- 
 diff --git a/mm/ksm.c b/mm/ksm.c
-index a92c9594a2d3..ee60890cf9b1 100644
+index 2b8d30068cbb..82029f1d454b 100644
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
 @@ -214,6 +214,7 @@ struct ksm_rmap_item {
@@ -13974,33 +8648,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  	}
  	return err;
  }
-@@ -988,9 +1038,15 @@ static int unmerge_and_remove_all_rmap_items(void)
- 
- 		mm = mm_slot->slot.mm;
- 		mmap_read_lock(mm);
-+
-+		/*
-+		 * Exit right away if mm is exiting to avoid lockdep issue in
-+		 * the maple tree
-+		 */
-+		if (ksm_test_exit(mm))
-+			goto mm_exiting;
-+
- 		for_each_vma(vmi, vma) {
--			if (ksm_test_exit(mm))
--				break;
- 			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
- 				continue;
- 			err = unmerge_ksm_pages(vma,
-@@ -999,6 +1055,7 @@ static int unmerge_and_remove_all_rmap_items(void)
- 				goto error;
- 		}
- 
-+mm_exiting:
- 		remove_trailing_rmap_items(&mm_slot->rmap_list);
- 		mmap_read_unlock(mm);
- 
-@@ -2044,6 +2101,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+@@ -2050,6 +2100,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
  	rmap_item->mm->ksm_merging_pages++;
  }
  
@@ -14043,7 +8691,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  /*
   * cmp_and_merge_page - first see if page can be merged into the stable tree;
   * if not, compare checksum to previous and if it's the same, see if page can
-@@ -2055,7 +2148,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+@@ -2061,7 +2147,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
   */
  static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
  {
@@ -14051,7 +8699,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  	struct ksm_rmap_item *tree_rmap_item;
  	struct page *tree_page = NULL;
  	struct ksm_stable_node *stable_node;
-@@ -2092,6 +2184,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2098,6 +2183,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	}
  
  	remove_rmap_item_from_tree(rmap_item);
@@ -14059,7 +8707,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  
  	if (kpage) {
  		if (PTR_ERR(kpage) == -EBUSY)
-@@ -2128,29 +2221,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2134,29 +2220,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	 * Same checksum as an empty page. We attempt to merge it with the
  	 * appropriate zero page if the user enabled this via sysfs.
  	 */
@@ -14096,7 +8744,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  	tree_rmap_item =
  		unstable_tree_search_insert(rmap_item, page, &tree_page);
  	if (tree_rmap_item) {
-@@ -2214,23 +2294,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2220,23 +2293,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	}
  }
  
@@ -14142,7 +8790,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  	rmap_item = alloc_rmap_item();
  	if (rmap_item) {
  		/* It has already been zeroed */
-@@ -2337,6 +2433,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
+@@ -2343,6 +2432,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  			}
  			if (is_zone_device_page(*page))
  				goto next_page;
@@ -14165,7 +8813,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  			if (PageAnon(*page)) {
  				flush_anon_page(vma, *page, ksm_scan.address);
  				flush_dcache_page(*page);
-@@ -3138,6 +3250,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
+@@ -3139,6 +3244,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
  }
  KSM_ATTR_RO(pages_volatile);
  
@@ -14179,7 +8827,7 @@ index a92c9594a2d3..ee60890cf9b1 100644
  static ssize_t stable_node_dups_show(struct kobject *kobj,
  				     struct kobj_attribute *attr, char *buf)
  {
-@@ -3193,6 +3312,7 @@ static struct attribute *ksm_attrs[] = {
+@@ -3194,6 +3306,7 @@ static struct attribute *ksm_attrs[] = {
  	&pages_sharing_attr.attr,
  	&pages_unshared_attr.attr,
  	&pages_volatile_attr.attr,
@@ -14187,108 +8835,6 @@ index a92c9594a2d3..ee60890cf9b1 100644
  	&full_scans_attr.attr,
  #ifdef CONFIG_NUMA
  	&merge_across_nodes_attr.attr,
-diff --git a/mm/page_alloc.c b/mm/page_alloc.c
-index 3bb3484563ed..3aec9a6a9cb7 100644
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
- {
- 	unsigned long flags;
- 	int i, allocated = 0;
-+	struct list_head *prev_tail = list->prev;
-+	struct page *pos, *n;
- 
- 	spin_lock_irqsave(&zone->lock, flags);
- 	for (i = 0; i < count; ++i) {
-@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
- 		if (unlikely(page == NULL))
- 			break;
- 
--		if (unlikely(check_pcp_refill(page, order)))
--			continue;
--
- 		/*
- 		 * Split buddy pages returned by expand() are received here in
- 		 * physical page order. The page is added to the tail of
-@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
- 		 * pages are ordered properly.
- 		 */
- 		list_add_tail(&page->pcp_list, list);
--		allocated++;
- 		if (is_migrate_cma(get_pcppage_migratetype(page)))
- 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
- 					      -(1 << order));
-@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
- 	 */
- 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- 	spin_unlock_irqrestore(&zone->lock, flags);
-+
-+	/*
-+	 * Pages are appended to the pcp list without checking to reduce the
-+	 * time holding the zone lock. Checking the appended pages happens right
-+	 * after the critical section while still holding the pcp lock.
-+	 */
-+	pos = list_first_entry(prev_tail, struct page, pcp_list);
-+	list_for_each_entry_safe_from(pos, n, list, pcp_list) {
-+		if (unlikely(check_pcp_refill(pos, order))) {
-+			list_del(&pos->pcp_list);
-+			continue;
-+		}
-+
-+		allocated++;
-+	}
-+
- 	return allocated;
- }
- 
-diff --git a/mm/z3fold.c b/mm/z3fold.c
-index a4de0c317ac7..0cef845d397b 100644
---- a/mm/z3fold.c
-+++ b/mm/z3fold.c
-@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
- 	struct z3fold_header *zhdr;
- 	struct z3fold_pool *pool;
- 
--	VM_BUG_ON_PAGE(!PageMovable(page), page);
- 	VM_BUG_ON_PAGE(PageIsolated(page), page);
- 
- 	if (test_bit(PAGE_HEADLESS, &page->private))
-@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
- 	struct z3fold_header *zhdr, *new_zhdr;
- 	struct z3fold_pool *pool;
- 
--	VM_BUG_ON_PAGE(!PageMovable(page), page);
- 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
- 	VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
- 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
-index 702bc3fd687a..9d27d9b00bce 100644
---- a/mm/zsmalloc.c
-+++ b/mm/zsmalloc.c
-@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
- 	 * Page is locked so zspage couldn't be destroyed. For detail, look at
- 	 * lock_zspage in free_zspage.
- 	 */
--	VM_BUG_ON_PAGE(!PageMovable(page), page);
- 	VM_BUG_ON_PAGE(PageIsolated(page), page);
- 
- 	zspage = get_zspage(page);
-@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
- 	if (mode == MIGRATE_SYNC_NO_COPY)
- 		return -EINVAL;
- 
--	VM_BUG_ON_PAGE(!PageMovable(page), page);
- 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
- 
- 	/* The page is locked, so this pointer must remain valid */
-@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page)
- {
- 	struct zspage *zspage;
- 
--	VM_BUG_ON_PAGE(!PageMovable(page), page);
- 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
- 
- 	zspage = get_zspage(page);
 diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
 index 0edfdb40364b..ae52d3b3f063 100644
 --- a/scripts/Makefile.vmlinux_o
@@ -14303,10 +8849,10 @@ index 0edfdb40364b..ae52d3b3f063 100644
  targets := .tmp_initcalls.lds
  
 diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
-index f7815ee24f83..e94b0a6b96df 100644
+index 75020edd39e7..e4455220e9fd 100644
 --- a/sound/pci/hda/cs35l41_hda.c
 +++ b/sound/pci/hda/cs35l41_hda.c
-@@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
+@@ -1239,7 +1239,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
  
  	if (strncmp(hid, "CLSA0100", 8) == 0) {
  		hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH;
@@ -14315,10 +8861,10 @@ index f7815ee24f83..e94b0a6b96df 100644
  		hw_cfg->bst_type = CS35L41_EXT_BOOST;
  		hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
  		hw_cfg->gpio1.valid = true;
-diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
-index b11b7e5115dc..3033cd6ed3b4 100644
---- a/tools/testing/selftests/vm/ksm_functional_tests.c
-+++ b/tools/testing/selftests/vm/ksm_functional_tests.c
+diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
+index d8b5b4930412..05048ebc24d8 100644
+--- a/tools/testing/selftests/mm/ksm_functional_tests.c
++++ b/tools/testing/selftests/mm/ksm_functional_tests.c
 @@ -24,9 +24,12 @@
  
  #define KiB 1024u
@@ -14468,12223 +9014,25 @@ index b11b7e5115dc..3033cd6ed3b4 100644
  #ifdef __NR_userfaultfd
  	test_unmerge_uffd_wp();
 -- 
-2.40.0.rc2
+2.40.0
 
-From 50de9c32a97f479390ff525d679f224e1ceb8e3b Mon Sep 17 00:00:00 2001
+From 57f8b594e6808d5ecc244928f704f66249dd9bba Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 3 Mar 2023 16:59:32 +0100
-Subject: [PATCH 07/16] fs-patches
+Date: Mon, 6 Mar 2023 18:45:25 +0100
+Subject: [PATCH 05/10] Implement amd-pstate guided driver
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- Documentation/admin-guide/xfs.rst |   2 +-
- block/blk-merge.c                 |   3 +-
- fs/btrfs/Makefile                 |   6 +-
- fs/btrfs/backref.c                |  33 +-
- fs/btrfs/bio.c                    | 557 ++++++++++++++++++++---
- fs/btrfs/bio.h                    |  67 +--
- fs/btrfs/block-group.c            | 276 ++++++++++--
- fs/btrfs/block-group.h            |  24 +-
- fs/btrfs/btrfs_inode.h            |  23 +-
- fs/btrfs/compression.c            | 276 ++----------
- fs/btrfs/compression.h            |   3 -
- fs/btrfs/ctree.c                  |  62 ++-
- fs/btrfs/ctree.h                  |  15 +
- fs/btrfs/defrag.c                 |   4 +-
- fs/btrfs/delayed-ref.c            |  24 +-
- fs/btrfs/delayed-ref.h            |   2 +-
- fs/btrfs/disk-io.c                | 222 +---------
- fs/btrfs/disk-io.h                |  14 +-
- fs/btrfs/extent-io-tree.c         |  10 +-
- fs/btrfs/extent-io-tree.h         |   1 -
- fs/btrfs/extent-tree.c            | 181 +++-----
- fs/btrfs/extent-tree.h            |  81 ++++
- fs/btrfs/extent_io.c              | 582 +++---------------------
- fs/btrfs/extent_io.h              |  36 +-
- fs/btrfs/file-item.c              |  72 ++-
- fs/btrfs/file-item.h              |   8 +-
- fs/btrfs/file.c                   |  13 +-
- fs/btrfs/free-space-tree.c        |   2 +-
- fs/btrfs/fs.h                     |   5 +-
- fs/btrfs/inode.c                  | 715 ++++++------------------------
- fs/btrfs/ioctl.c                  |   2 +-
- fs/btrfs/lru_cache.c              | 166 +++++++
- fs/btrfs/lru_cache.h              |  80 ++++
- fs/btrfs/lzo.c                    |   2 +-
- fs/btrfs/messages.c               |  30 --
- fs/btrfs/messages.h               |  34 --
- fs/btrfs/ordered-data.c           |  71 ++-
- fs/btrfs/ordered-data.h           |  10 +-
- fs/btrfs/qgroup.c                 |   2 +-
- fs/btrfs/raid56.c                 | 334 +++++---------
- fs/btrfs/raid56.h                 |   4 +-
- fs/btrfs/relocation.c             |   2 +-
- fs/btrfs/scrub.c                  |   2 +-
- fs/btrfs/send.c                   | 684 ++++++++++++++--------------
- fs/btrfs/super.c                  |   3 +-
- fs/btrfs/sysfs.c                  |  12 +-
- fs/btrfs/tests/extent-map-tests.c |   2 +-
- fs/btrfs/transaction.c            |  29 ++
- fs/btrfs/transaction.h            |  31 ++
- fs/btrfs/tree-log.c               |  87 ++--
- fs/btrfs/tree-log.h               |   9 +-
- fs/btrfs/volumes.c                | 116 ++---
- fs/btrfs/volumes.h                |  18 -
- fs/btrfs/zoned.c                  | 146 +++---
- fs/btrfs/zoned.h                  |  20 +-
- fs/ext4/extents.c                 |   2 +-
- fs/ext4/file.c                    |  34 +-
- fs/ext4/inode.c                   | 429 ++++++------------
- fs/ext4/ioctl.c                   |   3 -
- fs/ext4/namei.c                   |  11 +-
- fs/ext4/page-io.c                 |  10 +-
- fs/ext4/super.c                   |  26 +-
- fs/ext4/xattr.c                   | 137 ++++--
- fs/gfs2/bmap.c                    |  38 +-
- fs/iomap/buffered-io.c            |  91 ++--
- fs/iomap/direct-io.c              |  10 +-
- fs/xfs/libxfs/xfs_alloc.c         |  32 +-
- fs/xfs/libxfs/xfs_bmap.c          |  32 +-
- fs/xfs/libxfs/xfs_bmap.h          |   5 +-
- fs/xfs/libxfs/xfs_btree.c         |  18 +-
- fs/xfs/libxfs/xfs_refcount.c      |  96 ++--
- fs/xfs/libxfs/xfs_refcount.h      |   4 +-
- fs/xfs/libxfs/xfs_rmap.c          |  50 +--
- fs/xfs/libxfs/xfs_rmap.h          |   6 +-
- fs/xfs/xfs_bmap_item.c            | 137 +++---
- fs/xfs/xfs_error.c                |   2 +-
- fs/xfs/xfs_error.h                |  12 +-
- fs/xfs/xfs_extfree_item.c         |  99 +++--
- fs/xfs/xfs_fsmap.c                |   1 +
- fs/xfs/xfs_globals.c              |   3 +-
- fs/xfs/xfs_iomap.c                |   4 +-
- fs/xfs/xfs_refcount_item.c        | 110 +++--
- fs/xfs/xfs_rmap_item.c            | 142 +++---
- fs/xfs/xfs_sysfs.c                |  12 +-
- fs/xfs/xfs_sysfs.h                |  10 +-
- fs/xfs/xfs_trace.h                |  15 +-
- include/linux/bio.h               |   4 +
- include/linux/iomap.h             |  30 +-
- include/trace/events/btrfs.h      | 127 +++++-
- include/trace/events/ext4.h       |   7 -
- 90 files changed, 3213 insertions(+), 3751 deletions(-)
- create mode 100644 fs/btrfs/lru_cache.c
- create mode 100644 fs/btrfs/lru_cache.h
-
-diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
-index 8de008c0c5ad..e2561416391c 100644
---- a/Documentation/admin-guide/xfs.rst
-+++ b/Documentation/admin-guide/xfs.rst
-@@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem:
- 		XFS_ERRLEVEL_LOW:       1
- 		XFS_ERRLEVEL_HIGH:      5
- 
--  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 256)
-+  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 511)
- 	Causes certain error conditions to call BUG(). Value is a bitmask;
- 	OR together the tags which represent errors which should cause panics:
- 
-diff --git a/block/blk-merge.c b/block/blk-merge.c
-index 808b58129d3e..1ac782fdc55c 100644
---- a/block/blk-merge.c
-+++ b/block/blk-merge.c
-@@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim,
-  * responsible for ensuring that @bs is only destroyed after processing of the
-  * split bio has finished.
-  */
--static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
-+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
- 		unsigned *segs, struct bio_set *bs, unsigned max_bytes)
- {
- 	struct bio_vec bv, bvprv, *bvprvp = NULL;
-@@ -336,6 +336,7 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
- 	bio_clear_polled(bio);
- 	return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
- }
-+EXPORT_SYMBOL_GPL(bio_split_rw);
- 
- /**
-  * __bio_split_to_limits - split a bio to fit the queue limits
-diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
-index 555c962fdad6..90d53209755b 100644
---- a/fs/btrfs/Makefile
-+++ b/fs/btrfs/Makefile
-@@ -11,7 +11,8 @@ condflags := \
- 	$(call cc-option, -Wunused-but-set-variable)		\
- 	$(call cc-option, -Wunused-const-variable)		\
- 	$(call cc-option, -Wpacked-not-aligned)			\
--	$(call cc-option, -Wstringop-truncation)
-+	$(call cc-option, -Wstringop-truncation)		\
-+	$(call cc-option, -Wmaybe-uninitialized)
- subdir-ccflags-y += $(condflags)
- # The following turn off the warnings enabled by -Wextra
- subdir-ccflags-y += -Wno-missing-field-initializers
-@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
- 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
--	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
-+	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-+	   lru_cache.o
- 
- btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
- btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
-diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
-index 46851511b661..90e40d5ceccd 100644
---- a/fs/btrfs/backref.c
-+++ b/fs/btrfs/backref.c
-@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
- 					struct btrfs_root *root,
- 					u64 bytenr, int level, bool *is_shared)
- {
-+	const struct btrfs_fs_info *fs_info = root->fs_info;
- 	struct btrfs_backref_shared_cache_entry *entry;
- 
-+	if (!current->journal_info)
-+		lockdep_assert_held(&fs_info->commit_root_sem);
-+
- 	if (!ctx->use_path_cache)
- 		return false;
- 
-@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
- 	 * could be a snapshot sharing this extent buffer.
- 	 */
- 	if (entry->is_shared &&
--	    entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
-+	    entry->gen != btrfs_get_last_root_drop_gen(fs_info))
- 		return false;
- 
- 	*is_shared = entry->is_shared;
-@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
- 				       struct btrfs_root *root,
- 				       u64 bytenr, int level, bool is_shared)
- {
-+	const struct btrfs_fs_info *fs_info = root->fs_info;
- 	struct btrfs_backref_shared_cache_entry *entry;
- 	u64 gen;
- 
-+	if (!current->journal_info)
-+		lockdep_assert_held(&fs_info->commit_root_sem);
-+
- 	if (!ctx->use_path_cache)
- 		return;
- 
-@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
- 	ASSERT(level >= 0);
- 
- 	if (is_shared)
--		gen = btrfs_get_last_root_drop_gen(root->fs_info);
-+		gen = btrfs_get_last_root_drop_gen(fs_info);
- 	else
- 		gen = btrfs_root_last_snapshot(&root->root_item);
- 
-@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
- 		.have_delayed_delete_refs = false,
- 	};
- 	int level;
-+	bool leaf_cached;
-+	bool leaf_is_shared;
- 
- 	for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
- 		if (ctx->prev_extents_cache[i].bytenr == bytenr)
-@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
- 		walk_ctx.time_seq = elem.seq;
- 	}
- 
-+	ctx->use_path_cache = true;
-+
-+	/*
-+	 * We may have previously determined that the current leaf is shared.
-+	 * If it is, then we have a data extent that is shared due to a shared
-+	 * subtree (caused by snapshotting) and we don't need to check for data
-+	 * backrefs. If the leaf is not shared, then we must do backref walking
-+	 * to determine if the data extent is shared through reflinks.
-+	 */
-+	leaf_cached = lookup_backref_shared_cache(ctx, root,
-+						  ctx->curr_leaf_bytenr, 0,
-+						  &leaf_is_shared);
-+	if (leaf_cached && leaf_is_shared) {
-+		ret = 1;
-+		goto out_trans;
-+	}
-+
- 	walk_ctx.ignore_extent_item_pos = true;
- 	walk_ctx.trans = trans;
- 	walk_ctx.fs_info = fs_info;
-@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
- 	/* -1 means we are in the bytenr of the data extent. */
- 	level = -1;
- 	ULIST_ITER_INIT(&uiter);
--	ctx->use_path_cache = true;
- 	while (1) {
- 		bool is_shared;
- 		bool cached;
-@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
- 		ctx->prev_extents_cache_slot = slot;
- 	}
- 
-+out_trans:
- 	if (trans) {
- 		btrfs_put_tree_mod_seq(fs_info, &elem);
- 		btrfs_end_transaction(trans);
-diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
-index 8affc88b0e0a..d8b90f95b157 100644
---- a/fs/btrfs/bio.c
-+++ b/fs/btrfs/bio.c
-@@ -14,19 +14,31 @@
- #include "dev-replace.h"
- #include "rcu-string.h"
- #include "zoned.h"
-+#include "file-item.h"
- 
- static struct bio_set btrfs_bioset;
-+static struct bio_set btrfs_clone_bioset;
-+static struct bio_set btrfs_repair_bioset;
-+static mempool_t btrfs_failed_bio_pool;
-+
-+struct btrfs_failed_bio {
-+	struct btrfs_bio *bbio;
-+	int num_copies;
-+	atomic_t repair_count;
-+};
- 
- /*
-  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
-  * is already initialized by the block layer.
-  */
--static inline void btrfs_bio_init(struct btrfs_bio *bbio,
--				  btrfs_bio_end_io_t end_io, void *private)
-+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
-+		    btrfs_bio_end_io_t end_io, void *private)
- {
- 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-+	bbio->inode = inode;
- 	bbio->end_io = end_io;
- 	bbio->private = private;
-+	atomic_set(&bbio->pending_ios, 1);
- }
- 
- /*
-@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio,
-  * a mempool.
-  */
- struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-+			    struct btrfs_inode *inode,
- 			    btrfs_bio_end_io_t end_io, void *private)
- {
- 	struct bio *bio;
- 
- 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
--	btrfs_bio_init(btrfs_bio(bio), end_io, private);
-+	btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
- 	return bio;
- }
- 
--struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
--				    btrfs_bio_end_io_t end_io, void *private)
-+static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
-+				   struct bio *orig, u64 map_length,
-+				   bool use_append)
- {
-+	struct btrfs_bio *orig_bbio = btrfs_bio(orig);
- 	struct bio *bio;
--	struct btrfs_bio *bbio;
- 
--	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
-+	if (use_append) {
-+		unsigned int nr_segs;
-+
-+		bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
-+				   &btrfs_clone_bioset, map_length);
-+	} else {
-+		bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
-+				&btrfs_clone_bioset);
-+	}
-+	btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
- 
--	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
--	bbio = btrfs_bio(bio);
--	btrfs_bio_init(bbio, end_io, private);
-+	btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
-+	if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
-+		orig_bbio->file_offset += map_length;
- 
--	bio_trim(bio, offset >> 9, size >> 9);
--	bbio->iter = bio->bi_iter;
-+	atomic_inc(&orig_bbio->pending_ios);
- 	return bio;
- }
- 
-+static void btrfs_orig_write_end_io(struct bio *bio);
-+
-+static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
-+				       struct btrfs_bio *orig_bbio)
-+{
-+	/*
-+	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
-+	 * just blindly propagate a write failure here.  Instead increment the
-+	 * error count in the original I/O context so that it is guaranteed to
-+	 * be larger than the error tolerance.
-+	 */
-+	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
-+		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
-+		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-+
-+		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
-+	} else {
-+		orig_bbio->bio.bi_status = bbio->bio.bi_status;
-+	}
-+}
-+
-+static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
-+{
-+	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
-+		struct btrfs_bio *orig_bbio = bbio->private;
-+
-+		if (bbio->bio.bi_status)
-+			btrfs_bbio_propagate_error(bbio, orig_bbio);
-+		bio_put(&bbio->bio);
-+		bbio = orig_bbio;
-+	}
-+
-+	if (atomic_dec_and_test(&bbio->pending_ios))
-+		bbio->end_io(bbio);
-+}
-+
-+static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
-+{
-+	if (cur_mirror == fbio->num_copies)
-+		return cur_mirror + 1 - fbio->num_copies;
-+	return cur_mirror + 1;
-+}
-+
-+static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
-+{
-+	if (cur_mirror == 1)
-+		return fbio->num_copies;
-+	return cur_mirror - 1;
-+}
-+
-+static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
-+{
-+	if (atomic_dec_and_test(&fbio->repair_count)) {
-+		btrfs_orig_bbio_end_io(fbio->bbio);
-+		mempool_free(fbio, &btrfs_failed_bio_pool);
-+	}
-+}
-+
-+static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
-+				 struct btrfs_device *dev)
-+{
-+	struct btrfs_failed_bio *fbio = repair_bbio->private;
-+	struct btrfs_inode *inode = repair_bbio->inode;
-+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-+	struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
-+	int mirror = repair_bbio->mirror_num;
-+
-+	if (repair_bbio->bio.bi_status ||
-+	    !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
-+		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
-+		repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
-+
-+		mirror = next_repair_mirror(fbio, mirror);
-+		if (mirror == fbio->bbio->mirror_num) {
-+			btrfs_debug(fs_info, "no mirror left");
-+			fbio->bbio->bio.bi_status = BLK_STS_IOERR;
-+			goto done;
-+		}
-+
-+		btrfs_submit_bio(&repair_bbio->bio, mirror);
-+		return;
-+	}
-+
-+	do {
-+		mirror = prev_repair_mirror(fbio, mirror);
-+		btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
-+				  repair_bbio->file_offset, fs_info->sectorsize,
-+				  repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
-+				  bv->bv_page, bv->bv_offset, mirror);
-+	} while (mirror != fbio->bbio->mirror_num);
-+
-+done:
-+	btrfs_repair_done(fbio);
-+	bio_put(&repair_bbio->bio);
-+}
-+
-+/*
-+ * Try to kick off a repair read to the next available mirror for a bad sector.
-+ *
-+ * This primarily tries to recover good data to serve the actual read request,
-+ * but also tries to write the good data back to the bad mirror(s) when a
-+ * read succeeded to restore the redundancy.
-+ */
-+static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
-+						  u32 bio_offset,
-+						  struct bio_vec *bv,
-+						  struct btrfs_failed_bio *fbio)
-+{
-+	struct btrfs_inode *inode = failed_bbio->inode;
-+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-+	const u32 sectorsize = fs_info->sectorsize;
-+	const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
-+	struct btrfs_bio *repair_bbio;
-+	struct bio *repair_bio;
-+	int num_copies;
-+	int mirror;
-+
-+	btrfs_debug(fs_info, "repair read error: read error at %llu",
-+		    failed_bbio->file_offset + bio_offset);
-+
-+	num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
-+	if (num_copies == 1) {
-+		btrfs_debug(fs_info, "no copy to repair from");
-+		failed_bbio->bio.bi_status = BLK_STS_IOERR;
-+		return fbio;
-+	}
-+
-+	if (!fbio) {
-+		fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
-+		fbio->bbio = failed_bbio;
-+		fbio->num_copies = num_copies;
-+		atomic_set(&fbio->repair_count, 1);
-+	}
-+
-+	atomic_inc(&fbio->repair_count);
-+
-+	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
-+				      &btrfs_repair_bioset);
-+	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
-+	bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
-+
-+	repair_bbio = btrfs_bio(repair_bio);
-+	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
-+	repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
-+
-+	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
-+	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
-+	btrfs_submit_bio(repair_bio, mirror);
-+	return fbio;
-+}
-+
-+static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
-+{
-+	struct btrfs_inode *inode = bbio->inode;
-+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-+	u32 sectorsize = fs_info->sectorsize;
-+	struct bvec_iter *iter = &bbio->saved_iter;
-+	blk_status_t status = bbio->bio.bi_status;
-+	struct btrfs_failed_bio *fbio = NULL;
-+	u32 offset = 0;
-+
-+	/*
-+	 * Hand off repair bios to the repair code as there is no upper level
-+	 * submitter for them.
-+	 */
-+	if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
-+		btrfs_end_repair_bio(bbio, dev);
-+		return;
-+	}
-+
-+	/* Clear the I/O error. A failed repair will reset it. */
-+	bbio->bio.bi_status = BLK_STS_OK;
-+
-+	while (iter->bi_size) {
-+		struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
-+
-+		bv.bv_len = min(bv.bv_len, sectorsize);
-+		if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
-+			fbio = repair_one_sector(bbio, offset, &bv, fbio);
-+
-+		bio_advance_iter_single(&bbio->bio, iter, sectorsize);
-+		offset += sectorsize;
-+	}
-+
-+	if (bbio->csum != bbio->csum_inline)
-+		kfree(bbio->csum);
-+
-+	if (fbio)
-+		btrfs_repair_done(fbio);
-+	else
-+		btrfs_orig_bbio_end_io(bbio);
-+}
-+
- static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
- {
- 	if (!dev || !dev->bdev)
-@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work)
- {
- 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
- 
--	bbio->end_io(bbio);
-+	/* Metadata reads are checked and repaired by the submitter. */
-+	if (bbio->bio.bi_opf & REQ_META)
-+		bbio->end_io(bbio);
-+	else
-+		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
- }
- 
- static void btrfs_simple_end_io(struct bio *bio)
- {
--	struct btrfs_fs_info *fs_info = bio->bi_private;
- 	struct btrfs_bio *bbio = btrfs_bio(bio);
-+	struct btrfs_device *dev = bio->bi_private;
-+	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
- 
- 	btrfs_bio_counter_dec(fs_info);
- 
- 	if (bio->bi_status)
--		btrfs_log_dev_io_error(bio, bbio->device);
-+		btrfs_log_dev_io_error(bio, dev);
- 
- 	if (bio_op(bio) == REQ_OP_READ) {
- 		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
- 		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
- 	} else {
--		bbio->end_io(bbio);
-+		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-+			btrfs_record_physical_zoned(bbio);
-+		btrfs_orig_bbio_end_io(bbio);
- 	}
- }
- 
-@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio)
- 
- 	btrfs_bio_counter_dec(bioc->fs_info);
- 	bbio->mirror_num = bioc->mirror_num;
--	bbio->end_io(bbio);
-+	if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
-+		btrfs_check_read_bio(bbio, NULL);
-+	else
-+		btrfs_orig_bbio_end_io(bbio);
- 
- 	btrfs_put_bioc(bioc);
- }
-@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
- 	else
- 		bio->bi_status = BLK_STS_OK;
- 
--	bbio->end_io(bbio);
-+	btrfs_orig_bbio_end_io(bbio);
- 	btrfs_put_bioc(bioc);
- }
- 
-@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
- 	 */
- 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- 		u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
-+		u64 zone_start = round_down(physical, dev->fs_info->zone_size);
- 
--		if (btrfs_dev_is_sequential(dev, physical)) {
--			u64 zone_start = round_down(physical,
--						    dev->fs_info->zone_size);
--
--			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
--		} else {
--			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
--			bio->bi_opf |= REQ_OP_WRITE;
--		}
-+		ASSERT(btrfs_dev_is_sequential(dev, physical));
-+		bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
- 	}
- 	btrfs_debug_in_rcu(dev->fs_info,
- 	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
-@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
- 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
- }
- 
--void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
-+static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
-+			       struct btrfs_io_stripe *smap, int mirror_num)
- {
--	u64 logical = bio->bi_iter.bi_sector << 9;
--	u64 length = bio->bi_iter.bi_size;
--	u64 map_length = length;
--	struct btrfs_io_context *bioc = NULL;
--	struct btrfs_io_stripe smap;
--	int ret;
--
--	btrfs_bio_counter_inc_blocked(fs_info);
--	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
--				&bioc, &smap, &mirror_num, 1);
--	if (ret) {
--		btrfs_bio_counter_dec(fs_info);
--		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
--		return;
--	}
--
--	if (map_length < length) {
--		btrfs_crit(fs_info,
--			   "mapping failed logical %llu bio len %llu len %llu",
--			   logical, length, map_length);
--		BUG();
--	}
-+	/* Do not leak our private flag into the block layer. */
-+	bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;
- 
- 	if (!bioc) {
--		/* Single mirror read/write fast path */
-+		/* Single mirror read/write fast path. */
- 		btrfs_bio(bio)->mirror_num = mirror_num;
--		btrfs_bio(bio)->device = smap.dev;
--		bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
--		bio->bi_private = fs_info;
-+		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
-+		bio->bi_private = smap->dev;
- 		bio->bi_end_io = btrfs_simple_end_io;
--		btrfs_submit_dev_bio(smap.dev, bio);
-+		btrfs_submit_dev_bio(smap->dev, bio);
- 	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
--		/* Parity RAID write or read recovery */
-+		/* Parity RAID write or read recovery. */
- 		bio->bi_private = bioc;
- 		bio->bi_end_io = btrfs_raid56_end_io;
- 		if (bio_op(bio) == REQ_OP_READ)
-@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
- 		else
- 			raid56_parity_write(bio, bioc);
- 	} else {
--		/* Write to multiple mirrors */
-+		/* Write to multiple mirrors. */
- 		int total_devs = bioc->num_stripes;
--		int dev_nr;
- 
- 		bioc->orig_bio = bio;
--		for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
-+		for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
- 			btrfs_submit_mirrored_bio(bioc, dev_nr);
- 	}
- }
- 
-+static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
-+{
-+	if (bbio->bio.bi_opf & REQ_META)
-+		return btree_csum_one_bio(bbio);
-+	return btrfs_csum_one_bio(bbio);
-+}
-+
-+/*
-+ * Async submit bios are used to offload expensive checksumming onto the worker
-+ * threads.
-+ */
-+struct async_submit_bio {
-+	struct btrfs_bio *bbio;
-+	struct btrfs_io_context *bioc;
-+	struct btrfs_io_stripe smap;
-+	int mirror_num;
-+	struct btrfs_work work;
-+};
-+
-+/*
-+ * In order to insert checksums into the metadata in large chunks, we wait
-+ * until bio submission time.   All the pages in the bio are checksummed and
-+ * sums are attached onto the ordered extent record.
-+ *
-+ * At IO completion time the csums attached on the ordered extent record are
-+ * inserted into the btree.
-+ */
-+static void run_one_async_start(struct btrfs_work *work)
-+{
-+	struct async_submit_bio *async =
-+		container_of(work, struct async_submit_bio, work);
-+	blk_status_t ret;
-+
-+	ret = btrfs_bio_csum(async->bbio);
-+	if (ret)
-+		async->bbio->bio.bi_status = ret;
-+}
-+
-+/*
-+ * In order to insert checksums into the metadata in large chunks, we wait
-+ * until bio submission time.   All the pages in the bio are checksummed and
-+ * sums are attached onto the ordered extent record.
-+ *
-+ * At IO completion time the csums attached on the ordered extent record are
-+ * inserted into the tree.
-+ */
-+static void run_one_async_done(struct btrfs_work *work)
-+{
-+	struct async_submit_bio *async =
-+		container_of(work, struct async_submit_bio, work);
-+	struct bio *bio = &async->bbio->bio;
-+
-+	/* If an error occurred we just want to clean up the bio and move on. */
-+	if (bio->bi_status) {
-+		btrfs_orig_bbio_end_io(async->bbio);
-+		return;
-+	}
-+
-+	/*
-+	 * All of the bios that pass through here are from async helpers.
-+	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
-+	 * This changes nothing when cgroups aren't in use.
-+	 */
-+	bio->bi_opf |= REQ_CGROUP_PUNT;
-+	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
-+}
-+
-+static void run_one_async_free(struct btrfs_work *work)
-+{
-+	kfree(container_of(work, struct async_submit_bio, work));
-+}
-+
-+static bool should_async_write(struct btrfs_bio *bbio)
-+{
-+	/*
-+	 * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
-+	 * then try to defer the submission to a workqueue to parallelize the
-+	 * checksum calculation.
-+	 */
-+	if (atomic_read(&bbio->inode->sync_writers))
-+		return false;
-+
-+	/*
-+	 * Submit metadata writes synchronously if the checksum implementation
-+	 * is fast, or we are on a zoned device that wants I/O to be submitted
-+	 * in order.
-+	 */
-+	if (bbio->bio.bi_opf & REQ_META) {
-+		struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
-+
-+		if (btrfs_is_zoned(fs_info))
-+			return false;
-+		if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
-+			return false;
-+	}
-+
-+	return true;
-+}
-+
-+/*
-+ * Submit bio to an async queue.
-+ *
-+ * Return true if the work has been succesfuly submitted, else false.
-+ */
-+static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
-+				struct btrfs_io_context *bioc,
-+				struct btrfs_io_stripe *smap, int mirror_num)
-+{
-+	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
-+	struct async_submit_bio *async;
-+
-+	async = kmalloc(sizeof(*async), GFP_NOFS);
-+	if (!async)
-+		return false;
-+
-+	async->bbio = bbio;
-+	async->bioc = bioc;
-+	async->smap = *smap;
-+	async->mirror_num = mirror_num;
-+
-+	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
-+			run_one_async_free);
-+	if (op_is_sync(bbio->bio.bi_opf))
-+		btrfs_queue_work(fs_info->hipri_workers, &async->work);
-+	else
-+		btrfs_queue_work(fs_info->workers, &async->work);
-+	return true;
-+}
-+
-+static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
-+{
-+	struct btrfs_bio *bbio = btrfs_bio(bio);
-+	struct btrfs_inode *inode = bbio->inode;
-+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-+	struct btrfs_bio *orig_bbio = bbio;
-+	u64 logical = bio->bi_iter.bi_sector << 9;
-+	u64 length = bio->bi_iter.bi_size;
-+	u64 map_length = length;
-+	bool use_append = btrfs_use_zone_append(bbio);
-+	struct btrfs_io_context *bioc = NULL;
-+	struct btrfs_io_stripe smap;
-+	blk_status_t ret;
-+	int error;
-+
-+	btrfs_bio_counter_inc_blocked(fs_info);
-+	error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
-+				  &bioc, &smap, &mirror_num, 1);
-+	if (error) {
-+		ret = errno_to_blk_status(error);
-+		goto fail;
-+	}
-+
-+	map_length = min(map_length, length);
-+	if (use_append)
-+		map_length = min(map_length, fs_info->max_zone_append_size);
-+
-+	if (map_length < length) {
-+		bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
-+		bbio = btrfs_bio(bio);
-+	}
-+
-+	/*
-+	 * Save the iter for the end_io handler and preload the checksums for
-+	 * data reads.
-+	 */
-+	if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
-+		bbio->saved_iter = bio->bi_iter;
-+		ret = btrfs_lookup_bio_sums(bbio);
-+		if (ret)
-+			goto fail_put_bio;
-+	}
-+
-+	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-+		if (use_append) {
-+			bio->bi_opf &= ~REQ_OP_WRITE;
-+			bio->bi_opf |= REQ_OP_ZONE_APPEND;
-+			ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
-+			if (ret)
-+				goto fail_put_bio;
-+		}
-+
-+		/*
-+		 * Csum items for reloc roots have already been cloned at this
-+		 * point, so they are handled as part of the no-checksum case.
-+		 */
-+		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
-+		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
-+		    !btrfs_is_data_reloc_root(inode->root)) {
-+			if (should_async_write(bbio) &&
-+			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
-+				goto done;
-+
-+			ret = btrfs_bio_csum(bbio);
-+			if (ret)
-+				goto fail_put_bio;
-+		}
-+	}
-+
-+	__btrfs_submit_bio(bio, bioc, &smap, mirror_num);
-+done:
-+	return map_length == length;
-+
-+fail_put_bio:
-+	if (map_length < length)
-+		bio_put(bio);
-+fail:
-+	btrfs_bio_counter_dec(fs_info);
-+	btrfs_bio_end_io(orig_bbio, ret);
-+	/* Do not submit another chunk */
-+	return true;
-+}
-+
-+void btrfs_submit_bio(struct bio *bio, int mirror_num)
-+{
-+	while (!btrfs_submit_chunk(bio, mirror_num))
-+		;
-+}
-+
- /*
-  * Submit a repair write.
-  *
-@@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
-  * RAID setup.  Here we only want to write the one bad copy, so we do the
-  * mapping ourselves and submit the bio directly.
-  *
-- * The I/O is issued sychronously to block the repair read completion from
-+ * The I/O is issued synchronously to block the repair read completion from
-  * freeing the bio.
-  */
- int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-@@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void)
- 			offsetof(struct btrfs_bio, bio),
- 			BIOSET_NEED_BVECS))
- 		return -ENOMEM;
-+	if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
-+			offsetof(struct btrfs_bio, bio), 0))
-+		goto out_free_bioset;
-+	if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
-+			offsetof(struct btrfs_bio, bio),
-+			BIOSET_NEED_BVECS))
-+		goto out_free_clone_bioset;
-+	if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
-+				      sizeof(struct btrfs_failed_bio)))
-+		goto out_free_repair_bioset;
- 	return 0;
-+
-+out_free_repair_bioset:
-+	bioset_exit(&btrfs_repair_bioset);
-+out_free_clone_bioset:
-+	bioset_exit(&btrfs_clone_bioset);
-+out_free_bioset:
-+	bioset_exit(&btrfs_bioset);
-+	return -ENOMEM;
- }
- 
- void __cold btrfs_bioset_exit(void)
- {
-+	mempool_exit(&btrfs_failed_bio_pool);
-+	bioset_exit(&btrfs_repair_bioset);
-+	bioset_exit(&btrfs_clone_bioset);
- 	bioset_exit(&btrfs_bioset);
- }
-diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
-index b12f84b3b341..873ff85817f0 100644
---- a/fs/btrfs/bio.h
-+++ b/fs/btrfs/bio.h
-@@ -26,32 +26,23 @@ struct btrfs_fs_info;
- typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
- 
- /*
-- * Additional info to pass along bio.
-- *
-- * Mostly for btrfs specific features like csum and mirror_num.
-+ * Highlevel btrfs I/O structure.  It is allocated by btrfs_bio_alloc and
-+ * passed to btrfs_submit_bio for mapping to the physical devices.
-  */
- struct btrfs_bio {
--	unsigned int mirror_num:7;
--
--	/*
--	 * Extra indicator for metadata bios.
--	 * For some btrfs bios they use pages without a mapping, thus
--	 * we can not rely on page->mapping->host to determine if
--	 * it's a metadata bio.
--	 */
--	unsigned int is_metadata:1;
--	struct bvec_iter iter;
--
--	/* for direct I/O */
-+	/* Inode and offset into it that this I/O operates on. */
-+	struct btrfs_inode *inode;
- 	u64 file_offset;
- 
--	/* @device is for stripe IO submission. */
--	struct btrfs_device *device;
- 	union {
--		/* For data checksum verification. */
-+		/*
-+		 * Data checksumming and original I/O information for internal
-+		 * use in the btrfs_submit_bio machinery.
-+		 */
- 		struct {
- 			u8 *csum;
- 			u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
-+			struct bvec_iter saved_iter;
- 		};
- 
- 		/* For metadata parentness verification. */
-@@ -62,7 +53,9 @@ struct btrfs_bio {
- 	btrfs_bio_end_io_t end_io;
- 	void *private;
- 
--	/* For read end I/O handling */
-+	/* For internal use in read end I/O handling */
-+	unsigned int mirror_num;
-+	atomic_t pending_ios;
- 	struct work_struct end_io_work;
- 
- 	/*
-@@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
- int __init btrfs_bioset_init(void);
- void __cold btrfs_bioset_exit(void);
- 
-+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
-+		    btrfs_bio_end_io_t end_io, void *private);
- struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
-+			    struct btrfs_inode *inode,
- 			    btrfs_bio_end_io_t end_io, void *private);
--struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
--				    btrfs_bio_end_io_t end_io, void *private);
--
- 
- static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
- {
-@@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
- 	bbio->end_io(bbio);
- }
- 
--static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
--{
--	if (bbio->is_metadata)
--		return;
--	if (bbio->csum != bbio->csum_inline) {
--		kfree(bbio->csum);
--		bbio->csum = NULL;
--	}
--}
-+/* Bio only refers to one ordered extent. */
-+#define REQ_BTRFS_ONE_ORDERED			REQ_DRV
- 
--/*
-- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
-- *
-- * bvl        - struct bio_vec
-- * bbio       - struct btrfs_bio
-- * iters      - struct bvec_iter
-- * bio_offset - unsigned int
-- */
--#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset)	\
--	for ((iter) = (bbio)->iter, (bio_offset) = 0;			\
--	     (iter).bi_size &&					\
--	     (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1);	\
--	     (bio_offset) += fs_info->sectorsize,			\
--	     bio_advance_iter_single(&(bbio)->bio, &(iter),		\
--	     (fs_info)->sectorsize))
--
--void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
--		      int mirror_num);
-+void btrfs_submit_bio(struct bio *bio, int mirror_num);
- int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- 			    u64 length, u64 logical, struct page *page,
- 			    unsigned int pg_offset, int mirror_num);
-diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
-index 708d843daa72..80c73137e322 100644
---- a/fs/btrfs/block-group.c
-+++ b/fs/btrfs/block-group.c
-@@ -1,5 +1,6 @@
- // SPDX-License-Identifier: GPL-2.0
- 
-+#include <linux/sizes.h>
- #include <linux/list_sort.h>
- #include "misc.h"
- #include "ctree.h"
-@@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
- 	return total_added;
- }
- 
-+/*
-+ * Get an arbitrary extent item index / max_index through the block group
-+ *
-+ * @block_group   the block group to sample from
-+ * @index:        the integral step through the block group to grab from
-+ * @max_index:    the granularity of the sampling
-+ * @key:          return value parameter for the item we find
-+ *
-+ * Pre-conditions on indices:
-+ * 0 <= index <= max_index
-+ * 0 < max_index
-+ *
-+ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
-+ * error code on error.
-+ */
-+static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
-+					  struct btrfs_block_group *block_group,
-+					  int index, int max_index,
-+					  struct btrfs_key *key)
-+{
-+	struct btrfs_fs_info *fs_info = block_group->fs_info;
-+	struct btrfs_root *extent_root;
-+	int ret = 0;
-+	u64 search_offset;
-+	u64 search_end = block_group->start + block_group->length;
-+	struct btrfs_path *path;
-+
-+	ASSERT(index >= 0);
-+	ASSERT(index <= max_index);
-+	ASSERT(max_index > 0);
-+	lockdep_assert_held(&caching_ctl->mutex);
-+	lockdep_assert_held_read(&fs_info->commit_root_sem);
-+
-+	path = btrfs_alloc_path();
-+	if (!path)
-+		return -ENOMEM;
-+
-+	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
-+						       BTRFS_SUPER_INFO_OFFSET));
-+
-+	path->skip_locking = 1;
-+	path->search_commit_root = 1;
-+	path->reada = READA_FORWARD;
-+
-+	search_offset = index * div_u64(block_group->length, max_index);
-+	key->objectid = block_group->start + search_offset;
-+	key->type = BTRFS_EXTENT_ITEM_KEY;
-+	key->offset = 0;
-+
-+	while (1) {
-+		ret = btrfs_search_forward(extent_root, key, path, 0);
-+		if (ret != 0)
-+			goto out;
-+		/* Success; sampled an extent item in the block group */
-+		if (key->type == BTRFS_EXTENT_ITEM_KEY &&
-+		    key->objectid >= block_group->start &&
-+		    key->objectid + key->offset <= search_end)
-+			goto out;
-+
-+		/* We can't possibly find a valid extent item anymore */
-+		if (key->objectid >= search_end) {
-+			ret = 1;
-+			break;
-+		}
-+		if (key->type < BTRFS_EXTENT_ITEM_KEY)
-+			key->type = BTRFS_EXTENT_ITEM_KEY;
-+		else
-+			key->objectid++;
-+		btrfs_release_path(path);
-+		up_read(&fs_info->commit_root_sem);
-+		mutex_unlock(&caching_ctl->mutex);
-+		cond_resched();
-+		mutex_lock(&caching_ctl->mutex);
-+		down_read(&fs_info->commit_root_sem);
-+	}
-+out:
-+	lockdep_assert_held(&caching_ctl->mutex);
-+	lockdep_assert_held_read(&fs_info->commit_root_sem);
-+	btrfs_free_path(path);
-+	return ret;
-+}
-+
-+/*
-+ * Best effort attempt to compute a block group's size class while caching it.
-+ *
-+ * @block_group: the block group we are caching
-+ *
-+ * We cannot infer the size class while adding free space extents, because that
-+ * logic doesn't care about contiguous file extents (it doesn't differentiate
-+ * between a 100M extent and 100 contiguous 1M extents). So we need to read the
-+ * file extent items. Reading all of them is quite wasteful, because usually
-+ * only a handful are enough to give a good answer. Therefore, we just grab 5 of
-+ * them at even steps through the block group and pick the smallest size class
-+ * we see. Since size class is best effort, and not guaranteed in general,
-+ * inaccuracy is acceptable.
-+ *
-+ * To be more explicit about why this algorithm makes sense:
-+ *
-+ * If we are caching in a block group from disk, then there are three major cases
-+ * to consider:
-+ * 1. the block group is well behaved and all extents in it are the same size
-+ *    class.
-+ * 2. the block group is mostly one size class with rare exceptions for last
-+ *    ditch allocations
-+ * 3. the block group was populated before size classes and can have a totally
-+ *    arbitrary mix of size classes.
-+ *
-+ * In case 1, looking at any extent in the block group will yield the correct
-+ * result. For the mixed cases, taking the minimum size class seems like a good
-+ * approximation, since gaps from frees will be usable to the size class. For
-+ * 2., a small handful of file extents is likely to yield the right answer. For
-+ * 3, we can either read every file extent, or admit that this is best effort
-+ * anyway and try to stay fast.
-+ *
-+ * Returns: 0 on success, negative error code on error.
-+ */
-+static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
-+				       struct btrfs_block_group *block_group)
-+{
-+	struct btrfs_key key;
-+	int i;
-+	u64 min_size = block_group->length;
-+	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
-+	int ret;
-+
-+	if (!btrfs_block_group_should_use_size_class(block_group))
-+		return 0;
-+
-+	for (i = 0; i < 5; ++i) {
-+		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
-+		if (ret < 0)
-+			goto out;
-+		if (ret > 0)
-+			continue;
-+		min_size = min_t(u64, min_size, key.offset);
-+		size_class = btrfs_calc_block_group_size_class(min_size);
-+	}
-+	if (size_class != BTRFS_BG_SZ_NONE) {
-+		spin_lock(&block_group->lock);
-+		block_group->size_class = size_class;
-+		spin_unlock(&block_group->lock);
-+	}
-+
-+out:
-+	return ret;
-+}
-+
- static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
- {
- 	struct btrfs_block_group *block_group = caching_ctl->block_group;
-@@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
- 	mutex_lock(&caching_ctl->mutex);
- 	down_read(&fs_info->commit_root_sem);
- 
-+	load_block_group_size_class(caching_ctl, block_group);
- 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- 		ret = load_free_space_cache(block_group);
- 		if (ret == 1) {
-@@ -1687,7 +1836,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
- 
- 		btrfs_info(fs_info,
- 			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
--				bg->start, div_u64(bg->used * 100, bg->length),
-+				bg->start,
-+				div64_u64(bg->used * 100, bg->length),
- 				div64_u64(zone_unusable * 100, bg->length));
- 		trace_btrfs_reclaim_block_group(bg);
- 		ret = btrfs_relocate_chunk(fs_info, bg->start);
-@@ -1816,7 +1966,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-  *
-  * @fs_info:       the filesystem
-  * @chunk_start:   logical address of block group
-- * @bdev:	   physical device to resolve, can be NULL to indicate any device
-  * @physical:	   physical address to map to logical addresses
-  * @logical:	   return array of logical addresses which map to @physical
-  * @naddrs:	   length of @logical
-@@ -1827,8 +1976,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
-  * block copies.
-  */
- int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
--		     struct block_device *bdev, u64 physical, u64 **logical,
--		     int *naddrs, int *stripe_len)
-+		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
- {
- 	struct extent_map *em;
- 	struct map_lookup *map;
-@@ -1868,9 +2016,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- 			      data_stripe_length))
- 			continue;
- 
--		if (bdev && map->stripes[i].dev->bdev != bdev)
--			continue;
--
- 		stripe_nr = physical - map->stripes[i].physical;
- 		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
- 
-@@ -1927,7 +2072,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
- 
- 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- 		bytenr = btrfs_sb_offset(i);
--		ret = btrfs_rmap_block(fs_info, cache->start, NULL,
-+		ret = btrfs_rmap_block(fs_info, cache->start,
- 				       bytenr, &logical, &nr, &stripe_len);
- 		if (ret)
- 			return ret;
-@@ -3330,7 +3475,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- 	spin_unlock(&info->delalloc_root_lock);
- 
- 	while (total) {
--		bool reclaim;
-+		bool reclaim = false;
- 
- 		cache = btrfs_lookup_block_group(info, bytenr);
- 		if (!cache) {
-@@ -3379,6 +3524,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- 			cache->space_info->disk_used -= num_bytes * factor;
- 
- 			reclaim = should_reclaim_block_group(cache, num_bytes);
-+
- 			spin_unlock(&cache->lock);
- 			spin_unlock(&cache->space_info->lock);
- 
-@@ -3433,32 +3579,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-  * reservation and return -EAGAIN, otherwise this function always succeeds.
-  */
- int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
--			     u64 ram_bytes, u64 num_bytes, int delalloc)
-+			     u64 ram_bytes, u64 num_bytes, int delalloc,
-+			     bool force_wrong_size_class)
- {
- 	struct btrfs_space_info *space_info = cache->space_info;
-+	enum btrfs_block_group_size_class size_class;
- 	int ret = 0;
- 
- 	spin_lock(&space_info->lock);
- 	spin_lock(&cache->lock);
- 	if (cache->ro) {
- 		ret = -EAGAIN;
--	} else {
--		cache->reserved += num_bytes;
--		space_info->bytes_reserved += num_bytes;
--		trace_btrfs_space_reservation(cache->fs_info, "space_info",
--					      space_info->flags, num_bytes, 1);
--		btrfs_space_info_update_bytes_may_use(cache->fs_info,
--						      space_info, -ram_bytes);
--		if (delalloc)
--			cache->delalloc_bytes += num_bytes;
-+		goto out;
-+	}
- 
--		/*
--		 * Compression can use less space than we reserved, so wake
--		 * tickets if that happens
--		 */
--		if (num_bytes < ram_bytes)
--			btrfs_try_granting_tickets(cache->fs_info, space_info);
-+	if (btrfs_block_group_should_use_size_class(cache)) {
-+		size_class = btrfs_calc_block_group_size_class(num_bytes);
-+		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
-+		if (ret)
-+			goto out;
- 	}
-+	cache->reserved += num_bytes;
-+	space_info->bytes_reserved += num_bytes;
-+	trace_btrfs_space_reservation(cache->fs_info, "space_info",
-+				      space_info->flags, num_bytes, 1);
-+	btrfs_space_info_update_bytes_may_use(cache->fs_info,
-+					      space_info, -ram_bytes);
-+	if (delalloc)
-+		cache->delalloc_bytes += num_bytes;
-+
-+	/*
-+	 * Compression can use less space than we reserved, so wake tickets if
-+	 * that happens.
-+	 */
-+	if (num_bytes < ram_bytes)
-+		btrfs_try_granting_tickets(cache->fs_info, space_info);
-+out:
- 	spin_unlock(&cache->lock);
- 	spin_unlock(&space_info->lock);
- 	return ret;
-@@ -4218,3 +4374,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
- 	bg->swap_extents -= amount;
- 	spin_unlock(&bg->lock);
- }
-+
-+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
-+{
-+	if (size <= SZ_128K)
-+		return BTRFS_BG_SZ_SMALL;
-+	if (size <= SZ_8M)
-+		return BTRFS_BG_SZ_MEDIUM;
-+	return BTRFS_BG_SZ_LARGE;
-+}
-+
-+/*
-+ * Handle a block group allocating an extent in a size class
-+ *
-+ * @bg:				The block group we allocated in.
-+ * @size_class:			The size class of the allocation.
-+ * @force_wrong_size_class:	Whether we are desperate enough to allow
-+ *				mismatched size classes.
-+ *
-+ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
-+ * case of a race that leads to the wrong size class without
-+ * force_wrong_size_class set.
-+ *
-+ * find_free_extent will skip block groups with a mismatched size class until
-+ * it really needs to avoid ENOSPC. In that case it will set
-+ * force_wrong_size_class. However, if a block group is newly allocated and
-+ * doesn't yet have a size class, then it is possible for two allocations of
-+ * different sizes to race and both try to use it. The loser is caught here and
-+ * has to retry.
-+ */
-+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
-+				     enum btrfs_block_group_size_class size_class,
-+				     bool force_wrong_size_class)
-+{
-+	ASSERT(size_class != BTRFS_BG_SZ_NONE);
-+
-+	/* The new allocation is in the right size class, do nothing */
-+	if (bg->size_class == size_class)
-+		return 0;
-+	/*
-+	 * The new allocation is in a mismatched size class.
-+	 * This means one of two things:
-+	 *
-+	 * 1. Two tasks in find_free_extent for different size_classes raced
-+	 *    and hit the same empty block_group. Make the loser try again.
-+	 * 2. A call to find_free_extent got desperate enough to set
-+	 *    'force_wrong_slab'. Don't change the size_class, but allow the
-+	 *    allocation.
-+	 */
-+	if (bg->size_class != BTRFS_BG_SZ_NONE) {
-+		if (force_wrong_size_class)
-+			return 0;
-+		return -EAGAIN;
-+	}
-+	/*
-+	 * The happy new block group case: the new allocation is the first
-+	 * one in the block_group so we set size_class.
-+	 */
-+	bg->size_class = size_class;
-+
-+	return 0;
-+}
-+
-+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
-+{
-+	if (btrfs_is_zoned(bg->fs_info))
-+		return false;
-+	if (!btrfs_is_block_group_data_only(bg))
-+		return false;
-+	return true;
-+}
-diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
-index a02ea76fd6cf..6e4a0b429ac3 100644
---- a/fs/btrfs/block-group.h
-+++ b/fs/btrfs/block-group.h
-@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state {
- 	BTRFS_DC_SETUP,
- };
- 
-+enum btrfs_block_group_size_class {
-+	/* Unset */
-+	BTRFS_BG_SZ_NONE,
-+	/* 0 < size <= 128K */
-+	BTRFS_BG_SZ_SMALL,
-+	/* 128K < size <= 8M */
-+	BTRFS_BG_SZ_MEDIUM,
-+	/* 8M < size < BG_LENGTH */
-+	BTRFS_BG_SZ_LARGE,
-+};
-+
- /*
-  * This describes the state of the block_group for async discard.  This is due
-  * to the two pass nature of it where extent discarding is prioritized over
-@@ -233,6 +244,7 @@ struct btrfs_block_group {
- 	struct list_head active_bg_list;
- 	struct work_struct zone_finish_work;
- 	struct extent_buffer *last_eb;
-+	enum btrfs_block_group_size_class size_class;
- };
- 
- static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
-@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
- int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- 			     u64 bytenr, u64 num_bytes, bool alloc);
- int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
--			     u64 ram_bytes, u64 num_bytes, int delalloc);
-+			     u64 ram_bytes, u64 num_bytes, int delalloc,
-+			     bool force_wrong_size_class);
- void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- 			       u64 num_bytes, int delalloc);
- int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
-@@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
- void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
- int btrfs_free_block_groups(struct btrfs_fs_info *info);
- int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
--		       struct block_device *bdev, u64 physical, u64 **logical,
--		       int *naddrs, int *stripe_len);
-+		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
- 
- static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
- {
-@@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
- bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
- void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
- 
-+enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
-+int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
-+				     enum btrfs_block_group_size_class size_class,
-+				     bool force_wrong_size_class);
-+bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
-+
- #endif /* BTRFS_BLOCK_GROUP_H */
-diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
-index 195c09e20609..87020aa58121 100644
---- a/fs/btrfs/btrfs_inode.h
-+++ b/fs/btrfs/btrfs_inode.h
-@@ -93,12 +93,6 @@ struct btrfs_inode {
- 	/* the io_tree does range state (DIRTY, LOCKED etc) */
- 	struct extent_io_tree io_tree;
- 
--	/* special utility tree used to record which mirrors have already been
--	 * tried when checksums fail for a given block
--	 */
--	struct rb_root io_failure_tree;
--	spinlock_t io_failure_lock;
--
- 	/*
- 	 * Keep track of where the inode has extent items mapped in order to
- 	 * make sure the i_size adjustments are accurate
-@@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
- #define CSUM_FMT				"0x%*phN"
- #define CSUM_FMT_VALUE(size, bytes)		size, bytes
- 
--void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
--void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
--			int mirror_num, enum btrfs_compression_type compress_type);
--void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
--blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
--blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
--					      struct bio *bio,
--					      u64 dio_file_offset);
- int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
--int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
--			  u32 bio_offset, struct page *page, u32 pgoff);
--unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
--				    u32 bio_offset, struct page *page,
--				    u64 start, u64 end);
-+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
-+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
-+			u32 bio_offset, struct bio_vec *bv);
- noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- 			      u64 *orig_start, u64 *orig_block_len,
- 			      u64 *ram_bytes, bool nowait, bool strict);
-@@ -532,6 +516,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
- ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
- 		       size_t done_before);
- struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
-+				  struct btrfs_ordered_extent **ordered_extent,
- 				  size_t done_before);
- 
- extern const struct dentry_operations btrfs_dentry_operations;
-diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
-index 5122ca79f7ea..f42f31f22d13 100644
---- a/fs/btrfs/compression.c
-+++ b/fs/btrfs/compression.c
-@@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws,
- 
- static int btrfs_decompress_bio(struct compressed_bio *cb);
- 
--static void finish_compressed_bio_read(struct compressed_bio *cb)
-+static void end_compressed_bio_read(struct btrfs_bio *bbio)
- {
-+	struct compressed_bio *cb = bbio->private;
- 	unsigned int index;
- 	struct page *page;
- 
--	if (cb->status == BLK_STS_OK)
-+	if (bbio->bio.bi_status)
-+		cb->status = bbio->bio.bi_status;
-+	else
- 		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
- 
- 	/* Release the compressed pages */
-@@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
- 	/* Finally free the cb struct */
- 	kfree(cb->compressed_pages);
- 	kfree(cb);
--}
--
--/*
-- * Verify the checksums and kick off repair if needed on the uncompressed data
-- * before decompressing it into the original bio and freeing the uncompressed
-- * pages.
-- */
--static void end_compressed_bio_read(struct btrfs_bio *bbio)
--{
--	struct compressed_bio *cb = bbio->private;
--	struct inode *inode = cb->inode;
--	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
--	struct btrfs_inode *bi = BTRFS_I(inode);
--	bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
--		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
--	blk_status_t status = bbio->bio.bi_status;
--	struct bvec_iter iter;
--	struct bio_vec bv;
--	u32 offset;
--
--	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
--		u64 start = bbio->file_offset + offset;
--
--		if (!status &&
--		    (!csum || !btrfs_check_data_csum(bi, bbio, offset,
--						     bv.bv_page, bv.bv_offset))) {
--			btrfs_clean_io_failure(bi, start, bv.bv_page,
--					       bv.bv_offset);
--		} else {
--			int ret;
--
--			refcount_inc(&cb->pending_ios);
--			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
--						      bv.bv_page, bv.bv_offset,
--						      true);
--			if (ret) {
--				refcount_dec(&cb->pending_ios);
--				status = errno_to_blk_status(ret);
--			}
--		}
--	}
--
--	if (status)
--		cb->status = status;
--
--	if (refcount_dec_and_test(&cb->pending_ios))
--		finish_compressed_bio_read(cb);
--	btrfs_bio_free_csum(bbio);
- 	bio_put(&bbio->bio);
- }
- 
-@@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
- static void end_compressed_bio_write(struct btrfs_bio *bbio)
- {
- 	struct compressed_bio *cb = bbio->private;
--
--	if (bbio->bio.bi_status)
--		cb->status = bbio->bio.bi_status;
--
--	if (refcount_dec_and_test(&cb->pending_ios)) {
--		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
--
--		btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
--		queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
--	}
--	bio_put(&bbio->bio);
--}
--
--/*
-- * Allocate a compressed_bio, which will be used to read/write on-disk
-- * (aka, compressed) * data.
-- *
-- * @cb:                 The compressed_bio structure, which records all the needed
-- *                      information to bind the compressed data to the uncompressed
-- *                      page cache.
-- * @disk_byten:         The logical bytenr where the compressed data will be read
-- *                      from or written to.
-- * @endio_func:         The endio function to call after the IO for compressed data
-- *                      is finished.
-- * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
-- *                      Let the caller know to only fill the bio up to the stripe
-- *                      boundary.
-- */
--
--
--static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
--					blk_opf_t opf,
--					btrfs_bio_end_io_t endio_func,
--					u64 *next_stripe_start)
--{
- 	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
--	struct btrfs_io_geometry geom;
--	struct extent_map *em;
--	struct bio *bio;
--	int ret;
- 
--	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
--	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
-+	cb->status = bbio->bio.bi_status;
-+	queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
- 
--	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
--	if (IS_ERR(em)) {
--		bio_put(bio);
--		return ERR_CAST(em);
--	}
--
--	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
--		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
--
--	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
--	free_extent_map(em);
--	if (ret < 0) {
--		bio_put(bio);
--		return ERR_PTR(ret);
--	}
--	*next_stripe_start = disk_bytenr + geom.len;
--	refcount_inc(&cb->pending_ios);
--	return bio;
-+	bio_put(&bbio->bio);
- }
- 
- /*
-@@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- 	struct bio *bio = NULL;
- 	struct compressed_bio *cb;
- 	u64 cur_disk_bytenr = disk_start;
--	u64 next_stripe_start;
- 	blk_status_t ret = BLK_STS_OK;
--	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
--	const bool use_append = btrfs_use_zone_append(inode, disk_start);
--	const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
- 
- 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- 	       IS_ALIGNED(len, fs_info->sectorsize));
- 	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
- 	if (!cb)
- 		return BLK_STS_RESOURCE;
--	refcount_set(&cb->pending_ios, 1);
- 	cb->status = BLK_STS_OK;
- 	cb->inode = &inode->vfs_inode;
- 	cb->start = start;
-@@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- 	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
- 	cb->nr_pages = nr_pages;
- 
--	if (blkcg_css)
-+	if (blkcg_css) {
- 		kthread_associate_blkcg(blkcg_css);
-+		write_flags |= REQ_CGROUP_PUNT;
-+	}
-+
-+	write_flags |= REQ_BTRFS_ONE_ORDERED;
-+	bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
-+			      BTRFS_I(cb->inode), end_compressed_bio_write, cb);
-+	bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
-+	btrfs_bio(bio)->file_offset = start;
- 
- 	while (cur_disk_bytenr < disk_start + compressed_len) {
- 		u64 offset = cur_disk_bytenr - disk_start;
-@@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- 		unsigned int real_size;
- 		unsigned int added;
- 		struct page *page = compressed_pages[index];
--		bool submit = false;
--
--		/* Allocate new bio if submitted or not yet allocated */
--		if (!bio) {
--			bio = alloc_compressed_bio(cb, cur_disk_bytenr,
--				bio_op | write_flags, end_compressed_bio_write,
--				&next_stripe_start);
--			if (IS_ERR(bio)) {
--				ret = errno_to_blk_status(PTR_ERR(bio));
--				break;
--			}
--			if (blkcg_css)
--				bio->bi_opf |= REQ_CGROUP_PUNT;
--		}
--		/*
--		 * We should never reach next_stripe_start start as we will
--		 * submit comp_bio when reach the boundary immediately.
--		 */
--		ASSERT(cur_disk_bytenr != next_stripe_start);
- 
- 		/*
- 		 * We have various limits on the real read size:
--		 * - stripe boundary
- 		 * - page boundary
- 		 * - compressed length boundary
- 		 */
--		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
--		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
-+		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
- 		real_size = min_t(u64, real_size, compressed_len - offset);
- 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
- 
--		if (use_append)
--			added = bio_add_zone_append_page(bio, page, real_size,
--					offset_in_page(offset));
--		else
--			added = bio_add_page(bio, page, real_size,
--					offset_in_page(offset));
--		/* Reached zoned boundary */
--		if (added == 0)
--			submit = true;
--
-+		added = bio_add_page(bio, page, real_size, offset_in_page(offset));
-+		/*
-+		 * Maximum compressed extent is smaller than bio size limit,
-+		 * thus bio_add_page() should always success.
-+		 */
-+		ASSERT(added == real_size);
- 		cur_disk_bytenr += added;
--		/* Reached stripe boundary */
--		if (cur_disk_bytenr == next_stripe_start)
--			submit = true;
--
--		/* Finished the range */
--		if (cur_disk_bytenr == disk_start + compressed_len)
--			submit = true;
--
--		if (submit) {
--			if (!skip_sum) {
--				ret = btrfs_csum_one_bio(inode, bio, start, true);
--				if (ret) {
--					btrfs_bio_end_io(btrfs_bio(bio), ret);
--					break;
--				}
--			}
--
--			ASSERT(bio->bi_iter.bi_size);
--			btrfs_submit_bio(fs_info, bio, 0);
--			bio = NULL;
--		}
--		cond_resched();
- 	}
- 
-+	/* Finished the range. */
-+	ASSERT(bio->bi_iter.bi_size);
-+	btrfs_submit_bio(bio, 0);
- 	if (blkcg_css)
- 		kthread_associate_blkcg(NULL);
--
--	if (refcount_dec_and_test(&cb->pending_ios))
--		finish_compressed_bio_write(cb);
- 	return ret;
- }
- 
-@@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- 	struct extent_map_tree *em_tree;
- 	struct compressed_bio *cb;
- 	unsigned int compressed_len;
--	struct bio *comp_bio = NULL;
-+	struct bio *comp_bio;
- 	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- 	u64 cur_disk_byte = disk_bytenr;
--	u64 next_stripe_start;
- 	u64 file_offset;
- 	u64 em_len;
- 	u64 em_start;
-@@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- 		goto out;
- 	}
- 
--	refcount_set(&cb->pending_ios, 1);
- 	cb->status = BLK_STS_OK;
- 	cb->inode = inode;
- 
-@@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- 	/* include any pages we added in add_ra-bio_pages */
- 	cb->len = bio->bi_iter.bi_size;
- 
-+	comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
-+				   end_compressed_bio_read, cb);
-+	comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
-+
- 	while (cur_disk_byte < disk_bytenr + compressed_len) {
- 		u64 offset = cur_disk_byte - disk_bytenr;
- 		unsigned int index = offset >> PAGE_SHIFT;
- 		unsigned int real_size;
- 		unsigned int added;
- 		struct page *page = cb->compressed_pages[index];
--		bool submit = false;
--
--		/* Allocate new bio if submitted or not yet allocated */
--		if (!comp_bio) {
--			comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
--					REQ_OP_READ, end_compressed_bio_read,
--					&next_stripe_start);
--			if (IS_ERR(comp_bio)) {
--				cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
--				break;
--			}
--		}
--		/*
--		 * We should never reach next_stripe_start start as we will
--		 * submit comp_bio when reach the boundary immediately.
--		 */
--		ASSERT(cur_disk_byte != next_stripe_start);
-+
- 		/*
- 		 * We have various limit on the real read size:
--		 * - stripe boundary
- 		 * - page boundary
- 		 * - compressed length boundary
- 		 */
--		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
--		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
-+		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
- 		real_size = min_t(u64, real_size, compressed_len - offset);
- 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
- 
-@@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- 		 */
- 		ASSERT(added == real_size);
- 		cur_disk_byte += added;
--
--		/* Reached stripe boundary, need to submit */
--		if (cur_disk_byte == next_stripe_start)
--			submit = true;
--
--		/* Has finished the range, need to submit */
--		if (cur_disk_byte == disk_bytenr + compressed_len)
--			submit = true;
--
--		if (submit) {
--			/* Save the original iter for read repair */
--			if (bio_op(comp_bio) == REQ_OP_READ)
--				btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
--
--			/*
--			 * Save the initial offset of this chunk, as there
--			 * is no direct correlation between compressed pages and
--			 * the original file offset.  The field is only used for
--			 * priting error messages.
--			 */
--			btrfs_bio(comp_bio)->file_offset = file_offset;
--
--			ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
--			if (ret) {
--				btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
--				break;
--			}
--
--			ASSERT(comp_bio->bi_iter.bi_size);
--			btrfs_submit_bio(fs_info, comp_bio, mirror_num);
--			comp_bio = NULL;
--		}
- 	}
- 
- 	if (memstall)
- 		psi_memstall_leave(&pflags);
- 
--	if (refcount_dec_and_test(&cb->pending_ios))
--		finish_compressed_bio_read(cb);
-+	/*
-+	 * Stash the initial offset of this chunk, as there is no direct
-+	 * correlation between compressed pages and the original file offset.
-+	 * The field is only used for printing error messages anyway.
-+	 */
-+	btrfs_bio(comp_bio)->file_offset = file_offset;
-+
-+	ASSERT(comp_bio->bi_iter.bi_size);
-+	btrfs_submit_bio(comp_bio, mirror_num);
- 	return;
- 
- fail:
-@@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
- 	index_end = end >> PAGE_SHIFT;
- 
- 	/* Don't miss unaligned end */
--	if (!IS_ALIGNED(end, PAGE_SIZE))
-+	if (!PAGE_ALIGNED(end))
- 		index_end++;
- 
- 	curr_sample_pos = 0;
-@@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
-  *
-  * For now is's a naive and optimistic 'return true', we'll extend the logic to
-  * quickly (compared to direct compression) detect data characteristics
-- * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
-+ * (compressible/incompressible) to avoid wasting CPU time on incompressible
-  * data.
-  *
-  * The following types of analysis can be performed:
-diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
-index 6209d40a1e08..a5e3377db9ad 100644
---- a/fs/btrfs/compression.h
-+++ b/fs/btrfs/compression.h
-@@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
- #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
- 
- struct compressed_bio {
--	/* Number of outstanding bios */
--	refcount_t pending_ios;
--
- 	/* Number of compressed pages in the array */
- 	unsigned int nr_pages;
- 
-diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
-index 4754c9101a4c..a5b6bb54545f 100644
---- a/fs/btrfs/ctree.c
-+++ b/fs/btrfs/ctree.c
-@@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
- 			if (ret)
- 				return ret;
- 		}
--		btrfs_clean_tree_block(buf);
-+		btrfs_clear_buffer_dirty(trans, buf);
- 		*last_ref = 1;
- 	}
- 	return 0;
-@@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- /*
-  * Search for a key in the given extent_buffer.
-  *
-- * The lower boundary for the search is specified by the slot number @low. Use a
-- * value of 0 to search over the whole extent buffer.
-+ * The lower boundary for the search is specified by the slot number @first_slot.
-+ * Use a value of 0 to search over the whole extent buffer.
-  *
-  * The slot in the extent buffer is returned via @slot. If the key exists in the
-  * extent buffer, then @slot will point to the slot where the key is, otherwise
-@@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-  * Slot may point to the total number of items (i.e. one position beyond the last
-  * key) if the key is bigger than the last key in the extent buffer.
-  */
--static noinline int generic_bin_search(struct extent_buffer *eb, int low,
--				       const struct btrfs_key *key, int *slot)
-+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
-+			     const struct btrfs_key *key, int *slot)
- {
- 	unsigned long p;
- 	int item_size;
--	int high = btrfs_header_nritems(eb);
-+	/*
-+	 * Use unsigned types for the low and high slots, so that we get a more
-+	 * efficient division in the search loop below.
-+	 */
-+	u32 low = first_slot;
-+	u32 high = btrfs_header_nritems(eb);
- 	int ret;
- 	const int key_size = sizeof(struct btrfs_disk_key);
- 
--	if (low > high) {
-+	if (unlikely(low > high)) {
- 		btrfs_err(eb->fs_info,
--		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
-+		 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
- 			  __func__, low, high, eb->start,
- 			  btrfs_header_owner(eb), btrfs_header_level(eb));
- 		return -EINVAL;
-@@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low,
- 	return 1;
- }
- 
--/*
-- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
-- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
-- */
--int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
--		     int *slot)
--{
--	return generic_bin_search(eb, 0, key, slot);
--}
--
- static void root_add_used(struct btrfs_root *root, u32 size)
- {
- 	spin_lock(&root->accounting_lock);
-@@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
- 
- 		path->locks[level] = 0;
- 		path->nodes[level] = NULL;
--		btrfs_clean_tree_block(mid);
-+		btrfs_clear_buffer_dirty(trans, mid);
- 		btrfs_tree_unlock(mid);
- 		/* once for the path */
- 		free_extent_buffer(mid);
-@@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
- 		if (wret < 0 && wret != -ENOSPC)
- 			ret = wret;
- 		if (btrfs_header_nritems(right) == 0) {
--			btrfs_clean_tree_block(right);
-+			btrfs_clear_buffer_dirty(trans, right);
- 			btrfs_tree_unlock(right);
- 			del_ptr(root, path, level + 1, pslot + 1);
- 			root_sub_used(root, right->len);
-@@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
- 		BUG_ON(wret == 1);
- 	}
- 	if (btrfs_header_nritems(mid) == 0) {
--		btrfs_clean_tree_block(mid);
-+		btrfs_clear_buffer_dirty(trans, mid);
- 		btrfs_tree_unlock(mid);
- 		del_ptr(root, path, level + 1, pslot);
- 		root_sub_used(root, mid->len);
-@@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
- 		return 0;
- 	}
- 
--	return generic_bin_search(eb, search_low_slot, key, slot);
-+	return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
- }
- 
- static int search_leaf(struct btrfs_trans_handle *trans,
-@@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
-  * min slot controls the lowest index we're willing to push to the
-  * right.  We'll push up to and including min_slot, but no lower
-  */
--static noinline int __push_leaf_right(struct btrfs_path *path,
-+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
-+				      struct btrfs_path *path,
- 				      int data_size, int empty,
- 				      struct extent_buffer *right,
- 				      int free_space, u32 left_nritems,
-@@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
- 	if (left_nritems)
- 		btrfs_mark_buffer_dirty(left);
- 	else
--		btrfs_clean_tree_block(left);
-+		btrfs_clear_buffer_dirty(trans, left);
- 
- 	btrfs_mark_buffer_dirty(right);
- 
-@@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
- 	if (path->slots[0] >= left_nritems) {
- 		path->slots[0] -= left_nritems;
- 		if (btrfs_header_nritems(path->nodes[0]) == 0)
--			btrfs_clean_tree_block(path->nodes[0]);
-+			btrfs_clear_buffer_dirty(trans, path->nodes[0]);
- 		btrfs_tree_unlock(path->nodes[0]);
- 		free_extent_buffer(path->nodes[0]);
- 		path->nodes[0] = right;
-@@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- 		return 0;
- 	}
- 
--	return __push_leaf_right(path, min_data_size, empty,
--				right, free_space, left_nritems, min_slot);
-+	return __push_leaf_right(trans, path, min_data_size, empty, right,
-+				 free_space, left_nritems, min_slot);
- out_unlock:
- 	btrfs_tree_unlock(right);
- 	free_extent_buffer(right);
-@@ -3259,7 +3255,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
-  * items
-  */
--static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
-+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
-+				     struct btrfs_path *path, int data_size,
- 				     int empty, struct extent_buffer *left,
- 				     int free_space, u32 right_nritems,
- 				     u32 max_slot)
-@@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
- 	if (right_nritems)
- 		btrfs_mark_buffer_dirty(right);
- 	else
--		btrfs_clean_tree_block(right);
-+		btrfs_clear_buffer_dirty(trans, right);
- 
- 	btrfs_item_key(right, &disk_key, 0);
- 	fixup_low_keys(path, &disk_key, 1);
-@@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- 		ret = -EUCLEAN;
- 		goto out;
- 	}
--	return __push_leaf_left(path, min_data_size,
--			       empty, left, free_space, right_nritems,
--			       max_slot);
-+	return __push_leaf_left(trans, path, min_data_size, empty, left,
-+				free_space, right_nritems, max_slot);
- out:
- 	btrfs_tree_unlock(left);
- 	free_extent_buffer(left);
-@@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- 		if (leaf == root->node) {
- 			btrfs_set_header_level(leaf, 0);
- 		} else {
--			btrfs_clean_tree_block(leaf);
-+			btrfs_clear_buffer_dirty(trans, leaf);
- 			btrfs_del_leaf(trans, root, path, leaf);
- 		}
- 	} else {
-diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
-index 6965703a81b6..97897107fab5 100644
---- a/fs/btrfs/ctree.h
-+++ b/fs/btrfs/ctree.h
-@@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
- /* ctree.c */
- int __init btrfs_ctree_init(void);
- void __cold btrfs_ctree_exit(void);
-+
-+int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
-+			     const struct btrfs_key *key, int *slot);
-+
-+/*
-+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
-+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
-+ */
-+static inline int btrfs_bin_search(struct extent_buffer *eb,
-+				   const struct btrfs_key *key,
-+				   int *slot)
-+{
-+	return btrfs_generic_bin_search(eb, 0, key, slot);
-+}
-+
- int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- 		     int *slot);
- int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
-diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
-index d81b764a7644..8065341d831a 100644
---- a/fs/btrfs/defrag.c
-+++ b/fs/btrfs/defrag.c
-@@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
- 			break;
- 
- 		unlock_page(page);
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		btrfs_put_ordered_extent(ordered);
- 		lock_page(page);
- 		/*
-@@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
- }
- 
- #define CLUSTER_SIZE	(SZ_256K)
--static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-+static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
- 
- /*
-  * Defrag one contiguous target range.
-diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
-index 573ebab886e2..886ffb232eac 100644
---- a/fs/btrfs/delayed-ref.c
-+++ b/fs/btrfs/delayed-ref.c
-@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- 	return 0;
- }
- 
--static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
--				    struct btrfs_delayed_ref_root *delayed_refs,
-+static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
- 				    struct btrfs_delayed_ref_head *head,
- 				    struct btrfs_delayed_ref_node *ref)
- {
-@@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
- 	atomic_dec(&delayed_refs->num_entries);
- }
- 
--static bool merge_ref(struct btrfs_trans_handle *trans,
--		      struct btrfs_delayed_ref_root *delayed_refs,
-+static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
- 		      struct btrfs_delayed_ref_head *head,
- 		      struct btrfs_delayed_ref_node *ref,
- 		      u64 seq)
-@@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
- 			mod = -next->ref_mod;
- 		}
- 
--		drop_delayed_ref(trans, delayed_refs, head, next);
-+		drop_delayed_ref(delayed_refs, head, next);
- 		ref->ref_mod += mod;
- 		if (ref->ref_mod == 0) {
--			drop_delayed_ref(trans, delayed_refs, head, ref);
-+			drop_delayed_ref(delayed_refs, head, ref);
- 			done = true;
- 		} else {
- 			/*
-@@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
- 	return done;
- }
- 
--void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
- 			      struct btrfs_delayed_ref_root *delayed_refs,
- 			      struct btrfs_delayed_ref_head *head)
- {
--	struct btrfs_fs_info *fs_info = trans->fs_info;
- 	struct btrfs_delayed_ref_node *ref;
- 	struct rb_node *node;
- 	u64 seq = 0;
-@@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
- 		if (seq && ref->seq >= seq)
- 			continue;
--		if (merge_ref(trans, delayed_refs, head, ref, seq))
-+		if (merge_ref(delayed_refs, head, ref, seq))
- 			goto again;
- 	}
- }
-@@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
-  * Return 0 for insert.
-  * Return >0 for merge.
-  */
--static int insert_delayed_ref(struct btrfs_trans_handle *trans,
--			      struct btrfs_delayed_ref_root *root,
-+static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
- 			      struct btrfs_delayed_ref_head *href,
- 			      struct btrfs_delayed_ref_node *ref)
- {
-@@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
- 
- 	/* remove existing tail if its ref_mod is zero */
- 	if (exist->ref_mod == 0)
--		drop_delayed_ref(trans, root, href, exist);
-+		drop_delayed_ref(root, href, exist);
- 	spin_unlock(&href->lock);
- 	return ret;
- inserted:
-@@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- 	head_ref = add_delayed_ref_head(trans, head_ref, record,
- 					action, &qrecord_inserted);
- 
--	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
-+	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
- 	spin_unlock(&delayed_refs->lock);
- 
- 	/*
-@@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
- 	head_ref = add_delayed_ref_head(trans, head_ref, record,
- 					action, &qrecord_inserted);
- 
--	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
-+	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
- 	spin_unlock(&delayed_refs->lock);
- 
- 	/*
-diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
-index d6304b690ec4..2eb34abf700f 100644
---- a/fs/btrfs/delayed-ref.h
-+++ b/fs/btrfs/delayed-ref.h
-@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
- int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- 				u64 bytenr, u64 num_bytes,
- 				struct btrfs_delayed_extent_op *extent_op);
--void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-+void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
- 			      struct btrfs_delayed_ref_root *delayed_refs,
- 			      struct btrfs_delayed_ref_head *head);
- 
-diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
-index fde40112a259..b53f0e30ce2b 100644
---- a/fs/btrfs/disk-io.c
-+++ b/fs/btrfs/disk-io.c
-@@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
- 		crypto_free_shash(fs_info->csum_shash);
- }
- 
--/*
-- * async submit bios are used to offload expensive checksumming
-- * onto the worker threads.  They checksum file and metadata bios
-- * just before they are sent down the IO stack.
-- */
--struct async_submit_bio {
--	struct btrfs_inode *inode;
--	struct bio *bio;
--	enum btrfs_wq_submit_cmd submit_cmd;
--	int mirror_num;
--
--	/* Optional parameter for used by direct io */
--	u64 dio_file_offset;
--	struct btrfs_work work;
--	blk_status_t status;
--};
--
- /*
-  * Compute the csum of a btree block and store the result to provided buffer.
-  */
-@@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
- 	return csum_one_extent_buffer(eb);
- }
- 
-+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
-+{
-+	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
-+	struct bvec_iter iter;
-+	struct bio_vec bv;
-+	int ret = 0;
-+
-+	bio_for_each_segment(bv, &bbio->bio, iter) {
-+		ret = csum_dirty_buffer(fs_info, &bv);
-+		if (ret)
-+			break;
-+	}
-+
-+	return errno_to_blk_status(ret);
-+}
-+
- static int check_tree_block_fsid(struct extent_buffer *eb)
- {
- 	struct btrfs_fs_info *fs_info = eb->fs_info;
-@@ -700,172 +699,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
- 	return ret;
- }
- 
--static void run_one_async_start(struct btrfs_work *work)
--{
--	struct async_submit_bio *async;
--	blk_status_t ret;
--
--	async = container_of(work, struct  async_submit_bio, work);
--	switch (async->submit_cmd) {
--	case WQ_SUBMIT_METADATA:
--		ret = btree_submit_bio_start(async->bio);
--		break;
--	case WQ_SUBMIT_DATA:
--		ret = btrfs_submit_bio_start(async->inode, async->bio);
--		break;
--	case WQ_SUBMIT_DATA_DIO:
--		ret = btrfs_submit_bio_start_direct_io(async->inode,
--				async->bio, async->dio_file_offset);
--		break;
--	}
--	if (ret)
--		async->status = ret;
--}
--
--/*
-- * In order to insert checksums into the metadata in large chunks, we wait
-- * until bio submission time.   All the pages in the bio are checksummed and
-- * sums are attached onto the ordered extent record.
-- *
-- * At IO completion time the csums attached on the ordered extent record are
-- * inserted into the tree.
-- */
--static void run_one_async_done(struct btrfs_work *work)
--{
--	struct async_submit_bio *async =
--		container_of(work, struct  async_submit_bio, work);
--	struct btrfs_inode *inode = async->inode;
--	struct btrfs_bio *bbio = btrfs_bio(async->bio);
--
--	/* If an error occurred we just want to clean up the bio and move on */
--	if (async->status) {
--		btrfs_bio_end_io(bbio, async->status);
--		return;
--	}
--
--	/*
--	 * All of the bios that pass through here are from async helpers.
--	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
--	 * This changes nothing when cgroups aren't in use.
--	 */
--	async->bio->bi_opf |= REQ_CGROUP_PUNT;
--	btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
--}
--
--static void run_one_async_free(struct btrfs_work *work)
--{
--	struct async_submit_bio *async;
--
--	async = container_of(work, struct  async_submit_bio, work);
--	kfree(async);
--}
--
--/*
-- * Submit bio to an async queue.
-- *
-- * Retrun:
-- * - true if the work has been succesfuly submitted
-- * - false in case of error
-- */
--bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
--			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct async_submit_bio *async;
--
--	async = kmalloc(sizeof(*async), GFP_NOFS);
--	if (!async)
--		return false;
--
--	async->inode = inode;
--	async->bio = bio;
--	async->mirror_num = mirror_num;
--	async->submit_cmd = cmd;
--
--	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
--			run_one_async_free);
--
--	async->dio_file_offset = dio_file_offset;
--
--	async->status = 0;
--
--	if (op_is_sync(bio->bi_opf))
--		btrfs_queue_work(fs_info->hipri_workers, &async->work);
--	else
--		btrfs_queue_work(fs_info->workers, &async->work);
--	return true;
--}
--
--static blk_status_t btree_csum_one_bio(struct bio *bio)
--{
--	struct bio_vec *bvec;
--	struct btrfs_root *root;
--	int ret = 0;
--	struct bvec_iter_all iter_all;
--
--	ASSERT(!bio_flagged(bio, BIO_CLONED));
--	bio_for_each_segment_all(bvec, bio, iter_all) {
--		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
--		ret = csum_dirty_buffer(root->fs_info, bvec);
--		if (ret)
--			break;
--	}
--
--	return errno_to_blk_status(ret);
--}
--
--blk_status_t btree_submit_bio_start(struct bio *bio)
--{
--	/*
--	 * when we're called for a write, we're already in the async
--	 * submission context.  Just jump into btrfs_submit_bio.
--	 */
--	return btree_csum_one_bio(bio);
--}
--
--static bool should_async_write(struct btrfs_fs_info *fs_info,
--			     struct btrfs_inode *bi)
--{
--	if (btrfs_is_zoned(fs_info))
--		return false;
--	if (atomic_read(&bi->sync_writers))
--		return false;
--	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
--		return false;
--	return true;
--}
--
--void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct btrfs_bio *bbio = btrfs_bio(bio);
--	blk_status_t ret;
--
--	bio->bi_opf |= REQ_META;
--	bbio->is_metadata = 1;
--
--	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
--		btrfs_submit_bio(fs_info, bio, mirror_num);
--		return;
--	}
--
--	/*
--	 * Kthread helpers are used to submit writes so that checksumming can
--	 * happen in parallel across all CPUs.
--	 */
--	if (should_async_write(fs_info, inode) &&
--	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
--		return;
--
--	ret = btree_csum_one_bio(bio);
--	if (ret) {
--		btrfs_bio_end_io(bbio, ret);
--		return;
--	}
--
--	btrfs_submit_bio(fs_info, bio, mirror_num);
--}
--
- #ifdef CONFIG_MIGRATION
- static int btree_migrate_folio(struct address_space *mapping,
- 		struct folio *dst, struct folio *src, enum migrate_mode mode)
-@@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- 
- }
- 
--void btrfs_clean_tree_block(struct extent_buffer *buf)
--{
--	struct btrfs_fs_info *fs_info = buf->fs_info;
--	if (btrfs_header_generation(buf) ==
--	    fs_info->running_transaction->transid) {
--		btrfs_assert_tree_write_locked(buf);
--
--		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
--			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
--						 -buf->len,
--						 fs_info->dirty_metadata_batch);
--			clear_extent_buffer_dirty(buf);
--		}
--	}
--}
--
- static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
- 			 u64 objectid)
- {
-@@ -5162,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
- 			start += fs_info->nodesize;
- 			if (!eb)
- 				continue;
-+
-+			btrfs_tree_lock(eb);
- 			wait_on_extent_buffer_writeback(eb);
-+			btrfs_clear_buffer_dirty(NULL, eb);
-+			btrfs_tree_unlock(eb);
- 
--			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
--					       &eb->bflags))
--				clear_extent_buffer_dirty(eb);
- 			free_extent_buffer_stale(eb);
- 		}
- 	}
-diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
-index f2f295eb6103..4d5772330110 100644
---- a/fs/btrfs/disk-io.h
-+++ b/fs/btrfs/disk-io.h
-@@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block(
- 						struct btrfs_fs_info *fs_info,
- 						u64 bytenr, u64 owner_root,
- 						int level);
--void btrfs_clean_tree_block(struct extent_buffer *buf);
-+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
-+			      struct extent_buffer *buf);
- void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
- int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
- int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
-@@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
- int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
- 				   struct page *page, u64 start, u64 end,
- 				   int mirror);
--void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
- #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
- #endif
-@@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
- int btrfs_read_extent_buffer(struct extent_buffer *buf,
- 			     struct btrfs_tree_parent_check *check);
- 
--enum btrfs_wq_submit_cmd {
--	WQ_SUBMIT_METADATA,
--	WQ_SUBMIT_DATA,
--	WQ_SUBMIT_DATA_DIO,
--};
--
--bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
--			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
--blk_status_t btree_submit_bio_start(struct bio *bio);
-+blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
- int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
- 			      struct btrfs_root *root);
- int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
-diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
-index 3c7766dfaa69..29a225836e28 100644
---- a/fs/btrfs/extent-io-tree.c
-+++ b/fs/btrfs/extent-io-tree.c
-@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- {
- 	struct extent_state *state;
- 	struct extent_state *prealloc = NULL;
--	struct rb_node **p;
--	struct rb_node *parent;
-+	struct rb_node **p = NULL;
-+	struct rb_node *parent = NULL;
- 	int err = 0;
- 	u64 last_start;
- 	u64 last_end;
-@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- {
- 	struct extent_state *state;
- 	struct extent_state *prealloc = NULL;
--	struct rb_node **p;
--	struct rb_node *parent;
-+	struct rb_node **p = NULL;
-+	struct rb_node *parent = NULL;
- 	int err = 0;
- 	u64 last_start;
- 	u64 last_end;
-@@ -1625,7 +1625,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
- }
- 
- /*
-- * Searche a range in the state tree for a given mask.  If 'filled' == 1, this
-+ * Search a range in the state tree for a given mask.  If 'filled' == 1, this
-  * returns 1 only if every extent in the tree has the bits set.  Otherwise, 1
-  * is returned if any bit in the range is found set.
-  */
-diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
-index e3eeec380844..21766e49ec02 100644
---- a/fs/btrfs/extent-io-tree.h
-+++ b/fs/btrfs/extent-io-tree.h
-@@ -6,7 +6,6 @@
- #include "misc.h"
- 
- struct extent_changeset;
--struct io_failure_record;
- 
- /* Bits for the extent state */
- enum {
-diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
-index 72ba13b027a9..824c657f59e8 100644
---- a/fs/btrfs/extent-tree.c
-+++ b/fs/btrfs/extent-tree.c
-@@ -16,7 +16,8 @@
- #include <linux/percpu_counter.h>
- #include <linux/lockdep.h>
- #include <linux/crc32c.h>
--#include "misc.h"
-+#include "ctree.h"
-+#include "extent-tree.h"
- #include "tree-log.h"
- #include "disk-io.h"
- #include "print-tree.h"
-@@ -31,14 +32,12 @@
- #include "space-info.h"
- #include "block-rsv.h"
- #include "delalloc-space.h"
--#include "block-group.h"
- #include "discard.h"
- #include "rcu-string.h"
- #include "zoned.h"
- #include "dev-replace.h"
- #include "fs.h"
- #include "accessors.h"
--#include "extent-tree.h"
- #include "root-tree.h"
- #include "file-item.h"
- #include "orphan.h"
-@@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
- 		cond_resched();
- 
- 		spin_lock(&locked_ref->lock);
--		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-+		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- 	}
- 
- 	return 0;
-@@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- 		 * insert_inline_extent_backref()).
- 		 */
- 		spin_lock(&locked_ref->lock);
--		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-+		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- 
- 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
- 						      &actual_count);
-@@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
- enum btrfs_loop_type {
- 	LOOP_CACHING_NOWAIT,
- 	LOOP_CACHING_WAIT,
-+	LOOP_UNSET_SIZE_CLASS,
- 	LOOP_ALLOC_CHUNK,
-+	LOOP_WRONG_SIZE_CLASS,
- 	LOOP_NO_EMPTY_SIZE,
- };
- 
-@@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
- 	btrfs_put_block_group(cache);
- }
- 
--enum btrfs_extent_allocation_policy {
--	BTRFS_EXTENT_ALLOC_CLUSTERED,
--	BTRFS_EXTENT_ALLOC_ZONED,
--};
--
--/*
-- * Structure used internally for find_free_extent() function.  Wraps needed
-- * parameters.
-- */
--struct find_free_extent_ctl {
--	/* Basic allocation info */
--	u64 ram_bytes;
--	u64 num_bytes;
--	u64 min_alloc_size;
--	u64 empty_size;
--	u64 flags;
--	int delalloc;
--
--	/* Where to start the search inside the bg */
--	u64 search_start;
--
--	/* For clustered allocation */
--	u64 empty_cluster;
--	struct btrfs_free_cluster *last_ptr;
--	bool use_cluster;
--
--	bool have_caching_bg;
--	bool orig_have_caching_bg;
--
--	/* Allocation is called for tree-log */
--	bool for_treelog;
--
--	/* Allocation is called for data relocation */
--	bool for_data_reloc;
--
--	/* RAID index, converted from flags */
--	int index;
--
--	/*
--	 * Current loop number, check find_free_extent_update_loop() for details
--	 */
--	int loop;
--
--	/*
--	 * Whether we're refilling a cluster, if true we need to re-search
--	 * current block group but don't try to refill the cluster again.
--	 */
--	bool retry_clustered;
--
--	/*
--	 * Whether we're updating free space cache, if true we need to re-search
--	 * current block group but don't try updating free space cache again.
--	 */
--	bool retry_unclustered;
--
--	/* If current block group is cached */
--	int cached;
--
--	/* Max contiguous hole found */
--	u64 max_extent_size;
--
--	/* Total free space from free space cache, not always contiguous */
--	u64 total_free_space;
--
--	/* Found result */
--	u64 found_offset;
--
--	/* Hint where to start looking for an empty space */
--	u64 hint_byte;
--
--	/* Allocation policy */
--	enum btrfs_extent_allocation_policy policy;
--};
--
--
- /*
-  * Helper function for find_free_extent().
-  *
-@@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
- 	if (offset) {
- 		/* We have a block, we're done */
- 		spin_unlock(&last_ptr->refill_lock);
--		trace_btrfs_reserve_extent_cluster(cluster_bg,
--				ffe_ctl->search_start, ffe_ctl->num_bytes);
-+		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
- 		*cluster_bg_ret = cluster_bg;
- 		ffe_ctl->found_offset = offset;
- 		return 0;
-@@ -3610,10 +3535,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
- 		if (offset) {
- 			/* We found one, proceed */
- 			spin_unlock(&last_ptr->refill_lock);
--			trace_btrfs_reserve_extent_cluster(bg,
--					ffe_ctl->search_start,
--					ffe_ctl->num_bytes);
- 			ffe_ctl->found_offset = offset;
-+			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
- 			return 0;
- 		}
- 	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
-@@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
- 	}
- }
- 
--static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
--{
--	switch (ffe_ctl->policy) {
--	case BTRFS_EXTENT_ALLOC_CLUSTERED:
--		/*
--		 * If we can't allocate a new chunk we've already looped through
--		 * at least once, move on to the NO_EMPTY_SIZE case.
--		 */
--		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
--		return 0;
--	case BTRFS_EXTENT_ALLOC_ZONED:
--		/* Give up here */
--		return -ENOSPC;
--	default:
--		BUG();
--	}
--}
--
- /*
-  * Return >0 means caller needs to re-search for free extent
-  * Return 0 means we have the needed free extent.
-@@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
- 	 *			caching kthreads as we move along
- 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
-+	 * LOOP_UNSET_SIZE_CLASS, allow unset size class
- 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
- 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
- 	 *		       again
- 	 */
- 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
- 		ffe_ctl->index = 0;
--		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
--			/*
--			 * We want to skip the LOOP_CACHING_WAIT step if we
--			 * don't have any uncached bgs and we've already done a
--			 * full search through.
--			 */
--			if (ffe_ctl->orig_have_caching_bg || !full_search)
--				ffe_ctl->loop = LOOP_CACHING_WAIT;
--			else
--				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
--		} else {
-+		/*
-+		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
-+		 * any uncached bgs and we've already done a full search
-+		 * through.
-+		 */
-+		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
-+		    (!ffe_ctl->orig_have_caching_bg && full_search))
- 			ffe_ctl->loop++;
--		}
-+		ffe_ctl->loop++;
- 
- 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
- 			struct btrfs_trans_handle *trans;
- 			int exist = 0;
- 
--			/*Check if allocation policy allows to create a new chunk */
-+			/* Check if allocation policy allows to create a new chunk */
- 			ret = can_allocate_chunk(fs_info, ffe_ctl);
- 			if (ret)
- 				return ret;
-@@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
- 
- 			/* Do not bail out on ENOSPC since we can do more. */
--			if (ret == -ENOSPC)
--				ret = chunk_allocation_failed(ffe_ctl);
-+			if (ret == -ENOSPC) {
-+				ret = 0;
-+				ffe_ctl->loop++;
-+			}
- 			else if (ret < 0)
- 				btrfs_abort_transaction(trans, ret);
- 			else
-@@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- 	return -ENOSPC;
- }
- 
-+static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
-+					      struct btrfs_block_group *bg)
-+{
-+	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
-+		return true;
-+	if (!btrfs_block_group_should_use_size_class(bg))
-+		return true;
-+	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
-+		return true;
-+	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
-+	    bg->size_class == BTRFS_BG_SZ_NONE)
-+		return true;
-+	return ffe_ctl->size_class == bg->size_class;
-+}
-+
- static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
- 					struct find_free_extent_ctl *ffe_ctl,
- 					struct btrfs_space_info *space_info,
-@@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 	ffe_ctl->total_free_space = 0;
- 	ffe_ctl->found_offset = 0;
- 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
-+	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
- 
- 	if (btrfs_is_zoned(fs_info))
- 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
-@@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 	ins->objectid = 0;
- 	ins->offset = 0;
- 
--	trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
--			       ffe_ctl->flags);
-+	trace_find_free_extent(root, ffe_ctl);
- 
- 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
- 	if (!space_info) {
-@@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 							block_group->flags);
- 				btrfs_lock_block_group(block_group,
- 						       ffe_ctl->delalloc);
-+				ffe_ctl->hinted = true;
- 				goto have_block_group;
- 			}
- 		} else if (block_group) {
-@@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 		}
- 	}
- search:
-+	trace_find_free_extent_search_loop(root, ffe_ctl);
- 	ffe_ctl->have_caching_bg = false;
- 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
- 	    ffe_ctl->index == 0)
-@@ -4356,6 +4277,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 			    &space_info->block_groups[ffe_ctl->index], list) {
- 		struct btrfs_block_group *bg_ret;
- 
-+		ffe_ctl->hinted = false;
- 		/* If the block group is read-only, we can skip it entirely. */
- 		if (unlikely(block_group->ro)) {
- 			if (ffe_ctl->for_treelog)
-@@ -4397,6 +4319,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 		}
- 
- have_block_group:
-+		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
- 		ffe_ctl->cached = btrfs_block_group_done(block_group);
- 		if (unlikely(!ffe_ctl->cached)) {
- 			ffe_ctl->have_caching_bg = true;
-@@ -4421,6 +4344,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
- 			goto loop;
- 
-+		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
-+			goto loop;
-+
- 		bg_ret = NULL;
- 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
- 		if (ret == 0) {
-@@ -4455,7 +4381,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 
- 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
- 					       ffe_ctl->num_bytes,
--					       ffe_ctl->delalloc);
-+					       ffe_ctl->delalloc,
-+					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
- 		if (ret == -EAGAIN) {
- 			btrfs_add_free_space_unused(block_group,
- 					ffe_ctl->found_offset,
-@@ -4468,8 +4395,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
- 		ins->objectid = ffe_ctl->search_start;
- 		ins->offset = ffe_ctl->num_bytes;
- 
--		trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
--					   ffe_ctl->num_bytes);
-+		trace_btrfs_reserve_extent(block_group, ffe_ctl);
- 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
- 		break;
- loop:
-@@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- 	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
- 
- 	__btrfs_tree_lock(buf, nest);
--	btrfs_clean_tree_block(buf);
-+	btrfs_clear_buffer_dirty(trans, buf);
- 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- 	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
- 
-@@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
- 				}
- 			}
- 		}
--		/* make block locked assertion in btrfs_clean_tree_block happy */
--		if (!path->locks[level] &&
--		    btrfs_header_generation(eb) == trans->transid) {
-+		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
-+		if (!path->locks[level]) {
- 			btrfs_tree_lock(eb);
- 			path->locks[level] = BTRFS_WRITE_LOCK;
- 		}
--		btrfs_clean_tree_block(eb);
-+		btrfs_clear_buffer_dirty(trans, eb);
- 	}
- 
- 	if (eb == root->node) {
-diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
-index ae5425253603..0c958fc1b3b8 100644
---- a/fs/btrfs/extent-tree.h
-+++ b/fs/btrfs/extent-tree.h
-@@ -3,6 +3,87 @@
- #ifndef BTRFS_EXTENT_TREE_H
- #define BTRFS_EXTENT_TREE_H
- 
-+#include "misc.h"
-+#include "block-group.h"
-+
-+struct btrfs_free_cluster;
-+
-+enum btrfs_extent_allocation_policy {
-+	BTRFS_EXTENT_ALLOC_CLUSTERED,
-+	BTRFS_EXTENT_ALLOC_ZONED,
-+};
-+
-+struct find_free_extent_ctl {
-+	/* Basic allocation info */
-+	u64 ram_bytes;
-+	u64 num_bytes;
-+	u64 min_alloc_size;
-+	u64 empty_size;
-+	u64 flags;
-+	int delalloc;
-+
-+	/* Where to start the search inside the bg */
-+	u64 search_start;
-+
-+	/* For clustered allocation */
-+	u64 empty_cluster;
-+	struct btrfs_free_cluster *last_ptr;
-+	bool use_cluster;
-+
-+	bool have_caching_bg;
-+	bool orig_have_caching_bg;
-+
-+	/* Allocation is called for tree-log */
-+	bool for_treelog;
-+
-+	/* Allocation is called for data relocation */
-+	bool for_data_reloc;
-+
-+	/* RAID index, converted from flags */
-+	int index;
-+
-+	/*
-+	 * Current loop number, check find_free_extent_update_loop() for details
-+	 */
-+	int loop;
-+
-+	/*
-+	 * Whether we're refilling a cluster, if true we need to re-search
-+	 * current block group but don't try to refill the cluster again.
-+	 */
-+	bool retry_clustered;
-+
-+	/*
-+	 * Whether we're updating free space cache, if true we need to re-search
-+	 * current block group but don't try updating free space cache again.
-+	 */
-+	bool retry_unclustered;
-+
-+	/* If current block group is cached */
-+	int cached;
-+
-+	/* Max contiguous hole found */
-+	u64 max_extent_size;
-+
-+	/* Total free space from free space cache, not always contiguous */
-+	u64 total_free_space;
-+
-+	/* Found result */
-+	u64 found_offset;
-+
-+	/* Hint where to start looking for an empty space */
-+	u64 hint_byte;
-+
-+	/* Allocation policy */
-+	enum btrfs_extent_allocation_policy policy;
-+
-+	/* Whether or not the allocator is currently following a hint */
-+	bool hinted;
-+
-+	/* Size class of block groups to prefer in early loops */
-+	enum btrfs_block_group_size_class size_class;
-+};
-+
- enum btrfs_inline_ref_type {
- 	BTRFS_REF_TYPE_INVALID,
- 	BTRFS_REF_TYPE_BLOCK,
-diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
-index 3bbf8703db2a..c25fa74d7615 100644
---- a/fs/btrfs/extent_io.c
-+++ b/fs/btrfs/extent_io.c
-@@ -36,6 +36,7 @@
- #include "file.h"
- #include "dev-replace.h"
- #include "super.h"
-+#include "transaction.h"
- 
- static struct kmem_cache *extent_buffer_cache;
- 
-@@ -99,7 +100,6 @@ struct btrfs_bio_ctrl {
- 	struct bio *bio;
- 	int mirror_num;
- 	enum btrfs_compression_type compress_type;
--	u32 len_to_stripe_boundary;
- 	u32 len_to_oe_boundary;
- 	btrfs_bio_end_io_t end_io_func;
- 
-@@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
- {
- 	struct bio *bio;
- 	struct bio_vec *bv;
--	struct btrfs_inode *inode;
-+	struct inode *inode;
- 	int mirror_num;
- 
- 	if (!bio_ctrl->bio)
-@@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
- 
- 	bio = bio_ctrl->bio;
- 	bv = bio_first_bvec_all(bio);
--	inode = BTRFS_I(bv->bv_page->mapping->host);
-+	inode = bv->bv_page->mapping->host;
- 	mirror_num = bio_ctrl->mirror_num;
- 
- 	/* Caller should ensure the bio has at least some range added */
- 	ASSERT(bio->bi_iter.bi_size);
- 
--	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
--
--	if (!is_data_inode(&inode->vfs_inode)) {
-+	if (!is_data_inode(inode)) {
- 		if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- 			/*
- 			 * For metadata read, we should have the parent_check,
-@@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
- 			       bio_ctrl->parent_check,
- 			       sizeof(struct btrfs_tree_parent_check));
- 		}
--		btrfs_submit_metadata_bio(inode, bio, mirror_num);
--	} else if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
--		btrfs_submit_data_write_bio(inode, bio, mirror_num);
--	} else {
--		btrfs_submit_data_read_bio(inode, bio, mirror_num,
--					   bio_ctrl->compress_type);
-+		bio->bi_opf |= REQ_META;
- 	}
- 
-+	if (btrfs_op(bio) == BTRFS_MAP_READ &&
-+	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
-+		btrfs_submit_compressed_read(inode, bio, mirror_num);
-+	else
-+		btrfs_submit_bio(bio, mirror_num);
-+
- 	/* The bio is owned by the end_io handler now */
- 	bio_ctrl->bio = NULL;
- }
-@@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- 			       start, end, page_ops, NULL);
- }
- 
--static int insert_failrec(struct btrfs_inode *inode,
--			  struct io_failure_record *failrec)
--{
--	struct rb_node *exist;
--
--	spin_lock(&inode->io_failure_lock);
--	exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
--				 &failrec->rb_node);
--	spin_unlock(&inode->io_failure_lock);
--
--	return (exist == NULL) ? 0 : -EEXIST;
--}
--
--static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
--{
--	struct rb_node *node;
--	struct io_failure_record *failrec = ERR_PTR(-ENOENT);
--
--	spin_lock(&inode->io_failure_lock);
--	node = rb_simple_search(&inode->io_failure_tree, start);
--	if (node)
--		failrec = rb_entry(node, struct io_failure_record, rb_node);
--	spin_unlock(&inode->io_failure_lock);
--	return failrec;
--}
--
--static void free_io_failure(struct btrfs_inode *inode,
--			    struct io_failure_record *rec)
--{
--	spin_lock(&inode->io_failure_lock);
--	rb_erase(&rec->rb_node, &inode->io_failure_tree);
--	spin_unlock(&inode->io_failure_lock);
--
--	kfree(rec);
--}
--
--static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
--{
--	if (cur_mirror == failrec->num_copies)
--		return cur_mirror + 1 - failrec->num_copies;
--	return cur_mirror + 1;
--}
--
--static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
--{
--	if (cur_mirror == 1)
--		return failrec->num_copies;
--	return cur_mirror - 1;
--}
--
--/*
-- * each time an IO finishes, we do a fast check in the IO failure tree
-- * to see if we need to process or clean up an io_failure_record
-- */
--int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
--			   struct page *page, unsigned int pg_offset)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct extent_io_tree *io_tree = &inode->io_tree;
--	u64 ino = btrfs_ino(inode);
--	u64 locked_start, locked_end;
--	struct io_failure_record *failrec;
--	int mirror;
--	int ret;
--
--	failrec = get_failrec(inode, start);
--	if (IS_ERR(failrec))
--		return 0;
--
--	BUG_ON(!failrec->this_mirror);
--
--	if (sb_rdonly(fs_info->sb))
--		goto out;
--
--	ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
--				    &locked_end, EXTENT_LOCKED, NULL);
--	if (ret || locked_start > failrec->bytenr ||
--	    locked_end < failrec->bytenr + failrec->len - 1)
--		goto out;
--
--	mirror = failrec->this_mirror;
--	do {
--		mirror = prev_mirror(failrec, mirror);
--		btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
--				  failrec->logical, page, pg_offset, mirror);
--	} while (mirror != failrec->failed_mirror);
--
--out:
--	free_io_failure(inode, failrec);
--	return 0;
--}
--
--/*
-- * Can be called when
-- * - hold extent lock
-- * - under ordered extent
-- * - the inode is freeing
-- */
--void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
--{
--	struct io_failure_record *failrec;
--	struct rb_node *node, *next;
--
--	if (RB_EMPTY_ROOT(&inode->io_failure_tree))
--		return;
--
--	spin_lock(&inode->io_failure_lock);
--	node = rb_simple_search_first(&inode->io_failure_tree, start);
--	while (node) {
--		failrec = rb_entry(node, struct io_failure_record, rb_node);
--		if (failrec->bytenr > end)
--			break;
--
--		next = rb_next(node);
--		rb_erase(&failrec->rb_node, &inode->io_failure_tree);
--		kfree(failrec);
--
--		node = next;
--	}
--	spin_unlock(&inode->io_failure_lock);
--}
--
--static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
--							     struct btrfs_bio *bbio,
--							     unsigned int bio_offset)
--{
--	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
--	u64 start = bbio->file_offset + bio_offset;
--	struct io_failure_record *failrec;
--	const u32 sectorsize = fs_info->sectorsize;
--	int ret;
--
--	failrec = get_failrec(BTRFS_I(inode), start);
--	if (!IS_ERR(failrec)) {
--		btrfs_debug(fs_info,
--	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
--			failrec->logical, failrec->bytenr, failrec->len);
--		/*
--		 * when data can be on disk more than twice, add to failrec here
--		 * (e.g. with a list for failed_mirror) to make
--		 * clean_io_failure() clean all those errors at once.
--		 */
--		ASSERT(failrec->this_mirror == bbio->mirror_num);
--		ASSERT(failrec->len == fs_info->sectorsize);
--		return failrec;
--	}
--
--	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
--	if (!failrec)
--		return ERR_PTR(-ENOMEM);
--
--	RB_CLEAR_NODE(&failrec->rb_node);
--	failrec->bytenr = start;
--	failrec->len = sectorsize;
--	failrec->failed_mirror = bbio->mirror_num;
--	failrec->this_mirror = bbio->mirror_num;
--	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
--
--	btrfs_debug(fs_info,
--		    "new io failure record logical %llu start %llu",
--		    failrec->logical, start);
--
--	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
--	if (failrec->num_copies == 1) {
--		/*
--		 * We only have a single copy of the data, so don't bother with
--		 * all the retry and error correction code that follows. No
--		 * matter what the error is, it is very likely to persist.
--		 */
--		btrfs_debug(fs_info,
--			"cannot repair logical %llu num_copies %d",
--			failrec->logical, failrec->num_copies);
--		kfree(failrec);
--		return ERR_PTR(-EIO);
--	}
--
--	/* Set the bits in the private failure tree */
--	ret = insert_failrec(BTRFS_I(inode), failrec);
--	if (ret) {
--		kfree(failrec);
--		return ERR_PTR(ret);
--	}
--
--	return failrec;
--}
--
--int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
--			    u32 bio_offset, struct page *page, unsigned int pgoff,
--			    bool submit_buffered)
--{
--	u64 start = failed_bbio->file_offset + bio_offset;
--	struct io_failure_record *failrec;
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct bio *failed_bio = &failed_bbio->bio;
--	const int icsum = bio_offset >> fs_info->sectorsize_bits;
--	struct bio *repair_bio;
--	struct btrfs_bio *repair_bbio;
--
--	btrfs_debug(fs_info,
--		   "repair read error: read error at %llu", start);
--
--	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
--
--	failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
--	if (IS_ERR(failrec))
--		return PTR_ERR(failrec);
--
--	/*
--	 * There are two premises:
--	 * a) deliver good data to the caller
--	 * b) correct the bad sectors on disk
--	 *
--	 * Since we're only doing repair for one sector, we only need to get
--	 * a good copy of the failed sector and if we succeed, we have setup
--	 * everything for btrfs_repair_io_failure to do the rest for us.
--	 */
--	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
--	if (failrec->this_mirror == failrec->failed_mirror) {
--		btrfs_debug(fs_info,
--			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
--			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
--		free_io_failure(inode, failrec);
--		return -EIO;
--	}
--
--	repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
--				     failed_bbio->private);
--	repair_bbio = btrfs_bio(repair_bio);
--	repair_bbio->file_offset = start;
--	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
--
--	if (failed_bbio->csum) {
--		const u32 csum_size = fs_info->csum_size;
--
--		repair_bbio->csum = repair_bbio->csum_inline;
--		memcpy(repair_bbio->csum,
--		       failed_bbio->csum + csum_size * icsum, csum_size);
--	}
--
--	bio_add_page(repair_bio, page, failrec->len, pgoff);
--	repair_bbio->iter = repair_bio->bi_iter;
--
--	btrfs_debug(fs_info,
--		    "repair read error: submitting new read to mirror %d",
--		    failrec->this_mirror);
--
--	/*
--	 * At this point we have a bio, so any errors from bio submission will
--	 * be handled by the endio on the repair_bio, so we can't return an
--	 * error here.
--	 */
--	if (submit_buffered)
--		btrfs_submit_data_read_bio(inode, repair_bio,
--					   failrec->this_mirror, 0);
--	else
--		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
--
--	return BLK_STS_OK;
--}
--
- static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
- {
- 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
-@@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
- 		btrfs_subpage_end_reader(fs_info, page, start, len);
- }
- 
--static void end_sector_io(struct page *page, u64 offset, bool uptodate)
--{
--	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
--	const u32 sectorsize = inode->root->fs_info->sectorsize;
--
--	end_page_read(page, uptodate, offset, sectorsize);
--	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
--}
--
--static void submit_data_read_repair(struct inode *inode,
--				    struct btrfs_bio *failed_bbio,
--				    u32 bio_offset, const struct bio_vec *bvec,
--				    unsigned int error_bitmap)
--{
--	const unsigned int pgoff = bvec->bv_offset;
--	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
--	struct page *page = bvec->bv_page;
--	const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
--	const u64 end = start + bvec->bv_len - 1;
--	const u32 sectorsize = fs_info->sectorsize;
--	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
--	int i;
--
--	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
--
--	/* This repair is only for data */
--	ASSERT(is_data_inode(inode));
--
--	/* We're here because we had some read errors or csum mismatch */
--	ASSERT(error_bitmap);
--
--	/*
--	 * We only get called on buffered IO, thus page must be mapped and bio
--	 * must not be cloned.
--	 */
--	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
--
--	/* Iterate through all the sectors in the range */
--	for (i = 0; i < nr_bits; i++) {
--		const unsigned int offset = i * sectorsize;
--		bool uptodate = false;
--		int ret;
--
--		if (!(error_bitmap & (1U << i))) {
--			/*
--			 * This sector has no error, just end the page read
--			 * and unlock the range.
--			 */
--			uptodate = true;
--			goto next;
--		}
--
--		ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
--				bio_offset + offset, page, pgoff + offset,
--				true);
--		if (!ret) {
--			/*
--			 * We have submitted the read repair, the page release
--			 * will be handled by the endio function of the
--			 * submitted repair bio.
--			 * Thus we don't need to do any thing here.
--			 */
--			continue;
--		}
--		/*
--		 * Continue on failed repair, otherwise the remaining sectors
--		 * will not be properly unlocked.
--		 */
--next:
--		end_sector_io(page, start + offset, uptodate);
--	}
--}
--
- /* lots and lots of room for performance fixes in the end_bio funcs */
- 
- void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
-@@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
- 	u64 start;
- 	u64 end;
- 	struct bvec_iter_all iter_all;
--	bool first_bvec = true;
- 
- 	ASSERT(!bio_flagged(bio, BIO_CLONED));
- 	bio_for_each_segment_all(bvec, bio, iter_all) {
-@@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
- 		start = page_offset(page) + bvec->bv_offset;
- 		end = start + bvec->bv_len - 1;
- 
--		if (first_bvec) {
--			btrfs_record_physical_zoned(inode, start, bio);
--			first_bvec = false;
--		}
--
- 		end_extent_writepage(page, error, start, end);
- 
- 		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
-@@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
- 		struct inode *inode = page->mapping->host;
- 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 		const u32 sectorsize = fs_info->sectorsize;
--		unsigned int error_bitmap = (unsigned int)-1;
--		bool repair = false;
- 		u64 start;
- 		u64 end;
- 		u32 len;
-@@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
- 		len = bvec->bv_len;
- 
- 		mirror = bbio->mirror_num;
--		if (likely(uptodate)) {
--			if (is_data_inode(inode)) {
--				error_bitmap = btrfs_verify_data_csum(bbio,
--						bio_offset, page, start, end);
--				if (error_bitmap)
--					uptodate = false;
--			} else {
--				if (btrfs_validate_metadata_buffer(bbio,
--						page, start, end, mirror))
--					uptodate = false;
--			}
--		}
-+		if (uptodate && !is_data_inode(inode) &&
-+		    btrfs_validate_metadata_buffer(bbio, page, start, end, mirror))
-+			uptodate = false;
- 
- 		if (likely(uptodate)) {
- 			loff_t i_size = i_size_read(inode);
- 			pgoff_t end_index = i_size >> PAGE_SHIFT;
- 
--			btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
--
- 			/*
- 			 * Zero out the remaining part if this range straddles
- 			 * i_size.
-@@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
- 				zero_user_segment(page, zero_start,
- 						  offset_in_page(end) + 1);
- 			}
--		} else if (is_data_inode(inode)) {
--			/*
--			 * Only try to repair bios that actually made it to a
--			 * device.  If the bio failed to be submitted mirror
--			 * is 0 and we need to fail it without retrying.
--			 *
--			 * This also includes the high level bios for compressed
--			 * extents - these never make it to a device and repair
--			 * is already handled on the lower compressed bio.
--			 */
--			if (mirror > 0)
--				repair = true;
--		} else {
-+		} else if (!is_data_inode(inode)) {
- 			struct extent_buffer *eb;
- 
- 			eb = find_extent_buffer_readpage(fs_info, page, start);
-@@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
- 			atomic_dec(&eb->io_pages);
- 		}
- 
--		if (repair) {
--			/*
--			 * submit_data_read_repair() will handle all the good
--			 * and bad sectors, we just continue to the next bvec.
--			 */
--			submit_data_read_repair(inode, bbio, bio_offset, bvec,
--						error_bitmap);
--		} else {
--			/* Update page status and unlock */
--			end_page_read(page, uptodate, start, len);
--			endio_readpage_release_extent(&processed, BTRFS_I(inode),
--					start, end, PageUptodate(page));
--		}
-+		/* Update page status and unlock. */
-+		end_page_read(page, uptodate, start, len);
-+		endio_readpage_release_extent(&processed, BTRFS_I(inode),
-+					      start, end, PageUptodate(page));
- 
- 		ASSERT(bio_offset + len > bio_offset);
- 		bio_offset += len;
-@@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
- 	}
- 	/* Release the last extent */
- 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
--	btrfs_bio_free_csum(bbio);
- 	bio_put(bio);
- }
- 
-@@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- 	u32 real_size;
- 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- 	bool contig = false;
--	int ret;
- 
- 	ASSERT(bio);
- 	/* The limit should be calculated when bio_ctrl->bio is allocated */
--	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
-+	ASSERT(bio_ctrl->len_to_oe_boundary);
- 	if (bio_ctrl->compress_type != compress_type)
- 		return 0;
- 
-@@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- 	if (!contig)
- 		return 0;
- 
--	real_size = min(bio_ctrl->len_to_oe_boundary,
--			bio_ctrl->len_to_stripe_boundary) - bio_size;
--	real_size = min(real_size, size);
-+	real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
- 
- 	/*
- 	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
-@@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- 	if (real_size == 0)
- 		return 0;
- 
--	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
--		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
--	else
--		ret = bio_add_page(bio, page, real_size, pg_offset);
--
--	return ret;
-+	return bio_add_page(bio, page, real_size, pg_offset);
- }
- 
--static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
--			       struct btrfs_inode *inode, u64 file_offset)
-+static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
-+				struct btrfs_inode *inode, u64 file_offset)
- {
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct btrfs_io_geometry geom;
- 	struct btrfs_ordered_extent *ordered;
--	struct extent_map *em;
--	u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
--	int ret;
- 
- 	/*
--	 * Pages for compressed extent are never submitted to disk directly,
--	 * thus it has no real boundary, just set them to U32_MAX.
--	 *
--	 * The split happens for real compressed bio, which happens in
--	 * btrfs_submit_compressed_read/write().
-+	 * Limit the extent to the ordered boundary for Zone Append.
-+	 * Compressed bios aren't submitted directly, so it doesn't apply to
-+	 * them.
- 	 */
--	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
--		bio_ctrl->len_to_oe_boundary = U32_MAX;
--		bio_ctrl->len_to_stripe_boundary = U32_MAX;
--		return 0;
--	}
--	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
--	if (IS_ERR(em))
--		return PTR_ERR(em);
--	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
--				    logical, &geom);
--	free_extent_map(em);
--	if (ret < 0) {
--		return ret;
--	}
--	if (geom.len > U32_MAX)
--		bio_ctrl->len_to_stripe_boundary = U32_MAX;
--	else
--		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
--
--	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
--		bio_ctrl->len_to_oe_boundary = U32_MAX;
--		return 0;
--	}
--
--	/* Ordered extent not yet created, so we're good */
--	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
--	if (!ordered) {
--		bio_ctrl->len_to_oe_boundary = U32_MAX;
--		return 0;
-+	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
-+	    btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
-+		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-+		if (ordered) {
-+			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
-+					ordered->file_offset +
-+					ordered->disk_num_bytes - file_offset);
-+			btrfs_put_ordered_extent(ordered);
-+			return;
-+		}
- 	}
- 
--	bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
--		ordered->disk_bytenr + ordered->disk_num_bytes - logical);
--	btrfs_put_ordered_extent(ordered);
--	return 0;
-+	bio_ctrl->len_to_oe_boundary = U32_MAX;
- }
- 
--static int alloc_new_bio(struct btrfs_inode *inode,
--			 struct btrfs_bio_ctrl *bio_ctrl,
--			 struct writeback_control *wbc,
--			 blk_opf_t opf,
--			 u64 disk_bytenr, u32 offset, u64 file_offset,
--			 enum btrfs_compression_type compress_type)
-+static void alloc_new_bio(struct btrfs_inode *inode,
-+			  struct btrfs_bio_ctrl *bio_ctrl,
-+			  struct writeback_control *wbc, blk_opf_t opf,
-+			  u64 disk_bytenr, u32 offset, u64 file_offset,
-+			  enum btrfs_compression_type compress_type)
- {
- 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 	struct bio *bio;
--	int ret;
- 
--	ASSERT(bio_ctrl->end_io_func);
--
--	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
-+	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
-+			      NULL);
- 	/*
- 	 * For compressed page range, its disk_bytenr is always @disk_bytenr
- 	 * passed in, no matter if we have added any range into previous bio.
-@@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode,
- 		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- 	else
- 		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
-+	btrfs_bio(bio)->file_offset = file_offset;
- 	bio_ctrl->bio = bio;
- 	bio_ctrl->compress_type = compress_type;
--	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
--	if (ret < 0)
--		goto error;
-+	calc_bio_boundaries(bio_ctrl, inode, file_offset);
- 
- 	if (wbc) {
- 		/*
--		 * For Zone append we need the correct block_device that we are
--		 * going to write to set in the bio to be able to respect the
--		 * hardware limitation.  Look it up here:
-+		 * Pick the last added device to support cgroup writeback.  For
-+		 * multi-device file systems this means blk-cgroup policies have
-+		 * to always be set on the last added/replaced device.
-+		 * This is a bit odd but has been like that for a long time.
- 		 */
--		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
--			struct btrfs_device *dev;
--
--			dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
--						     fs_info->sectorsize);
--			if (IS_ERR(dev)) {
--				ret = PTR_ERR(dev);
--				goto error;
--			}
--
--			bio_set_dev(bio, dev->bdev);
--		} else {
--			/*
--			 * Otherwise pick the last added device to support
--			 * cgroup writeback.  For multi-device file systems this
--			 * means blk-cgroup policies have to always be set on the
--			 * last added/replaced device.  This is a bit odd but has
--			 * been like that for a long time.
--			 */
--			bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
--		}
-+		bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
- 		wbc_init_bio(wbc, bio);
--	} else {
--		ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
- 	}
--	return 0;
--error:
--	bio_ctrl->bio = NULL;
--	btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
--	return ret;
- }
- 
- /*
-@@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf,
- 			      enum btrfs_compression_type compress_type,
- 			      bool force_bio_submit)
- {
--	int ret = 0;
- 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- 	unsigned int cur = pg_offset;
- 
-@@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf,
- 
- 		/* Allocate new bio if needed */
- 		if (!bio_ctrl->bio) {
--			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
--					    disk_bytenr, offset,
--					    page_offset(page) + cur,
--					    compress_type);
--			if (ret < 0)
--				return ret;
-+			alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
-+				      offset, page_offset(page) + cur,
-+				      compress_type);
- 		}
- 		/*
- 		 * We must go through btrfs_bio_add_page() to ensure each
-@@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
- 		 * find_next_dirty_byte() are all exclusive
- 		 */
- 		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
--
--		if (btrfs_use_zone_append(inode, em->block_start))
--			op = REQ_OP_ZONE_APPEND;
--
- 		free_extent_map(em);
- 		em = NULL;
- 
-@@ -2360,13 +1910,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
- 	 */
- 	mapping_set_error(page->mapping, -EIO);
- 
--	/*
--	 * If we error out, we should add back the dirty_metadata_bytes
--	 * to make it consistent.
--	 */
--	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
--				 eb->len, fs_info->dirty_metadata_batch);
--
- 	/*
- 	 * If writeback for a btree extent that doesn't belong to a log tree
- 	 * failed, increment the counter transaction->eb_write_errors.
-@@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
- 	WARN_ON(atomic_read(&eb->refs) == 0);
- }
- 
--void clear_extent_buffer_dirty(const struct extent_buffer *eb)
-+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
-+			      struct extent_buffer *eb)
- {
-+	struct btrfs_fs_info *fs_info = eb->fs_info;
- 	int i;
- 	int num_pages;
- 	struct page *page;
- 
-+	btrfs_assert_tree_write_locked(eb);
-+
-+	if (trans && btrfs_header_generation(eb) != trans->transid)
-+		return;
-+
-+	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
-+		return;
-+
-+	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
-+				 fs_info->dirty_metadata_batch);
-+
- 	if (eb->fs_info->nodesize < PAGE_SIZE)
- 		return clear_subpage_extent_buffer_dirty(eb);
- 
-diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
-index a2c82448b2e0..4341ad978fb8 100644
---- a/fs/btrfs/extent_io.h
-+++ b/fs/btrfs/extent_io.h
-@@ -11,6 +11,8 @@
- #include "ulist.h"
- #include "misc.h"
- 
-+struct btrfs_trans_handle;
-+
- enum {
- 	EXTENT_BUFFER_UPTODATE,
- 	EXTENT_BUFFER_DIRTY,
-@@ -60,11 +62,9 @@ enum {
- #define BITMAP_LAST_BYTE_MASK(nbits) \
- 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
- 
--struct btrfs_bio;
- struct btrfs_root;
- struct btrfs_inode;
- struct btrfs_fs_info;
--struct io_failure_record;
- struct extent_io_tree;
- struct btrfs_tree_parent_check;
- 
-@@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
- void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
- 				unsigned long start, unsigned long pos,
- 				unsigned long len);
--void clear_extent_buffer_dirty(const struct extent_buffer *eb);
- bool set_extent_buffer_dirty(struct extent_buffer *eb);
- void set_extent_buffer_uptodate(struct extent_buffer *eb);
- void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-@@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- 				  u32 bits_to_clear, unsigned long page_ops);
- int extent_invalidate_folio(struct extent_io_tree *tree,
- 			    struct folio *folio, size_t offset);
-+void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
-+			      struct extent_buffer *buf);
- 
- int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
- 
- void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
- 
--/*
-- * When IO fails, either with EIO or csum verification fails, we
-- * try other mirrors that might have a good copy of the data.  This
-- * io_failure_record is used to record state as we go through all the
-- * mirrors.  If another mirror has good data, the sector is set up to date
-- * and things continue.  If a good mirror can't be found, the original
-- * bio end_io callback is called to indicate things have failed.
-- */
--struct io_failure_record {
--	/* Use rb_simple_node for search/insert */
--	struct {
--		struct rb_node rb_node;
--		u64 bytenr;
--	};
--	struct page *page;
--	u64 len;
--	u64 logical;
--	int this_mirror;
--	int failed_mirror;
--	int num_copies;
--};
--
--int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
--			    u32 bio_offset, struct page *page, unsigned int pgoff,
--			    bool submit_buffered);
--void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
--int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
--			   struct page *page, unsigned int pg_offset);
--
- #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- bool find_lock_delalloc_range(struct inode *inode,
- 			     struct page *locked_page, u64 *start,
-diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
-index 5de73466b2ca..41c77a100853 100644
---- a/fs/btrfs/file-item.c
-+++ b/fs/btrfs/file-item.c
-@@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
- /*
-  * Lookup the checksum for the read bio in csum tree.
-  *
-- * @inode:  inode that the bio is for.
-- * @bio:    bio to look up.
-- * @dst:    Buffer of size nblocks * btrfs_super_csum_size() used to return
-- *          checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
-- *          NULL, the checksum buffer is allocated and returned in
-- *          btrfs_bio(bio)->csum instead.
-- *
-  * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
-  */
--blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
-+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
- {
--	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
--	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
--	struct btrfs_bio *bbio = NULL;
-+	struct btrfs_inode *inode = bbio->inode;
-+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-+	struct extent_io_tree *io_tree = &inode->io_tree;
-+	struct bio *bio = &bbio->bio;
- 	struct btrfs_path *path;
- 	const u32 sectorsize = fs_info->sectorsize;
- 	const u32 csum_size = fs_info->csum_size;
- 	u32 orig_len = bio->bi_iter.bi_size;
- 	u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- 	u64 cur_disk_bytenr;
--	u8 *csum;
- 	const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
- 	int count = 0;
- 	blk_status_t ret = BLK_STS_OK;
- 
--	if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
-+	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- 	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
- 		return BLK_STS_OK;
- 
-@@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
- 	if (!path)
- 		return BLK_STS_RESOURCE;
- 
--	if (!dst) {
--		bbio = btrfs_bio(bio);
--
--		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
--			bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
--			if (!bbio->csum) {
--				btrfs_free_path(path);
--				return BLK_STS_RESOURCE;
--			}
--		} else {
--			bbio->csum = bbio->csum_inline;
-+	if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-+		bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
-+		if (!bbio->csum) {
-+			btrfs_free_path(path);
-+			return BLK_STS_RESOURCE;
- 		}
--		csum = bbio->csum;
- 	} else {
--		csum = dst;
-+		bbio->csum = bbio->csum_inline;
- 	}
- 
- 	/*
-@@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
- 	 * read from the commit root and sidestep a nasty deadlock
- 	 * between reading the free space cache and updating the csum tree.
- 	 */
--	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
-+	if (btrfs_is_free_space_inode(inode)) {
- 		path->search_commit_root = 1;
- 		path->skip_locking = 1;
- 	}
-@@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
- 		ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
- 		sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
- 				fs_info->sectorsize_bits;
--		csum_dst = csum + sector_offset * csum_size;
-+		csum_dst = bbio->csum + sector_offset * csum_size;
- 
- 		count = search_csum_tree(fs_info, path, cur_disk_bytenr,
- 					 search_len, csum_dst);
- 		if (count < 0) {
- 			ret = errno_to_blk_status(count);
--			if (bbio)
--				btrfs_bio_free_csum(bbio);
-+			if (bbio->csum != bbio->csum_inline)
-+				kfree(bbio->csum);
-+			bbio->csum = NULL;
- 			break;
- 		}
- 
-@@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
- 			memset(csum_dst, 0, csum_size);
- 			count = 1;
- 
--			if (BTRFS_I(inode)->root->root_key.objectid ==
-+			if (inode->root->root_key.objectid ==
- 			    BTRFS_DATA_RELOC_TREE_OBJECTID) {
- 				u64 file_offset;
- 				int ret;
- 
--				ret = search_file_offset_in_bio(bio, inode,
-+				ret = search_file_offset_in_bio(bio,
-+						&inode->vfs_inode,
- 						cur_disk_bytenr, &file_offset);
- 				if (ret)
- 					set_extent_bits(io_tree, file_offset,
-@@ -784,23 +772,16 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
- 
- /*
-  * Calculate checksums of the data contained inside a bio.
-- *
-- * @inode:	 Owner of the data inside the bio
-- * @bio:	 Contains the data to be checksummed
-- * @offset:      If (u64)-1, @bio may contain discontiguous bio vecs, so the
-- *               file offsets are determined from the page offsets in the bio.
-- *               Otherwise, this is the starting file offset of the bio vecs in
-- *               @bio, which must be contiguous.
-- * @one_ordered: If true, @bio only refers to one ordered extent.
-  */
--blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
--				u64 offset, bool one_ordered)
-+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
- {
-+	struct btrfs_inode *inode = bbio->inode;
- 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-+	struct bio *bio = &bbio->bio;
-+	u64 offset = bbio->file_offset;
- 	struct btrfs_ordered_sum *sums;
- 	struct btrfs_ordered_extent *ordered = NULL;
--	const bool use_page_offsets = (offset == (u64)-1);
- 	char *data;
- 	struct bvec_iter iter;
- 	struct bio_vec bvec;
-@@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- 	shash->tfm = fs_info->csum_shash;
- 
- 	bio_for_each_segment(bvec, bio, iter) {
--		if (use_page_offsets)
--			offset = page_offset(bvec.bv_page) + bvec.bv_offset;
--
- 		if (!ordered) {
- 			ordered = btrfs_lookup_ordered_extent(inode, offset);
- 			/*
-@@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- 						 - 1);
- 
- 		for (i = 0; i < blockcount; i++) {
--			if (!one_ordered &&
-+			if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) &&
- 			    !in_range(offset, ordered->file_offset,
- 				      ordered->num_bytes)) {
- 				unsigned long bytes_left;
-diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
-index 031225668434..cd7f2ae515c0 100644
---- a/fs/btrfs/file-item.h
-+++ b/fs/btrfs/file-item.h
-@@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
- 
- int btrfs_del_csums(struct btrfs_trans_handle *trans,
- 		    struct btrfs_root *root, u64 bytenr, u64 len);
--blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-+blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
- int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
- 			     struct btrfs_root *root, u64 objectid, u64 pos,
- 			     u64 num_bytes);
-@@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
- int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
- 			   struct btrfs_root *root,
- 			   struct btrfs_ordered_sum *sums);
--blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
--				u64 offset, bool one_ordered);
-+blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
-+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-+			     struct list_head *list, int search_commit,
-+			     bool nowait);
- int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- 			    struct list_head *list, int search_commit,
- 			    bool nowait);
-diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
-index af046d22300e..ec5c5355906b 100644
---- a/fs/btrfs/file.c
-+++ b/fs/btrfs/file.c
-@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
- 				unlock_page(pages[i]);
- 				put_page(pages[i]);
- 			}
--			btrfs_start_ordered_extent(ordered, 1);
-+			btrfs_start_ordered_extent(ordered);
- 			btrfs_put_ordered_extent(ordered);
- 			return -EAGAIN;
- 		}
-@@ -1465,6 +1465,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
- 	ssize_t err;
- 	unsigned int ilock_flags = 0;
- 	struct iomap_dio *dio;
-+	struct btrfs_ordered_extent *ordered_extent = NULL;
- 
- 	if (iocb->ki_flags & IOCB_NOWAIT)
- 		ilock_flags |= BTRFS_ILOCK_TRY;
-@@ -1526,7 +1527,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
- 	 * got -EFAULT, faulting in the pages before the retry.
- 	 */
- 	from->nofault = true;
--	dio = btrfs_dio_write(iocb, from, written);
-+	dio = btrfs_dio_write(iocb, from, &ordered_extent, written);
- 	from->nofault = false;
- 
- 	/*
-@@ -1569,6 +1570,14 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
- 			goto relock;
- 		}
- 	}
-+	/*
-+	 * We can't loop back to btrfs_dio_write, so we can drop the cached
-+	 * ordered extent. Typically btrfs_dio_iomap_end will run and put the
-+	 * ordered_extent, but this is needed to clean up in case of an error
-+	 * path breaking out of iomap_iter before the final iomap_end call.
-+	 */
-+	if (ordered_extent)
-+		btrfs_put_ordered_extent(ordered_extent);
- 
- 	/*
- 	 * If 'err' is -ENOTBLK or we have not written all data, then it means
-diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
-index c667e878ef1a..4d155a48ec59 100644
---- a/fs/btrfs/free-space-tree.c
-+++ b/fs/btrfs/free-space-tree.c
-@@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
- 	list_del(&free_space_root->dirty_list);
- 
- 	btrfs_tree_lock(free_space_root->node);
--	btrfs_clean_tree_block(free_space_root->node);
-+	btrfs_clear_buffer_dirty(trans, free_space_root->node);
- 	btrfs_tree_unlock(free_space_root->node);
- 	btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
- 			      free_space_root->node, 0, 1);
-diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
-index 3d8156fc8523..4c477eae6891 100644
---- a/fs/btrfs/fs.h
-+++ b/fs/btrfs/fs.h
-@@ -3,6 +3,7 @@
- #ifndef BTRFS_FS_H
- #define BTRFS_FS_H
- 
-+#include <linux/blkdev.h>
- #include <linux/fs.h>
- #include <linux/btrfs_tree.h>
- #include <linux/sizes.h>
-@@ -748,8 +749,10 @@ struct btrfs_fs_info {
- 	 */
- 	u64 zone_size;
- 
--	/* Max size to emit ZONE_APPEND write command */
-+	/* Constraints for ZONE_APPEND commands: */
-+	struct queue_limits limits;
- 	u64 max_zone_append_size;
-+
- 	struct mutex zoned_meta_io_lock;
- 	spinlock_t treelog_bg_lock;
- 	u64 treelog_bg;
-diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
-index 98a800b8bd43..6aaa892474be 100644
---- a/fs/btrfs/inode.c
-+++ b/fs/btrfs/inode.c
-@@ -81,30 +81,16 @@ struct btrfs_dio_data {
- 	struct extent_changeset *data_reserved;
- 	bool data_space_reserved;
- 	bool nocow_done;
-+	struct btrfs_ordered_extent *ordered;
- };
- 
- struct btrfs_dio_private {
--	struct btrfs_inode *inode;
--
--	/*
--	 * Since DIO can use anonymous page, we cannot use page_offset() to
--	 * grab the file offset, thus need a dedicated member for file offset.
--	 */
-+	/* Range of I/O */
- 	u64 file_offset;
--	/* Used for bio::bi_size */
- 	u32 bytes;
- 
--	/*
--	 * References to this structure. There is one reference per in-flight
--	 * bio plus one while we're still setting up.
--	 */
--	refcount_t refs;
--
--	/* Array of checksums */
--	u8 *csums;
--
- 	/* This must be last */
--	struct bio bio;
-+	struct btrfs_bio bbio;
- };
- 
- static struct bio_set btrfs_dio_bioset;
-@@ -228,7 +214,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- {
- 	unsigned long index = offset >> PAGE_SHIFT;
- 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
--	u64 page_start, page_end;
-+	u64 page_start = 0, page_end = 0;
- 	struct page *page;
- 
- 	if (locked_page) {
-@@ -2535,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
- 	}
- }
- 
--/*
-- * in order to insert checksums into the metadata in large chunks,
-- * we wait until bio submission time.   All the pages in the bio are
-- * checksummed and sums are attached onto the ordered extent record.
-- *
-- * At IO completion time the cums attached on the ordered extent record
-- * are inserted into the btree
-- */
--blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
--{
--	return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
--}
--
- /*
-  * Split an extent_map at [start, start + len]
-  *
-@@ -2663,19 +2636,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
- 	return ret;
- }
- 
--static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
--					   struct bio *bio, loff_t file_offset)
-+blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
- {
-+	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
-+	u64 len = bbio->bio.bi_iter.bi_size;
-+	struct btrfs_inode *inode = bbio->inode;
- 	struct btrfs_ordered_extent *ordered;
--	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
- 	u64 file_len;
--	u64 len = bio->bi_iter.bi_size;
- 	u64 end = start + len;
- 	u64 ordered_end;
- 	u64 pre, post;
- 	int ret = 0;
- 
--	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-+	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
- 	if (WARN_ON_ONCE(!ordered))
- 		return BLK_STS_IOERR;
- 
-@@ -2715,7 +2688,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
- 	ret = btrfs_split_ordered_extent(ordered, pre, post);
- 	if (ret)
- 		goto out;
--	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
-+	ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
- 
- out:
- 	btrfs_put_ordered_extent(ordered);
-@@ -2723,75 +2696,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
- 	return errno_to_blk_status(ret);
- }
- 
--void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	blk_status_t ret;
--
--	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
--		ret = extract_ordered_extent(inode, bio,
--				page_offset(bio_first_bvec_all(bio)->bv_page));
--		if (ret) {
--			btrfs_bio_end_io(btrfs_bio(bio), ret);
--			return;
--		}
--	}
--
--	/*
--	 * If we need to checksum, and the I/O is not issued by fsync and
--	 * friends, that is ->sync_writers != 0, defer the submission to a
--	 * workqueue to parallelize it.
--	 *
--	 * Csum items for reloc roots have already been cloned at this point,
--	 * so they are handled as part of the no-checksum case.
--	 */
--	if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
--	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
--	    !btrfs_is_data_reloc_root(inode->root)) {
--		if (!atomic_read(&inode->sync_writers) &&
--		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
--			return;
--
--		ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
--		if (ret) {
--			btrfs_bio_end_io(btrfs_bio(bio), ret);
--			return;
--		}
--	}
--	btrfs_submit_bio(fs_info, bio, mirror_num);
--}
--
--void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
--			int mirror_num, enum btrfs_compression_type compress_type)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	blk_status_t ret;
--
--	if (compress_type != BTRFS_COMPRESS_NONE) {
--		/*
--		 * btrfs_submit_compressed_read will handle completing the bio
--		 * if there were any errors, so just return here.
--		 */
--		btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
--		return;
--	}
--
--	/* Save the original iter for read repair */
--	btrfs_bio(bio)->iter = bio->bi_iter;
--
--	/*
--	 * Lookup bio sums does extra checks around whether we need to csum or
--	 * not, which is why we ignore skip_sum here.
--	 */
--	ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
--	if (ret) {
--		btrfs_bio_end_io(btrfs_bio(bio), ret);
--		return;
--	}
--
--	btrfs_submit_bio(fs_info, bio, mirror_num);
--}
--
- /*
-  * given a list of ordered sums record them in the inode.  This happens
-  * at IO completion time based on sums calculated at bio submission time.
-@@ -2969,7 +2873,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
- 		unlock_extent(&inode->io_tree, page_start, page_end,
- 			      &cached_state);
- 		unlock_page(page);
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		btrfs_put_ordered_extent(ordered);
- 		goto again;
- 	}
-@@ -3259,15 +3163,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
- 		goto out;
- 	}
- 
--	/* A valid bdev implies a write on a sequential zone */
--	if (ordered_extent->bdev) {
-+	/* A valid ->physical implies a write on a sequential zone. */
-+	if (ordered_extent->physical != (u64)-1) {
- 		btrfs_rewrite_logical_zoned(ordered_extent);
- 		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
- 					ordered_extent->disk_num_bytes);
- 	}
- 
--	btrfs_free_io_failure_record(inode, start, end);
--
- 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
- 		truncated = true;
- 		logical_len = ordered_extent->truncated_len;
-@@ -3474,109 +3376,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
- }
- 
- /*
-- * check_data_csum - verify checksum of one sector of uncompressed data
-- * @inode:	inode
-- * @bbio:	btrfs_bio which contains the csum
-+ * Verify the checksum of a single data sector.
-+ *
-+ * @bbio:	btrfs_io_bio which contains the csum
-+ * @dev:	device the sector is on
-  * @bio_offset:	offset to the beginning of the bio (in bytes)
-- * @page:	page where is the data to be verified
-- * @pgoff:	offset inside the page
-+ * @bv:		bio_vec to check
-  *
-- * The length of such check is always one sector size.
-+ * Check if the checksum on a data block is valid.  When a checksum mismatch is
-+ * detected, report the error and fill the corrupted range with zero.
-  *
-- * When csum mismatch is detected, we will also report the error and fill the
-- * corrupted range with zero. (Thus it needs the extra parameters)
-+ * Return %true if the sector is ok or had no checksum to start with, else %false.
-  */
--int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
--			  u32 bio_offset, struct page *page, u32 pgoff)
-+bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
-+			u32 bio_offset, struct bio_vec *bv)
- {
-+	struct btrfs_inode *inode = bbio->inode;
- 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	u32 len = fs_info->sectorsize;
-+	u64 file_offset = bbio->file_offset + bio_offset;
-+	u64 end = file_offset + bv->bv_len - 1;
- 	u8 *csum_expected;
- 	u8 csum[BTRFS_CSUM_SIZE];
- 
--	ASSERT(pgoff + len <= PAGE_SIZE);
-+	ASSERT(bv->bv_len == fs_info->sectorsize);
- 
--	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
-+	if (!bbio->csum)
-+		return true;
- 
--	if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
-+	if (btrfs_is_data_reloc_root(inode->root) &&
-+	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
-+			   1, NULL)) {
-+		/* Skip the range without csum for data reloc inode */
-+		clear_extent_bits(&inode->io_tree, file_offset, end,
-+				  EXTENT_NODATASUM);
-+		return true;
-+	}
-+
-+	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
-+	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
-+				    csum_expected))
- 		goto zeroit;
--	return 0;
-+	return true;
- 
- zeroit:
--	btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
--				    csum, csum_expected, bbio->mirror_num);
--	if (bbio->device)
--		btrfs_dev_stat_inc_and_print(bbio->device,
--					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
--	memzero_page(page, pgoff, len);
--	return -EIO;
--}
--
--/*
-- * When reads are done, we need to check csums to verify the data is correct.
-- * if there's a match, we allow the bio to finish.  If not, the code in
-- * extent_io.c will try to find good copies for us.
-- *
-- * @bio_offset:	offset to the beginning of the bio (in bytes)
-- * @start:	file offset of the range start
-- * @end:	file offset of the range end (inclusive)
-- *
-- * Return a bitmap where bit set means a csum mismatch, and bit not set means
-- * csum match.
-- */
--unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
--				    u32 bio_offset, struct page *page,
--				    u64 start, u64 end)
--{
--	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
--	struct btrfs_root *root = inode->root;
--	struct btrfs_fs_info *fs_info = root->fs_info;
--	struct extent_io_tree *io_tree = &inode->io_tree;
--	const u32 sectorsize = root->fs_info->sectorsize;
--	u32 pg_off;
--	unsigned int result = 0;
--
--	/*
--	 * This only happens for NODATASUM or compressed read.
--	 * Normally this should be covered by above check for compressed read
--	 * or the next check for NODATASUM.  Just do a quicker exit here.
--	 */
--	if (bbio->csum == NULL)
--		return 0;
--
--	if (inode->flags & BTRFS_INODE_NODATASUM)
--		return 0;
--
--	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
--		return 0;
--
--	ASSERT(page_offset(page) <= start &&
--	       end <= page_offset(page) + PAGE_SIZE - 1);
--	for (pg_off = offset_in_page(start);
--	     pg_off < offset_in_page(end);
--	     pg_off += sectorsize, bio_offset += sectorsize) {
--		u64 file_offset = pg_off + page_offset(page);
--		int ret;
--
--		if (btrfs_is_data_reloc_root(root) &&
--		    test_range_bit(io_tree, file_offset,
--				   file_offset + sectorsize - 1,
--				   EXTENT_NODATASUM, 1, NULL)) {
--			/* Skip the range without csum for data reloc inode */
--			clear_extent_bits(io_tree, file_offset,
--					  file_offset + sectorsize - 1,
--					  EXTENT_NODATASUM);
--			continue;
--		}
--		ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
--		if (ret < 0) {
--			const int nr_bit = (pg_off - offset_in_page(start)) >>
--				     root->fs_info->sectorsize_bits;
--
--			result |= (1U << nr_bit);
--		}
--	}
--	return result;
-+	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
-+				    bbio->mirror_num);
-+	if (dev)
-+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
-+	memzero_bvec(bv);
-+	return false;
- }
- 
- /*
-@@ -4987,7 +4835,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- 		unlock_extent(io_tree, block_start, block_end, &cached_state);
- 		unlock_page(page);
- 		put_page(page);
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		btrfs_put_ordered_extent(ordered);
- 		goto again;
- 	}
-@@ -5466,8 +5314,6 @@ void btrfs_evict_inode(struct inode *inode)
- 	if (is_bad_inode(inode))
- 		goto no_delete;
- 
--	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
--
- 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- 		goto no_delete;
- 
-@@ -7131,6 +6977,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- }
- 
- static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
-+						  struct btrfs_dio_data *dio_data,
- 						  const u64 start,
- 						  const u64 len,
- 						  const u64 orig_start,
-@@ -7141,7 +6988,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- 						  const int type)
- {
- 	struct extent_map *em = NULL;
--	int ret;
-+	struct btrfs_ordered_extent *ordered;
- 
- 	if (type != BTRFS_ORDERED_NOCOW) {
- 		em = create_io_em(inode, start, len, orig_start, block_start,
-@@ -7151,18 +6998,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- 		if (IS_ERR(em))
- 			goto out;
- 	}
--	ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
--				       block_len, 0,
--				       (1 << type) |
--				       (1 << BTRFS_ORDERED_DIRECT),
--				       BTRFS_COMPRESS_NONE);
--	if (ret) {
-+	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
-+					     block_start, block_len, 0,
-+					     (1 << type) |
-+					     (1 << BTRFS_ORDERED_DIRECT),
-+					     BTRFS_COMPRESS_NONE);
-+	if (IS_ERR(ordered)) {
- 		if (em) {
- 			free_extent_map(em);
- 			btrfs_drop_extent_map_range(inode, start,
- 						    start + len - 1, false);
- 		}
--		em = ERR_PTR(ret);
-+		em = ERR_PTR(PTR_ERR(ordered));
-+	} else {
-+		ASSERT(!dio_data->ordered);
-+		dio_data->ordered = ordered;
- 	}
-  out:
- 
-@@ -7170,6 +7020,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- }
- 
- static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
-+						  struct btrfs_dio_data *dio_data,
- 						  u64 start, u64 len)
- {
- 	struct btrfs_root *root = inode->root;
-@@ -7185,7 +7036,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
- 	if (ret)
- 		return ERR_PTR(ret);
- 
--	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
-+	em = btrfs_create_dio_extent(inode, dio_data,
-+				     start, ins.offset, start,
- 				     ins.objectid, ins.offset, ins.offset,
- 				     ins.offset, BTRFS_ORDERED_REGULAR);
- 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
-@@ -7392,7 +7244,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- 			 */
- 			if (writing ||
- 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
--				btrfs_start_ordered_extent(ordered, 1);
-+				btrfs_start_ordered_extent(ordered);
- 			else
- 				ret = nowait ? -EAGAIN : -ENOTBLK;
- 			btrfs_put_ordered_extent(ordered);
-@@ -7530,7 +7382,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
- 		}
- 		space_reserved = true;
- 
--		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
-+		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
- 					      orig_start, block_start,
- 					      len, orig_block_len,
- 					      ram_bytes, type);
-@@ -7572,7 +7424,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
- 			goto out;
- 		space_reserved = true;
- 
--		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
-+		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
- 		if (IS_ERR(em)) {
- 			ret = PTR_ERR(em);
- 			goto out;
-@@ -7676,6 +7528,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- 		}
- 	}
- 
-+	if (dio_data->ordered) {
-+		ASSERT(write);
-+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
-+				      dio_data->ordered->file_offset,
-+				      dio_data->ordered->bytes_left);
-+		if (IS_ERR(em)) {
-+			ret = PTR_ERR(em);
-+			goto err;
-+		}
-+		goto map_iomap;
-+	}
- 	memset(dio_data, 0, sizeof(*dio_data));
- 
- 	/*
-@@ -7817,6 +7680,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- 	else
- 		free_extent_state(cached_state);
- 
-+map_iomap:
- 	/*
- 	 * Translate extent map information to iomap.
- 	 * We trim the extents (and move the addr) even though iomap code does
-@@ -7833,10 +7697,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- 	iomap->offset = start;
- 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
- 	iomap->length = len;
--
--	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
--		iomap->flags |= IOMAP_F_ZONE_APPEND;
--
- 	free_extent_map(em);
- 
- 	return 0;
-@@ -7874,13 +7734,25 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- 	if (submitted < length) {
- 		pos += submitted;
- 		length -= submitted;
--		if (write)
--			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
--						       pos, length, false);
--		else
-+		if (write) {
-+			if (submitted == 0) {
-+				btrfs_mark_ordered_io_finished(BTRFS_I(inode),
-+							       NULL, pos,
-+							       length, false);
-+				btrfs_put_ordered_extent(dio_data->ordered);
-+				dio_data->ordered = NULL;
-+			}
-+		} else {
- 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- 				      pos + length - 1, NULL);
-+		}
- 		ret = -ENOTBLK;
-+	} else {
-+		/* On the last bio, release our cached ordered_extent. */
-+		if (write) {
-+			btrfs_put_ordered_extent(dio_data->ordered);
-+			dio_data->ordered = NULL;
-+		}
- 	}
- 
- 	if (write)
-@@ -7888,267 +7760,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- 	return ret;
- }
- 
--static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
--{
--	/*
--	 * This implies a barrier so that stores to dio_bio->bi_status before
--	 * this and loads of dio_bio->bi_status after this are fully ordered.
--	 */
--	if (!refcount_dec_and_test(&dip->refs))
--		return;
--
--	if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
--		btrfs_mark_ordered_io_finished(dip->inode, NULL,
--					       dip->file_offset, dip->bytes,
--					       !dip->bio.bi_status);
--	} else {
--		unlock_extent(&dip->inode->io_tree,
--			      dip->file_offset,
--			      dip->file_offset + dip->bytes - 1, NULL);
--	}
--
--	kfree(dip->csums);
--	bio_endio(&dip->bio);
--}
--
--void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
-+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
- {
--	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
--
--	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
--
--	refcount_inc(&dip->refs);
--	btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
--}
--
--static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
--					     struct btrfs_bio *bbio,
--					     const bool uptodate)
--{
--	struct inode *inode = &dip->inode->vfs_inode;
--	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
--	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
--	blk_status_t err = BLK_STS_OK;
--	struct bvec_iter iter;
--	struct bio_vec bv;
--	u32 offset;
--
--	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
--		u64 start = bbio->file_offset + offset;
--
--		if (uptodate &&
--		    (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
--						     bv.bv_page, bv.bv_offset))) {
--			btrfs_clean_io_failure(BTRFS_I(inode), start,
--					       bv.bv_page, bv.bv_offset);
--		} else {
--			int ret;
--
--			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
--					bv.bv_page, bv.bv_offset, false);
--			if (ret)
--				err = errno_to_blk_status(ret);
--		}
--	}
--
--	return err;
--}
--
--blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
--					      struct bio *bio,
--					      u64 dio_file_offset)
--{
--	return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
--}
--
--static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
--{
--	struct btrfs_dio_private *dip = bbio->private;
-+	struct btrfs_dio_private *dip =
-+		container_of(bbio, struct btrfs_dio_private, bbio);
-+	struct btrfs_inode *inode = bbio->inode;
- 	struct bio *bio = &bbio->bio;
--	blk_status_t err = bio->bi_status;
--
--	if (err)
--		btrfs_warn(dip->inode->root->fs_info,
--			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
--			   btrfs_ino(dip->inode), bio_op(bio),
--			   bio->bi_opf, bio->bi_iter.bi_sector,
--			   bio->bi_iter.bi_size, err);
--
--	if (bio_op(bio) == REQ_OP_READ)
--		err = btrfs_check_read_dio_bio(dip, bbio, !err);
- 
--	if (err)
--		dip->bio.bi_status = err;
--
--	btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
--
--	bio_put(bio);
--	btrfs_dio_private_put(dip);
--}
--
--static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
--				 u64 file_offset, int async_submit)
--{
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
--	blk_status_t ret;
--
--	/* Save the original iter for read repair */
--	if (btrfs_op(bio) == BTRFS_MAP_READ)
--		btrfs_bio(bio)->iter = bio->bi_iter;
--
--	if (inode->flags & BTRFS_INODE_NODATASUM)
--		goto map;
-+	if (bio->bi_status) {
-+		btrfs_warn(inode->root->fs_info,
-+		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
-+			   btrfs_ino(inode), bio->bi_opf,
-+			   dip->file_offset, dip->bytes, bio->bi_status);
-+	}
- 
--	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
--		/* Check btrfs_submit_data_write_bio() for async submit rules */
--		if (async_submit && !atomic_read(&inode->sync_writers) &&
--		    btrfs_wq_submit_bio(inode, bio, 0, file_offset,
--					WQ_SUBMIT_DATA_DIO))
--			return;
-+	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-+		btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset,
-+					       dip->bytes, !bio->bi_status);
-+	else
-+		unlock_extent(&inode->io_tree, dip->file_offset,
-+			      dip->file_offset + dip->bytes - 1, NULL);
- 
--		/*
--		 * If we aren't doing async submit, calculate the csum of the
--		 * bio now.
--		 */
--		ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
--		if (ret) {
--			btrfs_bio_end_io(btrfs_bio(bio), ret);
--			return;
--		}
--	} else {
--		btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
--						      file_offset - dip->file_offset);
--	}
--map:
--	btrfs_submit_bio(fs_info, bio, 0);
-+	bbio->bio.bi_private = bbio->private;
-+	iomap_dio_bio_end_io(bio);
- }
- 
--static void btrfs_submit_direct(const struct iomap_iter *iter,
--		struct bio *dio_bio, loff_t file_offset)
-+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
-+				loff_t file_offset)
- {
-+	struct btrfs_bio *bbio = btrfs_bio(bio);
- 	struct btrfs_dio_private *dip =
--		container_of(dio_bio, struct btrfs_dio_private, bio);
--	struct inode *inode = iter->inode;
--	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
--	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
--	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
--			     BTRFS_BLOCK_GROUP_RAID56_MASK);
--	struct bio *bio;
--	u64 start_sector;
--	int async_submit = 0;
--	u64 submit_len;
--	u64 clone_offset = 0;
--	u64 clone_len;
--	u64 logical;
--	int ret;
--	blk_status_t status;
--	struct btrfs_io_geometry geom;
-+		container_of(bbio, struct btrfs_dio_private, bbio);
- 	struct btrfs_dio_data *dio_data = iter->private;
--	struct extent_map *em = NULL;
--
--	dip->inode = BTRFS_I(inode);
--	dip->file_offset = file_offset;
--	dip->bytes = dio_bio->bi_iter.bi_size;
--	refcount_set(&dip->refs, 1);
--	dip->csums = NULL;
--
--	if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
--		unsigned int nr_sectors =
--			(dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
--
--		/*
--		 * Load the csums up front to reduce csum tree searches and
--		 * contention when submitting bios.
--		 */
--		status = BLK_STS_RESOURCE;
--		dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
--		if (!dip->csums)
--			goto out_err;
--
--		status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
--		if (status != BLK_STS_OK)
--			goto out_err;
--	}
--
--	start_sector = dio_bio->bi_iter.bi_sector;
--	submit_len = dio_bio->bi_iter.bi_size;
--
--	do {
--		logical = start_sector << 9;
--		em = btrfs_get_chunk_map(fs_info, logical, submit_len);
--		if (IS_ERR(em)) {
--			status = errno_to_blk_status(PTR_ERR(em));
--			em = NULL;
--			goto out_err_em;
--		}
--		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
--					    logical, &geom);
--		if (ret) {
--			status = errno_to_blk_status(ret);
--			goto out_err_em;
--		}
--
--		clone_len = min(submit_len, geom.len);
--		ASSERT(clone_len <= UINT_MAX);
--
--		/*
--		 * This will never fail as it's passing GPF_NOFS and
--		 * the allocation is backed by btrfs_bioset.
--		 */
--		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
--					      btrfs_end_dio_bio, dip);
--		btrfs_bio(bio)->file_offset = file_offset;
--
--		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
--			status = extract_ordered_extent(BTRFS_I(inode), bio,
--							file_offset);
--			if (status) {
--				bio_put(bio);
--				goto out_err;
--			}
--		}
--
--		ASSERT(submit_len >= clone_len);
--		submit_len -= clone_len;
--
--		/*
--		 * Increase the count before we submit the bio so we know
--		 * the end IO handler won't happen before we increase the
--		 * count. Otherwise, the dip might get freed before we're
--		 * done setting it up.
--		 *
--		 * We transfer the initial reference to the last bio, so we
--		 * don't need to increment the reference count for the last one.
--		 */
--		if (submit_len > 0) {
--			refcount_inc(&dip->refs);
--			/*
--			 * If we are submitting more than one bio, submit them
--			 * all asynchronously. The exception is RAID 5 or 6, as
--			 * asynchronous checksums make it difficult to collect
--			 * full stripe writes.
--			 */
--			if (!raid56)
--				async_submit = 1;
--		}
- 
--		btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
-+	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
-+	bbio->file_offset = file_offset;
- 
--		dio_data->submitted += clone_len;
--		clone_offset += clone_len;
--		start_sector += clone_len >> 9;
--		file_offset += clone_len;
--
--		free_extent_map(em);
--	} while (submit_len > 0);
--	return;
-+	dip->file_offset = file_offset;
-+	dip->bytes = bio->bi_iter.bi_size;
- 
--out_err_em:
--	free_extent_map(em);
--out_err:
--	dio_bio->bi_status = status;
--	btrfs_dio_private_put(dip);
-+	dio_data->submitted += bio->bi_iter.bi_size;
-+	btrfs_submit_bio(bio, 0);
- }
- 
- static const struct iomap_ops btrfs_dio_iomap_ops = {
-@@ -8157,25 +7809,30 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {
- };
- 
- static const struct iomap_dio_ops btrfs_dio_ops = {
--	.submit_io		= btrfs_submit_direct,
-+	.submit_io		= btrfs_dio_submit_io,
- 	.bio_set		= &btrfs_dio_bioset,
- };
- 
- ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
- {
--	struct btrfs_dio_data data;
-+	struct btrfs_dio_data data = { 0 };
- 
- 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- 			    IOMAP_DIO_PARTIAL, &data, done_before);
- }
- 
- struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
-+				  struct btrfs_ordered_extent **ordered_extent,
- 				  size_t done_before)
- {
--	struct btrfs_dio_data data;
-+	struct btrfs_dio_data dio_data = { .ordered = *ordered_extent };
-+	struct iomap_dio *dio;
- 
--	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
--			    IOMAP_DIO_PARTIAL, &data, done_before);
-+	dio =  __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-+			      IOMAP_DIO_PARTIAL, &dio_data, done_before);
-+	if (!IS_ERR_OR_NULL(dio))
-+		*ordered_extent = dio_data.ordered;
-+	return dio;
- }
- 
- static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-@@ -8552,7 +8209,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
- 		unlock_extent(io_tree, page_start, page_end, &cached_state);
- 		unlock_page(page);
- 		up_read(&BTRFS_I(inode)->i_mmap_lock);
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		btrfs_put_ordered_extent(ordered);
- 		goto again;
- 	}
-@@ -8850,7 +8507,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
- 	ei->last_log_commit = 0;
- 
- 	spin_lock_init(&ei->lock);
--	spin_lock_init(&ei->io_failure_lock);
- 	ei->outstanding_extents = 0;
- 	if (sb->s_magic != BTRFS_TEST_MAGIC)
- 		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
-@@ -8870,7 +8526,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
- 	ei->io_tree.inode = ei;
- 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
- 			    IO_TREE_INODE_FILE_EXTENT);
--	ei->io_failure_tree = RB_ROOT;
- 	atomic_set(&ei->sync_writers, 0);
- 	mutex_init(&ei->log_mutex);
- 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-@@ -8994,7 +8649,7 @@ int __init btrfs_init_cachep(void)
- 		goto fail;
- 
- 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
--			offsetof(struct btrfs_dio_private, bio),
-+			offsetof(struct btrfs_dio_private, bbio.bio),
- 			BIOSET_NEED_BVECS))
- 		goto fail;
- 
-@@ -10289,65 +9944,13 @@ struct btrfs_encoded_read_private {
- 	wait_queue_head_t wait;
- 	atomic_t pending;
- 	blk_status_t status;
--	bool skip_csum;
- };
- 
--static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
--					    struct bio *bio, int mirror_num)
--{
--	struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	blk_status_t ret;
--
--	if (!priv->skip_csum) {
--		ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
--		if (ret)
--			return ret;
--	}
--
--	atomic_inc(&priv->pending);
--	btrfs_submit_bio(fs_info, bio, mirror_num);
--	return BLK_STS_OK;
--}
--
--static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
--{
--	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
--	struct btrfs_encoded_read_private *priv = bbio->private;
--	struct btrfs_inode *inode = priv->inode;
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
--	u32 sectorsize = fs_info->sectorsize;
--	struct bio_vec *bvec;
--	struct bvec_iter_all iter_all;
--	u32 bio_offset = 0;
--
--	if (priv->skip_csum || !uptodate)
--		return bbio->bio.bi_status;
--
--	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
--		unsigned int i, nr_sectors, pgoff;
--
--		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
--		pgoff = bvec->bv_offset;
--		for (i = 0; i < nr_sectors; i++) {
--			ASSERT(pgoff < PAGE_SIZE);
--			if (btrfs_check_data_csum(inode, bbio, bio_offset,
--					    bvec->bv_page, pgoff))
--				return BLK_STS_IOERR;
--			bio_offset += sectorsize;
--			pgoff += sectorsize;
--		}
--	}
--	return BLK_STS_OK;
--}
--
- static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
- {
- 	struct btrfs_encoded_read_private *priv = bbio->private;
--	blk_status_t status;
- 
--	status = btrfs_encoded_read_verify_csum(bbio);
--	if (status) {
-+	if (bbio->bio.bi_status) {
- 		/*
- 		 * The memory barrier implied by the atomic_dec_return() here
- 		 * pairs with the memory barrier implied by the
-@@ -10356,11 +9959,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
- 		 * write is observed before the load of status in
- 		 * btrfs_encoded_read_regular_fill_pages().
- 		 */
--		WRITE_ONCE(priv->status, status);
-+		WRITE_ONCE(priv->status, bbio->bio.bi_status);
- 	}
- 	if (!atomic_dec_return(&priv->pending))
- 		wake_up(&priv->wait);
--	btrfs_bio_free_csum(bbio);
- 	bio_put(&bbio->bio);
- }
- 
-@@ -10368,47 +9970,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- 					  u64 file_offset, u64 disk_bytenr,
- 					  u64 disk_io_size, struct page **pages)
- {
--	struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 	struct btrfs_encoded_read_private priv = {
- 		.inode = inode,
- 		.file_offset = file_offset,
- 		.pending = ATOMIC_INIT(1),
--		.skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
- 	};
- 	unsigned long i = 0;
- 	u64 cur = 0;
--	int ret;
- 
- 	init_waitqueue_head(&priv.wait);
--	/*
--	 * Submit bios for the extent, splitting due to bio or stripe limits as
--	 * necessary.
--	 */
-+	/* Submit bios for the extent, splitting due to bio limits as necessary. */
- 	while (cur < disk_io_size) {
--		struct extent_map *em;
--		struct btrfs_io_geometry geom;
- 		struct bio *bio = NULL;
--		u64 remaining;
-+		u64 remaining = disk_io_size - cur;
- 
--		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
--					 disk_io_size - cur);
--		if (IS_ERR(em)) {
--			ret = PTR_ERR(em);
--		} else {
--			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
--						    disk_bytenr + cur, &geom);
--			free_extent_map(em);
--		}
--		if (ret) {
--			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
--			break;
--		}
--		remaining = min(geom.len, disk_io_size - cur);
- 		while (bio || remaining) {
- 			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
- 
- 			if (!bio) {
- 				bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
-+						      inode,
- 						      btrfs_encoded_read_endio,
- 						      &priv);
- 				bio->bi_iter.bi_sector =
-@@ -10417,14 +9998,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- 
- 			if (!bytes ||
- 			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
--				blk_status_t status;
--
--				status = submit_encoded_read_bio(inode, bio, 0);
--				if (status) {
--					WRITE_ONCE(priv.status, status);
--					bio_put(bio);
--					goto out;
--				}
-+				atomic_inc(&priv.pending);
-+				btrfs_submit_bio(bio, 0);
- 				bio = NULL;
- 				continue;
- 			}
-@@ -10435,7 +10010,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- 		}
- 	}
- 
--out:
- 	if (atomic_dec_return(&priv.pending))
- 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
- 	/* See btrfs_encoded_read_endio() for ordering. */
-@@ -10995,9 +10569,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
- 		return 0;
- 
- 	max_pages = sis->max - bsi->nr_pages;
--	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
--	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
--				PAGE_SIZE) >> PAGE_SHIFT;
-+	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
-+	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
- 
- 	if (first_ppage >= next_ppage)
- 		return 0;
-diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
-index 7e348bd2ccde..8ea557e22252 100644
---- a/fs/btrfs/ioctl.c
-+++ b/fs/btrfs/ioctl.c
-@@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
- 		 * exists).
- 		 */
- 		btrfs_tree_lock(leaf);
--		btrfs_clean_tree_block(leaf);
-+		btrfs_clear_buffer_dirty(trans, leaf);
- 		btrfs_tree_unlock(leaf);
- 		btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
- 		free_extent_buffer(leaf);
-diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
-new file mode 100644
-index 000000000000..0fe0ae54ac67
---- /dev/null
-+++ b/fs/btrfs/lru_cache.c
-@@ -0,0 +1,166 @@
-+// SPDX-License-Identifier: GPL-2.0
-+
-+#include <linux/mm.h>
-+#include "lru_cache.h"
-+#include "messages.h"
-+
-+/*
-+ * Initialize a cache object.
-+ *
-+ * @cache:      The cache.
-+ * @max_size:   Maximum size (number of entries) for the cache.
-+ *              Use 0 for unlimited size, it's the user's responsability to
-+ *              trim the cache in that case.
-+ */
-+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
-+{
-+	INIT_LIST_HEAD(&cache->lru_list);
-+	mt_init(&cache->entries);
-+	cache->size = 0;
-+	cache->max_size = max_size;
-+}
-+
-+static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key,
-+						 u64 gen)
-+{
-+	struct btrfs_lru_cache_entry *entry;
-+
-+	list_for_each_entry(entry, head, list) {
-+		if (entry->key == key && entry->gen == gen)
-+			return entry;
-+	}
-+
-+	return NULL;
-+}
-+
-+/*
-+ * Lookup for an entry in the cache.
-+ *
-+ * @cache:      The cache.
-+ * @key:        The key of the entry we are looking for.
-+ * @gen:        Generation associated to the key.
-+ *
-+ * Returns the entry associated with the key or NULL if none found.
-+ */
-+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
-+						     u64 key, u64 gen)
-+{
-+	struct list_head *head;
-+	struct btrfs_lru_cache_entry *entry;
-+
-+	head = mtree_load(&cache->entries, key);
-+	if (!head)
-+		return NULL;
-+
-+	entry = match_entry(head, key, gen);
-+	if (entry)
-+		list_move_tail(&entry->lru_list, &cache->lru_list);
-+
-+	return entry;
-+}
-+
-+/*
-+ * Remove an entry from the cache.
-+ *
-+ * @cache:     The cache to remove from.
-+ * @entry:     The entry to remove from the cache.
-+ *
-+ * Note: this also frees the memory used by the entry.
-+ */
-+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
-+			    struct btrfs_lru_cache_entry *entry)
-+{
-+	struct list_head *prev = entry->list.prev;
-+
-+	ASSERT(cache->size > 0);
-+	ASSERT(!mtree_empty(&cache->entries));
-+
-+	list_del(&entry->list);
-+	list_del(&entry->lru_list);
-+
-+	if (list_empty(prev)) {
-+		struct list_head *head;
-+
-+		/*
-+		 * If previous element in the list entry->list is now empty, it
-+		 * means it's a head entry not pointing to any cached entries,
-+		 * so remove it from the maple tree and free it.
-+		 */
-+		head = mtree_erase(&cache->entries, entry->key);
-+		ASSERT(head == prev);
-+		kfree(head);
-+	}
-+
-+	kfree(entry);
-+	cache->size--;
-+}
-+
-+/*
-+ * Store an entry in the cache.
-+ *
-+ * @cache:      The cache.
-+ * @entry:      The entry to store.
-+ *
-+ * Returns 0 on success and < 0 on error.
-+ */
-+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
-+			  struct btrfs_lru_cache_entry *new_entry,
-+			  gfp_t gfp)
-+{
-+	const u64 key = new_entry->key;
-+	struct list_head *head;
-+	int ret;
-+
-+	head = kmalloc(sizeof(*head), gfp);
-+	if (!head)
-+		return -ENOMEM;
-+
-+	ret = mtree_insert(&cache->entries, key, head, gfp);
-+	if (ret == 0) {
-+		INIT_LIST_HEAD(head);
-+		list_add_tail(&new_entry->list, head);
-+	} else if (ret == -EEXIST) {
-+		kfree(head);
-+		head = mtree_load(&cache->entries, key);
-+		ASSERT(head != NULL);
-+		if (match_entry(head, key, new_entry->gen) != NULL)
-+			return -EEXIST;
-+		list_add_tail(&new_entry->list, head);
-+	} else if (ret < 0) {
-+		kfree(head);
-+		return ret;
-+	}
-+
-+	if (cache->max_size > 0 && cache->size == cache->max_size) {
-+		struct btrfs_lru_cache_entry *lru_entry;
-+
-+		lru_entry = list_first_entry(&cache->lru_list,
-+					     struct btrfs_lru_cache_entry,
-+					     lru_list);
-+		btrfs_lru_cache_remove(cache, lru_entry);
-+	}
-+
-+	list_add_tail(&new_entry->lru_list, &cache->lru_list);
-+	cache->size++;
-+
-+	return 0;
-+}
-+
-+/*
-+ * Empty a cache.
-+ *
-+ * @cache:     The cache to empty.
-+ *
-+ * Removes all entries from the cache.
-+ */
-+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache)
-+{
-+	struct btrfs_lru_cache_entry *entry;
-+	struct btrfs_lru_cache_entry *tmp;
-+
-+	list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list)
-+		btrfs_lru_cache_remove(cache, entry);
-+
-+	ASSERT(cache->size == 0);
-+	ASSERT(mtree_empty(&cache->entries));
-+}
-diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
-new file mode 100644
-index 000000000000..de3e18bce24a
---- /dev/null
-+++ b/fs/btrfs/lru_cache.h
-@@ -0,0 +1,80 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+
-+#ifndef BTRFS_LRU_CACHE_H
-+#define BTRFS_LRU_CACHE_H
-+
-+#include <linux/maple_tree.h>
-+#include <linux/list.h>
-+
-+/*
-+ * A cache entry. This is meant to be embedded in a structure of a user of
-+ * this module. Similar to how struct list_head and struct rb_node are used.
-+ *
-+ * Note: it should be embedded as the first element in a struct (offset 0), and
-+ * this module assumes it was allocated with kmalloc(), so it calls kfree() when
-+ * it needs to free an entry.
-+ */
-+struct btrfs_lru_cache_entry {
-+	struct list_head lru_list;
-+	u64 key;
-+	/*
-+	 * Optional generation associated to a key. Use 0 if not needed/used.
-+	 * Entries with the same key and different generations are stored in a
-+	 * linked list, so use this only for cases where there's a small number
-+	 * of different generations.
-+	 */
-+	u64 gen;
-+	/*
-+	 * The maple tree uses unsigned long type for the keys, which is 32 bits
-+	 * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to
-+	 * use something like inode numbers as keys, which are always a u64, we
-+	 * have to deal with this in a special way - we store the key in the
-+	 * entry itself, as a u64, and the values inserted into the maple tree
-+	 * are linked lists of entries - so in case we are on a 64 bits system,
-+	 * that list always has a single entry, while on 32 bits systems it
-+	 * may have more than one, with each entry having the same value for
-+	 * their lower 32 bits of the u64 key.
-+	 */
-+	struct list_head list;
-+};
-+
-+struct btrfs_lru_cache {
-+	struct list_head lru_list;
-+	struct maple_tree entries;
-+	/* Number of entries stored in the cache. */
-+	unsigned int size;
-+	/* Maximum number of entries the cache can have. */
-+	unsigned int max_size;
-+};
-+
-+#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp)		\
-+	list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
-+
-+static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
-+{
-+	return cache->size;
-+}
-+
-+static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
-+{
-+	return cache->size >= cache->max_size;
-+}
-+
-+static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
-+					      struct btrfs_lru_cache *cache)
-+{
-+	return list_first_entry_or_null(&cache->lru_list,
-+					struct btrfs_lru_cache_entry, lru_list);
-+}
-+
-+void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size);
-+struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
-+						     u64 key, u64 gen);
-+int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
-+			  struct btrfs_lru_cache_entry *new_entry,
-+			  gfp_t gfp);
-+void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
-+			    struct btrfs_lru_cache_entry *entry);
-+void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache);
-+
-+#endif
-diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
-index d5e78cbc8fbc..71f6d8302d50 100644
---- a/fs/btrfs/lzo.c
-+++ b/fs/btrfs/lzo.c
-@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- 		}
- 
- 		/* Check if we have reached page boundary */
--		if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
-+		if (PAGE_ALIGNED(cur_in)) {
- 			put_page(page_in);
- 			page_in = NULL;
- 		}
-diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
-index 625bbbbb2608..fde5aaa6e7c9 100644
---- a/fs/btrfs/messages.c
-+++ b/fs/btrfs/messages.c
-@@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
- }
- #endif
- 
--/*
-- * We only mark the transaction aborted and then set the file system read-only.
-- * This will prevent new transactions from starting or trying to join this
-- * one.
-- *
-- * This means that error recovery at the call site is limited to freeing
-- * any local memory allocations and passing the error code up without
-- * further cleanup. The transaction should complete as it normally would
-- * in the call path but will return -EIO.
-- *
-- * We'll complete the cleanup in btrfs_end_transaction and
-- * btrfs_commit_transaction.
-- */
--__cold
--void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
--			       const char *function,
--			       unsigned int line, int errno, bool first_hit)
--{
--	struct btrfs_fs_info *fs_info = trans->fs_info;
--
--	WRITE_ONCE(trans->aborted, errno);
--	WRITE_ONCE(trans->transaction->aborted, errno);
--	if (first_hit && errno == -ENOSPC)
--		btrfs_dump_space_info_for_trans_abort(fs_info);
--	/* Wake up anybody who may be waiting on this transaction */
--	wake_up(&fs_info->transaction_wait);
--	wake_up(&fs_info->transaction_blocked_wait);
--	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
--}
--
- /*
-  * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
-  * alert, and either panics or BUGs, depending on mount options.
-diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
-index 190af1f698d9..8c516ee58ff9 100644
---- a/fs/btrfs/messages.h
-+++ b/fs/btrfs/messages.h
-@@ -6,7 +6,6 @@
- #include <linux/types.h>
- 
- struct btrfs_fs_info;
--struct btrfs_trans_handle;
- 
- static inline __printf(2, 3) __cold
- void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
-@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
- 
- const char * __attribute_const__ btrfs_decode_error(int errno);
- 
--__cold
--void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
--			       const char *function,
--			       unsigned int line, int errno, bool first_hit);
--
--bool __cold abort_should_print_stack(int errno);
--
--/*
-- * Call btrfs_abort_transaction as early as possible when an error condition is
-- * detected, that way the exact stack trace is reported for some errors.
-- */
--#define btrfs_abort_transaction(trans, errno)			\
--do {								\
--	bool first = false;					\
--	/* Report first abort since mount */			\
--	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
--			      &((trans)->fs_info->fs_state))) {	\
--		first = true;					\
--		if (WARN(abort_should_print_stack(errno),       \
--			KERN_ERR				\
--			"BTRFS: Transaction aborted (error %d)\n",	\
--			(errno))) {					\
--			/* Stack trace printed. */			\
--		} else {						\
--			btrfs_err((trans)->fs_info,			\
--				  "Transaction aborted (error %d)",     \
--				  (errno));			\
--		}						\
--	}							\
--	__btrfs_abort_transaction((trans), __func__,		\
--				  __LINE__, (errno), first);	\
--} while (0)
--
- #define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
- 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
- 				(errno), fmt, ##args)
-diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
-index 57d8c72737e1..1848d0d1a9c4 100644
---- a/fs/btrfs/ordered-data.c
-+++ b/fs/btrfs/ordered-data.c
-@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
-  * @compress_type:   Compression algorithm used for data.
-  *
-  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
-- * tree is given a single reference on the ordered extent that was inserted.
-+ * tree is given a single reference on the ordered extent that was inserted, and
-+ * the returned pointer is given a second reference.
-  *
-- * Return: 0 or -ENOMEM.
-+ * Return: the new ordered extent or ERR_PTR(-ENOMEM).
-  */
--int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
--			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
--			     u64 disk_num_bytes, u64 offset, unsigned flags,
--			     int compress_type)
-+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
-+			struct btrfs_inode *inode, u64 file_offset,
-+			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-+			u64 disk_num_bytes, u64 offset, unsigned long flags,
-+			int compress_type)
- {
- 	struct btrfs_root *root = inode->root;
- 	struct btrfs_fs_info *fs_info = root->fs_info;
-@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- 		/* For nocow write, we can release the qgroup rsv right now */
- 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
- 		if (ret < 0)
--			return ret;
-+			return ERR_PTR(ret);
- 		ret = 0;
- 	} else {
- 		/*
-@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- 		 */
- 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
- 		if (ret < 0)
--			return ret;
-+			return ERR_PTR(ret);
- 	}
- 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- 	if (!entry)
--		return -ENOMEM;
-+		return ERR_PTR(-ENOMEM);
- 
- 	entry->file_offset = file_offset;
- 	entry->num_bytes = num_bytes;
-@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- 	btrfs_mod_outstanding_extents(inode, 1);
- 	spin_unlock(&inode->lock);
- 
-+	/* One ref for the returned entry to match semantics of lookup. */
-+	refcount_inc(&entry->refs);
-+
-+	return entry;
-+}
-+
-+/*
-+ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
-+ * of returning it to the caller.
-+ */
-+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
-+			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-+			     u64 disk_num_bytes, u64 offset, unsigned long flags,
-+			     int compress_type)
-+{
-+	struct btrfs_ordered_extent *ordered;
-+
-+	ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
-+					     ram_bytes, disk_bytenr,
-+					     disk_num_bytes, offset, flags,
-+					     compress_type);
-+
-+	if (IS_ERR(ordered))
-+		return PTR_ERR(ordered);
-+	btrfs_put_ordered_extent(ordered);
-+
- 	return 0;
- }
- 
-@@ -616,7 +644,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
- 	struct btrfs_ordered_extent *ordered;
- 
- 	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
--	btrfs_start_ordered_extent(ordered, 1);
-+	btrfs_start_ordered_extent(ordered);
- 	complete(&ordered->completion);
- }
- 
-@@ -716,13 +744,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- }
- 
- /*
-- * Used to start IO or wait for a given ordered extent to finish.
-+ * Start IO and wait for a given ordered extent to finish.
-  *
-- * If wait is one, this effectively waits on page writeback for all the pages
-- * in the extent, and it waits on the io completion code to insert
-- * metadata into the btree corresponding to the extent
-+ * Wait on page writeback for all the pages in the extent and the IO completion
-+ * code to insert metadata into the btree corresponding to the extent.
-  */
--void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
-+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
- {
- 	u64 start = entry->file_offset;
- 	u64 end = start + entry->num_bytes - 1;
-@@ -744,12 +771,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
- 	 */
- 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- 		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
--	if (wait) {
--		if (!freespace_inode)
--			btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
--		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
--						 &entry->flags));
--	}
-+
-+	if (!freespace_inode)
-+		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
-+	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
- }
- 
- /*
-@@ -800,7 +825,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
- 			btrfs_put_ordered_extent(ordered);
- 			break;
- 		}
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		end = ordered->file_offset;
- 		/*
- 		 * If the ordered extent had an error save the error but don't
-@@ -1061,7 +1086,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
- 			break;
- 		}
- 		unlock_extent(&inode->io_tree, start, end, cachedp);
--		btrfs_start_ordered_extent(ordered, 1);
-+		btrfs_start_ordered_extent(ordered);
- 		btrfs_put_ordered_extent(ordered);
- 	}
- }
-diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
-index 89f82b78f590..18007f9c00ad 100644
---- a/fs/btrfs/ordered-data.h
-+++ b/fs/btrfs/ordered-data.h
-@@ -157,7 +157,6 @@ struct btrfs_ordered_extent {
- 	 * command in a workqueue context
- 	 */
- 	u64 physical;
--	struct block_device *bdev;
- };
- 
- static inline void
-@@ -179,15 +178,20 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
- 				    struct btrfs_ordered_extent **cached,
- 				    u64 file_offset, u64 io_size);
-+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
-+			struct btrfs_inode *inode, u64 file_offset,
-+			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-+			u64 disk_num_bytes, u64 offset, unsigned long flags,
-+			int compress_type);
- int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- 			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
--			     u64 disk_num_bytes, u64 offset, unsigned flags,
-+			     u64 disk_num_bytes, u64 offset, unsigned long flags,
- 			     int compress_type);
- void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
- 			   struct btrfs_ordered_sum *sum);
- struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
- 							 u64 file_offset);
--void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
-+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
- int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
- struct btrfs_ordered_extent *
- btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
-diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
-index af97413abcf4..52a7d2fa2284 100644
---- a/fs/btrfs/qgroup.c
-+++ b/fs/btrfs/qgroup.c
-@@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
- 	list_del(&quota_root->dirty_list);
- 
- 	btrfs_tree_lock(quota_root->node);
--	btrfs_clean_tree_block(quota_root->node);
-+	btrfs_clear_buffer_dirty(trans, quota_root->node);
- 	btrfs_tree_unlock(quota_root->node);
- 	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
- 			      quota_root->node, 0, 1);
-diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
-index ff4b1d583788..642828c1b299 100644
---- a/fs/btrfs/raid56.c
-+++ b/fs/btrfs/raid56.c
-@@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
- }
- 
- /*
-- * Return the total numer of errors found in the vertical stripe of @sector_nr.
-+ * Return the total number of errors found in the vertical stripe of @sector_nr.
-  *
-  * @faila and @failb will also be updated to the first and second stripe
-  * number of the errors.
-@@ -1183,7 +1183,15 @@ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
- 	trace_info->stripe_nr = -1;
- }
- 
--/* Generate PQ for one veritical stripe. */
-+static inline void bio_list_put(struct bio_list *bio_list)
-+{
-+	struct bio *bio;
-+
-+	while ((bio = bio_list_pop(bio_list)))
-+		bio_put(bio);
-+}
-+
-+/* Generate PQ for one vertical stripe. */
- static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
- {
- 	void **pointers = rbio->finish_pointers;
-@@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
- static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
- 				   struct bio_list *bio_list)
- {
--	struct bio *bio;
- 	/* The total sector number inside the full stripe. */
- 	int total_sector_nr;
- 	int sectornr;
-@@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
- 
- 	return 0;
- error:
--	while ((bio = bio_list_pop(bio_list)))
--		bio_put(bio);
-+	bio_list_put(bio_list);
- 	return -EIO;
- }
- 
-@@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
- }
- 
- /*
-- * For subpage case, we can no longer set page Uptodate directly for
-+ * For subpage case, we can no longer set page Up-to-date directly for
-  * stripe_pages[], thus we need to locate the sector.
-  */
- static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
-@@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi
- 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
- 	u32 bio_size = 0;
- 	struct bio_vec *bvec;
--	struct bvec_iter_all iter_all;
- 	int i;
- 
--	bio_for_each_segment_all(bvec, bio, iter_all)
-+	bio_for_each_bvec_all(bvec, bio, i)
- 		bio_size += bvec->bv_len;
- 
- 	/*
-@@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio)
- 		wake_up(&rbio->io_wait);
- }
- 
--static void submit_read_bios(struct btrfs_raid_bio *rbio,
-+static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
- 			     struct bio_list *bio_list)
- {
- 	struct bio *bio;
-@@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio,
- 		}
- 		submit_bio(bio);
- 	}
--}
--
--static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
--				  struct bio_list *bio_list)
--{
--	struct bio *bio;
--	int total_sector_nr;
--	int ret = 0;
--
--	ASSERT(bio_list_size(bio_list) == 0);
--
--	/*
--	 * Build a list of bios to read all sectors (including data and P/Q).
--	 *
--	 * This behaviro is to compensate the later csum verification and
--	 * recovery.
--	 */
--	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
--	     total_sector_nr++) {
--		struct sector_ptr *sector;
--		int stripe = total_sector_nr / rbio->stripe_nsectors;
--		int sectornr = total_sector_nr % rbio->stripe_nsectors;
--
--		sector = rbio_stripe_sector(rbio, stripe, sectornr);
--		ret = rbio_add_io_sector(rbio, bio_list, sector,
--			       stripe, sectornr, REQ_OP_READ);
--		if (ret)
--			goto cleanup;
--	}
--	return 0;
- 
--cleanup:
--	while ((bio = bio_list_pop(bio_list)))
--		bio_put(bio);
--	return ret;
-+	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
- }
- 
- static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
-@@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
- 	struct btrfs_raid_bio *rbio;
- 	struct btrfs_plug_cb *plug = NULL;
- 	struct blk_plug_cb *cb;
--	int ret = 0;
- 
- 	rbio = alloc_rbio(fs_info, bioc);
- 	if (IS_ERR(rbio)) {
--		ret = PTR_ERR(rbio);
--		goto fail;
-+		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
-+		bio_endio(bio);
-+		return;
- 	}
- 	rbio->operation = BTRFS_RBIO_WRITE;
- 	rbio_add_bio(rbio, bio);
-@@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
- 	 * Don't plug on full rbios, just get them out the door
- 	 * as quickly as we can
- 	 */
--	if (rbio_is_full(rbio))
--		goto queue_rbio;
--
--	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
--	if (cb) {
--		plug = container_of(cb, struct btrfs_plug_cb, cb);
--		if (!plug->info) {
--			plug->info = fs_info;
--			INIT_LIST_HEAD(&plug->rbio_list);
-+	if (!rbio_is_full(rbio)) {
-+		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
-+		if (cb) {
-+			plug = container_of(cb, struct btrfs_plug_cb, cb);
-+			if (!plug->info) {
-+				plug->info = fs_info;
-+				INIT_LIST_HEAD(&plug->rbio_list);
-+			}
-+			list_add_tail(&rbio->plug_list, &plug->rbio_list);
-+			return;
- 		}
--		list_add_tail(&rbio->plug_list, &plug->rbio_list);
--		return;
- 	}
--queue_rbio:
-+
- 	/*
- 	 * Either we don't have any existing plug, or we're doing a full stripe,
--	 * can queue the rmw work now.
-+	 * queue the rmw work now.
- 	 */
- 	start_async_work(rbio, rmw_rbio_work);
--
--	return;
--
--fail:
--	bio->bi_status = errno_to_blk_status(ret);
--	bio_endio(bio);
- }
- 
- static int verify_one_sector(struct btrfs_raid_bio *rbio,
-@@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
- 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
- 						 &failb);
- 	/*
--	 * No errors in the veritical stripe, skip it.  Can happen for recovery
-+	 * No errors in the vertical stripe, skip it.  Can happen for recovery
- 	 * which only part of a stripe failed csum check.
- 	 */
- 	if (!found_errors)
-@@ -1949,14 +1914,25 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
- 	return ret;
- }
- 
--static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
--				      struct bio_list *bio_list)
-+static void recover_rbio(struct btrfs_raid_bio *rbio)
- {
--	struct bio *bio;
-+	struct bio_list bio_list = BIO_EMPTY_LIST;
- 	int total_sector_nr;
- 	int ret = 0;
- 
--	ASSERT(bio_list_size(bio_list) == 0);
-+	/*
-+	 * Either we're doing recover for a read failure or degraded write,
-+	 * caller should have set error bitmap correctly.
-+	 */
-+	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
-+
-+	/* For recovery, we need to read all sectors including P/Q. */
-+	ret = alloc_rbio_pages(rbio);
-+	if (ret < 0)
-+		goto out;
-+
-+	index_rbio_pages(rbio);
-+
- 	/*
- 	 * Read everything that hasn't failed. However this time we will
- 	 * not trust any cached sector.
-@@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
- 		}
- 
- 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
--		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
-+		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
- 					 sectornr, REQ_OP_READ);
--		if (ret < 0)
--			goto error;
-+		if (ret < 0) {
-+			bio_list_put(&bio_list);
-+			goto out;
-+		}
- 	}
--	return 0;
--error:
--	while ((bio = bio_list_pop(bio_list)))
--		bio_put(bio);
--
--	return -EIO;
--}
--
--static int recover_rbio(struct btrfs_raid_bio *rbio)
--{
--	struct bio_list bio_list;
--	struct bio *bio;
--	int ret;
--
--	/*
--	 * Either we're doing recover for a read failure or degraded write,
--	 * caller should have set error bitmap correctly.
--	 */
--	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
--	bio_list_init(&bio_list);
--
--	/* For recovery, we need to read all sectors including P/Q. */
--	ret = alloc_rbio_pages(rbio);
--	if (ret < 0)
--		goto out;
--
--	index_rbio_pages(rbio);
--
--	ret = recover_assemble_read_bios(rbio, &bio_list);
--	if (ret < 0)
--		goto out;
--
--	submit_read_bios(rbio, &bio_list);
--	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
- 
-+	submit_read_wait_bio_list(rbio, &bio_list);
- 	ret = recover_sectors(rbio);
--
- out:
--	while ((bio = bio_list_pop(&bio_list)))
--		bio_put(bio);
--
--	return ret;
-+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
- }
- 
- static void recover_rbio_work(struct work_struct *work)
- {
- 	struct btrfs_raid_bio *rbio;
--	int ret;
- 
- 	rbio = container_of(work, struct btrfs_raid_bio, work);
--
--	ret = lock_stripe_add(rbio);
--	if (ret == 0) {
--		ret = recover_rbio(rbio);
--		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
--	}
-+	if (!lock_stripe_add(rbio))
-+		recover_rbio(rbio);
- }
- 
- static void recover_rbio_work_locked(struct work_struct *work)
- {
--	struct btrfs_raid_bio *rbio;
--	int ret;
--
--	rbio = container_of(work, struct btrfs_raid_bio, work);
--
--	ret = recover_rbio(rbio);
--	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
-+	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
- }
- 
- static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
-@@ -2204,11 +2134,9 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
- 
- static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
- {
--	struct bio_list bio_list;
--	struct bio *bio;
--	int ret;
--
--	bio_list_init(&bio_list);
-+	struct bio_list bio_list = BIO_EMPTY_LIST;
-+	int total_sector_nr;
-+	int ret = 0;
- 
- 	/*
- 	 * Fill the data csums we need for data verification.  We need to fill
-@@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
- 	 */
- 	fill_data_csums(rbio);
- 
--	ret = rmw_assemble_read_bios(rbio, &bio_list);
--	if (ret < 0)
--		goto out;
-+	/*
-+	 * Build a list of bios to read all sectors (including data and P/Q).
-+	 *
-+	 * This behavior is to compensate the later csum verification and recovery.
-+	 */
-+	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
-+	     total_sector_nr++) {
-+		struct sector_ptr *sector;
-+		int stripe = total_sector_nr / rbio->stripe_nsectors;
-+		int sectornr = total_sector_nr % rbio->stripe_nsectors;
- 
--	submit_read_bios(rbio, &bio_list);
--	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
-+		sector = rbio_stripe_sector(rbio, stripe, sectornr);
-+		ret = rbio_add_io_sector(rbio, &bio_list, sector,
-+			       stripe, sectornr, REQ_OP_READ);
-+		if (ret) {
-+			bio_list_put(&bio_list);
-+			return ret;
-+		}
-+	}
- 
- 	/*
- 	 * We may or may not have any corrupted sectors (including missing dev
- 	 * and csum mismatch), just let recover_sectors() to handle them all.
- 	 */
--	ret = recover_sectors(rbio);
--	return ret;
--out:
--	while ((bio = bio_list_pop(&bio_list)))
--		bio_put(bio);
--
--	return ret;
-+	submit_read_wait_bio_list(rbio, &bio_list);
-+	return recover_sectors(rbio);
- }
- 
- static void raid_wait_write_end_io(struct bio *bio)
-@@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
- 	return false;
- }
- 
--static int rmw_rbio(struct btrfs_raid_bio *rbio)
-+static void rmw_rbio(struct btrfs_raid_bio *rbio)
- {
- 	struct bio_list bio_list;
- 	int sectornr;
-@@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
- 	 */
- 	ret = alloc_rbio_parity_pages(rbio);
- 	if (ret < 0)
--		return ret;
-+		goto out;
- 
- 	/*
- 	 * Either full stripe write, or we have every data sector already
- 	 * cached, can go to write path immediately.
- 	 */
--	if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
--		goto write;
--
--	/*
--	 * Now we're doing sub-stripe write, also need all data stripes to do
--	 * the full RMW.
--	 */
--	ret = alloc_rbio_data_pages(rbio);
--	if (ret < 0)
--		return ret;
-+	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
-+		/*
-+		 * Now we're doing sub-stripe write, also need all data stripes
-+		 * to do the full RMW.
-+		 */
-+		ret = alloc_rbio_data_pages(rbio);
-+		if (ret < 0)
-+			goto out;
- 
--	index_rbio_pages(rbio);
-+		index_rbio_pages(rbio);
- 
--	ret = rmw_read_wait_recover(rbio);
--	if (ret < 0)
--		return ret;
-+		ret = rmw_read_wait_recover(rbio);
-+		if (ret < 0)
-+			goto out;
-+	}
- 
--write:
- 	/*
- 	 * At this stage we're not allowed to add any new bios to the
- 	 * bio list any more, anyone else that wants to change this stripe
-@@ -2356,7 +2290,7 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
- 	bio_list_init(&bio_list);
- 	ret = rmw_assemble_write_bios(rbio, &bio_list);
- 	if (ret < 0)
--		return ret;
-+		goto out;
- 
- 	/* We should have at least one bio assembled. */
- 	ASSERT(bio_list_size(&bio_list));
-@@ -2373,32 +2307,22 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
- 			break;
- 		}
- 	}
--	return ret;
-+out:
-+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
- }
- 
- static void rmw_rbio_work(struct work_struct *work)
- {
- 	struct btrfs_raid_bio *rbio;
--	int ret;
- 
- 	rbio = container_of(work, struct btrfs_raid_bio, work);
--
--	ret = lock_stripe_add(rbio);
--	if (ret == 0) {
--		ret = rmw_rbio(rbio);
--		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
--	}
-+	if (lock_stripe_add(rbio) == 0)
-+		rmw_rbio(rbio);
- }
- 
- static void rmw_rbio_work_locked(struct work_struct *work)
- {
--	struct btrfs_raid_bio *rbio;
--	int ret;
--
--	rbio = container_of(work, struct btrfs_raid_bio, work);
--
--	ret = rmw_rbio(rbio);
--	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
-+	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
- }
- 
- /*
-@@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
- 	struct sector_ptr p_sector = { 0 };
- 	struct sector_ptr q_sector = { 0 };
- 	struct bio_list bio_list;
--	struct bio *bio;
- 	int is_replace = 0;
- 	int ret;
- 
-@@ -2637,8 +2560,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
- 	return 0;
- 
- cleanup:
--	while ((bio = bio_list_pop(&bio_list)))
--		bio_put(bio);
-+	bio_list_put(&bio_list);
- 	return ret;
- }
- 
-@@ -2733,15 +2655,12 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
- 	return ret;
- }
- 
--static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
--				    struct bio_list *bio_list)
-+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
- {
--	struct bio *bio;
-+	struct bio_list bio_list = BIO_EMPTY_LIST;
- 	int total_sector_nr;
- 	int ret = 0;
- 
--	ASSERT(bio_list_size(bio_list) == 0);
--
- 	/* Build a list of bios to read all the missing parts. */
- 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
- 	     total_sector_nr++) {
-@@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
- 		if (sector->uptodate)
- 			continue;
- 
--		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
-+		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
- 					 sectornr, REQ_OP_READ);
--		if (ret)
--			goto error;
-+		if (ret) {
-+			bio_list_put(&bio_list);
-+			return ret;
-+		}
- 	}
-+
-+	submit_read_wait_bio_list(rbio, &bio_list);
- 	return 0;
--error:
--	while ((bio = bio_list_pop(bio_list)))
--		bio_put(bio);
--	return ret;
- }
- 
--static int scrub_rbio(struct btrfs_raid_bio *rbio)
-+static void scrub_rbio(struct btrfs_raid_bio *rbio)
- {
- 	bool need_check = false;
--	struct bio_list bio_list;
- 	int sector_nr;
- 	int ret;
--	struct bio *bio;
--
--	bio_list_init(&bio_list);
- 
- 	ret = alloc_rbio_essential_pages(rbio);
- 	if (ret)
--		goto cleanup;
-+		goto out;
- 
- 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
- 
--	ret = scrub_assemble_read_bios(rbio, &bio_list);
-+	ret = scrub_assemble_read_bios(rbio);
- 	if (ret < 0)
--		goto cleanup;
--
--	submit_read_bios(rbio, &bio_list);
--	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
-+		goto out;
- 
- 	/* We may have some failures, recover the failed sectors first. */
- 	ret = recover_scrub_rbio(rbio);
- 	if (ret < 0)
--		goto cleanup;
-+		goto out;
- 
- 	/*
- 	 * We have every sector properly prepared. Can finish the scrub
-@@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
- 			break;
- 		}
- 	}
--	return ret;
--
--cleanup:
--	while ((bio = bio_list_pop(&bio_list)))
--		bio_put(bio);
--
--	return ret;
-+out:
-+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
- }
- 
- static void scrub_rbio_work_locked(struct work_struct *work)
- {
--	struct btrfs_raid_bio *rbio;
--	int ret;
--
--	rbio = container_of(work, struct btrfs_raid_bio, work);
--	ret = scrub_rbio(rbio);
--	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
-+	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
- }
- 
- void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
-diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
-index 7c73a443939e..df0e0abdeb1f 100644
---- a/fs/btrfs/raid56.h
-+++ b/fs/btrfs/raid56.h
-@@ -65,7 +65,7 @@ struct btrfs_raid_bio {
- 	/* Number of data stripes (no p/q) */
- 	u8 nr_data;
- 
--	/* Numer of all stripes (including P/Q) */
-+	/* Number of all stripes (including P/Q) */
- 	u8 real_stripes;
- 
- 	/* How many pages there are for each stripe */
-@@ -132,7 +132,7 @@ struct btrfs_raid_bio {
- 
- 	/*
- 	 * Checksum buffer if the rbio is for data.  The buffer should cover
--	 * all data sectors (exlcuding P/Q sectors).
-+	 * all data sectors (excluding P/Q sectors).
- 	 */
- 	u8 *csum_buf;
- 
-diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
-index 31ec4a7658ce..ef13a9d4e370 100644
---- a/fs/btrfs/relocation.c
-+++ b/fs/btrfs/relocation.c
-@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
- 	 *
- 	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
- 	 */
--	if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
-+	if (!PAGE_ALIGNED(i_size)) {
- 		struct address_space *mapping = inode->vfs_inode.i_mapping;
- 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 		const u32 sectorsize = fs_info->sectorsize;
-diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
-index a5d026041be4..69c93ae333f6 100644
---- a/fs/btrfs/scrub.c
-+++ b/fs/btrfs/scrub.c
-@@ -229,7 +229,7 @@ struct full_stripe_lock {
- };
- 
- #ifndef CONFIG_64BIT
--/* This structure is for archtectures whose (void *) is smaller than u64 */
-+/* This structure is for architectures whose (void *) is smaller than u64 */
- struct scrub_page_private {
- 	u64 logical;
- };
-diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
-index d50182b6deec..e5c963bb873d 100644
---- a/fs/btrfs/send.c
-+++ b/fs/btrfs/send.c
-@@ -32,6 +32,7 @@
- #include "file-item.h"
- #include "ioctl.h"
- #include "verity.h"
-+#include "lru_cache.h"
- 
- /*
-  * Maximum number of references an extent can have in order for us to attempt to
-@@ -80,23 +81,23 @@ struct clone_root {
- 	bool found_ref;
- };
- 
--#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
--#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
-+#define SEND_MAX_NAME_CACHE_SIZE			256
- 
- /*
-- * Limit the root_ids array of struct backref_cache_entry to 12 elements.
-- * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
-+ * Limit the root_ids array of struct backref_cache_entry to 17 elements.
-+ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
-+ * can be satisfied from the kmalloc-192 slab, without wasting any space.
-  * The most common case is to have a single root for cloning, which corresponds
-- * to the send root. Having the user specify more than 11 clone roots is not
-+ * to the send root. Having the user specify more than 16 clone roots is not
-  * common, and in such rare cases we simply don't use caching if the number of
-- * cloning roots that lead down to a leaf is more than 12.
-+ * cloning roots that lead down to a leaf is more than 17.
-  */
--#define SEND_MAX_BACKREF_CACHE_ROOTS 12
-+#define SEND_MAX_BACKREF_CACHE_ROOTS			17
- 
- /*
-  * Max number of entries in the cache.
-- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
-- * maple tree's internal nodes, is 16K.
-+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
-+ * maple tree's internal nodes, is 24K.
-  */
- #define SEND_MAX_BACKREF_CACHE_SIZE 128
- 
-@@ -107,15 +108,31 @@ struct clone_root {
-  * x86_64).
-  */
- struct backref_cache_entry {
--	/* List to link to the cache's lru list. */
--	struct list_head list;
--	/* The key for this entry in the cache. */
--	u64 key;
-+	struct btrfs_lru_cache_entry entry;
- 	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
- 	/* Number of valid elements in the root_ids array. */
- 	int num_roots;
- };
- 
-+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
-+static_assert(offsetof(struct backref_cache_entry, entry) == 0);
-+
-+/*
-+ * Max number of entries in the cache that stores directories that were already
-+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
-+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
-+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
-+ */
-+#define SEND_MAX_DIR_CREATED_CACHE_SIZE			64
-+
-+/*
-+ * Max number of entries in the cache that stores directories that were already
-+ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
-+ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
-+ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
-+ */
-+#define SEND_MAX_DIR_UTIMES_CACHE_SIZE			64
-+
- struct send_ctx {
- 	struct file *send_filp;
- 	loff_t send_off;
-@@ -174,9 +191,7 @@ struct send_ctx {
- 	struct list_head new_refs;
- 	struct list_head deleted_refs;
- 
--	struct radix_tree_root name_cache;
--	struct list_head name_cache_list;
--	int name_cache_size;
-+	struct btrfs_lru_cache name_cache;
- 
- 	/*
- 	 * The inode we are currently processing. It's not NULL only when we
-@@ -285,13 +300,11 @@ struct send_ctx {
- 	struct rb_root rbtree_new_refs;
- 	struct rb_root rbtree_deleted_refs;
- 
--	struct {
--		u64 last_reloc_trans;
--		struct list_head lru_list;
--		struct maple_tree entries;
--		/* Number of entries stored in the cache. */
--		int size;
--	} backref_cache;
-+	struct btrfs_lru_cache backref_cache;
-+	u64 backref_cache_last_reloc_trans;
-+
-+	struct btrfs_lru_cache dir_created_cache;
-+	struct btrfs_lru_cache dir_utimes_cache;
- };
- 
- struct pending_dir_move {
-@@ -321,21 +334,15 @@ struct orphan_dir_info {
- 	u64 ino;
- 	u64 gen;
- 	u64 last_dir_index_offset;
-+	u64 dir_high_seq_ino;
- };
- 
- struct name_cache_entry {
--	struct list_head list;
- 	/*
--	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
--	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
--	 * more then one inum would fall into the same entry, we use radix_list
--	 * to store the additional entries. radix_list is also used to store
--	 * entries where two entries have the same inum but different
--	 * generations.
-+	 * The key in the entry is an inode number, and the generation matches
-+	 * the inode's generation.
- 	 */
--	struct list_head radix_list;
--	u64 ino;
--	u64 gen;
-+	struct btrfs_lru_cache_entry entry;
- 	u64 parent_ino;
- 	u64 parent_gen;
- 	int ret;
-@@ -344,6 +351,9 @@ struct name_cache_entry {
- 	char name[];
- };
- 
-+/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
-+static_assert(offsetof(struct name_cache_entry, entry) == 0);
-+
- #define ADVANCE							1
- #define ADVANCE_ONLY_NEXT					-1
- 
-@@ -956,14 +966,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
- static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
- {
- 	int ret;
--	struct btrfs_inode_info info;
-+	struct btrfs_inode_info info = { 0 };
- 
--	if (!gen)
--		return -EPERM;
-+	ASSERT(gen);
- 
- 	ret = get_inode_info(root, ino, &info);
--	if (!ret)
--		*gen = info.gen;
-+	*gen = info.gen;
- 	return ret;
- }
- 
-@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
- 	return 0;
- }
- 
--static void empty_backref_cache(struct send_ctx *sctx)
--{
--	struct backref_cache_entry *entry;
--	struct backref_cache_entry *tmp;
--
--	list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
--		kfree(entry);
--
--	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
--	mtree_destroy(&sctx->backref_cache.entries);
--	sctx->backref_cache.size = 0;
--}
--
- static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
- 				 const u64 **root_ids_ret, int *root_count_ret)
- {
-@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
- 	struct send_ctx *sctx = bctx->sctx;
- 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
- 	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
-+	struct btrfs_lru_cache_entry *raw_entry;
- 	struct backref_cache_entry *entry;
- 
--	if (sctx->backref_cache.size == 0)
-+	if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
- 		return false;
- 
- 	/*
-@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
- 	 * transaction handle or holding fs_info->commit_root_sem, so no need
- 	 * to take any lock here.
- 	 */
--	if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
--		empty_backref_cache(sctx);
-+	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
-+		btrfs_lru_cache_clear(&sctx->backref_cache);
- 		return false;
- 	}
- 
--	entry = mtree_load(&sctx->backref_cache.entries, key);
--	if (!entry)
-+	raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
-+	if (!raw_entry)
- 		return false;
- 
-+	entry = container_of(raw_entry, struct backref_cache_entry, entry);
- 	*root_ids_ret = entry->root_ids;
- 	*root_count_ret = entry->num_roots;
--	list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
- 
- 	return true;
- }
-@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
- 	if (!new_entry)
- 		return;
- 
--	new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
-+	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
-+	new_entry->entry.gen = 0;
- 	new_entry->num_roots = 0;
- 	ULIST_ITER_INIT(&uiter);
- 	while ((node = ulist_next(root_ids, &uiter)) != NULL) {
-@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
- 	 * none of the roots is part of the list of roots from which we are
- 	 * allowed to clone. Cache the new entry as it's still useful to avoid
- 	 * backref walking to determine which roots have a path to the leaf.
-+	 *
-+	 * Also use GFP_NOFS because we're called while holding a transaction
-+	 * handle or while holding fs_info->commit_root_sem.
- 	 */
--
--	if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
--		struct backref_cache_entry *lru_entry;
--		struct backref_cache_entry *mt_entry;
--
--		lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
--					     struct backref_cache_entry, list);
--		mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
--		ASSERT(mt_entry == lru_entry);
--		list_del(&mt_entry->list);
--		kfree(mt_entry);
--		sctx->backref_cache.size--;
--	}
--
--	ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
--			   new_entry, GFP_NOFS);
-+	ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
-+				    GFP_NOFS);
- 	ASSERT(ret == 0 || ret == -ENOMEM);
- 	if (ret) {
- 		/* Caching is optional, no worries. */
-@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
- 		return;
- 	}
- 
--	list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
--
- 	/*
- 	 * We are called from iterate_extent_inodes() while either holding a
- 	 * transaction handle or holding fs_info->commit_root_sem, so no need
- 	 * to take any lock here.
- 	 */
--	if (sctx->backref_cache.size == 0)
--		sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
--
--	sctx->backref_cache.size++;
-+	if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
-+		sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
- }
- 
- static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
-@@ -1886,7 +1868,8 @@ enum inode_state {
- 	inode_state_did_delete,
- };
- 
--static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
-+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
-+			       u64 *send_gen, u64 *parent_gen)
- {
- 	int ret;
- 	int left_ret;
-@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
- 		goto out;
- 	left_ret = (info.nlink == 0) ? -ENOENT : ret;
- 	left_gen = info.gen;
-+	if (send_gen)
-+		*send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
- 
- 	if (!sctx->parent_root) {
- 		right_ret = -ENOENT;
-@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
- 			goto out;
- 		right_ret = (info.nlink == 0) ? -ENOENT : ret;
- 		right_gen = info.gen;
-+		if (parent_gen)
-+			*parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
- 	}
- 
- 	if (!left_ret && !right_ret) {
-@@ -1953,14 +1940,15 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
- 	return ret;
- }
- 
--static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
-+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
-+			     u64 *send_gen, u64 *parent_gen)
- {
- 	int ret;
- 
- 	if (ino == BTRFS_FIRST_FREE_OBJECTID)
- 		return 1;
- 
--	ret = get_cur_inode_state(sctx, ino, gen);
-+	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
- 	if (ret < 0)
- 		goto out;
- 
-@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 			      const char *name, int name_len,
- 			      u64 *who_ino, u64 *who_gen, u64 *who_mode)
- {
--	int ret = 0;
--	u64 gen;
-+	int ret;
-+	u64 parent_root_dir_gen;
- 	u64 other_inode = 0;
- 	struct btrfs_inode_info info;
- 
- 	if (!sctx->parent_root)
--		goto out;
-+		return 0;
- 
--	ret = is_inode_existent(sctx, dir, dir_gen);
-+	ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
- 	if (ret <= 0)
--		goto out;
-+		return 0;
- 
- 	/*
- 	 * If we have a parent root we need to verify that the parent dir was
- 	 * not deleted and then re-created, if it was then we have no overwrite
- 	 * and we can just unlink this entry.
-+	 *
-+	 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
-+	 * parent root.
- 	 */
--	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
--		ret = get_inode_gen(sctx->parent_root, dir, &gen);
--		if (ret < 0 && ret != -ENOENT)
--			goto out;
--		if (ret) {
--			ret = 0;
--			goto out;
--		}
--		if (gen != dir_gen)
--			goto out;
--	}
-+	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
-+	    parent_root_dir_gen != dir_gen)
-+		return 0;
- 
- 	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
- 				    &other_inode);
--	if (ret < 0 && ret != -ENOENT)
--		goto out;
--	if (ret) {
--		ret = 0;
--		goto out;
--	}
-+	if (ret == -ENOENT)
-+		return 0;
-+	else if (ret < 0)
-+		return ret;
- 
- 	/*
- 	 * Check if the overwritten ref was already processed. If yes, the ref
-@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 	    is_waiting_for_move(sctx, other_inode)) {
- 		ret = get_inode_info(sctx->parent_root, other_inode, &info);
- 		if (ret < 0)
--			goto out;
-+			return ret;
- 
--		ret = 1;
- 		*who_ino = other_inode;
- 		*who_gen = info.gen;
- 		*who_mode = info.mode;
--	} else {
--		ret = 0;
-+		return 1;
- 	}
- 
--out:
--	return ret;
-+	return 0;
- }
- 
- /*
-@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx,
- 			    u64 ino, u64 ino_gen,
- 			    const char *name, int name_len)
- {
--	int ret = 0;
--	u64 gen;
-+	int ret;
- 	u64 ow_inode;
-+	u64 ow_gen = 0;
-+	u64 send_root_dir_gen;
- 
- 	if (!sctx->parent_root)
--		goto out;
-+		return 0;
- 
--	ret = is_inode_existent(sctx, dir, dir_gen);
-+	ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
- 	if (ret <= 0)
--		goto out;
-+		return ret;
- 
--	if (dir != BTRFS_FIRST_FREE_OBJECTID) {
--		ret = get_inode_gen(sctx->send_root, dir, &gen);
--		if (ret < 0 && ret != -ENOENT)
--			goto out;
--		if (ret) {
--			ret = 0;
--			goto out;
--		}
--		if (gen != dir_gen)
--			goto out;
--	}
-+	/*
-+	 * @send_root_dir_gen was set to 0 if the inode does not exist in the
-+	 * send root.
-+	 */
-+	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
-+		return 0;
- 
- 	/* check if the ref was overwritten by another ref */
- 	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
- 				    &ow_inode);
--	if (ret < 0 && ret != -ENOENT)
--		goto out;
--	if (ret) {
-+	if (ret == -ENOENT) {
- 		/* was never and will never be overwritten */
--		ret = 0;
--		goto out;
-+		return 0;
-+	} else if (ret < 0) {
-+		return ret;
- 	}
- 
--	ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
--	if (ret < 0)
--		goto out;
-+	if (ow_inode == ino) {
-+		ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
-+		if (ret < 0)
-+			return ret;
- 
--	if (ow_inode == ino && gen == ino_gen) {
--		ret = 0;
--		goto out;
-+		/* It's the same inode, so no overwrite happened. */
-+		if (ow_gen == ino_gen)
-+			return 0;
- 	}
- 
- 	/*
-@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx,
- 	 * inode 'ino' to be orphanized, therefore check if ow_inode matches
- 	 * the current inode being processed.
- 	 */
--	if ((ow_inode < sctx->send_progress) ||
--	    (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
--	     gen == sctx->cur_inode_gen))
--		ret = 1;
--	else
--		ret = 0;
-+	if (ow_inode < sctx->send_progress)
-+		return 1;
- 
--out:
--	return ret;
-+	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
-+		if (ow_gen == 0) {
-+			ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
-+			if (ret < 0)
-+				return ret;
-+		}
-+		if (ow_gen == sctx->cur_inode_gen)
-+			return 1;
-+	}
-+
-+	return 0;
- }
- 
- /*
-@@ -2285,113 +2264,16 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
- 	return ret;
- }
- 
--/*
-- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
-- * so we need to do some special handling in case we have clashes. This function
-- * takes care of this with the help of name_cache_entry::radix_list.
-- * In case of error, nce is kfreed.
-- */
--static int name_cache_insert(struct send_ctx *sctx,
--			     struct name_cache_entry *nce)
-+static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
-+							 u64 ino, u64 gen)
- {
--	int ret = 0;
--	struct list_head *nce_head;
--
--	nce_head = radix_tree_lookup(&sctx->name_cache,
--			(unsigned long)nce->ino);
--	if (!nce_head) {
--		nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
--		if (!nce_head) {
--			kfree(nce);
--			return -ENOMEM;
--		}
--		INIT_LIST_HEAD(nce_head);
--
--		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
--		if (ret < 0) {
--			kfree(nce_head);
--			kfree(nce);
--			return ret;
--		}
--	}
--	list_add_tail(&nce->radix_list, nce_head);
--	list_add_tail(&nce->list, &sctx->name_cache_list);
--	sctx->name_cache_size++;
--
--	return ret;
--}
-+	struct btrfs_lru_cache_entry *entry;
- 
--static void name_cache_delete(struct send_ctx *sctx,
--			      struct name_cache_entry *nce)
--{
--	struct list_head *nce_head;
--
--	nce_head = radix_tree_lookup(&sctx->name_cache,
--			(unsigned long)nce->ino);
--	if (!nce_head) {
--		btrfs_err(sctx->send_root->fs_info,
--	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
--			nce->ino, sctx->name_cache_size);
--	}
--
--	list_del(&nce->radix_list);
--	list_del(&nce->list);
--	sctx->name_cache_size--;
--
--	/*
--	 * We may not get to the final release of nce_head if the lookup fails
--	 */
--	if (nce_head && list_empty(nce_head)) {
--		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
--		kfree(nce_head);
--	}
--}
--
--static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
--						    u64 ino, u64 gen)
--{
--	struct list_head *nce_head;
--	struct name_cache_entry *cur;
--
--	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
--	if (!nce_head)
-+	entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
-+	if (!entry)
- 		return NULL;
- 
--	list_for_each_entry(cur, nce_head, radix_list) {
--		if (cur->ino == ino && cur->gen == gen)
--			return cur;
--	}
--	return NULL;
--}
--
--/*
-- * Remove some entries from the beginning of name_cache_list.
-- */
--static void name_cache_clean_unused(struct send_ctx *sctx)
--{
--	struct name_cache_entry *nce;
--
--	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
--		return;
--
--	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
--		nce = list_entry(sctx->name_cache_list.next,
--				struct name_cache_entry, list);
--		name_cache_delete(sctx, nce);
--		kfree(nce);
--	}
--}
--
--static void name_cache_free(struct send_ctx *sctx)
--{
--	struct name_cache_entry *nce;
--
--	while (!list_empty(&sctx->name_cache_list)) {
--		nce = list_entry(sctx->name_cache_list.next,
--				struct name_cache_entry, list);
--		name_cache_delete(sctx, nce);
--		kfree(nce);
--	}
-+	return container_of(entry, struct name_cache_entry, entry);
- }
- 
- /*
-@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
- {
- 	int ret;
- 	int nce_ret;
--	struct name_cache_entry *nce = NULL;
-+	struct name_cache_entry *nce;
- 
- 	/*
- 	 * First check if we already did a call to this function with the same
-@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
- 	nce = name_cache_search(sctx, ino, gen);
- 	if (nce) {
- 		if (ino < sctx->send_progress && nce->need_later_update) {
--			name_cache_delete(sctx, nce);
--			kfree(nce);
-+			btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
- 			nce = NULL;
- 		} else {
--			/*
--			 * Removes the entry from the list and adds it back to
--			 * the end.  This marks the entry as recently used so
--			 * that name_cache_clean_unused does not remove it.
--			 */
--			list_move_tail(&nce->list, &sctx->name_cache_list);
--
- 			*parent_ino = nce->parent_ino;
- 			*parent_gen = nce->parent_gen;
- 			ret = fs_path_add(dest, nce->name, nce->name_len);
-@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
- 	 * This should only happen for the parent dir that we determine in
- 	 * record_new_ref_if_needed().
- 	 */
--	ret = is_inode_existent(sctx, ino, gen);
-+	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
- 	if (ret < 0)
- 		goto out;
- 
-@@ -2497,8 +2371,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
- 		goto out;
- 	}
- 
--	nce->ino = ino;
--	nce->gen = gen;
-+	nce->entry.key = ino;
-+	nce->entry.gen = gen;
- 	nce->parent_ino = *parent_ino;
- 	nce->parent_gen = *parent_gen;
- 	nce->name_len = fs_path_len(dest);
-@@ -2510,10 +2384,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
- 	else
- 		nce->need_later_update = 1;
- 
--	nce_ret = name_cache_insert(sctx, nce);
--	if (nce_ret < 0)
-+	nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
-+	if (nce_ret < 0) {
-+		kfree(nce);
- 		ret = nce_ret;
--	name_cache_clean_unused(sctx);
-+	}
- 
- out:
- 	return ret;
-@@ -2883,6 +2758,63 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
- 	return ret;
- }
- 
-+/*
-+ * If the cache is full, we can't remove entries from it and do a call to
-+ * send_utimes() for each respective inode, because we might be finishing
-+ * processing an inode that is a directory and it just got renamed, and existing
-+ * entries in the cache may refer to inodes that have the directory in their
-+ * full path - in which case we would generate outdated paths (pre-rename)
-+ * for the inodes that the cache entries point to. Instead of prunning the
-+ * cache when inserting, do it after we finish processing each inode at
-+ * finish_inode_if_needed().
-+ */
-+static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
-+{
-+	struct btrfs_lru_cache_entry *entry;
-+	int ret;
-+
-+	entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
-+	if (entry != NULL)
-+		return 0;
-+
-+	/* Caching is optional, don't fail if we can't allocate memory. */
-+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-+	if (!entry)
-+		return send_utimes(sctx, dir, gen);
-+
-+	entry->key = dir;
-+	entry->gen = gen;
-+
-+	ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
-+	ASSERT(ret != -EEXIST);
-+	if (ret) {
-+		kfree(entry);
-+		return send_utimes(sctx, dir, gen);
-+	}
-+
-+	return 0;
-+}
-+
-+static int trim_dir_utimes_cache(struct send_ctx *sctx)
-+{
-+	while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
-+	       SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
-+		struct btrfs_lru_cache_entry *lru;
-+		int ret;
-+
-+		lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
-+		ASSERT(lru != NULL);
-+
-+		ret = send_utimes(sctx, lru->key, lru->gen);
-+		if (ret)
-+			return ret;
-+
-+		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
-+	}
-+
-+	return 0;
-+}
-+
- /*
-  * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
-  * a valid path yet because we did not process the refs yet. So, the inode
-@@ -2971,6 +2903,23 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
- 	return ret;
- }
- 
-+static void cache_dir_created(struct send_ctx *sctx, u64 dir)
-+{
-+	struct btrfs_lru_cache_entry *entry;
-+	int ret;
-+
-+	/* Caching is optional, ignore any failures. */
-+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-+	if (!entry)
-+		return;
-+
-+	entry->key = dir;
-+	entry->gen = 0;
-+	ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
-+	if (ret < 0)
-+		kfree(entry);
-+}
-+
- /*
-  * We need some special handling for inodes that get processed before the parent
-  * directory got created. See process_recorded_refs for details.
-@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
- 	struct btrfs_key di_key;
- 	struct btrfs_dir_item *di;
- 
-+	if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
-+		return 1;
-+
- 	path = alloc_path_for_send();
- 	if (!path)
- 		return -ENOMEM;
-@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
- 		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
- 		    di_key.objectid < sctx->send_progress) {
- 			ret = 1;
-+			cache_dir_created(sctx, dir);
- 			break;
- 		}
- 	}
-@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
- 			return 0;
- 	}
- 
--	return send_create_inode(sctx, sctx->cur_ino);
-+	ret = send_create_inode(sctx, sctx->cur_ino);
-+
-+	if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
-+		cache_dir_created(sctx, sctx->cur_ino);
-+
-+	return ret;
- }
- 
- struct recorded_ref {
-@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
- 	odi->ino = dir_ino;
- 	odi->gen = dir_gen;
- 	odi->last_dir_index_offset = 0;
-+	odi->dir_high_seq_ino = 0;
- 
- 	rb_link_node(&odi->node, parent, p);
- 	rb_insert_color(&odi->node, &sctx->orphan_dirs);
-@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx,
-  * We check this by iterating all dir items and checking if the inode behind
-  * the dir item was already processed.
-  */
--static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
--		     u64 send_progress)
-+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
- {
- 	int ret = 0;
- 	int iter_ret = 0;
-@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 	struct btrfs_key loc;
- 	struct btrfs_dir_item *di;
- 	struct orphan_dir_info *odi = NULL;
-+	u64 dir_high_seq_ino = 0;
-+	u64 last_dir_index_offset = 0;
- 
- 	/*
- 	 * Don't try to rmdir the top/root subvolume dir.
-@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 	if (dir == BTRFS_FIRST_FREE_OBJECTID)
- 		return 0;
- 
-+	odi = get_orphan_dir_info(sctx, dir, dir_gen);
-+	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
-+		return 0;
-+
- 	path = alloc_path_for_send();
- 	if (!path)
- 		return -ENOMEM;
- 
-+	if (!odi) {
-+		/*
-+		 * Find the inode number associated with the last dir index
-+		 * entry. This is very likely the inode with the highest number
-+		 * of all inodes that have an entry in the directory. We can
-+		 * then use it to avoid future calls to can_rmdir(), when
-+		 * processing inodes with a lower number, from having to search
-+		 * the parent root b+tree for dir index keys.
-+		 */
-+		key.objectid = dir;
-+		key.type = BTRFS_DIR_INDEX_KEY;
-+		key.offset = (u64)-1;
-+
-+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-+		if (ret < 0) {
-+			goto out;
-+		} else if (ret > 0) {
-+			/* Can't happen, the root is never empty. */
-+			ASSERT(path->slots[0] > 0);
-+			if (WARN_ON(path->slots[0] == 0)) {
-+				ret = -EUCLEAN;
-+				goto out;
-+			}
-+			path->slots[0]--;
-+		}
-+
-+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-+		if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
-+			/* No index keys, dir can be removed. */
-+			ret = 1;
-+			goto out;
-+		}
-+
-+		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
-+				    struct btrfs_dir_item);
-+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
-+		dir_high_seq_ino = loc.objectid;
-+		if (sctx->cur_ino < dir_high_seq_ino) {
-+			ret = 0;
-+			goto out;
-+		}
-+
-+		btrfs_release_path(path);
-+	}
-+
- 	key.objectid = dir;
- 	key.type = BTRFS_DIR_INDEX_KEY;
--	key.offset = 0;
--
--	odi = get_orphan_dir_info(sctx, dir, dir_gen);
--	if (odi)
--		key.offset = odi->last_dir_index_offset;
-+	key.offset = (odi ? odi->last_dir_index_offset : 0);
- 
- 	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
- 		struct waiting_dir_move *dm;
-@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 				struct btrfs_dir_item);
- 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
- 
-+		dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
-+		last_dir_index_offset = found_key.offset;
-+
- 		dm = get_waiting_dir_move(sctx, loc.objectid);
- 		if (dm) {
--			odi = add_orphan_dir_info(sctx, dir, dir_gen);
--			if (IS_ERR(odi)) {
--				ret = PTR_ERR(odi);
--				goto out;
--			}
--			odi->gen = dir_gen;
--			odi->last_dir_index_offset = found_key.offset;
- 			dm->rmdir_ino = dir;
- 			dm->rmdir_gen = dir_gen;
- 			ret = 0;
- 			goto out;
- 		}
- 
--		if (loc.objectid > send_progress) {
--			odi = add_orphan_dir_info(sctx, dir, dir_gen);
--			if (IS_ERR(odi)) {
--				ret = PTR_ERR(odi);
--				goto out;
--			}
--			odi->gen = dir_gen;
--			odi->last_dir_index_offset = found_key.offset;
-+		if (loc.objectid > sctx->cur_ino) {
- 			ret = 0;
- 			goto out;
- 		}
-@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
- 
- out:
- 	btrfs_free_path(path);
--	return ret;
-+
-+	if (ret)
-+		return ret;
-+
-+	if (!odi) {
-+		odi = add_orphan_dir_info(sctx, dir, dir_gen);
-+		if (IS_ERR(odi))
-+			return PTR_ERR(odi);
-+
-+		odi->gen = dir_gen;
-+	}
-+
-+	odi->last_dir_index_offset = last_dir_index_offset;
-+	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
-+
-+	return 0;
- }
- 
- static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
-@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
- 		}
- 		gen = odi->gen;
- 
--		ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
-+		ret = can_rmdir(sctx, rmdir_ino, gen);
- 		if (ret < 0)
- 			goto out;
- 		if (!ret)
-@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
- 	}
- 
- finish:
--	ret = send_utimes(sctx, pm->ino, pm->gen);
-+	ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
- 	if (ret < 0)
- 		goto out;
- 
-@@ -3619,7 +3628,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
- 		if (ret < 0)
- 			goto out;
- 
--		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
-+		ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
- 		if (ret < 0)
- 			goto out;
- 	}
-@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 	 * "testdir_2".
- 	 */
- 	list_for_each_entry(cur, &sctx->new_refs, list) {
--		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
-+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
- 		if (ret < 0)
- 			goto out;
- 		if (ret == inode_state_will_create)
-@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 				 * the source path when performing its rename
- 				 * operation.
- 				 */
--				if (is_waiting_for_move(sctx, ow_inode)) {
--					wdm = get_waiting_dir_move(sctx,
--								   ow_inode);
--					ASSERT(wdm);
-+				wdm = get_waiting_dir_move(sctx, ow_inode);
-+				if (wdm)
- 					wdm->orphanized = true;
--				}
- 
- 				/*
- 				 * Make sure we clear our orphanized inode's
-@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 				 * and get instead the orphan name.
- 				 */
- 				nce = name_cache_search(sctx, ow_inode, ow_gen);
--				if (nce) {
--					name_cache_delete(sctx, nce);
--					kfree(nce);
--				}
-+				if (nce)
-+					btrfs_lru_cache_remove(&sctx->name_cache,
-+							       &nce->entry);
- 
- 				/*
- 				 * ow_inode might currently be an ancestor of
-@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 		 * parent directory out of order. But we need to check if this
- 		 * did already happen before due to other refs in the same dir.
- 		 */
--		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
-+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
- 		if (ret < 0)
- 			goto out;
- 		if (ret == inode_state_will_create) {
-@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 				ret = send_create_inode(sctx, cur->dir);
- 				if (ret < 0)
- 					goto out;
-+				cache_dir_created(sctx, cur->dir);
- 			}
- 		}
- 
-@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 		 * later, we do this check again and rmdir it then if possible.
- 		 * See the use of check_dirs for more details.
- 		 */
--		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
--				sctx->cur_ino);
-+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
- 		if (ret < 0)
- 			goto out;
- 		if (ret) {
-@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
- 		if (cur->dir > sctx->cur_ino)
- 			continue;
- 
--		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
-+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
- 		if (ret < 0)
- 			goto out;
- 
- 		if (ret == inode_state_did_create ||
- 		    ret == inode_state_no_change) {
--			/* TODO delayed utimes */
--			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
-+			ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
- 			if (ret < 0)
- 				goto out;
- 		} else if (ret == inode_state_did_delete &&
- 			   cur->dir != last_dir_ino_rm) {
--			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
--					sctx->cur_ino);
-+			ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
- 			if (ret < 0)
- 				goto out;
- 			if (ret) {
-@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
- 	 * boundary in the send buffer. This means that there may be a gap
- 	 * between the beginning of the command and the file data.
- 	 */
--	data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
-+	data_offset = PAGE_ALIGN(sctx->send_size);
- 	if (data_offset > sctx->send_max_size ||
- 	    sctx->send_max_size - data_offset < disk_num_bytes) {
- 		ret = -EOVERFLOW;
-@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
- 		sent += size;
- 	}
- 
--	if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
-+	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
- 		/*
- 		 * Always operate only on ranges that are a multiple of the page
- 		 * size. This is not only to prevent zeroing parts of a page in
-@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
- 		 * it's moved/renamed, therefore we don't need to do it here.
- 		 */
- 		sctx->send_progress = sctx->cur_ino + 1;
--		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-+
-+		/*
-+		 * If the current inode is a non-empty directory, delay issuing
-+		 * the utimes command for it, as it's very likely we have inodes
-+		 * with an higher number inside it. We want to issue the utimes
-+		 * command only after adding all dentries to it.
-+		 */
-+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
-+			ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-+		else
-+			ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-+
- 		if (ret < 0)
- 			goto out;
- 	}
- 
- out:
-+	if (!ret)
-+		ret = trim_dir_utimes_cache(sctx);
-+
- 	return ret;
- }
- 
-@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
- 	int clone_sources_to_rollback = 0;
- 	size_t alloc_size;
- 	int sort_clone_roots = 0;
-+	struct btrfs_lru_cache_entry *entry;
-+	struct btrfs_lru_cache_entry *tmp;
- 
- 	if (!capable(CAP_SYS_ADMIN))
- 		return -EPERM;
-@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
- 
- 	INIT_LIST_HEAD(&sctx->new_refs);
- 	INIT_LIST_HEAD(&sctx->deleted_refs);
--	INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
--	INIT_LIST_HEAD(&sctx->name_cache_list);
- 
--	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
--	mt_init(&sctx->backref_cache.entries);
-+	btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
-+	btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
-+	btrfs_lru_cache_init(&sctx->dir_created_cache,
-+			     SEND_MAX_DIR_CREATED_CACHE_SIZE);
-+	/*
-+	 * This cache is periodically trimmed to a fixed size elsewhere, see
-+	 * cache_dir_utimes() and trim_dir_utimes_cache().
-+	 */
-+	btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
-+
-+	sctx->pending_dir_moves = RB_ROOT;
-+	sctx->waiting_dir_moves = RB_ROOT;
-+	sctx->orphan_dirs = RB_ROOT;
-+	sctx->rbtree_new_refs = RB_ROOT;
-+	sctx->rbtree_deleted_refs = RB_ROOT;
- 
- 	sctx->flags = arg->flags;
- 
-@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
- 		goto out;
- 	}
- 
--	sctx->pending_dir_moves = RB_ROOT;
--	sctx->waiting_dir_moves = RB_ROOT;
--	sctx->orphan_dirs = RB_ROOT;
--	sctx->rbtree_new_refs = RB_ROOT;
--	sctx->rbtree_deleted_refs = RB_ROOT;
--
- 	sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- 				     arg->clone_sources_count + 1,
- 				     GFP_KERNEL);
-@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
- 	if (ret < 0)
- 		goto out;
- 
-+	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
-+		ret = send_utimes(sctx, entry->key, entry->gen);
-+		if (ret < 0)
-+			goto out;
-+		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
-+	}
-+
- 	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
- 		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
- 		if (ret < 0)
-@@ -8358,11 +8389,12 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
- 		kvfree(sctx->send_buf);
- 		kvfree(sctx->verity_descriptor);
- 
--		name_cache_free(sctx);
--
- 		close_current_inode(sctx);
- 
--		empty_backref_cache(sctx);
-+		btrfs_lru_cache_clear(&sctx->name_cache);
-+		btrfs_lru_cache_clear(&sctx->backref_cache);
-+		btrfs_lru_cache_clear(&sctx->dir_created_cache);
-+		btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
- 
- 		kfree(sctx);
- 	}
-diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
-index 433ce221dc5c..581845bc206a 100644
---- a/fs/btrfs/super.c
-+++ b/fs/btrfs/super.c
-@@ -58,6 +58,7 @@
- #include "scrub.h"
- #include "verity.h"
- #include "super.h"
-+#include "extent-tree.h"
- #define CREATE_TRACE_POINTS
- #include <trace/events/btrfs.h>
- 
-@@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
- 		}
- 
- 		/*
--		 * Metadata in mixed block goup profiles are accounted in data
-+		 * Metadata in mixed block group profiles are accounted in data
- 		 */
- 		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
- 			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 108aa3876186..8c5efa5813b3 100644
---- a/fs/btrfs/sysfs.c
-+++ b/fs/btrfs/sysfs.c
-@@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj)
- 	kfree(to_raid_kobj(kobj));
- }
- 
--static struct kobj_type btrfs_raid_ktype = {
-+static const struct kobj_type btrfs_raid_ktype = {
- 	.sysfs_ops = &kobj_sysfs_ops,
- 	.release = release_raid_kobj,
- 	.default_groups = raid_groups,
-@@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj)
- 	kfree(sinfo);
- }
- 
--static struct kobj_type space_info_ktype = {
-+static const struct kobj_type space_info_ktype = {
- 	.sysfs_ops = &kobj_sysfs_ops,
- 	.release = space_info_release,
- 	.default_groups = space_info_groups,
-@@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj)
- 	complete(&fs_devs->kobj_unregister);
- }
- 
--static struct kobj_type btrfs_ktype = {
-+static const struct kobj_type btrfs_ktype = {
- 	.sysfs_ops	= &kobj_sysfs_ops,
- 	.release	= btrfs_release_fsid_kobj,
- };
-@@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj)
- 	complete(&device->kobj_unregister);
- }
- 
--static struct kobj_type devid_ktype = {
-+static const struct kobj_type devid_ktype = {
- 	.sysfs_ops	= &kobj_sysfs_ops,
- 	.default_groups = devid_groups,
- 	.release	= btrfs_release_devid_kobj,
-@@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj)
- 	kfree(kobj);
- }
- 
--static struct kobj_type qgroups_ktype = {
-+static const struct kobj_type qgroups_ktype = {
- 	.sysfs_ops = &kobj_sysfs_ops,
- 	.default_groups = qgroups_groups,
- 	.release = qgroups_release,
-@@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj)
- 	memset(&qgroup->kobj, 0, sizeof(*kobj));
- }
- 
--static struct kobj_type qgroup_ktype = {
-+static const struct kobj_type qgroup_ktype = {
- 	.sysfs_ops = &kobj_sysfs_ops,
- 	.release = qgroup_release,
- 	.default_groups = qgroup_groups,
-diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
-index c5b3a631bf4f..f2f2e11dac4c 100644
---- a/fs/btrfs/tests/extent-map-tests.c
-+++ b/fs/btrfs/tests/extent-map-tests.c
-@@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
- 		goto out_free;
- 	}
- 
--	ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
-+	ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
- 			       &logical, &out_ndaddrs, &out_stripe_len);
- 	if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
- 		test_err("didn't rmap anything but expected %d",
-diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
-index 8f8d0fce6e4a..18329ebcb1cb 100644
---- a/fs/btrfs/transaction.c
-+++ b/fs/btrfs/transaction.c
-@@ -2609,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
- 	return (ret < 0) ? 0 : 1;
- }
- 
-+/*
-+ * We only mark the transaction aborted and then set the file system read-only.
-+ * This will prevent new transactions from starting or trying to join this
-+ * one.
-+ *
-+ * This means that error recovery at the call site is limited to freeing
-+ * any local memory allocations and passing the error code up without
-+ * further cleanup. The transaction should complete as it normally would
-+ * in the call path but will return -EIO.
-+ *
-+ * We'll complete the cleanup in btrfs_end_transaction and
-+ * btrfs_commit_transaction.
-+ */
-+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-+				      const char *function,
-+				      unsigned int line, int errno, bool first_hit)
-+{
-+	struct btrfs_fs_info *fs_info = trans->fs_info;
-+
-+	WRITE_ONCE(trans->aborted, errno);
-+	WRITE_ONCE(trans->transaction->aborted, errno);
-+	if (first_hit && errno == -ENOSPC)
-+		btrfs_dump_space_info_for_trans_abort(fs_info);
-+	/* Wake up anybody who may be waiting on this transaction */
-+	wake_up(&fs_info->transaction_wait);
-+	wake_up(&fs_info->transaction_blocked_wait);
-+	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
-+}
-+
- int __init btrfs_transaction_init(void)
- {
- 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
-diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
-index 97f6c39f59c8..fa728ab80826 100644
---- a/fs/btrfs/transaction.h
-+++ b/fs/btrfs/transaction.h
-@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
- 	delayed_refs->qgroup_to_skip = 0;
- }
- 
-+bool __cold abort_should_print_stack(int errno);
-+
-+/*
-+ * Call btrfs_abort_transaction as early as possible when an error condition is
-+ * detected, that way the exact stack trace is reported for some errors.
-+ */
-+#define btrfs_abort_transaction(trans, errno)		\
-+do {								\
-+	bool first = false;					\
-+	/* Report first abort since mount */			\
-+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
-+			&((trans)->fs_info->fs_state))) {	\
-+		first = true;					\
-+		if (WARN(abort_should_print_stack(errno),	\
-+			KERN_ERR				\
-+			"BTRFS: Transaction aborted (error %d)\n",	\
-+			(errno))) {					\
-+			/* Stack trace printed. */			\
-+		} else {						\
-+			btrfs_debug((trans)->fs_info,			\
-+				    "Transaction aborted (error %d)", \
-+				  (errno));			\
-+		}						\
-+	}							\
-+	__btrfs_abort_transaction((trans), __func__,		\
-+				  __LINE__, (errno), first);	\
-+} while (0)
-+
- int btrfs_end_transaction(struct btrfs_trans_handle *trans);
- struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- 						   unsigned int num_items);
-@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
- void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
- 			    struct btrfs_root *root);
- void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-+void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
-+				      const char *function,
-+				      unsigned int line, int errno, bool first_hit);
- 
- int __init btrfs_transaction_init(void);
- void __cold btrfs_transaction_exit(void);
-diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
-index 58599189bd18..200cea6e49e5 100644
---- a/fs/btrfs/tree-log.c
-+++ b/fs/btrfs/tree-log.c
-@@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
- 	}
- }
- 
--static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
--{
--	filemap_fdatawait_range(buf->pages[0]->mapping,
--			        buf->start, buf->start + buf->len - 1);
--}
--
- /*
-  * the walk control struct is used to pass state down the chain when
-  * processing the log tree.  The stage field tells us which part
-@@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- 					return ret;
- 				}
- 
-+				btrfs_tree_lock(next);
-+				btrfs_clear_buffer_dirty(trans, next);
-+				wait_on_extent_buffer_writeback(next);
-+				btrfs_tree_unlock(next);
-+
- 				if (trans) {
--					btrfs_tree_lock(next);
--					btrfs_clean_tree_block(next);
--					btrfs_wait_tree_block_writeback(next);
--					btrfs_tree_unlock(next);
- 					ret = btrfs_pin_reserved_extent(trans,
- 							bytenr, blocksize);
- 					if (ret) {
-@@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- 					btrfs_redirty_list_add(
- 						trans->transaction, next);
- 				} else {
--					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
--						clear_extent_buffer_dirty(next);
- 					unaccount_log_buffer(fs_info, bytenr);
- 				}
- 			}
-@@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- 
- 				next = path->nodes[*level];
- 
-+				btrfs_tree_lock(next);
-+				btrfs_clear_buffer_dirty(trans, next);
-+				wait_on_extent_buffer_writeback(next);
-+				btrfs_tree_unlock(next);
-+
- 				if (trans) {
--					btrfs_tree_lock(next);
--					btrfs_clean_tree_block(next);
--					btrfs_wait_tree_block_writeback(next);
--					btrfs_tree_unlock(next);
- 					ret = btrfs_pin_reserved_extent(trans,
- 						     path->nodes[*level]->start,
- 						     path->nodes[*level]->len);
-@@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- 					btrfs_redirty_list_add(trans->transaction,
- 							       next);
- 				} else {
--					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
--						clear_extent_buffer_dirty(next);
--
- 					unaccount_log_buffer(fs_info,
- 						path->nodes[*level]->start);
- 				}
-@@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
- 
- 			next = path->nodes[orig_level];
- 
-+			btrfs_tree_lock(next);
-+			btrfs_clear_buffer_dirty(trans, next);
-+			wait_on_extent_buffer_writeback(next);
-+			btrfs_tree_unlock(next);
-+
- 			if (trans) {
--				btrfs_tree_lock(next);
--				btrfs_clean_tree_block(next);
--				btrfs_wait_tree_block_writeback(next);
--				btrfs_tree_unlock(next);
- 				ret = btrfs_pin_reserved_extent(trans,
- 						next->start, next->len);
- 				if (ret)
- 					goto out;
- 				btrfs_redirty_list_add(trans->transaction, next);
- 			} else {
--				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
--					clear_extent_buffer_dirty(next);
- 				unaccount_log_buffer(fs_info, next->start);
- 			}
- 		}
-@@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
- 
- 	/*
- 	 * If for some unexpected reason the last item's index is not greater
--	 * than the last index we logged, warn and return an error to fallback
--	 * to a transaction commit.
-+	 * than the last index we logged, warn and force a transaction commit.
- 	 */
- 	if (WARN_ON(last_index <= inode->last_dir_index_offset))
--		ret = -EUCLEAN;
-+		ret = BTRFS_LOG_FORCE_COMMIT;
- 	else
- 		inode->last_dir_index_offset = last_index;
- out:
-@@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 	struct btrfs_key min_key;
- 	struct btrfs_root *root = inode->root;
- 	struct btrfs_root *log = root->log_root;
--	int err = 0;
- 	int ret;
- 	u64 last_old_dentry_offset = min_offset - 1;
- 	u64 last_offset = (u64)-1;
-@@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 					      path->slots[0]);
- 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
- 				last_old_dentry_offset = tmp.offset;
--		} else if (ret < 0) {
--			err = ret;
-+		} else if (ret > 0) {
-+			ret = 0;
- 		}
- 
- 		goto done;
-@@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
- 			last_old_dentry_offset = tmp.offset;
- 	} else if (ret < 0) {
--		err = ret;
- 		goto done;
- 	}
- 
-@@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 	 */
- search:
- 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
--	if (ret > 0)
-+	if (ret > 0) {
- 		ret = btrfs_next_item(root, path);
-+		if (ret > 0) {
-+			/* There are no more keys in the inode's root. */
-+			ret = 0;
-+			goto done;
-+		}
-+	}
- 	if (ret < 0)
--		err = ret;
--	/* If ret is 1, there are no more keys in the inode's root. */
--	if (ret != 0)
- 		goto done;
- 
- 	/*
-@@ -3897,8 +3887,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
- 					     &last_old_dentry_offset);
- 		if (ret != 0) {
--			if (ret < 0)
--				err = ret;
-+			if (ret > 0)
-+				ret = 0;
- 			goto done;
- 		}
- 		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
-@@ -3909,10 +3899,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 		 */
- 		ret = btrfs_next_leaf(root, path);
- 		if (ret) {
--			if (ret == 1)
-+			if (ret == 1) {
- 				last_offset = (u64)-1;
--			else
--				err = ret;
-+				ret = 0;
-+			}
- 			goto done;
- 		}
- 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
-@@ -3943,7 +3933,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 	btrfs_release_path(path);
- 	btrfs_release_path(dst_path);
- 
--	if (err == 0) {
-+	if (ret == 0) {
- 		*last_offset_ret = last_offset;
- 		/*
- 		 * In case the leaf was changed in the current transaction but
-@@ -3954,15 +3944,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- 		 * a range, last_old_dentry_offset is == to last_offset.
- 		 */
- 		ASSERT(last_old_dentry_offset <= last_offset);
--		if (last_old_dentry_offset < last_offset) {
-+		if (last_old_dentry_offset < last_offset)
- 			ret = insert_dir_log_key(trans, log, path, ino,
- 						 last_old_dentry_offset + 1,
- 						 last_offset);
--			if (ret)
--				err = ret;
--		}
- 	}
--	return err;
-+
-+	return ret;
- }
- 
- /*
-@@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
- 	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
- 	 * commits.
- 	 */
--	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
--		btrfs_set_log_full_commit(trans);
-+	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
- 		return BTRFS_LOG_FORCE_COMMIT;
--	}
- 
- 	inode = btrfs_iget(root->fs_info->sb, ino, root);
- 	/*
-@@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- 	 * result in losing the file after a log replay.
- 	 */
- 	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
--		btrfs_set_log_full_commit(trans);
- 		ret = BTRFS_LOG_FORCE_COMMIT;
- 		goto out_unlock;
- 	}
-diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
-index 85cd24cb0540..bdeb5216718f 100644
---- a/fs/btrfs/tree-log.h
-+++ b/fs/btrfs/tree-log.h
-@@ -13,8 +13,13 @@
- /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
- #define BTRFS_NO_LOG_SYNC 256
- 
--/* We can't use the tree log for whatever reason, force a transaction commit */
--#define BTRFS_LOG_FORCE_COMMIT				(1)
-+/*
-+ * We can't use the tree log for whatever reason, force a transaction commit.
-+ * We use a negative value because there are functions through the logging code
-+ * that need to return an error (< 0 value), false (0) or true (1). Any negative
-+ * value will do, as it will cause the log to be marked for a full sync.
-+ */
-+#define BTRFS_LOG_FORCE_COMMIT				(-(MAX_ERRNO + 1))
- 
- struct btrfs_log_ctx {
- 	int log_ret;
-diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
-index df43093b7a46..7823168c08a6 100644
---- a/fs/btrfs/volumes.c
-+++ b/fs/btrfs/volumes.c
-@@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata(
- 	/*
- 	 * Handle the case where the scanned device is part of an fs whose last
- 	 * metadata UUID change reverted it to the original FSID. At the same
--	 * time * fs_devices was first created by another constitutent device
-+	 * time fs_devices was first created by another constituent device
- 	 * which didn't fully observe the operation. This results in an
- 	 * btrfs_fs_devices created with metadata/fsid different AND
- 	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
-@@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op)
- 	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
- }
- 
--/*
-- * Calculate the geometry of a particular (address, len) tuple. This
-- * information is used to calculate how big a particular bio can get before it
-- * straddles a stripe.
-- *
-- * @fs_info: the filesystem
-- * @em:      mapping containing the logical extent
-- * @op:      type of operation - write or read
-- * @logical: address that we want to figure out the geometry of
-- * @io_geom: pointer used to return values
-- *
-- * Returns < 0 in case a chunk for the given logical address cannot be found,
-- * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
-- */
--int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
--			  enum btrfs_map_op op, u64 logical,
--			  struct btrfs_io_geometry *io_geom)
-+static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
-+			    u64 offset, u64 *stripe_nr, u64 *stripe_offset,
-+			    u64 *full_stripe_start)
- {
--	struct map_lookup *map;
--	u64 len;
--	u64 offset;
--	u64 stripe_offset;
--	u64 stripe_nr;
--	u32 stripe_len;
--	u64 raid56_full_stripe_start = (u64)-1;
--	int data_stripes;
-+	u32 stripe_len = map->stripe_len;
- 
- 	ASSERT(op != BTRFS_MAP_DISCARD);
- 
--	map = em->map_lookup;
--	/* Offset of this logical address in the chunk */
--	offset = logical - em->start;
--	/* Len of a stripe in a chunk */
--	stripe_len = map->stripe_len;
- 	/*
--	 * Stripe_nr is where this block falls in
--	 * stripe_offset is the offset of this block in its stripe.
-+	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
-+	 * the offset of this block in its stripe.
- 	 */
--	stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
--	ASSERT(stripe_offset < U32_MAX);
-+	*stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
-+	ASSERT(*stripe_offset < U32_MAX);
- 
--	data_stripes = nr_data_stripes(map);
-+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
- 
--	/* Only stripe based profiles needs to check against stripe length. */
--	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
--		u64 max_len = stripe_len - stripe_offset;
-+		*full_stripe_start =
-+			div64_u64(offset, full_stripe_len) * full_stripe_len;
- 
- 		/*
--		 * In case of raid56, we need to know the stripe aligned start
-+		 * For writes to RAID56, allow to write a full stripe set, but
-+		 * no straddling of stripe sets.
- 		 */
--		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
--			unsigned long full_stripe_len = stripe_len * data_stripes;
--			raid56_full_stripe_start = offset;
--
--			/*
--			 * Allow a write of a full stripe, but make sure we
--			 * don't allow straddling of stripes
--			 */
--			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
--					full_stripe_len);
--			raid56_full_stripe_start *= full_stripe_len;
--
--			/*
--			 * For writes to RAID[56], allow a full stripeset across
--			 * all disks. For other RAID types and for RAID[56]
--			 * reads, just allow a single stripe (on a single disk).
--			 */
--			if (op == BTRFS_MAP_WRITE) {
--				max_len = stripe_len * data_stripes -
--					  (offset - raid56_full_stripe_start);
--			}
--		}
--		len = min_t(u64, em->len - offset, max_len);
--	} else {
--		len = em->len - offset;
-+		if (op == BTRFS_MAP_WRITE)
-+			return full_stripe_len - (offset - *full_stripe_start);
- 	}
- 
--	io_geom->len = len;
--	io_geom->offset = offset;
--	io_geom->stripe_len = stripe_len;
--	io_geom->stripe_nr = stripe_nr;
--	io_geom->stripe_offset = stripe_offset;
--	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
--
--	return 0;
-+	/*
-+	 * For other RAID types and for RAID56 reads, allow a single stripe (on
-+	 * a single disk).
-+	 */
-+	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
-+		return stripe_len - *stripe_offset;
-+	return U64_MAX;
- }
- 
- static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
-@@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- {
- 	struct extent_map *em;
- 	struct map_lookup *map;
-+	u64 map_offset;
- 	u64 stripe_offset;
- 	u64 stripe_nr;
- 	u64 stripe_len;
-@@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- 	int patch_the_first_stripe_for_dev_replace = 0;
- 	u64 physical_to_patch_in_first_stripe = 0;
- 	u64 raid56_full_stripe_start = (u64)-1;
--	struct btrfs_io_geometry geom;
-+	u64 max_len;
- 
- 	ASSERT(bioc_ret);
- 	ASSERT(op != BTRFS_MAP_DISCARD);
-@@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- 	em = btrfs_get_chunk_map(fs_info, logical, *length);
- 	ASSERT(!IS_ERR(em));
- 
--	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
--	if (ret < 0)
--		return ret;
--
- 	map = em->map_lookup;
--
--	*length = geom.len;
--	stripe_len = geom.stripe_len;
--	stripe_nr = geom.stripe_nr;
--	stripe_offset = geom.stripe_offset;
--	raid56_full_stripe_start = geom.raid56_stripe_offset;
- 	data_stripes = nr_data_stripes(map);
-+	stripe_len = map->stripe_len;
-+
-+	map_offset = logical - em->start;
-+	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
-+				   &stripe_offset, &raid56_full_stripe_start);
-+	*length = min_t(u64, em->len - map_offset, max_len);
- 
- 	down_read(&dev_replace->rwsem);
- 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
-diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
-index 6b7a05f6cf82..7e51f2238f72 100644
---- a/fs/btrfs/volumes.h
-+++ b/fs/btrfs/volumes.h
-@@ -53,21 +53,6 @@ enum btrfs_raid_types {
- 	BTRFS_NR_RAID_TYPES
- };
- 
--struct btrfs_io_geometry {
--	/* remaining bytes before crossing a stripe */
--	u64 len;
--	/* offset of logical address in chunk */
--	u64 offset;
--	/* length of single IO stripe */
--	u32 stripe_len;
--	/* offset of address in stripe */
--	u32 stripe_offset;
--	/* number of stripe where address falls */
--	u64 stripe_nr;
--	/* offset of raid56 stripe into the chunk */
--	u64 raid56_stripe_offset;
--};
--
- /*
-  * Use sequence counter to get consistent device stat data on
-  * 32-bit processors.
-@@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
- 					       u64 logical, u64 *length_ret,
- 					       u32 *num_stripes);
--int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
--			  enum btrfs_map_op op, u64 logical,
--			  struct btrfs_io_geometry *io_geom);
- int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
- int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
- struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
-diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
-index 1f503e8e42d4..f95b2c94d619 100644
---- a/fs/btrfs/zoned.c
-+++ b/fs/btrfs/zoned.c
-@@ -17,6 +17,7 @@
- #include "space-info.h"
- #include "fs.h"
- #include "accessors.h"
-+#include "bio.h"
- 
- /* Maximum number of zones to report per blkdev_report_zones() call */
- #define BTRFS_REPORT_NR_ZONES   4096
-@@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
-  */
- static inline u32 sb_zone_number(int shift, int mirror)
- {
--	u64 zone;
-+	u64 zone = U64_MAX;
- 
- 	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
- 	switch (mirror) {
-@@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
- 			       struct blk_zone *zones, unsigned int *nr_zones)
- {
- 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
--	u32 zno;
- 	int ret;
- 
- 	if (!*nr_zones)
-@@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
- 	/* Check cache */
- 	if (zinfo->zone_cache) {
- 		unsigned int i;
-+		u32 zno;
- 
- 		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
- 		zno = pos >> zinfo->zone_size_shift;
-@@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
- 		return -EIO;
- 
- 	/* Populate cache */
--	if (zinfo->zone_cache)
-+	if (zinfo->zone_cache) {
-+		u32 zno = pos >> zinfo->zone_size_shift;
-+
- 		memcpy(zinfo->zone_cache + zno, zones,
- 		       sizeof(*zinfo->zone_cache) * *nr_zones);
-+	}
- 
- 	return 0;
- }
-@@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
- 	nr_sectors = bdev_nr_sectors(bdev);
- 	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
- 	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
--	/*
--	 * We limit max_zone_append_size also by max_segments *
--	 * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
--	 * since btrfs adds the pages one by one to a bio, and btrfs cannot
--	 * increase the metadata reservation even if it increases the number of
--	 * extents, it is safe to stick with the limit.
--	 *
--	 * With the zoned emulation, we can have non-zoned device on the zoned
--	 * mode. In this case, we don't have a valid max zone append size. So,
--	 * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
--	 */
--	if (bdev_is_zoned(bdev)) {
--		zone_info->max_zone_append_size = min_t(u64,
--			(u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
--			(u64)bdev_max_segments(bdev) << PAGE_SHIFT);
--	} else {
--		zone_info->max_zone_append_size =
--			(u64)bdev_max_segments(bdev) << PAGE_SHIFT;
--	}
- 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
- 		zone_info->nr_zones++;
- 
-@@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
- 
- int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
- {
-+	struct queue_limits *lim = &fs_info->limits;
- 	struct btrfs_device *device;
- 	u64 zone_size = 0;
--	u64 max_zone_append_size = 0;
- 	int ret;
- 
- 	/*
-@@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
- 	if (!btrfs_fs_incompat(fs_info, ZONED))
- 		return btrfs_check_for_zoned_device(fs_info);
- 
-+	blk_set_stacking_limits(lim);
-+
- 	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- 		struct btrfs_zoned_device_info *zone_info = device->zone_info;
- 
-@@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
- 				  zone_info->zone_size, zone_size);
- 			return -EINVAL;
- 		}
--		if (!max_zone_append_size ||
--		    (zone_info->max_zone_append_size &&
--		     zone_info->max_zone_append_size < max_zone_append_size))
--			max_zone_append_size = zone_info->max_zone_append_size;
-+
-+		/*
-+		 * With the zoned emulation, we can have non-zoned device on the
-+		 * zoned mode. In this case, we don't have a valid max zone
-+		 * append size.
-+		 */
-+		if (bdev_is_zoned(device->bdev)) {
-+			blk_stack_limits(lim,
-+					 &bdev_get_queue(device->bdev)->limits,
-+					 0);
-+		}
- 	}
- 
- 	/*
-@@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
- 	}
- 
- 	fs_info->zone_size = zone_size;
--	fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
--						   fs_info->sectorsize);
-+	/*
-+	 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
-+	 * Technically, we can have multiple pages per segment. But, since
-+	 * we add the pages one by one to a bio, and cannot increase the
-+	 * metadata reservation even if it increases the number of extents, it
-+	 * is safe to stick with the limit.
-+	 */
-+	fs_info->max_zone_append_size = ALIGN_DOWN(
-+		min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
-+		     (u64)lim->max_sectors << SECTOR_SHIFT,
-+		     (u64)lim->max_segments << PAGE_SHIFT),
-+		fs_info->sectorsize);
- 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
- 	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
- 		fs_info->max_extent_size = fs_info->max_zone_append_size;
-@@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
- 	spin_unlock(&trans->releasing_ebs_lock);
- }
- 
--bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
-+bool btrfs_use_zone_append(struct btrfs_bio *bbio)
- {
-+	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
-+	struct btrfs_inode *inode = bbio->inode;
- 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 	struct btrfs_block_group *cache;
- 	bool ret = false;
-@@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
- 	if (!is_data_inode(&inode->vfs_inode))
- 		return false;
- 
-+	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
-+		return false;
-+
- 	/*
- 	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
- 	 * extent layout the relocation code has.
-@@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
- 	return ret;
- }
- 
--void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
--				 struct bio *bio)
-+void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
- {
-+	const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- 	struct btrfs_ordered_extent *ordered;
--	const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- 
--	if (bio_op(bio) != REQ_OP_ZONE_APPEND)
--		return;
--
--	ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
-+	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
- 	if (WARN_ON(!ordered))
- 		return;
- 
- 	ordered->physical = physical;
--	ordered->bdev = bio->bi_bdev;
--
- 	btrfs_put_ordered_extent(ordered);
- }
- 
-@@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
- 	struct extent_map *em;
- 	struct btrfs_ordered_sum *sum;
- 	u64 orig_logical = ordered->disk_bytenr;
--	u64 *logical = NULL;
--	int nr, stripe_len;
-+	struct map_lookup *map;
-+	u64 physical = ordered->physical;
-+	u64 chunk_start_phys;
-+	u64 logical;
- 
--	/* Zoned devices should not have partitions. So, we can assume it is 0 */
--	ASSERT(!bdev_is_partition(ordered->bdev));
--	if (WARN_ON(!ordered->bdev))
-+	em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
-+	if (IS_ERR(em))
- 		return;
-+	map = em->map_lookup;
-+	chunk_start_phys = map->stripes[0].physical;
- 
--	if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
--				     ordered->physical, &logical, &nr,
--				     &stripe_len)))
--		goto out;
--
--	WARN_ON(nr != 1);
-+	if (WARN_ON_ONCE(map->num_stripes > 1) ||
-+	    WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
-+	    WARN_ON_ONCE(physical < chunk_start_phys) ||
-+	    WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
-+		free_extent_map(em);
-+		return;
-+	}
-+	logical = em->start + (physical - map->stripes[0].physical);
-+	free_extent_map(em);
- 
--	if (orig_logical == *logical)
--		goto out;
-+	if (orig_logical == logical)
-+		return;
- 
--	ordered->disk_bytenr = *logical;
-+	ordered->disk_bytenr = logical;
- 
- 	em_tree = &inode->extent_tree;
- 	write_lock(&em_tree->lock);
- 	em = search_extent_mapping(em_tree, ordered->file_offset,
- 				   ordered->num_bytes);
--	em->block_start = *logical;
-+	em->block_start = logical;
- 	free_extent_map(em);
- 	write_unlock(&em_tree->lock);
- 
- 	list_for_each_entry(sum, &ordered->list, list) {
--		if (*logical < orig_logical)
--			sum->bytenr -= orig_logical - *logical;
-+		if (logical < orig_logical)
-+			sum->bytenr -= orig_logical - logical;
- 		else
--			sum->bytenr += *logical - orig_logical;
-+			sum->bytenr += logical - orig_logical;
- 	}
--
--out:
--	kfree(logical);
- }
- 
- bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
-@@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
- 	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
- }
- 
--struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
--					    u64 logical, u64 length)
--{
--	struct btrfs_device *device;
--	struct extent_map *em;
--	struct map_lookup *map;
--
--	em = btrfs_get_chunk_map(fs_info, logical, length);
--	if (IS_ERR(em))
--		return ERR_CAST(em);
--
--	map = em->map_lookup;
--	/* We only support single profile for now */
--	device = map->stripes[0].dev;
--
--	free_extent_map(em);
--
--	return device;
--}
--
- /*
-  * Activate block group and underlying device zones
-  *
-diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
-index f43990985d80..c0570d35fea2 100644
---- a/fs/btrfs/zoned.h
-+++ b/fs/btrfs/zoned.h
-@@ -20,7 +20,6 @@ struct btrfs_zoned_device_info {
- 	 */
- 	u64 zone_size;
- 	u8  zone_size_shift;
--	u64 max_zone_append_size;
- 	u32 nr_zones;
- 	unsigned int max_active_zones;
- 	atomic_t active_zones_left;
-@@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
- void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- 			    struct extent_buffer *eb);
- void btrfs_free_redirty_list(struct btrfs_transaction *trans);
--bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
--void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
--				 struct bio *bio);
-+bool btrfs_use_zone_append(struct btrfs_bio *bbio);
-+void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
- void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
- bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
- 				    struct extent_buffer *eb,
-@@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
- int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
- int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
- 				  u64 physical_start, u64 physical_pos);
--struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
--					    u64 logical, u64 length);
- bool btrfs_zone_activate(struct btrfs_block_group *block_group);
- int btrfs_zone_finish(struct btrfs_block_group *block_group);
- bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
-@@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- 					  struct extent_buffer *eb) { }
- static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
- 
--static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
-+static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
- {
- 	return false;
- }
- 
--static inline void btrfs_record_physical_zoned(struct inode *inode,
--					       u64 file_offset, struct bio *bio)
-+static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
- {
- }
- 
-@@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
- 	return -EOPNOTSUPP;
- }
- 
--static inline struct btrfs_device *btrfs_zoned_get_device(
--						  struct btrfs_fs_info *fs_info,
--						  u64 logical, u64 length)
--{
--	return ERR_PTR(-EOPNOTSUPP);
--}
--
- static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
- {
- 	return true;
-diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
-index 9de1c9d1a13d..3559ea6b0781 100644
---- a/fs/ext4/extents.c
-+++ b/fs/ext4/extents.c
-@@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle,
- 		ext4_ext_mark_unwritten(ex2);
- 
- 	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
--	if (err != -ENOSPC && err != -EDQUOT)
-+	if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
- 		goto out;
- 
- 	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
-diff --git a/fs/ext4/file.c b/fs/ext4/file.c
-index 7ac0a81bd371..6e9f198ecacf 100644
---- a/fs/ext4/file.c
-+++ b/fs/ext4/file.c
-@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
- 	return false;
- }
- 
--/* Is IO overwriting allocated and initialized blocks? */
--static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
-+/* Is IO overwriting allocated or initialized blocks? */
-+static bool ext4_overwrite_io(struct inode *inode,
-+			      loff_t pos, loff_t len, bool *unwritten)
- {
- 	struct ext4_map_blocks map;
- 	unsigned int blkbits = inode->i_blkbits;
-@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
- 	blklen = map.m_len;
- 
- 	err = ext4_map_blocks(NULL, inode, &map, 0);
-+	if (err != blklen)
-+		return false;
- 	/*
- 	 * 'err==len' means that all of the blocks have been preallocated,
--	 * regardless of whether they have been initialized or not. To exclude
--	 * unwritten extents, we need to check m_flags.
-+	 * regardless of whether they have been initialized or not. We need to
-+	 * check m_flags to distinguish the unwritten extents.
- 	 */
--	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
-+	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
-+	return true;
- }
- 
- static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
-@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
-  * - For extending writes case we don't take the shared lock, since it requires
-  *   updating inode i_disksize and/or orphan handling with exclusive lock.
-  *
-- * - shared locking will only be true mostly with overwrites. Otherwise we will
-- *   switch to exclusive i_rwsem lock.
-+ * - shared locking will only be true mostly with overwrites, including
-+ *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
-+ *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
-+ *   also release exclusive i_rwsem lock.
-+ *
-+ * - Otherwise we will switch to exclusive i_rwsem lock.
-  */
- static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
--				     bool *ilock_shared, bool *extend)
-+				     bool *ilock_shared, bool *extend,
-+				     bool *unwritten)
- {
- 	struct file *file = iocb->ki_filp;
- 	struct inode *inode = file_inode(file);
-@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
- 	 * in file_modified().
- 	 */
- 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
--	     !ext4_overwrite_io(inode, offset, count))) {
-+	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
- 		if (iocb->ki_flags & IOCB_NOWAIT) {
- 			ret = -EAGAIN;
- 			goto out;
-@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
- 	loff_t offset = iocb->ki_pos;
- 	size_t count = iov_iter_count(from);
- 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
--	bool extend = false, unaligned_io = false;
-+	bool extend = false, unaligned_io = false, unwritten = false;
- 	bool ilock_shared = true;
- 
- 	/*
-@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
- 		return ext4_buffered_write_iter(iocb, from);
- 	}
- 
--	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
-+	ret = ext4_dio_write_checks(iocb, from,
-+				    &ilock_shared, &extend, &unwritten);
- 	if (ret <= 0)
- 		return ret;
- 
-@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
- 		ext4_journal_stop(handle);
- 	}
- 
--	if (ilock_shared)
-+	if (ilock_shared && !unwritten)
- 		iomap_ops = &ext4_iomap_overwrite_ops;
- 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- 			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
-diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
-index 9d9f414f99fe..24128f6cd1b0 100644
---- a/fs/ext4/inode.c
-+++ b/fs/ext4/inode.c
-@@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
- 						   new_size);
- }
- 
--static int __ext4_journalled_writepage(struct page *page, unsigned int len);
- static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- 				  int pextents);
- 
-@@ -1005,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
- 	return ret;
- }
- 
--/*
-- * To preserve ordering, it is essential that the hole instantiation and
-- * the data write be encapsulated in a single transaction.  We cannot
-- * close off a transaction and start a new one between the ext4_get_block()
-- * and the commit_write().  So doing the jbd2_journal_start at the start of
-- * prepare_write() is the right place.
-- *
-- * Also, this function can nest inside ext4_writepage().  In that case, we
-- * *know* that ext4_writepage() has generated enough buffer credits to do the
-- * whole page.  So we won't block on the journal in that case, which is good,
-- * because the caller may be PF_MEMALLOC.
-- *
-- * By accident, ext4 can be reentered when a transaction is open via
-- * quota file writes.  If we were to commit the transaction while thus
-- * reentered, there can be a deadlock - we would be holding a quota
-- * lock, and the commit would never complete if another thread had a
-- * transaction open and was blocking on the quota lock - a ranking
-- * violation.
-- *
-- * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
-- * will _not_ run commit under these circumstances because handle->h_ref
-- * is elevated.  We'll still have enough credits for the tiny quotafile
-- * write.
-- */
- int do_journal_get_write_access(handle_t *handle, struct inode *inode,
- 				struct buffer_head *bh)
- {
-@@ -1149,6 +1124,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
- }
- #endif
- 
-+/*
-+ * To preserve ordering, it is essential that the hole instantiation and
-+ * the data write be encapsulated in a single transaction.  We cannot
-+ * close off a transaction and start a new one between the ext4_get_block()
-+ * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
-+ * ext4_write_begin() is the right place.
-+ */
- static int ext4_write_begin(struct file *file, struct address_space *mapping,
- 			    loff_t pos, unsigned len,
- 			    struct page **pagep, void **fsdata)
-@@ -1649,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode)
- 	return;
- }
- 
--static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
--				      struct buffer_head *bh)
--{
--	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
--}
--
- /*
-  * ext4_insert_delayed_block - adds a delayed block to the extents status
-  *                             tree, incrementing the reserved cluster/block
-@@ -1887,216 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- 	return 0;
- }
- 
--static int __ext4_journalled_writepage(struct page *page,
--				       unsigned int len)
-+static void mpage_page_done(struct mpage_da_data *mpd, struct page *page)
- {
--	struct address_space *mapping = page->mapping;
--	struct inode *inode = mapping->host;
--	handle_t *handle = NULL;
--	int ret = 0, err = 0;
--	int inline_data = ext4_has_inline_data(inode);
--	struct buffer_head *inode_bh = NULL;
--	loff_t size;
--
--	ClearPageChecked(page);
--
--	if (inline_data) {
--		BUG_ON(page->index != 0);
--		BUG_ON(len > ext4_get_max_inline_size(inode));
--		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
--		if (inode_bh == NULL)
--			goto out;
--	}
--	/*
--	 * We need to release the page lock before we start the
--	 * journal, so grab a reference so the page won't disappear
--	 * out from under us.
--	 */
--	get_page(page);
--	unlock_page(page);
--
--	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
--				    ext4_writepage_trans_blocks(inode));
--	if (IS_ERR(handle)) {
--		ret = PTR_ERR(handle);
--		put_page(page);
--		goto out_no_pagelock;
--	}
--	BUG_ON(!ext4_handle_valid(handle));
--
--	lock_page(page);
--	put_page(page);
--	size = i_size_read(inode);
--	if (page->mapping != mapping || page_offset(page) > size) {
--		/* The page got truncated from under us */
--		ext4_journal_stop(handle);
--		ret = 0;
--		goto out;
--	}
--
--	if (inline_data) {
--		ret = ext4_mark_inode_dirty(handle, inode);
--	} else {
--		struct buffer_head *page_bufs = page_buffers(page);
--
--		if (page->index == size >> PAGE_SHIFT)
--			len = size & ~PAGE_MASK;
--		else
--			len = PAGE_SIZE;
--
--		ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
--					     NULL, do_journal_get_write_access);
--
--		err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
--					     NULL, write_end_fn);
--	}
--	if (ret == 0)
--		ret = err;
--	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
--	if (ret == 0)
--		ret = err;
--	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
--	err = ext4_journal_stop(handle);
--	if (!ret)
--		ret = err;
--
--	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
--out:
-+	mpd->first_page++;
- 	unlock_page(page);
--out_no_pagelock:
--	brelse(inode_bh);
--	return ret;
--}
--
--/*
-- * Note that we don't need to start a transaction unless we're journaling data
-- * because we should have holes filled from ext4_page_mkwrite(). We even don't
-- * need to file the inode to the transaction's list in ordered mode because if
-- * we are writing back data added by write(), the inode is already there and if
-- * we are writing back data modified via mmap(), no one guarantees in which
-- * transaction the data will hit the disk. In case we are journaling data, we
-- * cannot start transaction directly because transaction start ranks above page
-- * lock so we have to do some magic.
-- *
-- * This function can get called via...
-- *   - ext4_writepages after taking page lock (have journal handle)
-- *   - journal_submit_inode_data_buffers (no journal handle)
-- *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
-- *   - grab_page_cache when doing write_begin (have journal handle)
-- *
-- * We don't do any block allocation in this function. If we have page with
-- * multiple blocks we need to write those buffer_heads that are mapped. This
-- * is important for mmaped based write. So if we do with blocksize 1K
-- * truncate(f, 1024);
-- * a = mmap(f, 0, 4096);
-- * a[0] = 'a';
-- * truncate(f, 4096);
-- * we have in the page first buffer_head mapped via page_mkwrite call back
-- * but other buffer_heads would be unmapped but dirty (dirty done via the
-- * do_wp_page). So writepage should write the first block. If we modify
-- * the mmap area beyond 1024 we will again get a page_fault and the
-- * page_mkwrite callback will do the block allocation and mark the
-- * buffer_heads mapped.
-- *
-- * We redirty the page if we have any buffer_heads that is either delay or
-- * unwritten in the page.
-- *
-- * We can get recursively called as show below.
-- *
-- *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
-- *		ext4_writepage()
-- *
-- * But since we don't do any block allocation we should not deadlock.
-- * Page also have the dirty flag cleared so we don't get recurive page_lock.
-- */
--static int ext4_writepage(struct page *page,
--			  struct writeback_control *wbc)
--{
--	struct folio *folio = page_folio(page);
--	int ret = 0;
--	loff_t size;
--	unsigned int len;
--	struct buffer_head *page_bufs = NULL;
--	struct inode *inode = page->mapping->host;
--	struct ext4_io_submit io_submit;
--
--	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
--		folio_invalidate(folio, 0, folio_size(folio));
--		folio_unlock(folio);
--		return -EIO;
--	}
--
--	trace_ext4_writepage(page);
--	size = i_size_read(inode);
--	if (page->index == size >> PAGE_SHIFT &&
--	    !ext4_verity_in_progress(inode))
--		len = size & ~PAGE_MASK;
--	else
--		len = PAGE_SIZE;
--
--	/* Should never happen but for bugs in other kernel subsystems */
--	if (!page_has_buffers(page)) {
--		ext4_warning_inode(inode,
--		   "page %lu does not have buffers attached", page->index);
--		ClearPageDirty(page);
--		unlock_page(page);
--		return 0;
--	}
--
--	page_bufs = page_buffers(page);
--	/*
--	 * We cannot do block allocation or other extent handling in this
--	 * function. If there are buffers needing that, we have to redirty
--	 * the page. But we may reach here when we do a journal commit via
--	 * journal_submit_inode_data_buffers() and in that case we must write
--	 * allocated buffers to achieve data=ordered mode guarantees.
--	 *
--	 * Also, if there is only one buffer per page (the fs block
--	 * size == the page size), if one buffer needs block
--	 * allocation or needs to modify the extent tree to clear the
--	 * unwritten flag, we know that the page can't be written at
--	 * all, so we might as well refuse the write immediately.
--	 * Unfortunately if the block size != page size, we can't as
--	 * easily detect this case using ext4_walk_page_buffers(), but
--	 * for the extremely common case, this is an optimization that
--	 * skips a useless round trip through ext4_bio_write_page().
--	 */
--	if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
--				   ext4_bh_delay_or_unwritten)) {
--		redirty_page_for_writepage(wbc, page);
--		if ((current->flags & PF_MEMALLOC) ||
--		    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
--			/*
--			 * For memory cleaning there's no point in writing only
--			 * some buffers. So just bail out. Warn if we came here
--			 * from direct reclaim.
--			 */
--			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
--							== PF_MEMALLOC);
--			unlock_page(page);
--			return 0;
--		}
--	}
--
--	if (PageChecked(page) && ext4_should_journal_data(inode))
--		/*
--		 * It's mmapped pagecache.  Add buffers and journal it.  There
--		 * doesn't seem much point in redirtying the page here.
--		 */
--		return __ext4_journalled_writepage(page, len);
--
--	ext4_io_submit_init(&io_submit, wbc);
--	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
--	if (!io_submit.io_end) {
--		redirty_page_for_writepage(wbc, page);
--		unlock_page(page);
--		return -ENOMEM;
--	}
--	ret = ext4_bio_write_page(&io_submit, page, len);
--	ext4_io_submit(&io_submit);
--	/* Drop io_end reference we got from init */
--	ext4_put_io_end_defer(io_submit.io_end);
--	return ret;
- }
- 
- static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
-@@ -2129,7 +1899,6 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
- 	err = ext4_bio_write_page(&mpd->io_submit, page, len);
- 	if (!err)
- 		mpd->wbc->nr_to_write--;
--	mpd->first_page++;
- 
- 	return err;
- }
-@@ -2243,6 +2012,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
- 		err = mpage_submit_page(mpd, head->b_page);
- 		if (err < 0)
- 			return err;
-+		mpage_page_done(mpd, head->b_page);
- 	}
- 	if (lblk >= blocks) {
- 		mpd->scanned_until_end = 1;
-@@ -2374,6 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
- 			err = mpage_submit_page(mpd, page);
- 			if (err < 0)
- 				goto out;
-+			mpage_page_done(mpd, page);
- 		}
- 		folio_batch_release(&fbatch);
- 	}
-@@ -2572,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page)
- 	return false;
- }
- 
-+static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
-+				     int len)
-+{
-+	struct buffer_head *page_bufs = page_buffers(page);
-+	struct inode *inode = page->mapping->host;
-+	int ret, err;
-+
-+	ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
-+				     NULL, do_journal_get_write_access);
-+	err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
-+				     NULL, write_end_fn);
-+	if (ret == 0)
-+		ret = err;
-+	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
-+	if (ret == 0)
-+		ret = err;
-+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
-+
-+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
-+
-+	return ret;
-+}
-+
-+static int mpage_journal_page_buffers(handle_t *handle,
-+				      struct mpage_da_data *mpd,
-+				      struct page *page)
-+{
-+	struct inode *inode = mpd->inode;
-+	loff_t size = i_size_read(inode);
-+	int len;
-+
-+	ClearPageChecked(page);
-+	clear_page_dirty_for_io(page);
-+	mpd->wbc->nr_to_write--;
-+
-+	if (page->index == size >> PAGE_SHIFT &&
-+	    !ext4_verity_in_progress(inode))
-+		len = size & ~PAGE_MASK;
-+	else
-+		len = PAGE_SIZE;
-+
-+	return ext4_journal_page_buffers(handle, page, len);
-+}
-+
- /*
-  * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
-  * 				 needing mapping, submit mapped pages
-@@ -2597,7 +2412,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
- 	struct address_space *mapping = mpd->inode->i_mapping;
- 	struct pagevec pvec;
- 	unsigned int nr_pages;
--	long left = mpd->wbc->nr_to_write;
- 	pgoff_t index = mpd->first_page;
- 	pgoff_t end = mpd->last_page;
- 	xa_mark_t tag;
-@@ -2605,12 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
- 	int blkbits = mpd->inode->i_blkbits;
- 	ext4_lblk_t lblk;
- 	struct buffer_head *head;
-+	handle_t *handle = NULL;
-+	int bpp = ext4_journal_blocks_per_page(mpd->inode);
- 
- 	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
- 		tag = PAGECACHE_TAG_TOWRITE;
- 	else
- 		tag = PAGECACHE_TAG_DIRTY;
- 
-+	if (ext4_should_journal_data(mpd->inode)) {
-+		handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
-+					    bpp);
-+		if (IS_ERR(handle))
-+			return PTR_ERR(handle);
-+	}
- 	pagevec_init(&pvec);
- 	mpd->map.m_len = 0;
- 	mpd->next_page = index;
-@@ -2631,13 +2453,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
- 			 * newly appeared dirty pages, but have not synced all
- 			 * of the old dirty pages.
- 			 */
--			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
-+			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
-+			    mpd->wbc->nr_to_write <=
-+			    mpd->map.m_len >> (PAGE_SHIFT - blkbits))
- 				goto out;
- 
- 			/* If we can't merge this page, we are done. */
- 			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
- 				goto out;
- 
-+			if (handle) {
-+				err = ext4_journal_ensure_credits(handle, bpp,
-+								  0);
-+				if (err < 0)
-+					goto out;
-+			}
-+
- 			lock_page(page);
- 			/*
- 			 * If the page is no longer dirty, or its mapping no
-@@ -2677,18 +2508,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
- 				mpd->first_page = page->index;
- 			mpd->next_page = page->index + 1;
- 			/*
--			 * Writeout for transaction commit where we cannot
--			 * modify metadata is simple. Just submit the page.
-+			 * Writeout when we cannot modify metadata is simple.
-+			 * Just submit the page. For data=journal mode we
-+			 * first handle writeout of the page for checkpoint and
-+			 * only after that handle delayed page dirtying. This
-+			 * is crutial so that forcing a transaction commit and
-+			 * then calling filemap_write_and_wait() guarantees
-+			 * current state of data is in its final location. Such
-+			 * sequence is used for example by insert/collapse
-+			 * range operations before discarding the page cache.
- 			 */
- 			if (!mpd->can_map) {
- 				if (ext4_page_nomap_can_writeout(page)) {
- 					err = mpage_submit_page(mpd, page);
- 					if (err < 0)
- 						goto out;
--				} else {
--					unlock_page(page);
--					mpd->first_page++;
- 				}
-+				/* Pending dirtying of journalled data? */
-+				if (PageChecked(page)) {
-+					err = mpage_journal_page_buffers(handle,
-+						mpd, page);
-+					if (err < 0)
-+						goto out;
-+				}
-+				mpage_page_done(mpd, page);
- 			} else {
- 				/* Add all dirty buffers to mpd */
- 				lblk = ((ext4_lblk_t)page->index) <<
-@@ -2700,24 +2543,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
- 					goto out;
- 				err = 0;
- 			}
--			left--;
- 		}
- 		pagevec_release(&pvec);
- 		cond_resched();
- 	}
- 	mpd->scanned_until_end = 1;
-+	if (handle)
-+		ext4_journal_stop(handle);
- 	return 0;
- out:
- 	pagevec_release(&pvec);
-+	if (handle)
-+		ext4_journal_stop(handle);
- 	return err;
- }
- 
--static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
--			     void *data)
--{
--	return ext4_writepage(page, wbc);
--}
--
- static int ext4_do_writepages(struct mpage_da_data *mpd)
- {
- 	struct writeback_control *wbc = mpd->wbc;
-@@ -2743,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
- 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- 		goto out_writepages;
- 
--	if (ext4_should_journal_data(inode)) {
--		blk_start_plug(&plug);
--		ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
--		blk_finish_plug(&plug);
--		goto out_writepages;
--	}
--
- 	/*
- 	 * If the filesystem has aborted, it is read-only, so return
- 	 * right away instead of dumping stack traces later on that
-@@ -2784,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
- 		ext4_journal_stop(handle);
- 	}
- 
-+	/*
-+	 * data=journal mode does not do delalloc so we just need to writeout /
-+	 * journal already mapped buffers
-+	 */
-+	if (ext4_should_journal_data(inode))
-+		mpd->can_map = 0;
-+
- 	if (ext4_should_dioread_nolock(inode)) {
- 		/*
- 		 * We may need to convert up to one extent per block in
-@@ -3160,9 +3000,8 @@ static int ext4_da_write_end(struct file *file,
- 	 * i_disksize since writeback will push i_disksize upto i_size
- 	 * eventually. If the end of the current write is > i_size and
- 	 * inside an allocated block (ext4_da_should_update_i_disksize()
--	 * check), we need to update i_disksize here as neither
--	 * ext4_writepage() nor certain ext4_writepages() paths not
--	 * allocating blocks update i_disksize.
-+	 * check), we need to update i_disksize here as certain
-+	 * ext4_writepages() paths not allocating blocks update i_disksize.
- 	 *
- 	 * Note that we defer inode dirtying to generic_write_end() /
- 	 * ext4_da_write_inline_data_end().
-@@ -3687,24 +3526,26 @@ const struct iomap_ops ext4_iomap_report_ops = {
- };
- 
- /*
-- * Whenever the folio is being dirtied, corresponding buffers should already
-- * be attached to the transaction (we take care of this in ext4_page_mkwrite()
-- * and ext4_write_begin()). However we cannot move buffers to dirty transaction
-- * lists here because ->dirty_folio is called under VFS locks and the folio
-- * is not necessarily locked.
-- *
-- * We cannot just dirty the folio and leave attached buffers clean, because the
-- * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
-- * or jbddirty because all the journalling code will explode.
-- *
-- * So what we do is to mark the folio "pending dirty" and next time writepage
-- * is called, propagate that into the buffers appropriately.
-+ * For data=journal mode, folio should be marked dirty only when it was
-+ * writeably mapped. When that happens, it was already attached to the
-+ * transaction and marked as jbddirty (we take care of this in
-+ * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
-+ * so we should have nothing to do here, except for the case when someone
-+ * had the page pinned and dirtied the page through this pin (e.g. by doing
-+ * direct IO to it). In that case we'd need to attach buffers here to the
-+ * transaction but we cannot due to lock ordering.  We cannot just dirty the
-+ * folio and leave attached buffers clean, because the buffers' dirty state is
-+ * "definitive".  We cannot just set the buffers dirty or jbddirty because all
-+ * the journalling code will explode.  So what we do is to mark the folio
-+ * "pending dirty" and next time ext4_writepages() is called, attach buffers
-+ * to the transaction appropriately.
-  */
- static bool ext4_journalled_dirty_folio(struct address_space *mapping,
- 		struct folio *folio)
- {
- 	WARN_ON_ONCE(!folio_buffers(folio));
--	folio_set_checked(folio);
-+	if (folio_maybe_dma_pinned(folio))
-+		folio_set_checked(folio);
- 	return filemap_dirty_folio(mapping, folio);
- }
- 
-@@ -4872,13 +4713,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
- 		goto bad_inode;
- 	raw_inode = ext4_raw_inode(&iloc);
- 
--	if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
--		ext4_error_inode(inode, function, line, 0,
--				 "iget: root inode unallocated");
--		ret = -EFSCORRUPTED;
--		goto bad_inode;
--	}
--
- 	if ((flags & EXT4_IGET_HANDLE) &&
- 	    (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
- 		ret = -ESTALE;
-@@ -4951,11 +4785,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
- 	 * NeilBrown 1999oct15
- 	 */
- 	if (inode->i_nlink == 0) {
--		if ((inode->i_mode == 0 ||
-+		if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
- 		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
- 		    ino != EXT4_BOOT_LOADER_INO) {
--			/* this inode is deleted */
--			ret = -ESTALE;
-+			/* this inode is deleted or unallocated */
-+			if (flags & EXT4_IGET_SPECIAL) {
-+				ext4_error_inode(inode, function, line, 0,
-+						 "iget: special inode unallocated");
-+				ret = -EFSCORRUPTED;
-+			} else
-+				ret = -ESTALE;
- 			goto bad_inode;
- 		}
- 		/* The only unlinked inodes we let through here have
-@@ -5382,7 +5221,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
- 	 * If the folio is fully truncated, we don't need to wait for any commit
- 	 * (and we even should not as __ext4_journalled_invalidate_folio() may
- 	 * strip all buffers from the folio but keep the folio dirty which can then
--	 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
-+	 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
- 	 * buffers). Also we don't need to wait for any commit if all buffers in
- 	 * the folio remain valid. This is most beneficial for the common case of
- 	 * blocksize == PAGESIZE.
-@@ -5788,7 +5627,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
- 	int gdpblocks;
- 	int idxblocks;
--	int ret = 0;
-+	int ret;
- 
- 	/*
- 	 * How many index blocks need to touch to map @lblocks logical blocks
-@@ -6320,18 +6159,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
- 		err = __block_write_begin(page, 0, len, ext4_get_block);
- 		if (!err) {
- 			ret = VM_FAULT_SIGBUS;
--			if (ext4_walk_page_buffers(handle, inode,
--					page_buffers(page), 0, len, NULL,
--					do_journal_get_write_access))
--				goto out_error;
--			if (ext4_walk_page_buffers(handle, inode,
--					page_buffers(page), 0, len, NULL,
--					write_end_fn))
--				goto out_error;
--			if (ext4_jbd2_inode_add_write(handle, inode,
--						      page_offset(page), len))
-+			if (ext4_journal_page_buffers(handle, page, len))
- 				goto out_error;
--			ext4_set_inode_state(inode, EXT4_STATE_JDATA);
- 		} else {
- 			unlock_page(page);
- 		}
-diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
-index 8067ccda34e4..2e8c34036313 100644
---- a/fs/ext4/ioctl.c
-+++ b/fs/ext4/ioctl.c
-@@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb,
- 	set_buffer_uptodate(bh);
- 	unlock_buffer(bh);
- 
--	if (err)
--		goto out_bh;
--
- 	if (handle) {
- 		err = ext4_handle_dirty_metadata(handle, NULL, bh);
- 		if (err)
-diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
-index dd28453d6ea3..270fbcba75b6 100644
---- a/fs/ext4/namei.c
-+++ b/fs/ext4/namei.c
-@@ -3872,9 +3872,16 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
- 			if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
- 				goto end_rename;
- 		}
-+		/*
-+		 * We need to protect against old.inode directory getting
-+		 * converted from inline directory format into a normal one.
-+		 */
-+		inode_lock_nested(old.inode, I_MUTEX_NONDIR2);
- 		retval = ext4_rename_dir_prepare(handle, &old);
--		if (retval)
-+		if (retval) {
-+			inode_unlock(old.inode);
- 			goto end_rename;
-+		}
- 	}
- 	/*
- 	 * If we're renaming a file within an inline_data dir and adding or
-@@ -4006,6 +4013,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
- 	} else {
- 		ext4_journal_stop(handle);
- 	}
-+	if (old.dir_bh)
-+		inode_unlock(old.inode);
- release_bh:
- 	brelse(old.dir_bh);
- 	brelse(old.bh);
-diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
-index beaec6d81074..3bc7c7c5b99d 100644
---- a/fs/ext4/page-io.c
-+++ b/fs/ext4/page-io.c
-@@ -500,7 +500,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
- 
- 	/* Nothing to submit? Just unlock the page... */
- 	if (!nr_to_submit)
--		goto unlock;
-+		return 0;
- 
- 	bh = head = page_buffers(page);
- 
-@@ -548,7 +548,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
- 				}
- 				bh = bh->b_this_page;
- 			} while (bh != head);
--			goto unlock;
-+
-+			return ret;
- 		}
- 	}
- 
-@@ -564,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
- 		io_submit_add_bh(io, inode,
- 				 bounce_page ? bounce_page : page, bh);
- 	} while ((bh = bh->b_this_page) != head);
--unlock:
--	unlock_page(page);
--	return ret;
-+
-+	return 0;
- }
-diff --git a/fs/ext4/super.c b/fs/ext4/super.c
-index c81fa0fa9901..2192b4111442 100644
---- a/fs/ext4/super.c
-+++ b/fs/ext4/super.c
-@@ -4751,7 +4751,6 @@ static int ext4_group_desc_init(struct super_block *sb,
- 	struct ext4_sb_info *sbi = EXT4_SB(sb);
- 	unsigned int db_count;
- 	ext4_fsblk_t block;
--	int ret;
- 	int i;
- 
- 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
-@@ -4791,8 +4790,7 @@ static int ext4_group_desc_init(struct super_block *sb,
- 			ext4_msg(sb, KERN_ERR,
- 			       "can't read group descriptor %d", i);
- 			sbi->s_gdb_count = i;
--			ret = PTR_ERR(bh);
--			goto out;
-+			return PTR_ERR(bh);
- 		}
- 		rcu_read_lock();
- 		rcu_dereference(sbi->s_group_desc)[i] = bh;
-@@ -4801,13 +4799,10 @@ static int ext4_group_desc_init(struct super_block *sb,
- 	sbi->s_gdb_count = db_count;
- 	if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
- 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
--		ret = -EFSCORRUPTED;
--		goto out;
-+		return -EFSCORRUPTED;
- 	}
-+
- 	return 0;
--out:
--	ext4_group_desc_free(sbi);
--	return ret;
- }
- 
- static int ext4_load_and_init_journal(struct super_block *sb,
-@@ -5234,14 +5229,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
- 	if (ext4_geometry_check(sb, es))
- 		goto failed_mount;
- 
--	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
--	if (err)
--		goto failed_mount;
--
- 	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
- 	spin_lock_init(&sbi->s_error_lock);
- 	INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
- 
-+	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
-+	if (err)
-+		goto failed_mount3;
-+
- 	/* Register extent status tree shrinker */
- 	if (ext4_es_register_shrinker(sbi))
- 		goto failed_mount3;
-@@ -5967,8 +5962,11 @@ static int ext4_load_journal(struct super_block *sb,
- 	if (!really_read_only && journal_devnum &&
- 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- 		es->s_journal_dev = cpu_to_le32(journal_devnum);
--
--		/* Make sure we flush the recovery flag to disk. */
-+		ext4_commit_super(sb);
-+	}
-+	if (!really_read_only && journal_inum &&
-+	    journal_inum != le32_to_cpu(es->s_journal_inum)) {
-+		es->s_journal_inum = cpu_to_le32(journal_inum);
- 		ext4_commit_super(sb);
- 	}
- 
-diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
-index 0c6b011a91b3..62f2ec599218 100644
---- a/fs/ext4/xattr.c
-+++ b/fs/ext4/xattr.c
-@@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index)
- }
- 
- static int
--ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
--			 void *value_start)
-+check_xattrs(struct inode *inode, struct buffer_head *bh,
-+	     struct ext4_xattr_entry *entry, void *end, void *value_start,
-+	     const char *function, unsigned int line)
- {
- 	struct ext4_xattr_entry *e = entry;
-+	int err = -EFSCORRUPTED;
-+	char *err_str;
-+
-+	if (bh) {
-+		if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
-+		    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-+			err_str = "invalid header";
-+			goto errout;
-+		}
-+		if (buffer_verified(bh))
-+			return 0;
-+		if (!ext4_xattr_block_csum_verify(inode, bh)) {
-+			err = -EFSBADCRC;
-+			err_str = "invalid checksum";
-+			goto errout;
-+		}
-+	} else {
-+		struct ext4_xattr_ibody_header *header = value_start;
-+
-+		header -= 1;
-+		if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
-+			err_str = "in-inode xattr block too small";
-+			goto errout;
-+		}
-+		if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
-+			err_str = "bad magic number in in-inode xattr";
-+			goto errout;
-+		}
-+	}
- 
- 	/* Find the end of the names list */
- 	while (!IS_LAST_ENTRY(e)) {
- 		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
--		if ((void *)next >= end)
--			return -EFSCORRUPTED;
--		if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
--			return -EFSCORRUPTED;
-+		if ((void *)next >= end) {
-+			err_str = "e_name out of bounds";
-+			goto errout;
-+		}
-+		if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
-+			err_str = "bad e_name length";
-+			goto errout;
-+		}
- 		e = next;
- 	}
- 
- 	/* Check the values */
- 	while (!IS_LAST_ENTRY(entry)) {
- 		u32 size = le32_to_cpu(entry->e_value_size);
-+		unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);
- 
--		if (size > EXT4_XATTR_SIZE_MAX)
--			return -EFSCORRUPTED;
-+		if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
-+			err_str = "ea_inode specified without ea_inode feature enabled";
-+			goto errout;
-+		}
-+		if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
-+			       !ext4_valid_inum(inode->i_sb, ea_ino))) {
-+			err_str = "invalid ea_ino";
-+			goto errout;
-+		}
-+		if (size > EXT4_XATTR_SIZE_MAX) {
-+			err_str = "e_value size too large";
-+			goto errout;
-+		}
- 
- 		if (size != 0 && entry->e_value_inum == 0) {
- 			u16 offs = le16_to_cpu(entry->e_value_offs);
-@@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
- 			 * the padded and unpadded sizes, since the size may
- 			 * overflow to 0 when adding padding.
- 			 */
--			if (offs > end - value_start)
--				return -EFSCORRUPTED;
-+			if (offs > end - value_start) {
-+				err_str = "e_value out of bounds";
-+				goto errout;
-+			}
- 			value = value_start + offs;
- 			if (value < (void *)e + sizeof(u32) ||
- 			    size > end - value ||
--			    EXT4_XATTR_SIZE(size) > end - value)
--				return -EFSCORRUPTED;
-+			    EXT4_XATTR_SIZE(size) > end - value) {
-+				err_str = "overlapping e_value ";
-+				goto errout;
-+			}
- 		}
- 		entry = EXT4_XATTR_NEXT(entry);
- 	}
--
-+	if (bh)
-+		set_buffer_verified(bh);
- 	return 0;
-+
-+errout:
-+	if (bh)
-+		__ext4_error_inode(inode, function, line, 0, -err,
-+				   "corrupted xattr block %llu: %s",
-+				   (unsigned long long) bh->b_blocknr,
-+				   err_str);
-+	else
-+		__ext4_error_inode(inode, function, line, 0, -err,
-+				   "corrupted in-inode xattr: %s", err_str);
-+	return err;
- }
- 
- static inline int
- __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
- 			 const char *function, unsigned int line)
- {
--	int error = -EFSCORRUPTED;
--
--	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
--	    BHDR(bh)->h_blocks != cpu_to_le32(1))
--		goto errout;
--	if (buffer_verified(bh))
--		return 0;
--
--	error = -EFSBADCRC;
--	if (!ext4_xattr_block_csum_verify(inode, bh))
--		goto errout;
--	error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
--					 bh->b_data);
--errout:
--	if (error)
--		__ext4_error_inode(inode, function, line, 0, -error,
--				   "corrupted xattr block %llu",
--				   (unsigned long long) bh->b_blocknr);
--	else
--		set_buffer_verified(bh);
--	return error;
-+	return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
-+			    bh->b_data, function, line);
- }
- 
- #define ext4_xattr_check_block(inode, bh) \
- 	__ext4_xattr_check_block((inode), (bh),  __func__, __LINE__)
- 
- 
--static int
-+static inline int
- __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
- 			 void *end, const char *function, unsigned int line)
- {
--	int error = -EFSCORRUPTED;
--
--	if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
--	    (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
--		goto errout;
--	error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
--errout:
--	if (error)
--		__ext4_error_inode(inode, function, line, 0, -error,
--				   "corrupted in-inode xattr");
--	return error;
-+	return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
-+			    function, line);
- }
- 
- #define xattr_check_inode(inode, header, end) \
-@@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
- 	struct inode *inode;
- 	int err;
- 
-+	/*
-+	 * We have to check for this corruption early as otherwise
-+	 * iget_locked() could wait indefinitely for the state of our
-+	 * parent inode.
-+	 */
-+	if (parent->i_ino == ea_ino) {
-+		ext4_error(parent->i_sb,
-+			   "Parent and EA inode have the same ino %lu", ea_ino);
-+		return -EFSCORRUPTED;
-+	}
-+
- 	inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
- 	if (IS_ERR(inode)) {
- 		err = PTR_ERR(inode);
-diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
-index e7537fd305dd..e191ecfb1fde 100644
---- a/fs/gfs2/bmap.c
-+++ b/fs/gfs2/bmap.c
-@@ -956,26 +956,40 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
- 	goto out;
- }
- 
--static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
--				   unsigned len)
-+static struct folio *
-+gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
- {
-+	struct inode *inode = iter->inode;
- 	unsigned int blockmask = i_blocksize(inode) - 1;
- 	struct gfs2_sbd *sdp = GFS2_SB(inode);
- 	unsigned int blocks;
-+	struct folio *folio;
-+	int status;
- 
- 	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
--	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
-+	status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
-+	if (status)
-+		return ERR_PTR(status);
-+
-+	folio = iomap_get_folio(iter, pos);
-+	if (IS_ERR(folio))
-+		gfs2_trans_end(sdp);
-+	return folio;
- }
- 
--static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
--				 unsigned copied, struct page *page)
-+static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
-+				 unsigned copied, struct folio *folio)
- {
- 	struct gfs2_trans *tr = current->journal_info;
- 	struct gfs2_inode *ip = GFS2_I(inode);
- 	struct gfs2_sbd *sdp = GFS2_SB(inode);
- 
--	if (page && !gfs2_is_stuffed(ip))
--		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
-+	if (!gfs2_is_stuffed(ip))
-+		gfs2_page_add_databufs(ip, &folio->page, offset_in_page(pos),
-+				       copied);
-+
-+	folio_unlock(folio);
-+	folio_put(folio);
- 
- 	if (tr->tr_num_buf_new)
- 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-@@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
- 	gfs2_trans_end(sdp);
- }
- 
--static const struct iomap_page_ops gfs2_iomap_page_ops = {
--	.page_prepare = gfs2_iomap_page_prepare,
--	.page_done = gfs2_iomap_page_done,
-+static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
-+	.get_folio = gfs2_iomap_get_folio,
-+	.put_folio = gfs2_iomap_put_folio,
- };
- 
- static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
-@@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
- 	}
- 
- 	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
--		iomap->page_ops = &gfs2_iomap_page_ops;
-+		iomap->folio_ops = &gfs2_iomap_folio_ops;
- 	return 0;
- 
- out_trans_end:
-@@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
- /*
-  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
-  * uses iomap write to perform its actions, which begin their own transactions
-- * (iomap_begin, page_prepare, etc.)
-+ * (iomap_begin, get_folio, etc.)
-  */
- static int gfs2_block_zero_range(struct inode *inode, loff_t from,
- 				 unsigned int length)
-diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
-index 356193e44cf0..d3c300563eb8 100644
---- a/fs/iomap/buffered-io.c
-+++ b/fs/iomap/buffered-io.c
-@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
- }
- EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
- 
-+/**
-+ * iomap_get_folio - get a folio reference for writing
-+ * @iter: iteration structure
-+ * @pos: start offset of write
-+ *
-+ * Returns a locked reference to the folio at @pos, or an error pointer if the
-+ * folio could not be obtained.
-+ */
-+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
-+{
-+	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
-+	struct folio *folio;
-+
-+	if (iter->flags & IOMAP_NOWAIT)
-+		fgp |= FGP_NOWAIT;
-+
-+	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
-+			fgp, mapping_gfp_mask(iter->inode->i_mapping));
-+	if (folio)
-+		return folio;
-+
-+	if (iter->flags & IOMAP_NOWAIT)
-+		return ERR_PTR(-EAGAIN);
-+	return ERR_PTR(-ENOMEM);
-+}
-+EXPORT_SYMBOL_GPL(iomap_get_folio);
-+
- bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
- {
- 	trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
-@@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
- 	return 0;
- }
- 
-+static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
-+		size_t len)
-+{
-+	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
-+
-+	if (folio_ops && folio_ops->get_folio)
-+		return folio_ops->get_folio(iter, pos, len);
-+	else
-+		return iomap_get_folio(iter, pos);
-+}
-+
-+static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
-+		struct folio *folio)
-+{
-+	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
-+
-+	if (folio_ops && folio_ops->put_folio) {
-+		folio_ops->put_folio(iter->inode, pos, ret, folio);
-+	} else {
-+		folio_unlock(folio);
-+		folio_put(folio);
-+	}
-+}
-+
- static int iomap_write_begin_inline(const struct iomap_iter *iter,
- 		struct folio *folio)
- {
-@@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
- static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- 		size_t len, struct folio **foliop)
- {
--	const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
-+	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
- 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
- 	struct folio *folio;
--	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
- 	int status = 0;
- 
--	if (iter->flags & IOMAP_NOWAIT)
--		fgp |= FGP_NOWAIT;
--
- 	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
- 	if (srcmap != &iter->iomap)
- 		BUG_ON(pos + len > srcmap->offset + srcmap->length);
-@@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- 	if (!mapping_large_folio_support(iter->inode->i_mapping))
- 		len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
- 
--	if (page_ops && page_ops->page_prepare) {
--		status = page_ops->page_prepare(iter->inode, pos, len);
--		if (status)
--			return status;
--	}
--
--	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
--			fgp, mapping_gfp_mask(iter->inode->i_mapping));
--	if (!folio) {
--		status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
--		goto out_no_page;
--	}
-+	folio = __iomap_get_folio(iter, pos, len);
-+	if (IS_ERR(folio))
-+		return PTR_ERR(folio);
- 
- 	/*
- 	 * Now we have a locked folio, before we do anything with it we need to
-@@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- 	 * could do the wrong thing here (zero a page range incorrectly or fail
- 	 * to zero) and corrupt data.
- 	 */
--	if (page_ops && page_ops->iomap_valid) {
--		bool iomap_valid = page_ops->iomap_valid(iter->inode,
--							&iter->iomap);
-+	if (folio_ops && folio_ops->iomap_valid) {
-+		bool iomap_valid = folio_ops->iomap_valid(iter->inode,
-+							 &iter->iomap);
- 		if (!iomap_valid) {
- 			iter->iomap.flags |= IOMAP_F_STALE;
- 			status = 0;
-@@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- 	return 0;
- 
- out_unlock:
--	folio_unlock(folio);
--	folio_put(folio);
-+	__iomap_put_folio(iter, pos, 0, folio);
- 	iomap_write_failed(iter->inode, pos, len);
- 
--out_no_page:
--	if (page_ops && page_ops->page_done)
--		page_ops->page_done(iter->inode, pos, 0, NULL);
- 	return status;
- }
- 
-@@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
- static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
- 		size_t copied, struct folio *folio)
- {
--	const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
- 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
- 	loff_t old_size = iter->inode->i_size;
- 	size_t ret;
-@@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
- 		i_size_write(iter->inode, pos + ret);
- 		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
- 	}
--	folio_unlock(folio);
-+	__iomap_put_folio(iter, pos, ret, folio);
- 
- 	if (old_size < pos)
- 		pagecache_isize_extended(iter->inode, old_size, pos);
--	if (page_ops && page_ops->page_done)
--		page_ops->page_done(iter->inode, pos, ret, &folio->page);
--	folio_put(folio);
--
- 	if (ret < len)
- 		iomap_write_failed(iter->inode, pos + ret, len - ret);
- 	return ret;
-diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
-index 9804714b1751..f771001574d0 100644
---- a/fs/iomap/direct-io.c
-+++ b/fs/iomap/direct-io.c
-@@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- {
- 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
- 
--	if (!(dio->flags & IOMAP_DIO_WRITE)) {
--		WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
-+	if (!(dio->flags & IOMAP_DIO_WRITE))
- 		return REQ_OP_READ;
--	}
--
--	if (iomap->flags & IOMAP_F_ZONE_APPEND)
--		opflags |= REQ_OP_ZONE_APPEND;
--	else
--		opflags |= REQ_OP_WRITE;
- 
-+	opflags |= REQ_OP_WRITE;
- 	if (use_fua)
- 		opflags |= REQ_FUA;
- 	else
-diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
-index 989cf341779b..f8ff81c3de76 100644
---- a/fs/xfs/libxfs/xfs_alloc.c
-+++ b/fs/xfs/libxfs/xfs_alloc.c
-@@ -2472,20 +2472,20 @@ xfs_defer_agfl_block(
- 	struct xfs_owner_info		*oinfo)
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
--	struct xfs_extent_free_item	*new;		/* new element */
-+	struct xfs_extent_free_item	*xefi;
- 
- 	ASSERT(xfs_extfree_item_cache != NULL);
- 	ASSERT(oinfo != NULL);
- 
--	new = kmem_cache_zalloc(xfs_extfree_item_cache,
-+	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
- 			       GFP_KERNEL | __GFP_NOFAIL);
--	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
--	new->xefi_blockcount = 1;
--	new->xefi_owner = oinfo->oi_owner;
-+	xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
-+	xefi->xefi_blockcount = 1;
-+	xefi->xefi_owner = oinfo->oi_owner;
- 
- 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
- 
--	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
-+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
- }
- 
- /*
-@@ -2500,7 +2500,7 @@ __xfs_free_extent_later(
- 	const struct xfs_owner_info	*oinfo,
- 	bool				skip_discard)
- {
--	struct xfs_extent_free_item	*new;		/* new element */
-+	struct xfs_extent_free_item	*xefi;
- #ifdef DEBUG
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	xfs_agnumber_t			agno;
-@@ -2519,27 +2519,27 @@ __xfs_free_extent_later(
- #endif
- 	ASSERT(xfs_extfree_item_cache != NULL);
- 
--	new = kmem_cache_zalloc(xfs_extfree_item_cache,
-+	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
- 			       GFP_KERNEL | __GFP_NOFAIL);
--	new->xefi_startblock = bno;
--	new->xefi_blockcount = (xfs_extlen_t)len;
-+	xefi->xefi_startblock = bno;
-+	xefi->xefi_blockcount = (xfs_extlen_t)len;
- 	if (skip_discard)
--		new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
-+		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
- 	if (oinfo) {
- 		ASSERT(oinfo->oi_offset == 0);
- 
- 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
--			new->xefi_flags |= XFS_EFI_ATTR_FORK;
-+			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
- 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
--			new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
--		new->xefi_owner = oinfo->oi_owner;
-+			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
-+		xefi->xefi_owner = oinfo->oi_owner;
- 	} else {
--		new->xefi_owner = XFS_RMAP_OWN_NULL;
-+		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
- 	}
- 	trace_xfs_bmap_free_defer(tp->t_mountp,
- 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
--	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-+	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
- }
- 
- #ifdef DEBUG
-diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
-index 0d56a8d862e8..c8c65387136c 100644
---- a/fs/xfs/libxfs/xfs_bmap.c
-+++ b/fs/xfs/libxfs/xfs_bmap.c
-@@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent(
- int
- xfs_bmap_finish_one(
- 	struct xfs_trans		*tp,
--	struct xfs_inode		*ip,
--	enum xfs_bmap_intent_type	type,
--	int				whichfork,
--	xfs_fileoff_t			startoff,
--	xfs_fsblock_t			startblock,
--	xfs_filblks_t			*blockcount,
--	xfs_exntst_t			state)
-+	struct xfs_bmap_intent		*bi)
- {
-+	struct xfs_bmbt_irec		*bmap = &bi->bi_bmap;
- 	int				error = 0;
- 
- 	ASSERT(tp->t_firstblock == NULLFSBLOCK);
- 
- 	trace_xfs_bmap_deferred(tp->t_mountp,
--			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
--			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
--			ip->i_ino, whichfork, startoff, *blockcount, state);
-+			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
-+			bi->bi_type,
-+			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
-+			bi->bi_owner->i_ino, bi->bi_whichfork,
-+			bmap->br_startoff, bmap->br_blockcount,
-+			bmap->br_state);
- 
--	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
-+	if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
- 		return -EFSCORRUPTED;
- 
- 	if (XFS_TEST_ERROR(false, tp->t_mountp,
- 			XFS_ERRTAG_BMAP_FINISH_ONE))
- 		return -EIO;
- 
--	switch (type) {
-+	switch (bi->bi_type) {
- 	case XFS_BMAP_MAP:
--		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
--				startblock, 0);
--		*blockcount = 0;
-+		error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
-+				bmap->br_blockcount, bmap->br_startblock, 0);
-+		bmap->br_blockcount = 0;
- 		break;
- 	case XFS_BMAP_UNMAP:
--		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
--				XFS_BMAPI_REMAP, 1);
-+		error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
-+				&bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
- 		break;
- 	default:
- 		ASSERT(0);
-diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
-index 16db95b11589..01c2df35c3e3 100644
---- a/fs/xfs/libxfs/xfs_bmap.h
-+++ b/fs/xfs/libxfs/xfs_bmap.h
-@@ -234,10 +234,7 @@ struct xfs_bmap_intent {
- 	struct xfs_bmbt_irec			bi_bmap;
- };
- 
--int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
--		enum xfs_bmap_intent_type type, int whichfork,
--		xfs_fileoff_t startoff, xfs_fsblock_t startblock,
--		xfs_filblks_t *blockcount, xfs_exntst_t state);
-+int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
- void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- 		struct xfs_bmbt_irec *imap);
- void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
-diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
-index 35f574421670..da8c769887fd 100644
---- a/fs/xfs/libxfs/xfs_btree.c
-+++ b/fs/xfs/libxfs/xfs_btree.c
-@@ -2913,9 +2913,22 @@ xfs_btree_split_worker(
- }
- 
- /*
-- * BMBT split requests often come in with little stack to work on. Push
-+ * BMBT split requests often come in with little stack to work on so we push
-  * them off to a worker thread so there is lots of stack to use. For the other
-  * btree types, just call directly to avoid the context switch overhead here.
-+ *
-+ * Care must be taken here - the work queue rescuer thread introduces potential
-+ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
-+ * AGFs to allocate blocks. A task being run by the rescuer could attempt to
-+ * lock an AGF that is already locked by a task queued to run by the rescuer,
-+ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
-+ * release it until the current thread it is running gains the lock.
-+ *
-+ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
-+ * already locked to allocate from. The only place that doesn't hold an AGF
-+ * locked is unwritten extent conversion at IO completion, but that has already
-+ * been offloaded to a worker thread and hence has no stack consumption issues
-+ * we have to worry about.
-  */
- STATIC int					/* error */
- xfs_btree_split(
-@@ -2929,7 +2942,8 @@ xfs_btree_split(
- 	struct xfs_btree_split_args	args;
- 	DECLARE_COMPLETION_ONSTACK(done);
- 
--	if (cur->bc_btnum != XFS_BTNUM_BMAP)
-+	if (cur->bc_btnum != XFS_BTNUM_BMAP ||
-+	    cur->bc_tp->t_firstblock == NULLFSBLOCK)
- 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
- 
- 	args.cur = cur;
-diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
-index 6f7ed9288fe4..bcf46aa0d08b 100644
---- a/fs/xfs/libxfs/xfs_refcount.c
-+++ b/fs/xfs/libxfs/xfs_refcount.c
-@@ -1213,37 +1213,33 @@ xfs_refcount_adjust_extents(
- STATIC int
- xfs_refcount_adjust(
- 	struct xfs_btree_cur	*cur,
--	xfs_agblock_t		agbno,
--	xfs_extlen_t		aglen,
--	xfs_agblock_t		*new_agbno,
--	xfs_extlen_t		*new_aglen,
-+	xfs_agblock_t		*agbno,
-+	xfs_extlen_t		*aglen,
- 	enum xfs_refc_adjust_op	adj)
- {
- 	bool			shape_changed;
- 	int			shape_changes = 0;
- 	int			error;
- 
--	*new_agbno = agbno;
--	*new_aglen = aglen;
- 	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
--		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
--				agbno, aglen);
-+		trace_xfs_refcount_increase(cur->bc_mp,
-+				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
- 	else
--		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
--				agbno, aglen);
-+		trace_xfs_refcount_decrease(cur->bc_mp,
-+				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
- 
- 	/*
- 	 * Ensure that no rcextents cross the boundary of the adjustment range.
- 	 */
- 	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
--			agbno, &shape_changed);
-+			*agbno, &shape_changed);
- 	if (error)
- 		goto out_error;
- 	if (shape_changed)
- 		shape_changes++;
- 
- 	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
--			agbno + aglen, &shape_changed);
-+			*agbno + *aglen, &shape_changed);
- 	if (error)
- 		goto out_error;
- 	if (shape_changed)
-@@ -1253,7 +1249,7 @@ xfs_refcount_adjust(
- 	 * Try to merge with the left or right extents of the range.
- 	 */
- 	error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
--			new_agbno, new_aglen, adj, &shape_changed);
-+			agbno, aglen, adj, &shape_changed);
- 	if (error)
- 		goto out_error;
- 	if (shape_changed)
-@@ -1262,7 +1258,7 @@ xfs_refcount_adjust(
- 		cur->bc_ag.refc.shape_changes++;
- 
- 	/* Now that we've taken care of the ends, adjust the middle extents */
--	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
-+	error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
- 	if (error)
- 		goto out_error;
- 
-@@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup(
- static inline int
- xfs_refcount_continue_op(
- 	struct xfs_btree_cur		*cur,
--	xfs_fsblock_t			startblock,
--	xfs_agblock_t			new_agbno,
--	xfs_extlen_t			new_len,
--	xfs_fsblock_t			*new_fsbno)
-+	struct xfs_refcount_intent	*ri,
-+	xfs_agblock_t			new_agbno)
- {
- 	struct xfs_mount		*mp = cur->bc_mp;
- 	struct xfs_perag		*pag = cur->bc_ag.pag;
- 
--	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
-+	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
-+					ri->ri_blockcount)))
- 		return -EFSCORRUPTED;
- 
--	*new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
-+	ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
- 
--	ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
--	ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
-+	ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
-+	ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
- 
- 	return 0;
- }
-@@ -1327,11 +1322,7 @@ xfs_refcount_continue_op(
- int
- xfs_refcount_finish_one(
- 	struct xfs_trans		*tp,
--	enum xfs_refcount_intent_type	type,
--	xfs_fsblock_t			startblock,
--	xfs_extlen_t			blockcount,
--	xfs_fsblock_t			*new_fsb,
--	xfs_extlen_t			*new_len,
-+	struct xfs_refcount_intent	*ri,
- 	struct xfs_btree_cur		**pcur)
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
-@@ -1339,17 +1330,16 @@ xfs_refcount_finish_one(
- 	struct xfs_buf			*agbp = NULL;
- 	int				error = 0;
- 	xfs_agblock_t			bno;
--	xfs_agblock_t			new_agbno;
- 	unsigned long			nr_ops = 0;
- 	int				shape_changes = 0;
- 	struct xfs_perag		*pag;
- 
--	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
--	bno = XFS_FSB_TO_AGBNO(mp, startblock);
-+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
-+	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
- 
--	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
--			type, XFS_FSB_TO_AGBNO(mp, startblock),
--			blockcount);
-+	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
-+			ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
-+			ri->ri_blockcount);
- 
- 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
- 		error = -EIO;
-@@ -1380,42 +1370,42 @@ xfs_refcount_finish_one(
- 	}
- 	*pcur = rcur;
- 
--	switch (type) {
-+	switch (ri->ri_type) {
- 	case XFS_REFCOUNT_INCREASE:
--		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
--				new_len, XFS_REFCOUNT_ADJUST_INCREASE);
-+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
-+				XFS_REFCOUNT_ADJUST_INCREASE);
- 		if (error)
- 			goto out_drop;
--		if (*new_len > 0)
--			error = xfs_refcount_continue_op(rcur, startblock,
--					new_agbno, *new_len, new_fsb);
-+		if (ri->ri_blockcount > 0)
-+			error = xfs_refcount_continue_op(rcur, ri, bno);
- 		break;
- 	case XFS_REFCOUNT_DECREASE:
--		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
--				new_len, XFS_REFCOUNT_ADJUST_DECREASE);
-+		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
-+				XFS_REFCOUNT_ADJUST_DECREASE);
- 		if (error)
- 			goto out_drop;
--		if (*new_len > 0)
--			error = xfs_refcount_continue_op(rcur, startblock,
--					new_agbno, *new_len, new_fsb);
-+		if (ri->ri_blockcount > 0)
-+			error = xfs_refcount_continue_op(rcur, ri, bno);
- 		break;
- 	case XFS_REFCOUNT_ALLOC_COW:
--		*new_fsb = startblock + blockcount;
--		*new_len = 0;
--		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
-+		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
-+		if (error)
-+			goto out_drop;
-+		ri->ri_blockcount = 0;
- 		break;
- 	case XFS_REFCOUNT_FREE_COW:
--		*new_fsb = startblock + blockcount;
--		*new_len = 0;
--		error = __xfs_refcount_cow_free(rcur, bno, blockcount);
-+		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
-+		if (error)
-+			goto out_drop;
-+		ri->ri_blockcount = 0;
- 		break;
- 	default:
- 		ASSERT(0);
- 		error = -EFSCORRUPTED;
- 	}
--	if (!error && *new_len > 0)
--		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
--				bno, blockcount, new_agbno, *new_len);
-+	if (!error && ri->ri_blockcount > 0)
-+		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
-+				ri->ri_type, bno, ri->ri_blockcount);
- out_drop:
- 	xfs_perag_put(pag);
- 	return error;
-diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
-index 452f30556f5a..c633477ce3ce 100644
---- a/fs/xfs/libxfs/xfs_refcount.h
-+++ b/fs/xfs/libxfs/xfs_refcount.h
-@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp,
- extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
- 		struct xfs_btree_cur *rcur, int error);
- extern int xfs_refcount_finish_one(struct xfs_trans *tp,
--		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
--		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
--		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
-+		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
- 
- extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
- 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
-diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
-index b56aca1e7c66..df720041cd3d 100644
---- a/fs/xfs/libxfs/xfs_rmap.c
-+++ b/fs/xfs/libxfs/xfs_rmap.c
-@@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup(
- int
- xfs_rmap_finish_one(
- 	struct xfs_trans		*tp,
--	enum xfs_rmap_intent_type	type,
--	uint64_t			owner,
--	int				whichfork,
--	xfs_fileoff_t			startoff,
--	xfs_fsblock_t			startblock,
--	xfs_filblks_t			blockcount,
--	xfs_exntst_t			state,
-+	struct xfs_rmap_intent		*ri,
- 	struct xfs_btree_cur		**pcur)
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
-@@ -2408,11 +2402,13 @@ xfs_rmap_finish_one(
- 	xfs_agblock_t			bno;
- 	bool				unwritten;
- 
--	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
--	bno = XFS_FSB_TO_AGBNO(mp, startblock);
-+	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock));
-+	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
- 
--	trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
--			startoff, blockcount, state);
-+	trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno,
-+			ri->ri_owner, ri->ri_whichfork,
-+			ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
-+			ri->ri_bmap.br_state);
- 
- 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
- 		error = -EIO;
-@@ -2448,35 +2444,37 @@ xfs_rmap_finish_one(
- 	}
- 	*pcur = rcur;
- 
--	xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
--	unwritten = state == XFS_EXT_UNWRITTEN;
--	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
-+	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
-+			ri->ri_bmap.br_startoff);
-+	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
-+	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- 
--	switch (type) {
-+	switch (ri->ri_type) {
- 	case XFS_RMAP_ALLOC:
- 	case XFS_RMAP_MAP:
--		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
-+		error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
-+				unwritten, &oinfo);
- 		break;
- 	case XFS_RMAP_MAP_SHARED:
--		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
--				&oinfo);
-+		error = xfs_rmap_map_shared(rcur, bno,
-+				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- 		break;
- 	case XFS_RMAP_FREE:
- 	case XFS_RMAP_UNMAP:
--		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
--				&oinfo);
-+		error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
-+				unwritten, &oinfo);
- 		break;
- 	case XFS_RMAP_UNMAP_SHARED:
--		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
--				&oinfo);
-+		error = xfs_rmap_unmap_shared(rcur, bno,
-+				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- 		break;
- 	case XFS_RMAP_CONVERT:
--		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
--				&oinfo);
-+		error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
-+				!unwritten, &oinfo);
- 		break;
- 	case XFS_RMAP_CONVERT_SHARED:
--		error = xfs_rmap_convert_shared(rcur, bno, blockcount,
--				!unwritten, &oinfo);
-+		error = xfs_rmap_convert_shared(rcur, bno,
-+				ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
- 		break;
- 	default:
- 		ASSERT(0);
-diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
-index 54741a591a17..2dac88cea28d 100644
---- a/fs/xfs/libxfs/xfs_rmap.h
-+++ b/fs/xfs/libxfs/xfs_rmap.h
-@@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
- 
- void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
- 		struct xfs_btree_cur *rcur, int error);
--int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
--		uint64_t owner, int whichfork, xfs_fileoff_t startoff,
--		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
--		xfs_exntst_t state, struct xfs_btree_cur **pcur);
-+int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
-+		struct xfs_btree_cur **pcur);
- 
- int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- 		uint64_t owner, uint64_t offset, unsigned int flags,
-diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
-index 41323da523d1..6e2f0013380a 100644
---- a/fs/xfs/xfs_bmap_item.c
-+++ b/fs/xfs/xfs_bmap_item.c
-@@ -246,18 +246,11 @@ static int
- xfs_trans_log_finish_bmap_update(
- 	struct xfs_trans		*tp,
- 	struct xfs_bud_log_item		*budp,
--	enum xfs_bmap_intent_type	type,
--	struct xfs_inode		*ip,
--	int				whichfork,
--	xfs_fileoff_t			startoff,
--	xfs_fsblock_t			startblock,
--	xfs_filblks_t			*blockcount,
--	xfs_exntst_t			state)
-+	struct xfs_bmap_intent		*bi)
- {
- 	int				error;
- 
--	error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
--			startblock, blockcount, state);
-+	error = xfs_bmap_finish_one(tp, bi);
- 
- 	/*
- 	 * Mark the transaction dirty, even on error. This ensures the
-@@ -290,24 +283,24 @@ xfs_bmap_update_diff_items(
- /* Set the map extent flags for this mapping. */
- static void
- xfs_trans_set_bmap_flags(
--	struct xfs_map_extent		*bmap,
-+	struct xfs_map_extent		*map,
- 	enum xfs_bmap_intent_type	type,
- 	int				whichfork,
- 	xfs_exntst_t			state)
- {
--	bmap->me_flags = 0;
-+	map->me_flags = 0;
- 	switch (type) {
- 	case XFS_BMAP_MAP:
- 	case XFS_BMAP_UNMAP:
--		bmap->me_flags = type;
-+		map->me_flags = type;
- 		break;
- 	default:
- 		ASSERT(0);
- 	}
- 	if (state == XFS_EXT_UNWRITTEN)
--		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
-+		map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- 	if (whichfork == XFS_ATTR_FORK)
--		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-+		map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
- }
- 
- /* Log bmap updates in the intent item. */
-@@ -315,7 +308,7 @@ STATIC void
- xfs_bmap_update_log_item(
- 	struct xfs_trans		*tp,
- 	struct xfs_bui_log_item		*buip,
--	struct xfs_bmap_intent		*bmap)
-+	struct xfs_bmap_intent		*bi)
- {
- 	uint				next_extent;
- 	struct xfs_map_extent		*map;
-@@ -331,12 +324,12 @@ xfs_bmap_update_log_item(
- 	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
- 	ASSERT(next_extent < buip->bui_format.bui_nextents);
- 	map = &buip->bui_format.bui_extents[next_extent];
--	map->me_owner = bmap->bi_owner->i_ino;
--	map->me_startblock = bmap->bi_bmap.br_startblock;
--	map->me_startoff = bmap->bi_bmap.br_startoff;
--	map->me_len = bmap->bi_bmap.br_blockcount;
--	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
--			bmap->bi_bmap.br_state);
-+	map->me_owner = bi->bi_owner->i_ino;
-+	map->me_startblock = bi->bi_bmap.br_startblock;
-+	map->me_startoff = bi->bi_bmap.br_startoff;
-+	map->me_len = bi->bi_bmap.br_blockcount;
-+	xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
-+			bi->bi_bmap.br_state);
- }
- 
- static struct xfs_log_item *
-@@ -348,15 +341,15 @@ xfs_bmap_update_create_intent(
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_bui_log_item		*buip = xfs_bui_init(mp);
--	struct xfs_bmap_intent		*bmap;
-+	struct xfs_bmap_intent		*bi;
- 
- 	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- 
- 	xfs_trans_add_item(tp, &buip->bui_item);
- 	if (sort)
- 		list_sort(mp, items, xfs_bmap_update_diff_items);
--	list_for_each_entry(bmap, items, bi_list)
--		xfs_bmap_update_log_item(tp, buip, bmap);
-+	list_for_each_entry(bi, items, bi_list)
-+		xfs_bmap_update_log_item(tp, buip, bi);
- 	return &buip->bui_item;
- }
- 
-@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item(
- 	struct list_head		*item,
- 	struct xfs_btree_cur		**state)
- {
--	struct xfs_bmap_intent		*bmap;
--	xfs_filblks_t			count;
-+	struct xfs_bmap_intent		*bi;
- 	int				error;
- 
--	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
--	count = bmap->bi_bmap.br_blockcount;
--	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
--			bmap->bi_type,
--			bmap->bi_owner, bmap->bi_whichfork,
--			bmap->bi_bmap.br_startoff,
--			bmap->bi_bmap.br_startblock,
--			&count,
--			bmap->bi_bmap.br_state);
--	if (!error && count > 0) {
--		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
--		bmap->bi_bmap.br_blockcount = count;
-+	bi = container_of(item, struct xfs_bmap_intent, bi_list);
-+
-+	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
-+	if (!error && bi->bi_bmap.br_blockcount > 0) {
-+		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
- 		return -EAGAIN;
- 	}
--	kmem_cache_free(xfs_bmap_intent_cache, bmap);
-+	kmem_cache_free(xfs_bmap_intent_cache, bi);
- 	return error;
- }
- 
-@@ -413,10 +398,10 @@ STATIC void
- xfs_bmap_update_cancel_item(
- 	struct list_head		*item)
- {
--	struct xfs_bmap_intent		*bmap;
-+	struct xfs_bmap_intent		*bi;
- 
--	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
--	kmem_cache_free(xfs_bmap_intent_cache, bmap);
-+	bi = container_of(item, struct xfs_bmap_intent, bi_list);
-+	kmem_cache_free(xfs_bmap_intent_cache, bi);
- }
- 
- const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
-@@ -434,18 +419,18 @@ xfs_bui_validate(
- 	struct xfs_mount		*mp,
- 	struct xfs_bui_log_item		*buip)
- {
--	struct xfs_map_extent		*bmap;
-+	struct xfs_map_extent		*map;
- 
- 	/* Only one mapping operation per BUI... */
- 	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
- 		return false;
- 
--	bmap = &buip->bui_format.bui_extents[0];
-+	map = &buip->bui_format.bui_extents[0];
- 
--	if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
-+	if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
- 		return false;
- 
--	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
-+	switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
- 	case XFS_BMAP_MAP:
- 	case XFS_BMAP_UNMAP:
- 		break;
-@@ -453,13 +438,13 @@ xfs_bui_validate(
- 		return false;
- 	}
- 
--	if (!xfs_verify_ino(mp, bmap->me_owner))
-+	if (!xfs_verify_ino(mp, map->me_owner))
- 		return false;
- 
--	if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len))
-+	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
- 		return false;
- 
--	return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len);
-+	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
- }
- 
- /*
-@@ -471,17 +456,13 @@ xfs_bui_item_recover(
- 	struct xfs_log_item		*lip,
- 	struct list_head		*capture_list)
- {
--	struct xfs_bmbt_irec		irec;
-+	struct xfs_bmap_intent		fake = { };
- 	struct xfs_bui_log_item		*buip = BUI_ITEM(lip);
- 	struct xfs_trans		*tp;
- 	struct xfs_inode		*ip = NULL;
- 	struct xfs_mount		*mp = lip->li_log->l_mp;
--	struct xfs_map_extent		*bmap;
-+	struct xfs_map_extent		*map;
- 	struct xfs_bud_log_item		*budp;
--	xfs_filblks_t			count;
--	xfs_exntst_t			state;
--	unsigned int			bui_type;
--	int				whichfork;
- 	int				iext_delta;
- 	int				error = 0;
- 
-@@ -491,14 +472,12 @@ xfs_bui_item_recover(
- 		return -EFSCORRUPTED;
- 	}
- 
--	bmap = &buip->bui_format.bui_extents[0];
--	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
--			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
--	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
-+	map = &buip->bui_format.bui_extents[0];
-+	fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- 			XFS_ATTR_FORK : XFS_DATA_FORK;
--	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
-+	fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- 
--	error = xlog_recover_iget(mp, bmap->me_owner, &ip);
-+	error = xlog_recover_iget(mp, map->me_owner, &ip);
- 	if (error)
- 		return error;
- 
-@@ -512,34 +491,34 @@ xfs_bui_item_recover(
- 	xfs_ilock(ip, XFS_ILOCK_EXCL);
- 	xfs_trans_ijoin(tp, ip, 0);
- 
--	if (bui_type == XFS_BMAP_MAP)
-+	if (fake.bi_type == XFS_BMAP_MAP)
- 		iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
- 	else
- 		iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- 
--	error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
-+	error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
- 	if (error == -EFBIG)
- 		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
- 	if (error)
- 		goto err_cancel;
- 
--	count = bmap->me_len;
--	error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
--			whichfork, bmap->me_startoff, bmap->me_startblock,
--			&count, state);
-+	fake.bi_owner = ip;
-+	fake.bi_bmap.br_startblock = map->me_startblock;
-+	fake.bi_bmap.br_startoff = map->me_startoff;
-+	fake.bi_bmap.br_blockcount = map->me_len;
-+	fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
-+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-+
-+	error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
- 	if (error == -EFSCORRUPTED)
--		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
--				sizeof(*bmap));
-+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
-+				sizeof(*map));
- 	if (error)
- 		goto err_cancel;
- 
--	if (count > 0) {
--		ASSERT(bui_type == XFS_BMAP_UNMAP);
--		irec.br_startblock = bmap->me_startblock;
--		irec.br_blockcount = count;
--		irec.br_startoff = bmap->me_startoff;
--		irec.br_state = state;
--		xfs_bmap_unmap_extent(tp, ip, &irec);
-+	if (fake.bi_bmap.br_blockcount > 0) {
-+		ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
-+		xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
- 	}
- 
- 	/*
-@@ -579,18 +558,18 @@ xfs_bui_item_relog(
- {
- 	struct xfs_bud_log_item		*budp;
- 	struct xfs_bui_log_item		*buip;
--	struct xfs_map_extent		*extp;
-+	struct xfs_map_extent		*map;
- 	unsigned int			count;
- 
- 	count = BUI_ITEM(intent)->bui_format.bui_nextents;
--	extp = BUI_ITEM(intent)->bui_format.bui_extents;
-+	map = BUI_ITEM(intent)->bui_format.bui_extents;
- 
- 	tp->t_flags |= XFS_TRANS_DIRTY;
- 	budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
- 	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
- 
- 	buip = xfs_bui_init(tp->t_mountp);
--	memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
-+	memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
- 	atomic_set(&buip->bui_next_extent, count);
- 	xfs_trans_add_item(tp, &buip->bui_item);
- 	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
-index ae082808cfed..b2cbbba3e15a 100644
---- a/fs/xfs/xfs_error.c
-+++ b/fs/xfs/xfs_error.c
-@@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_errortag);
- 
--static struct kobj_type xfs_errortag_ktype = {
-+static const struct kobj_type xfs_errortag_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_errortag_sysfs_ops,
- 	.default_groups = xfs_errortag_groups,
-diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
-index dbe6c37dc697..0b9c5ba8a598 100644
---- a/fs/xfs/xfs_error.h
-+++ b/fs/xfs/xfs_error.h
-@@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
- 
- /*
-  * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
-- *			a panic by setting xfs_panic_mask in a sysctl.
-+ *			a panic by setting fs.xfs.panic_mask in a sysctl.
-  */
- #define		XFS_NO_PTAG			0u
- #define		XFS_PTAG_IFLUSH			(1u << 0)
-@@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
- #define		XFS_PTAG_FSBLOCK_ZERO		(1u << 7)
- #define		XFS_PTAG_VERIFIER_ERROR		(1u << 8)
- 
-+#define		XFS_PTAG_MASK	(XFS_PTAG_IFLUSH | \
-+				 XFS_PTAG_LOGRES | \
-+				 XFS_PTAG_AILDELETE | \
-+				 XFS_PTAG_ERROR_REPORT | \
-+				 XFS_PTAG_SHUTDOWN_CORRUPT | \
-+				 XFS_PTAG_SHUTDOWN_IOERROR | \
-+				 XFS_PTAG_SHUTDOWN_LOGERROR | \
-+				 XFS_PTAG_FSBLOCK_ZERO | \
-+				 XFS_PTAG_VERIFIER_ERROR)
-+
- #define XFS_PTAG_STRINGS \
- 	{ XFS_NO_PTAG,			"none" }, \
- 	{ XFS_PTAG_IFLUSH,		"iflush" }, \
-diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
-index d5130d1fcfae..011b50469301 100644
---- a/fs/xfs/xfs_extfree_item.c
-+++ b/fs/xfs/xfs_extfree_item.c
-@@ -345,23 +345,30 @@ static int
- xfs_trans_free_extent(
- 	struct xfs_trans		*tp,
- 	struct xfs_efd_log_item		*efdp,
--	xfs_fsblock_t			start_block,
--	xfs_extlen_t			ext_len,
--	const struct xfs_owner_info	*oinfo,
--	bool				skip_discard)
-+	struct xfs_extent_free_item	*xefi)
- {
-+	struct xfs_owner_info		oinfo = { };
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_extent		*extp;
- 	uint				next_extent;
--	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, start_block);
-+	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp,
-+							xefi->xefi_startblock);
- 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
--								start_block);
-+							xefi->xefi_startblock);
- 	int				error;
- 
--	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
-+	oinfo.oi_owner = xefi->xefi_owner;
-+	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
-+		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
-+	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
-+		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
-+
-+	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno,
-+			xefi->xefi_blockcount);
- 
--	error = __xfs_free_extent(tp, start_block, ext_len,
--				  oinfo, XFS_AG_RESV_NONE, skip_discard);
-+	error = __xfs_free_extent(tp, xefi->xefi_startblock,
-+			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
-+			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
- 	/*
- 	 * Mark the transaction dirty, even on error. This ensures the
- 	 * transaction is aborted, which:
-@@ -375,8 +382,8 @@ xfs_trans_free_extent(
- 	next_extent = efdp->efd_next_extent;
- 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
- 	extp = &(efdp->efd_format.efd_extents[next_extent]);
--	extp->ext_start = start_block;
--	extp->ext_len = ext_len;
-+	extp->ext_start = xefi->xefi_startblock;
-+	extp->ext_len = xefi->xefi_blockcount;
- 	efdp->efd_next_extent++;
- 
- 	return error;
-@@ -404,7 +411,7 @@ STATIC void
- xfs_extent_free_log_item(
- 	struct xfs_trans		*tp,
- 	struct xfs_efi_log_item		*efip,
--	struct xfs_extent_free_item	*free)
-+	struct xfs_extent_free_item	*xefi)
- {
- 	uint				next_extent;
- 	struct xfs_extent		*extp;
-@@ -420,8 +427,8 @@ xfs_extent_free_log_item(
- 	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
- 	ASSERT(next_extent < efip->efi_format.efi_nextents);
- 	extp = &efip->efi_format.efi_extents[next_extent];
--	extp->ext_start = free->xefi_startblock;
--	extp->ext_len = free->xefi_blockcount;
-+	extp->ext_start = xefi->xefi_startblock;
-+	extp->ext_len = xefi->xefi_blockcount;
- }
- 
- static struct xfs_log_item *
-@@ -433,15 +440,15 @@ xfs_extent_free_create_intent(
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
--	struct xfs_extent_free_item	*free;
-+	struct xfs_extent_free_item	*xefi;
- 
- 	ASSERT(count > 0);
- 
- 	xfs_trans_add_item(tp, &efip->efi_item);
- 	if (sort)
- 		list_sort(mp, items, xfs_extent_free_diff_items);
--	list_for_each_entry(free, items, xefi_list)
--		xfs_extent_free_log_item(tp, efip, free);
-+	list_for_each_entry(xefi, items, xefi_list)
-+		xfs_extent_free_log_item(tp, efip, xefi);
- 	return &efip->efi_item;
- }
- 
-@@ -463,21 +470,13 @@ xfs_extent_free_finish_item(
- 	struct list_head		*item,
- 	struct xfs_btree_cur		**state)
- {
--	struct xfs_owner_info		oinfo = { };
--	struct xfs_extent_free_item	*free;
-+	struct xfs_extent_free_item	*xefi;
- 	int				error;
- 
--	free = container_of(item, struct xfs_extent_free_item, xefi_list);
--	oinfo.oi_owner = free->xefi_owner;
--	if (free->xefi_flags & XFS_EFI_ATTR_FORK)
--		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
--	if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
--		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
--	error = xfs_trans_free_extent(tp, EFD_ITEM(done),
--			free->xefi_startblock,
--			free->xefi_blockcount,
--			&oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
--	kmem_cache_free(xfs_extfree_item_cache, free);
-+	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-+
-+	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
-+	kmem_cache_free(xfs_extfree_item_cache, xefi);
- 	return error;
- }
- 
-@@ -494,10 +493,10 @@ STATIC void
- xfs_extent_free_cancel_item(
- 	struct list_head		*item)
- {
--	struct xfs_extent_free_item	*free;
-+	struct xfs_extent_free_item	*xefi;
- 
--	free = container_of(item, struct xfs_extent_free_item, xefi_list);
--	kmem_cache_free(xfs_extfree_item_cache, free);
-+	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-+	kmem_cache_free(xfs_extfree_item_cache, xefi);
- }
- 
- const struct xfs_defer_op_type xfs_extent_free_defer_type = {
-@@ -523,7 +522,7 @@ xfs_agfl_free_finish_item(
- 	struct xfs_owner_info		oinfo = { };
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
--	struct xfs_extent_free_item	*free;
-+	struct xfs_extent_free_item	*xefi;
- 	struct xfs_extent		*extp;
- 	struct xfs_buf			*agbp;
- 	int				error;
-@@ -532,13 +531,13 @@ xfs_agfl_free_finish_item(
- 	uint				next_extent;
- 	struct xfs_perag		*pag;
- 
--	free = container_of(item, struct xfs_extent_free_item, xefi_list);
--	ASSERT(free->xefi_blockcount == 1);
--	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
--	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
--	oinfo.oi_owner = free->xefi_owner;
-+	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-+	ASSERT(xefi->xefi_blockcount == 1);
-+	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
-+	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
-+	oinfo.oi_owner = xefi->xefi_owner;
- 
--	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
-+	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount);
- 
- 	pag = xfs_perag_get(mp, agno);
- 	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
-@@ -559,11 +558,11 @@ xfs_agfl_free_finish_item(
- 	next_extent = efdp->efd_next_extent;
- 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
- 	extp = &(efdp->efd_format.efd_extents[next_extent]);
--	extp->ext_start = free->xefi_startblock;
--	extp->ext_len = free->xefi_blockcount;
-+	extp->ext_start = xefi->xefi_startblock;
-+	extp->ext_len = xefi->xefi_blockcount;
- 	efdp->efd_next_extent++;
- 
--	kmem_cache_free(xfs_extfree_item_cache, free);
-+	kmem_cache_free(xfs_extfree_item_cache, xefi);
- 	return error;
- }
- 
-@@ -599,7 +598,6 @@ xfs_efi_item_recover(
- 	struct xfs_mount		*mp = lip->li_log->l_mp;
- 	struct xfs_efd_log_item		*efdp;
- 	struct xfs_trans		*tp;
--	struct xfs_extent		*extp;
- 	int				i;
- 	int				error = 0;
- 
-@@ -624,10 +622,17 @@ xfs_efi_item_recover(
- 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
- 
- 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
-+		struct xfs_extent_free_item	fake = {
-+			.xefi_owner		= XFS_RMAP_OWN_UNKNOWN,
-+		};
-+		struct xfs_extent		*extp;
-+
- 		extp = &efip->efi_format.efi_extents[i];
--		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
--					      extp->ext_len,
--					      &XFS_RMAP_OINFO_ANY_OWNER, false);
-+
-+		fake.xefi_startblock = extp->ext_start;
-+		fake.xefi_blockcount = extp->ext_len;
-+
-+		error = xfs_trans_free_extent(tp, efdp, &fake);
- 		if (error == -EFSCORRUPTED)
- 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- 					extp, sizeof(*extp));
-diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
-index 88a88506ffff..92ca2017eded 100644
---- a/fs/xfs/xfs_fsmap.c
-+++ b/fs/xfs/xfs_fsmap.c
-@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt(
- {
- 	struct xfs_alloc_rec_incore	akeys[2];
- 
-+	memset(akeys, 0, sizeof(akeys));
- 	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
- 	return __xfs_getfsmap_datadev(tp, keys, info,
- 			xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
-diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
-index 4d0a98f920ca..9edc1f2bc939 100644
---- a/fs/xfs/xfs_globals.c
-+++ b/fs/xfs/xfs_globals.c
-@@ -4,6 +4,7 @@
-  * All Rights Reserved.
-  */
- #include "xfs.h"
-+#include "xfs_error.h"
- 
- /*
-  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
-@@ -15,7 +16,7 @@ xfs_param_t xfs_params = {
- 			  /*	MIN		DFLT		MAX	*/
- 	.sgid_inherit	= {	0,		0,		1	},
- 	.symlink_mode	= {	0,		0,		1	},
--	.panic_mask	= {	0,		0,		256	},
-+	.panic_mask	= {	0,		0,		XFS_PTAG_MASK},
- 	.error_level	= {	0,		3,		11	},
- 	.syncd_timer	= {	1*100,		30*100,		7200*100},
- 	.stats_clear	= {	0,		0,		1	},
-diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
-index fc1946f80a4a..69dbe7814128 100644
---- a/fs/xfs/xfs_iomap.c
-+++ b/fs/xfs/xfs_iomap.c
-@@ -83,7 +83,7 @@ xfs_iomap_valid(
- 	return true;
- }
- 
--static const struct iomap_page_ops xfs_iomap_page_ops = {
-+static const struct iomap_folio_ops xfs_iomap_folio_ops = {
- 	.iomap_valid		= xfs_iomap_valid,
- };
- 
-@@ -133,7 +133,7 @@ xfs_bmbt_to_iomap(
- 		iomap->flags |= IOMAP_F_DIRTY;
- 
- 	iomap->validity_cookie = sequence_cookie;
--	iomap->page_ops = &xfs_iomap_page_ops;
-+	iomap->folio_ops = &xfs_iomap_folio_ops;
- 	return 0;
- }
- 
-diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
-index 858e3e9eb4a8..48d771a76add 100644
---- a/fs/xfs/xfs_refcount_item.c
-+++ b/fs/xfs/xfs_refcount_item.c
-@@ -252,17 +252,12 @@ static int
- xfs_trans_log_finish_refcount_update(
- 	struct xfs_trans		*tp,
- 	struct xfs_cud_log_item		*cudp,
--	enum xfs_refcount_intent_type	type,
--	xfs_fsblock_t			startblock,
--	xfs_extlen_t			blockcount,
--	xfs_fsblock_t			*new_fsb,
--	xfs_extlen_t			*new_len,
-+	struct xfs_refcount_intent	*ri,
- 	struct xfs_btree_cur		**pcur)
- {
- 	int				error;
- 
--	error = xfs_refcount_finish_one(tp, type, startblock,
--			blockcount, new_fsb, new_len, pcur);
-+	error = xfs_refcount_finish_one(tp, ri, pcur);
- 
- 	/*
- 	 * Mark the transaction dirty, even on error. This ensures the
-@@ -297,16 +292,16 @@ xfs_refcount_update_diff_items(
- /* Set the phys extent flags for this reverse mapping. */
- static void
- xfs_trans_set_refcount_flags(
--	struct xfs_phys_extent		*refc,
-+	struct xfs_phys_extent		*pmap,
- 	enum xfs_refcount_intent_type	type)
- {
--	refc->pe_flags = 0;
-+	pmap->pe_flags = 0;
- 	switch (type) {
- 	case XFS_REFCOUNT_INCREASE:
- 	case XFS_REFCOUNT_DECREASE:
- 	case XFS_REFCOUNT_ALLOC_COW:
- 	case XFS_REFCOUNT_FREE_COW:
--		refc->pe_flags |= type;
-+		pmap->pe_flags |= type;
- 		break;
- 	default:
- 		ASSERT(0);
-@@ -318,10 +313,10 @@ STATIC void
- xfs_refcount_update_log_item(
- 	struct xfs_trans		*tp,
- 	struct xfs_cui_log_item		*cuip,
--	struct xfs_refcount_intent	*refc)
-+	struct xfs_refcount_intent	*ri)
- {
- 	uint				next_extent;
--	struct xfs_phys_extent		*ext;
-+	struct xfs_phys_extent		*pmap;
- 
- 	tp->t_flags |= XFS_TRANS_DIRTY;
- 	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-@@ -333,10 +328,10 @@ xfs_refcount_update_log_item(
- 	 */
- 	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
- 	ASSERT(next_extent < cuip->cui_format.cui_nextents);
--	ext = &cuip->cui_format.cui_extents[next_extent];
--	ext->pe_startblock = refc->ri_startblock;
--	ext->pe_len = refc->ri_blockcount;
--	xfs_trans_set_refcount_flags(ext, refc->ri_type);
-+	pmap = &cuip->cui_format.cui_extents[next_extent];
-+	pmap->pe_startblock = ri->ri_startblock;
-+	pmap->pe_len = ri->ri_blockcount;
-+	xfs_trans_set_refcount_flags(pmap, ri->ri_type);
- }
- 
- static struct xfs_log_item *
-@@ -348,15 +343,15 @@ xfs_refcount_update_create_intent(
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
--	struct xfs_refcount_intent	*refc;
-+	struct xfs_refcount_intent	*ri;
- 
- 	ASSERT(count > 0);
- 
- 	xfs_trans_add_item(tp, &cuip->cui_item);
- 	if (sort)
- 		list_sort(mp, items, xfs_refcount_update_diff_items);
--	list_for_each_entry(refc, items, ri_list)
--		xfs_refcount_update_log_item(tp, cuip, refc);
-+	list_for_each_entry(ri, items, ri_list)
-+		xfs_refcount_update_log_item(tp, cuip, ri);
- 	return &cuip->cui_item;
- }
- 
-@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item(
- 	struct list_head		*item,
- 	struct xfs_btree_cur		**state)
- {
--	struct xfs_refcount_intent	*refc;
--	xfs_fsblock_t			new_fsb;
--	xfs_extlen_t			new_aglen;
-+	struct xfs_refcount_intent	*ri;
- 	int				error;
- 
--	refc = container_of(item, struct xfs_refcount_intent, ri_list);
--	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
--			refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
--			&new_fsb, &new_aglen, state);
-+	ri = container_of(item, struct xfs_refcount_intent, ri_list);
-+	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
-+			state);
- 
- 	/* Did we run out of reservation?  Requeue what we didn't finish. */
--	if (!error && new_aglen > 0) {
--		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
--		       refc->ri_type == XFS_REFCOUNT_DECREASE);
--		refc->ri_startblock = new_fsb;
--		refc->ri_blockcount = new_aglen;
-+	if (!error && ri->ri_blockcount > 0) {
-+		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
-+		       ri->ri_type == XFS_REFCOUNT_DECREASE);
- 		return -EAGAIN;
- 	}
--	kmem_cache_free(xfs_refcount_intent_cache, refc);
-+	kmem_cache_free(xfs_refcount_intent_cache, ri);
- 	return error;
- }
- 
-@@ -413,10 +403,10 @@ STATIC void
- xfs_refcount_update_cancel_item(
- 	struct list_head		*item)
- {
--	struct xfs_refcount_intent	*refc;
-+	struct xfs_refcount_intent	*ri;
- 
--	refc = container_of(item, struct xfs_refcount_intent, ri_list);
--	kmem_cache_free(xfs_refcount_intent_cache, refc);
-+	ri = container_of(item, struct xfs_refcount_intent, ri_list);
-+	kmem_cache_free(xfs_refcount_intent_cache, ri);
- }
- 
- const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
-@@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- static inline bool
- xfs_cui_validate_phys(
- 	struct xfs_mount		*mp,
--	struct xfs_phys_extent		*refc)
-+	struct xfs_phys_extent		*pmap)
- {
- 	if (!xfs_has_reflink(mp))
- 		return false;
- 
--	if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
-+	if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
- 		return false;
- 
--	switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
-+	switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
- 	case XFS_REFCOUNT_INCREASE:
- 	case XFS_REFCOUNT_DECREASE:
- 	case XFS_REFCOUNT_ALLOC_COW:
-@@ -451,7 +441,7 @@ xfs_cui_validate_phys(
- 		return false;
- 	}
- 
--	return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len);
-+	return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
- }
- 
- /*
-@@ -463,18 +453,13 @@ xfs_cui_item_recover(
- 	struct xfs_log_item		*lip,
- 	struct list_head		*capture_list)
- {
--	struct xfs_bmbt_irec		irec;
- 	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
--	struct xfs_phys_extent		*refc;
- 	struct xfs_cud_log_item		*cudp;
- 	struct xfs_trans		*tp;
- 	struct xfs_btree_cur		*rcur = NULL;
- 	struct xfs_mount		*mp = lip->li_log->l_mp;
--	xfs_fsblock_t			new_fsb;
--	xfs_extlen_t			new_len;
- 	unsigned int			refc_type;
- 	bool				requeue_only = false;
--	enum xfs_refcount_intent_type	type;
- 	int				i;
- 	int				error = 0;
- 
-@@ -513,14 +498,17 @@ xfs_cui_item_recover(
- 	cudp = xfs_trans_get_cud(tp, cuip);
- 
- 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
--		refc = &cuip->cui_format.cui_extents[i];
--		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
-+		struct xfs_refcount_intent	fake = { };
-+		struct xfs_phys_extent		*pmap;
-+
-+		pmap = &cuip->cui_format.cui_extents[i];
-+		refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
- 		switch (refc_type) {
- 		case XFS_REFCOUNT_INCREASE:
- 		case XFS_REFCOUNT_DECREASE:
- 		case XFS_REFCOUNT_ALLOC_COW:
- 		case XFS_REFCOUNT_FREE_COW:
--			type = refc_type;
-+			fake.ri_type = refc_type;
- 			break;
- 		default:
- 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-@@ -529,13 +517,12 @@ xfs_cui_item_recover(
- 			error = -EFSCORRUPTED;
- 			goto abort_error;
- 		}
--		if (requeue_only) {
--			new_fsb = refc->pe_startblock;
--			new_len = refc->pe_len;
--		} else
-+
-+		fake.ri_startblock = pmap->pe_startblock;
-+		fake.ri_blockcount = pmap->pe_len;
-+		if (!requeue_only)
- 			error = xfs_trans_log_finish_refcount_update(tp, cudp,
--				type, refc->pe_startblock, refc->pe_len,
--				&new_fsb, &new_len, &rcur);
-+					&fake, &rcur);
- 		if (error == -EFSCORRUPTED)
- 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- 					&cuip->cui_format,
-@@ -544,10 +531,13 @@ xfs_cui_item_recover(
- 			goto abort_error;
- 
- 		/* Requeue what we didn't finish. */
--		if (new_len > 0) {
--			irec.br_startblock = new_fsb;
--			irec.br_blockcount = new_len;
--			switch (type) {
-+		if (fake.ri_blockcount > 0) {
-+			struct xfs_bmbt_irec	irec = {
-+				.br_startblock	= fake.ri_startblock,
-+				.br_blockcount	= fake.ri_blockcount,
-+			};
-+
-+			switch (fake.ri_type) {
- 			case XFS_REFCOUNT_INCREASE:
- 				xfs_refcount_increase_extent(tp, &irec);
- 				break;
-@@ -596,18 +586,18 @@ xfs_cui_item_relog(
- {
- 	struct xfs_cud_log_item		*cudp;
- 	struct xfs_cui_log_item		*cuip;
--	struct xfs_phys_extent		*extp;
-+	struct xfs_phys_extent		*pmap;
- 	unsigned int			count;
- 
- 	count = CUI_ITEM(intent)->cui_format.cui_nextents;
--	extp = CUI_ITEM(intent)->cui_format.cui_extents;
-+	pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- 
- 	tp->t_flags |= XFS_TRANS_DIRTY;
- 	cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
- 	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
- 
- 	cuip = xfs_cui_init(tp->t_mountp, count);
--	memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
-+	memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
- 	atomic_set(&cuip->cui_next_extent, count);
- 	xfs_trans_add_item(tp, &cuip->cui_item);
- 	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
-index 534504ede1a3..a1619d67015f 100644
---- a/fs/xfs/xfs_rmap_item.c
-+++ b/fs/xfs/xfs_rmap_item.c
-@@ -244,40 +244,40 @@ xfs_trans_get_rud(
- /* Set the map extent flags for this reverse mapping. */
- static void
- xfs_trans_set_rmap_flags(
--	struct xfs_map_extent		*rmap,
-+	struct xfs_map_extent		*map,
- 	enum xfs_rmap_intent_type	type,
- 	int				whichfork,
- 	xfs_exntst_t			state)
- {
--	rmap->me_flags = 0;
-+	map->me_flags = 0;
- 	if (state == XFS_EXT_UNWRITTEN)
--		rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
-+		map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
- 	if (whichfork == XFS_ATTR_FORK)
--		rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
-+		map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
- 	switch (type) {
- 	case XFS_RMAP_MAP:
--		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
-+		map->me_flags |= XFS_RMAP_EXTENT_MAP;
- 		break;
- 	case XFS_RMAP_MAP_SHARED:
--		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
-+		map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
- 		break;
- 	case XFS_RMAP_UNMAP:
--		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
-+		map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
- 		break;
- 	case XFS_RMAP_UNMAP_SHARED:
--		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
-+		map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
- 		break;
- 	case XFS_RMAP_CONVERT:
--		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
-+		map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
- 		break;
- 	case XFS_RMAP_CONVERT_SHARED:
--		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
-+		map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
- 		break;
- 	case XFS_RMAP_ALLOC:
--		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
-+		map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
- 		break;
- 	case XFS_RMAP_FREE:
--		rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
-+		map->me_flags |= XFS_RMAP_EXTENT_FREE;
- 		break;
- 	default:
- 		ASSERT(0);
-@@ -293,19 +293,12 @@ static int
- xfs_trans_log_finish_rmap_update(
- 	struct xfs_trans		*tp,
- 	struct xfs_rud_log_item		*rudp,
--	enum xfs_rmap_intent_type	type,
--	uint64_t			owner,
--	int				whichfork,
--	xfs_fileoff_t			startoff,
--	xfs_fsblock_t			startblock,
--	xfs_filblks_t			blockcount,
--	xfs_exntst_t			state,
-+	struct xfs_rmap_intent		*ri,
- 	struct xfs_btree_cur		**pcur)
- {
- 	int				error;
- 
--	error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
--			startblock, blockcount, state, pcur);
-+	error = xfs_rmap_finish_one(tp, ri, pcur);
- 
- 	/*
- 	 * Mark the transaction dirty, even on error. This ensures the
-@@ -342,7 +335,7 @@ STATIC void
- xfs_rmap_update_log_item(
- 	struct xfs_trans		*tp,
- 	struct xfs_rui_log_item		*ruip,
--	struct xfs_rmap_intent		*rmap)
-+	struct xfs_rmap_intent		*ri)
- {
- 	uint				next_extent;
- 	struct xfs_map_extent		*map;
-@@ -358,12 +351,12 @@ xfs_rmap_update_log_item(
- 	next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
- 	ASSERT(next_extent < ruip->rui_format.rui_nextents);
- 	map = &ruip->rui_format.rui_extents[next_extent];
--	map->me_owner = rmap->ri_owner;
--	map->me_startblock = rmap->ri_bmap.br_startblock;
--	map->me_startoff = rmap->ri_bmap.br_startoff;
--	map->me_len = rmap->ri_bmap.br_blockcount;
--	xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
--			rmap->ri_bmap.br_state);
-+	map->me_owner = ri->ri_owner;
-+	map->me_startblock = ri->ri_bmap.br_startblock;
-+	map->me_startoff = ri->ri_bmap.br_startoff;
-+	map->me_len = ri->ri_bmap.br_blockcount;
-+	xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork,
-+			ri->ri_bmap.br_state);
- }
- 
- static struct xfs_log_item *
-@@ -375,15 +368,15 @@ xfs_rmap_update_create_intent(
- {
- 	struct xfs_mount		*mp = tp->t_mountp;
- 	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
--	struct xfs_rmap_intent		*rmap;
-+	struct xfs_rmap_intent		*ri;
- 
- 	ASSERT(count > 0);
- 
- 	xfs_trans_add_item(tp, &ruip->rui_item);
- 	if (sort)
- 		list_sort(mp, items, xfs_rmap_update_diff_items);
--	list_for_each_entry(rmap, items, ri_list)
--		xfs_rmap_update_log_item(tp, ruip, rmap);
-+	list_for_each_entry(ri, items, ri_list)
-+		xfs_rmap_update_log_item(tp, ruip, ri);
- 	return &ruip->rui_item;
- }
- 
-@@ -405,16 +398,14 @@ xfs_rmap_update_finish_item(
- 	struct list_head		*item,
- 	struct xfs_btree_cur		**state)
- {
--	struct xfs_rmap_intent		*rmap;
-+	struct xfs_rmap_intent		*ri;
- 	int				error;
- 
--	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
--	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
--			rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
--			rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
--			rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
-+	ri = container_of(item, struct xfs_rmap_intent, ri_list);
-+
-+	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
- 			state);
--	kmem_cache_free(xfs_rmap_intent_cache, rmap);
-+	kmem_cache_free(xfs_rmap_intent_cache, ri);
- 	return error;
- }
- 
-@@ -431,10 +422,10 @@ STATIC void
- xfs_rmap_update_cancel_item(
- 	struct list_head		*item)
- {
--	struct xfs_rmap_intent		*rmap;
-+	struct xfs_rmap_intent		*ri;
- 
--	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
--	kmem_cache_free(xfs_rmap_intent_cache, rmap);
-+	ri = container_of(item, struct xfs_rmap_intent, ri_list);
-+	kmem_cache_free(xfs_rmap_intent_cache, ri);
- }
- 
- const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
-@@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- static inline bool
- xfs_rui_validate_map(
- 	struct xfs_mount		*mp,
--	struct xfs_map_extent		*rmap)
-+	struct xfs_map_extent		*map)
- {
- 	if (!xfs_has_rmapbt(mp))
- 		return false;
- 
--	if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
-+	if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
- 		return false;
- 
--	switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
-+	switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- 	case XFS_RMAP_EXTENT_MAP:
- 	case XFS_RMAP_EXTENT_MAP_SHARED:
- 	case XFS_RMAP_EXTENT_UNMAP:
-@@ -473,14 +464,14 @@ xfs_rui_validate_map(
- 		return false;
- 	}
- 
--	if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) &&
--	    !xfs_verify_ino(mp, rmap->me_owner))
-+	if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) &&
-+	    !xfs_verify_ino(mp, map->me_owner))
- 		return false;
- 
--	if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len))
-+	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
- 		return false;
- 
--	return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len);
-+	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
- }
- 
- /*
-@@ -493,15 +484,11 @@ xfs_rui_item_recover(
- 	struct list_head		*capture_list)
- {
- 	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
--	struct xfs_map_extent		*rmap;
- 	struct xfs_rud_log_item		*rudp;
- 	struct xfs_trans		*tp;
- 	struct xfs_btree_cur		*rcur = NULL;
- 	struct xfs_mount		*mp = lip->li_log->l_mp;
--	enum xfs_rmap_intent_type	type;
--	xfs_exntst_t			state;
- 	int				i;
--	int				whichfork;
- 	int				error = 0;
- 
- 	/*
-@@ -526,35 +513,34 @@ xfs_rui_item_recover(
- 	rudp = xfs_trans_get_rud(tp, ruip);
- 
- 	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
--		rmap = &ruip->rui_format.rui_extents[i];
--		state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
--				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
--		whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
--				XFS_ATTR_FORK : XFS_DATA_FORK;
--		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
-+		struct xfs_rmap_intent	fake = { };
-+		struct xfs_map_extent	*map;
-+
-+		map = &ruip->rui_format.rui_extents[i];
-+		switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- 		case XFS_RMAP_EXTENT_MAP:
--			type = XFS_RMAP_MAP;
-+			fake.ri_type = XFS_RMAP_MAP;
- 			break;
- 		case XFS_RMAP_EXTENT_MAP_SHARED:
--			type = XFS_RMAP_MAP_SHARED;
-+			fake.ri_type = XFS_RMAP_MAP_SHARED;
- 			break;
- 		case XFS_RMAP_EXTENT_UNMAP:
--			type = XFS_RMAP_UNMAP;
-+			fake.ri_type = XFS_RMAP_UNMAP;
- 			break;
- 		case XFS_RMAP_EXTENT_UNMAP_SHARED:
--			type = XFS_RMAP_UNMAP_SHARED;
-+			fake.ri_type = XFS_RMAP_UNMAP_SHARED;
- 			break;
- 		case XFS_RMAP_EXTENT_CONVERT:
--			type = XFS_RMAP_CONVERT;
-+			fake.ri_type = XFS_RMAP_CONVERT;
- 			break;
- 		case XFS_RMAP_EXTENT_CONVERT_SHARED:
--			type = XFS_RMAP_CONVERT_SHARED;
-+			fake.ri_type = XFS_RMAP_CONVERT_SHARED;
- 			break;
- 		case XFS_RMAP_EXTENT_ALLOC:
--			type = XFS_RMAP_ALLOC;
-+			fake.ri_type = XFS_RMAP_ALLOC;
- 			break;
- 		case XFS_RMAP_EXTENT_FREE:
--			type = XFS_RMAP_FREE;
-+			fake.ri_type = XFS_RMAP_FREE;
- 			break;
- 		default:
- 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
-@@ -563,13 +549,21 @@ xfs_rui_item_recover(
- 			error = -EFSCORRUPTED;
- 			goto abort_error;
- 		}
--		error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
--				rmap->me_owner, whichfork,
--				rmap->me_startoff, rmap->me_startblock,
--				rmap->me_len, state, &rcur);
-+
-+		fake.ri_owner = map->me_owner;
-+		fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
-+				XFS_ATTR_FORK : XFS_DATA_FORK;
-+		fake.ri_bmap.br_startblock = map->me_startblock;
-+		fake.ri_bmap.br_startoff = map->me_startoff;
-+		fake.ri_bmap.br_blockcount = map->me_len;
-+		fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
-+				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-+
-+		error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
-+				&rcur);
- 		if (error == -EFSCORRUPTED)
- 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
--					rmap, sizeof(*rmap));
-+					map, sizeof(*map));
- 		if (error)
- 			goto abort_error;
- 
-@@ -600,18 +594,18 @@ xfs_rui_item_relog(
- {
- 	struct xfs_rud_log_item		*rudp;
- 	struct xfs_rui_log_item		*ruip;
--	struct xfs_map_extent		*extp;
-+	struct xfs_map_extent		*map;
- 	unsigned int			count;
- 
- 	count = RUI_ITEM(intent)->rui_format.rui_nextents;
--	extp = RUI_ITEM(intent)->rui_format.rui_extents;
-+	map = RUI_ITEM(intent)->rui_format.rui_extents;
- 
- 	tp->t_flags |= XFS_TRANS_DIRTY;
- 	rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
- 	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
- 
- 	ruip = xfs_rui_init(tp->t_mountp, count);
--	memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
-+	memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
- 	atomic_set(&ruip->rui_next_extent, count);
- 	xfs_trans_add_item(tp, &ruip->rui_item);
- 	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
-index f7faf6e70d7f..a3c6b1548723 100644
---- a/fs/xfs/xfs_sysfs.c
-+++ b/fs/xfs/xfs_sysfs.c
-@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_mp);
- 
--struct kobj_type xfs_mp_ktype = {
-+const struct kobj_type xfs_mp_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- 	.default_groups = xfs_mp_groups,
-@@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_dbg);
- 
--struct kobj_type xfs_dbg_ktype = {
-+const struct kobj_type xfs_dbg_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- 	.default_groups = xfs_dbg_groups,
-@@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_stats);
- 
--struct kobj_type xfs_stats_ktype = {
-+const struct kobj_type xfs_stats_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- 	.default_groups = xfs_stats_groups,
-@@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_log);
- 
--struct kobj_type xfs_log_ktype = {
-+const struct kobj_type xfs_log_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- 	.default_groups = xfs_log_groups,
-@@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = {
- };
- ATTRIBUTE_GROUPS(xfs_error);
- 
--static struct kobj_type xfs_error_cfg_ktype = {
-+static const struct kobj_type xfs_error_cfg_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- 	.default_groups = xfs_error_groups,
- };
- 
--static struct kobj_type xfs_error_ktype = {
-+static const struct kobj_type xfs_error_ktype = {
- 	.release = xfs_sysfs_release,
- 	.sysfs_ops = &xfs_sysfs_ops,
- };
-diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
-index 513095e353a5..148893ebfdef 100644
---- a/fs/xfs/xfs_sysfs.h
-+++ b/fs/xfs/xfs_sysfs.h
-@@ -7,10 +7,10 @@
- #ifndef __XFS_SYSFS_H__
- #define __XFS_SYSFS_H__
- 
--extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
--extern struct kobj_type xfs_dbg_ktype;	/* debug */
--extern struct kobj_type xfs_log_ktype;	/* xlog */
--extern struct kobj_type xfs_stats_ktype;	/* stats */
-+extern const struct kobj_type xfs_mp_ktype;	/* xfs_mount */
-+extern const struct kobj_type xfs_dbg_ktype;	/* debug */
-+extern const struct kobj_type xfs_log_ktype;	/* xlog */
-+extern const struct kobj_type xfs_stats_ktype;	/* stats */
- 
- static inline struct xfs_kobj *
- to_kobj(struct kobject *kobject)
-@@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject)
- static inline int
- xfs_sysfs_init(
- 	struct xfs_kobj		*kobj,
--	struct kobj_type	*ktype,
-+	const struct kobj_type	*ktype,
- 	struct xfs_kobj		*parent_kobj,
- 	const char		*name)
- {
-diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
-index 421d1e504ac4..6b0e9ae7c513 100644
---- a/fs/xfs/xfs_trace.h
-+++ b/fs/xfs/xfs_trace.h
-@@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
- 
- TRACE_EVENT(xfs_refcount_finish_one_leftover,
- 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
--		 int type, xfs_agblock_t agbno, xfs_extlen_t len,
--		 xfs_agblock_t new_agbno, xfs_extlen_t new_len),
--	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
-+		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
-+	TP_ARGS(mp, agno, type, agbno, len),
- 	TP_STRUCT__entry(
- 		__field(dev_t, dev)
- 		__field(xfs_agnumber_t, agno)
- 		__field(int, type)
- 		__field(xfs_agblock_t, agbno)
- 		__field(xfs_extlen_t, len)
--		__field(xfs_agblock_t, new_agbno)
--		__field(xfs_extlen_t, new_len)
- 	),
- 	TP_fast_assign(
- 		__entry->dev = mp->m_super->s_dev;
-@@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
- 		__entry->type = type;
- 		__entry->agbno = agbno;
- 		__entry->len = len;
--		__entry->new_agbno = new_agbno;
--		__entry->new_len = new_len;
- 	),
--	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
-+	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
- 		  MAJOR(__entry->dev), MINOR(__entry->dev),
- 		  __entry->type,
- 		  __entry->agno,
- 		  __entry->agbno,
--		  __entry->len,
--		  __entry->new_agbno,
--		  __entry->new_len)
-+		  __entry->len)
- );
- 
- /* simple inode-based error/%ip tracepoint class */
-diff --git a/include/linux/bio.h b/include/linux/bio.h
-index c1da63f6c808..d766be7152e1 100644
---- a/include/linux/bio.h
-+++ b/include/linux/bio.h
-@@ -12,6 +12,8 @@
- 
- #define BIO_MAX_VECS		256U
- 
-+struct queue_limits;
-+
- static inline unsigned int bio_max_segs(unsigned int nr_segs)
- {
- 	return min(nr_segs, BIO_MAX_VECS);
-@@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip,
- void bio_trim(struct bio *bio, sector_t offset, sector_t size);
- extern struct bio *bio_split(struct bio *bio, int sectors,
- 			     gfp_t gfp, struct bio_set *bs);
-+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
-+		unsigned *segs, struct bio_set *bs, unsigned max_bytes);
- 
- /**
-  * bio_next_split - get next @sectors from a bio, splitting if necessary
-diff --git a/include/linux/iomap.h b/include/linux/iomap.h
-index 0983dfc9a203..0f8123504e5e 100644
---- a/include/linux/iomap.h
-+++ b/include/linux/iomap.h
-@@ -13,6 +13,7 @@
- struct address_space;
- struct fiemap_extent_info;
- struct inode;
-+struct iomap_iter;
- struct iomap_dio;
- struct iomap_writepage_ctx;
- struct iov_iter;
-@@ -58,8 +59,7 @@ struct vm_fault;
- #define IOMAP_F_SHARED		(1U << 2)
- #define IOMAP_F_MERGED		(1U << 3)
- #define IOMAP_F_BUFFER_HEAD	(1U << 4)
--#define IOMAP_F_ZONE_APPEND	(1U << 5)
--#define IOMAP_F_XATTR		(1U << 6)
-+#define IOMAP_F_XATTR		(1U << 5)
- 
- /*
-  * Flags set by the core iomap code during operations:
-@@ -85,7 +85,7 @@ struct vm_fault;
-  */
- #define IOMAP_NULL_ADDR -1ULL	/* addr is not valid */
- 
--struct iomap_page_ops;
-+struct iomap_folio_ops;
- 
- struct iomap {
- 	u64			addr; /* disk offset of mapping, bytes */
-@@ -97,7 +97,7 @@ struct iomap {
- 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
- 	void			*inline_data;
- 	void			*private; /* filesystem private */
--	const struct iomap_page_ops *page_ops;
-+	const struct iomap_folio_ops *folio_ops;
- 	u64			validity_cookie; /* used with .iomap_valid() */
- };
- 
-@@ -125,19 +125,20 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap)
- }
- 
- /*
-- * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
-- * and page_done will be called for each page written to.  This only applies to
-- * buffered writes as unbuffered writes will not typically have pages
-+ * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio
-+ * and put_folio will be called for each folio written to.  This only applies
-+ * to buffered writes as unbuffered writes will not typically have folios
-  * associated with them.
-  *
-- * When page_prepare succeeds, page_done will always be called to do any
-- * cleanup work necessary.  In that page_done call, @page will be NULL if the
-- * associated page could not be obtained.
-+ * When get_folio succeeds, put_folio will always be called to do any
-+ * cleanup work necessary.  put_folio is responsible for unlocking and putting
-+ * @folio.
-  */
--struct iomap_page_ops {
--	int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
--	void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
--			struct page *page);
-+struct iomap_folio_ops {
-+	struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
-+			unsigned len);
-+	void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
-+			struct folio *folio);
- 
- 	/*
- 	 * Check that the cached iomap still maps correctly to the filesystem's
-@@ -260,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
- void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
- bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
- bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
- void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
- int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
-diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
-index 6548b5b5aa60..75d7d22c3a27 100644
---- a/include/trace/events/btrfs.h
-+++ b/include/trace/events/btrfs.h
-@@ -32,6 +32,7 @@ struct prelim_ref;
- struct btrfs_space_info;
- struct btrfs_raid_bio;
- struct raid56_bio_trace_info;
-+struct find_free_extent_ctl;
- 
- #define show_ref_type(type)						\
- 	__print_symbolic(type,						\
-@@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
- 
- TRACE_EVENT(find_free_extent,
- 
--	TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
--		 u64 empty_size, u64 data),
-+	TP_PROTO(const struct btrfs_root *root,
-+		 const struct find_free_extent_ctl *ffe_ctl),
- 
--	TP_ARGS(root, num_bytes, empty_size, data),
-+	TP_ARGS(root, ffe_ctl),
- 
- 	TP_STRUCT__entry_btrfs(
- 		__field(	u64,	root_objectid		)
- 		__field(	u64,	num_bytes		)
- 		__field(	u64,	empty_size		)
--		__field(	u64,	data			)
-+		__field(	u64,	flags			)
- 	),
- 
- 	TP_fast_assign_btrfs(root->fs_info,
- 		__entry->root_objectid	= root->root_key.objectid;
--		__entry->num_bytes	= num_bytes;
--		__entry->empty_size	= empty_size;
--		__entry->data		= data;
-+		__entry->num_bytes	= ffe_ctl->num_bytes;
-+		__entry->empty_size	= ffe_ctl->empty_size;
-+		__entry->flags		= ffe_ctl->flags;
- 	),
- 
- 	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
- 		  show_root_type(__entry->root_objectid),
--		  __entry->num_bytes, __entry->empty_size, __entry->data,
--		  __print_flags((unsigned long)__entry->data, "|",
-+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
-+		  __print_flags((unsigned long)__entry->flags, "|",
-+				 BTRFS_GROUP_FLAGS))
-+);
-+
-+TRACE_EVENT(find_free_extent_search_loop,
-+
-+	TP_PROTO(const struct btrfs_root *root,
-+		 const struct find_free_extent_ctl *ffe_ctl),
-+
-+	TP_ARGS(root, ffe_ctl),
-+
-+	TP_STRUCT__entry_btrfs(
-+		__field(	u64,	root_objectid		)
-+		__field(	u64,	num_bytes		)
-+		__field(	u64,	empty_size		)
-+		__field(	u64,	flags			)
-+		__field(	u64,	loop			)
-+	),
-+
-+	TP_fast_assign_btrfs(root->fs_info,
-+		__entry->root_objectid	= root->root_key.objectid;
-+		__entry->num_bytes	= ffe_ctl->num_bytes;
-+		__entry->empty_size	= ffe_ctl->empty_size;
-+		__entry->flags		= ffe_ctl->flags;
-+		__entry->loop		= ffe_ctl->loop;
-+	),
-+
-+	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu",
-+		  show_root_type(__entry->root_objectid),
-+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
-+		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
-+		  __entry->loop)
-+);
-+
-+TRACE_EVENT(find_free_extent_have_block_group,
-+
-+	TP_PROTO(const struct btrfs_root *root,
-+		 const struct find_free_extent_ctl *ffe_ctl,
-+		 const struct btrfs_block_group *block_group),
-+
-+	TP_ARGS(root, ffe_ctl, block_group),
-+
-+	TP_STRUCT__entry_btrfs(
-+		__field(	u64,	root_objectid		)
-+		__field(	u64,	num_bytes		)
-+		__field(	u64,	empty_size		)
-+		__field(	u64,	flags			)
-+		__field(	u64,	loop			)
-+		__field(	bool,	hinted			)
-+		__field(	u64,	bg_start		)
-+		__field(	u64,	bg_flags		)
-+	),
-+
-+	TP_fast_assign_btrfs(root->fs_info,
-+		__entry->root_objectid	= root->root_key.objectid;
-+		__entry->num_bytes	= ffe_ctl->num_bytes;
-+		__entry->empty_size	= ffe_ctl->empty_size;
-+		__entry->flags		= ffe_ctl->flags;
-+		__entry->loop		= ffe_ctl->loop;
-+		__entry->hinted		= ffe_ctl->hinted;
-+		__entry->bg_start	= block_group->start;
-+		__entry->bg_flags	= block_group->flags;
-+	),
-+
-+	TP_printk_btrfs(
-+"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)",
-+		  show_root_type(__entry->root_objectid),
-+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
-+		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
-+		  __entry->loop, __entry->hinted,
-+		  __entry->bg_start, __entry->bg_flags,
-+		  __print_flags((unsigned long)__entry->bg_flags, "|",
- 				 BTRFS_GROUP_FLAGS))
- );
- 
- DECLARE_EVENT_CLASS(btrfs__reserve_extent,
- 
--	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
--		 u64 len),
-+	TP_PROTO(const struct btrfs_block_group *block_group,
-+		 const struct find_free_extent_ctl *ffe_ctl),
- 
--	TP_ARGS(block_group, start, len),
-+	TP_ARGS(block_group, ffe_ctl),
- 
- 	TP_STRUCT__entry_btrfs(
- 		__field(	u64,	bg_objectid		)
- 		__field(	u64,	flags			)
-+		__field(	int,	bg_size_class		)
- 		__field(	u64,	start			)
- 		__field(	u64,	len			)
-+		__field(	u64,	loop			)
-+		__field(	bool,	hinted			)
-+		__field(	int,	size_class		)
- 	),
- 
- 	TP_fast_assign_btrfs(block_group->fs_info,
- 		__entry->bg_objectid	= block_group->start;
- 		__entry->flags		= block_group->flags;
--		__entry->start		= start;
--		__entry->len		= len;
-+		__entry->bg_size_class	= block_group->size_class;
-+		__entry->start		= ffe_ctl->search_start;
-+		__entry->len		= ffe_ctl->num_bytes;
-+		__entry->loop		= ffe_ctl->loop;
-+		__entry->hinted		= ffe_ctl->hinted;
-+		__entry->size_class	= ffe_ctl->size_class;
- 	),
- 
--	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
--		  "start=%llu len=%llu",
-+	TP_printk_btrfs(
-+"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d",
- 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
- 		  __entry->bg_objectid,
- 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
- 						"|", BTRFS_GROUP_FLAGS),
--		  __entry->start, __entry->len)
-+		  __entry->bg_size_class, __entry->start, __entry->len,
-+		  __entry->loop, __entry->hinted, __entry->size_class)
- );
- 
- DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
- 
--	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
--		 u64 len),
-+	TP_PROTO(const struct btrfs_block_group *block_group,
-+		 const struct find_free_extent_ctl *ffe_ctl),
- 
--	TP_ARGS(block_group, start, len)
-+	TP_ARGS(block_group, ffe_ctl)
- );
- 
- DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
- 
--	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
--		 u64 len),
-+	TP_PROTO(const struct btrfs_block_group *block_group,
-+		 const struct find_free_extent_ctl *ffe_ctl),
- 
--	TP_ARGS(block_group, start, len)
-+	TP_ARGS(block_group, ffe_ctl)
- );
- 
- TRACE_EVENT(btrfs_find_cluster,
-diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
-index 77b426ae0064..ebccf6a6aa1b 100644
---- a/include/trace/events/ext4.h
-+++ b/include/trace/events/ext4.h
-@@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op,
- 		  (unsigned long) __entry->index)
- );
- 
--DEFINE_EVENT(ext4__page_op, ext4_writepage,
--
--	TP_PROTO(struct page *page),
--
--	TP_ARGS(page)
--);
--
- DEFINE_EVENT(ext4__page_op, ext4_readpage,
- 
- 	TP_PROTO(struct page *page),
--- 
-2.40.0.rc2
-
-From 31bc464783789781c2a6885b36f63fcb3751a5bb Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 18:05:48 +0100
-Subject: [PATCH 08/16] Implement amd-pstate-epp and amd-pstate-guided driver
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- .../admin-guide/kernel-parameters.txt         |  33 +-
- Documentation/admin-guide/pm/amd-pstate.rst   |  95 ++-
- drivers/acpi/cppc_acpi.c                      | 188 ++++-
- drivers/cpufreq/amd-pstate.c                  | 794 +++++++++++++++++-
- drivers/cpufreq/brcmstb-avs-cpufreq.c         |   5 +-
- drivers/cpufreq/cpufreq.c                     |   8 +-
- drivers/cpufreq/mediatek-cpufreq-hw.c         |   4 +-
- drivers/cpufreq/omap-cpufreq.c                |   4 +-
- drivers/cpufreq/qcom-cpufreq-hw.c             |   4 +-
- include/acpi/cppc_acpi.h                      |  23 +
- include/linux/amd-pstate.h                    |  34 +
- include/linux/cpufreq.h                       |   2 +-
- 12 files changed, 1136 insertions(+), 58 deletions(-)
+ .../admin-guide/kernel-parameters.txt         |  40 ++--
+ Documentation/admin-guide/pm/amd-pstate.rst   |  31 ++-
+ drivers/acpi/cppc_acpi.c                      | 121 +++++++++++-
+ drivers/cpufreq/amd-pstate.c                  | 177 +++++++++++++-----
+ include/acpi/cppc_acpi.h                      |  11 ++
+ include/linux/amd-pstate.h                    |   2 +
+ 6 files changed, 302 insertions(+), 80 deletions(-)
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 9595abf34974..f39b8f05392c 100644
+index 4f6761a93715..bf2a402af231 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -339,6 +339,29 @@
@@ -26717,7 +9065,7 @@ index 9595abf34974..f39b8f05392c 100644
  	amijoy.map=	[HW,JOY] Amiga joystick support
  			Map of devices attached to JOY0DAT and JOY1DAT
  			Format: <a>,<b>
-@@ -7019,13 +7042,3 @@
+@@ -7068,20 +7091,3 @@
  				xmon commands.
  			off	xmon is disabled.
  
@@ -26731,57 +9079,28 @@ index 9595abf34974..f39b8f05392c 100644
 -			  management firmware translates the requests into actual
 -			  hardware states (core frequency, data fabric and memory
 -			  clocks etc.)
+-			active
+-			  Use amd_pstate_epp driver instance as the scaling driver,
+-			  driver provides a hint to the hardware if software wants
+-			  to bias toward performance (0x0) or energy efficiency (0xff)
+-			  to the CPPC firmware. then CPPC power algorithm will
+-			  calculate the runtime workload and adjust the realtime cores
+-			  frequency.
 diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
-index 5376d53faaa8..f24a90007e98 100644
+index 6e5298b521b1..1cf40f69278c 100644
 --- a/Documentation/admin-guide/pm/amd-pstate.rst
 +++ b/Documentation/admin-guide/pm/amd-pstate.rst
-@@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond
- to the request from AMD P-States.
+@@ -303,13 +303,18 @@ efficiency frequency management method on AMD processors.
+ AMD Pstate Driver Operation Modes
+ =================================
  
- 
--User Space Interface in ``sysfs``
--==================================
-+User Space Interface in ``sysfs`` - Per-policy control
-+======================================================
- 
- ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
- control its functionality at the system level. They are located in the
-@@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability
- <perf_cap_>`_.)
- This attribute is read-only.
- 
-+``energy_performance_available_preferences``
-+
-+A list of all the supported EPP preferences that could be used for
-+``energy_performance_preference`` on this system.
-+These profiles represent different hints that are provided
-+to the low-level firmware about the user's desired energy vs efficiency
-+tradeoff.  ``default`` represents the epp value is set by platform
-+firmware. This attribute is read-only.
-+
-+``energy_performance_preference``
-+
-+The current energy performance preference can be read from this attribute.
-+and user can change current preference according to energy or performance needs
-+Please get all support profiles list from
-+``energy_performance_available_preferences`` attribute, all the profiles are
-+integer values defined between 0 to 255 when EPP feature is enabled by platform
-+firmware, if EPP feature is disabled, driver will ignore the written value
-+This attribute is read-write.
-+
- Other performance and frequency values can be read back from
- ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
- 
-@@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD
- platforms. The AMD P-States mechanism is the more performance and energy
- efficiency frequency management method on AMD processors.
- 
--Kernel Module Options for ``amd-pstate``
--=========================================
-+
-+AMD Pstate Driver Operation Modes
-+=================================
-+
+-``amd_pstate`` CPPC has two operation modes: CPPC Autonomous(active) mode and
+-CPPC non-autonomous(passive) mode.
+-active mode and passive mode can be chosen by different kernel parameters.
+-When in Autonomous mode, CPPC ignores requests done in the Desired Performance
+-Target register and takes into account only the values set to the Minimum requested
+-performance, Maximum requested performance, and Energy Performance Preference
+-registers. When Autonomous is disabled, it only considers the Desired Performance Target.
 +``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
 +non-autonomous (passive) mode and guided autonomous (guided) mode.
 +Active/passive/guided mode can be chosen by different kernel parameters.
@@ -26794,23 +9113,10 @@ index 5376d53faaa8..f24a90007e98 100644
 +- In guided-autonomous mode, platform sets operating performance level
 +  autonomously according to the current workload and within the limits set by
 +  OS through min and max performance registers.
-+
-+Active Mode
-+------------
-+
-+``amd_pstate=active``
-+
-+This is the low-level firmware control mode which is implemented by ``amd_pstate_epp``
-+driver with ``amd_pstate=active`` passed to the kernel in the command line.
-+In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software
-+wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware.
-+then CPPC power algorithm will calculate the runtime workload and adjust the realtime
-+cores frequency according to the power supply and thermal, core voltage and some other
-+hardware conditions.
  
- Passive Mode
+ Active Mode
  ------------
-@@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l
+@@ -338,6 +343,15 @@ to the Performance Reduction Tolerance register. Above the nominal performance l
  processor must provide at least nominal performance requested and go higher if current
  operating conditions allow.
  
@@ -26823,123 +9129,27 @@ index 5376d53faaa8..f24a90007e98 100644
 +is activated.  In this mode, driver requests minimum and maximum performance
 +level and the platform autonomously selects a performance level in this range
 +and appropriate to the current workload.
-+
-+User Space Interface in ``sysfs`` - General
-+===========================================
-+
-+Global Attributes
-+-----------------
-+
-+``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
-+control its functionality at the system level.  They are located in the
-+``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs.
-+
-+``status``
-+	Operation mode of the driver: "active", "passive" or "disable".
-+
-+	"active"
-+		The driver is functional and in the ``active mode``
-+
-+	"passive"
-+		The driver is functional and in the ``passive mode``
-+
+ 
+ User Space Interface in ``sysfs`` - General
+ ===========================================
+@@ -358,6 +372,9 @@ control its functionality at the system level.  They are located in the
+ 	"passive"
+ 		The driver is functional and in the ``passive mode``
+ 
 +	"guided"
 +		The driver is functional and in the ``guided mode``
 +
-+	"disable"
-+		The driver is unregistered and not functional now.
-+
-+        This attribute can be written to in order to change the driver's
-+        operation mode or to unregister it.  The string written to it must be
-+        one of the possible values of it and, if successful, writing one of
-+        these values to the sysfs file will cause the driver to switch over
-+        to the operation mode represented by that string - or to be
-+        unregistered in the "disable" case.
+ 	"disable"
+ 		The driver is unregistered and not functional now.
  
- ``cpupower`` tool support for ``amd-pstate``
- ===============================================
 diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
-index 0f17b1c32718..0efdbeed6ada 100644
+index c51d3ccb4cca..02a4bfb54967 100644
 --- a/drivers/acpi/cppc_acpi.c
 +++ b/drivers/acpi/cppc_acpi.c
-@@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
- 	return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
+@@ -1433,6 +1433,103 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
  }
+ EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
  
-+/**
-+ * cppc_get_epp_perf - Get the epp register value.
-+ * @cpunum: CPU from which to get epp preference value.
-+ * @epp_perf: Return address.
-+ *
-+ * Return: 0 for success, -EIO otherwise.
-+ */
-+int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
-+{
-+	return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf);
-+}
-+EXPORT_SYMBOL_GPL(cppc_get_epp_perf);
-+
- /**
-  * cppc_get_perf_caps - Get a CPU's performance capabilities.
-  * @cpunum: CPU from which to get capabilities info.
-@@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
- }
- EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs);
- 
-+/*
-+ * Set Energy Performance Preference Register value through
-+ * Performance Controls Interface
-+ */
-+int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
-+{
-+	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
-+	struct cpc_register_resource *epp_set_reg;
-+	struct cpc_register_resource *auto_sel_reg;
-+	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
-+	struct cppc_pcc_data *pcc_ss_data = NULL;
-+	int ret;
-+
-+	if (!cpc_desc) {
-+		pr_debug("No CPC descriptor for CPU:%d\n", cpu);
-+		return -ENODEV;
-+	}
-+
-+	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
-+	epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF];
-+
-+	if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) {
-+		if (pcc_ss_id < 0) {
-+			pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu);
-+			return -ENODEV;
-+		}
-+
-+		if (CPC_SUPPORTED(auto_sel_reg)) {
-+			ret = cpc_write(cpu, auto_sel_reg, enable);
-+			if (ret)
-+				return ret;
-+		}
-+
-+		if (CPC_SUPPORTED(epp_set_reg)) {
-+			ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf);
-+			if (ret)
-+				return ret;
-+		}
-+
-+		pcc_ss_data = pcc_data[pcc_ss_id];
-+
-+		down_write(&pcc_ss_data->pcc_lock);
-+		/* after writing CPC, transfer the ownership of PCC to platform */
-+		ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
-+		up_write(&pcc_ss_data->pcc_lock);
-+	} else {
-+		ret = -ENOTSUPP;
-+		pr_debug("_CPC in PCC is not supported\n");
-+	}
-+
-+	return ret;
-+}
-+EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
-+
 +/*
 + * cppc_get_auto_sel_caps - Read autonomous selection register.
 + * @cpunum : CPU from which to read register.
@@ -27040,7 +9250,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  /**
   * cppc_set_enable - Set to enable CPPC on the processor by writing the
   * Continuous Performance Control package EnableRegister field.
-@@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable);
+@@ -1488,7 +1585,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable);
  int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
  {
  	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
@@ -27049,7 +9259,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
  	struct cppc_pcc_data *pcc_ss_data = NULL;
  	int ret = 0;
-@@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+@@ -1499,6 +1596,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
  	}
  
  	desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
@@ -27058,7 +9268,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  
  	/*
  	 * This is Phase-I where we want to write to CPC registers
-@@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+@@ -1507,7 +1606,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
  	 * Since read_lock can be acquired by multiple CPUs simultaneously we
  	 * achieve that goal here
  	 */
@@ -27067,7 +9277,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  		if (pcc_ss_id < 0) {
  			pr_debug("Invalid pcc_ss_id\n");
  			return -ENODEV;
-@@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+@@ -1530,13 +1629,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
  		cpc_desc->write_cmd_status = 0;
  	}
  
@@ -27092,7 +9302,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  		up_read(&pcc_ss_data->pcc_lock);	/* END Phase-I */
  	/*
  	 * This is Phase-II where we transfer the ownership of PCC to Platform
-@@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+@@ -1584,7 +1689,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
  	 * case during a CMD_READ and if there are pending writes it delivers
  	 * the write command before servicing the read command
  	 */
@@ -27102,207 +9312,19 @@ index 0f17b1c32718..0efdbeed6ada 100644
  			/* Update only if there are pending write commands */
  			if (pcc_ss_data->pending_pcc_write_cmd)
 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index c17bd845f5fc..f4f96baae500 100644
+index 73c7643b2697..7955cfc91c31 100644
 --- a/drivers/cpufreq/amd-pstate.c
 +++ b/drivers/cpufreq/amd-pstate.c
-@@ -59,8 +59,173 @@
-  * we disable it by default to go acpi-cpufreq on these processors and add a
-  * module parameter to be able to enable it manually for debugging.
-  */
-+static struct cpufreq_driver *current_pstate_driver;
- static struct cpufreq_driver amd_pstate_driver;
--static int cppc_load __initdata;
-+static struct cpufreq_driver amd_pstate_epp_driver;
-+static int cppc_state = AMD_PSTATE_DISABLE;
-+struct kobject *amd_pstate_kobj;
-+
-+/*
-+ * AMD Energy Preference Performance (EPP)
-+ * The EPP is used in the CCLK DPM controller to drive
-+ * the frequency that a core is going to operate during
-+ * short periods of activity. EPP values will be utilized for
-+ * different OS profiles (balanced, performance, power savings)
-+ * display strings corresponding to EPP index in the
-+ * energy_perf_strings[]
-+ *	index		String
-+ *-------------------------------------
-+ *	0		default
-+ *	1		performance
-+ *	2		balance_performance
-+ *	3		balance_power
-+ *	4		power
-+ */
-+enum energy_perf_value_index {
-+	EPP_INDEX_DEFAULT = 0,
-+	EPP_INDEX_PERFORMANCE,
-+	EPP_INDEX_BALANCE_PERFORMANCE,
-+	EPP_INDEX_BALANCE_POWERSAVE,
-+	EPP_INDEX_POWERSAVE,
-+};
-+
-+static const char * const energy_perf_strings[] = {
-+	[EPP_INDEX_DEFAULT] = "default",
-+	[EPP_INDEX_PERFORMANCE] = "performance",
-+	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
-+	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
-+	[EPP_INDEX_POWERSAVE] = "power",
-+	NULL
-+};
-+
-+static unsigned int epp_values[] = {
-+	[EPP_INDEX_DEFAULT] = 0,
-+	[EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
-+	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
-+	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
-+	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
-+ };
-+
+@@ -106,6 +106,8 @@ static unsigned int epp_values[] = {
+ 	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
+  };
+ 
 +typedef int (*cppc_mode_transition_fn)(int);
 +
-+static inline int get_mode_idx_from_str(const char *str, size_t size)
-+{
-+	int i;
-+
-+	for (i=0; i < AMD_PSTATE_MAX; i++) {
-+		if (!strncmp(str, amd_pstate_mode_string[i], size))
-+			return i;
-+	}
-+	return -EINVAL;
-+}
-+
-+static DEFINE_MUTEX(amd_pstate_limits_lock);
-+static DEFINE_MUTEX(amd_pstate_driver_lock);
-+
-+static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
-+{
-+	u64 epp;
-+	int ret;
-+
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		if (!cppc_req_cached) {
-+			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
-+					&cppc_req_cached);
-+			if (epp)
-+				return epp;
-+		}
-+		epp = (cppc_req_cached >> 24) & 0xFF;
-+	} else {
-+		ret = cppc_get_epp_perf(cpudata->cpu, &epp);
-+		if (ret < 0) {
-+			pr_debug("Could not retrieve energy perf value (%d)\n", ret);
-+			return -EIO;
-+		}
-+	}
-+
-+	return (s16)(epp & 0xff);
-+}
-+
-+static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
-+{
-+	s16 epp;
-+	int index = -EINVAL;
-+
-+	epp = amd_pstate_get_epp(cpudata, 0);
-+	if (epp < 0)
-+		return epp;
-+
-+	switch (epp) {
-+	case AMD_CPPC_EPP_PERFORMANCE:
-+		index = EPP_INDEX_PERFORMANCE;
-+		break;
-+	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
-+		index = EPP_INDEX_BALANCE_PERFORMANCE;
-+		break;
-+	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
-+		index = EPP_INDEX_BALANCE_POWERSAVE;
-+		break;
-+	case AMD_CPPC_EPP_POWERSAVE:
-+		index = EPP_INDEX_POWERSAVE;
-+		break;
-+	default:
-+		break;
-+	}
-+
-+	return index;
-+}
-+
-+static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
-+{
-+	int ret;
-+	struct cppc_perf_ctrls perf_ctrls;
-+
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		u64 value = READ_ONCE(cpudata->cppc_req_cached);
-+
-+		value &= ~GENMASK_ULL(31, 24);
-+		value |= (u64)epp << 24;
-+		WRITE_ONCE(cpudata->cppc_req_cached, value);
-+
-+		ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
-+		if (!ret)
-+			cpudata->epp_cached = epp;
-+	} else {
-+		perf_ctrls.energy_perf = epp;
-+		ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
-+		if (ret) {
-+			pr_debug("failed to set energy perf value (%d)\n", ret);
-+			return ret;
-+		}
-+		cpudata->epp_cached = epp;
-+	}
-+
-+	return ret;
-+}
-+
-+static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
-+		int pref_index)
-+{
-+	int epp = -EINVAL;
-+	int ret;
-+
-+	if (!pref_index) {
-+		pr_debug("EPP pref_index is invalid\n");
-+		return -EINVAL;
-+	}
-+
-+	if (epp == -EINVAL)
-+		epp = epp_values[pref_index];
-+
-+	if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
-+		pr_debug("EPP cannot be set under performance policy\n");
-+		return -EBUSY;
-+	}
-+
-+	ret = amd_pstate_set_epp(cpudata, epp);
-+
-+	return ret;
-+}
- 
- static inline int pstate_enable(bool enable)
+ static inline int get_mode_idx_from_str(const char *str, size_t size)
  {
-@@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable)
- static int cppc_enable(bool enable)
- {
- 	int cpu, ret = 0;
-+	struct cppc_perf_ctrls perf_ctrls;
- 
- 	for_each_present_cpu(cpu) {
- 		ret = cppc_set_enable(cpu, enable);
- 		if (ret)
- 			return ret;
-+
-+		/* Enable autonomous mode for EPP */
-+		if (cppc_state == AMD_PSTATE_ACTIVE) {
-+			/* Set desired perf as zero to allow EPP firmware control */
-+			perf_ctrls.desired_perf = 0;
-+			ret = cppc_set_perf(cpu, &perf_ctrls);
-+			if (ret)
-+				return ret;
-+		}
- 	}
- 
- 	return ret;
-@@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
+ 	int i;
+@@ -308,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
  		   cppc_perf.lowest_nonlinear_perf);
  	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
  
@@ -27326,7 +9348,7 @@ index c17bd845f5fc..f4f96baae500 100644
  }
  
  DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
-@@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
+@@ -385,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
  }
  
  static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
@@ -27346,7 +9368,7 @@ index c17bd845f5fc..f4f96baae500 100644
  	value &= ~AMD_CPPC_MIN_PERF(~0L);
  	value |= AMD_CPPC_MIN_PERF(min_perf);
  
-@@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
+@@ -445,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
  
  	cpufreq_freq_transition_begin(policy, &freqs);
  	amd_pstate_update(cpudata, min_perf, des_perf,
@@ -27355,7 +9377,7 @@ index c17bd845f5fc..f4f96baae500 100644
  	cpufreq_freq_transition_end(policy, &freqs, false);
  
  	return 0;
-@@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
+@@ -479,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
  	if (max_perf < min_perf)
  		max_perf = min_perf;
  
@@ -27365,99 +9387,10 @@ index c17bd845f5fc..f4f96baae500 100644
  	cpufreq_cpu_put(policy);
  }
  
-@@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
- 		return;
- 
- 	cpudata->boost_supported = true;
--	amd_pstate_driver.boost_enabled = true;
-+	current_pstate_driver->boost_enabled = true;
+@@ -816,6 +840,98 @@ static ssize_t show_energy_performance_preference(
+ 	return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
  }
  
- static void amd_perf_ctl_reset(unsigned int cpu)
-@@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
- 	policy->driver_data = cpudata;
- 
- 	amd_pstate_boost_init(cpudata);
-+	if (!current_pstate_driver->adjust_perf)
-+		current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
- 
- 	return 0;
- 
-@@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy,
- 	if (max_freq < 0)
- 		return max_freq;
- 
--	return sprintf(&buf[0], "%u\n", max_freq);
-+	return sysfs_emit(buf, "%u\n", max_freq);
- }
- 
- static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy,
-@@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli
- 	if (freq < 0)
- 		return freq;
- 
--	return sprintf(&buf[0], "%u\n", freq);
-+	return sysfs_emit(buf, "%u\n", freq);
- }
- 
- /*
-@@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
- 
- 	perf = READ_ONCE(cpudata->highest_perf);
- 
--	return sprintf(&buf[0], "%u\n", perf);
-+	return sysfs_emit(buf, "%u\n", perf);
-+}
-+
-+static ssize_t show_energy_performance_available_preferences(
-+				struct cpufreq_policy *policy, char *buf)
-+{
-+	int i = 0;
-+	int offset = 0;
-+
-+	while (energy_perf_strings[i] != NULL)
-+		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]);
-+
-+	sysfs_emit_at(buf, offset, "\n");
-+
-+	return offset;
-+}
-+
-+static ssize_t store_energy_performance_preference(
-+		struct cpufreq_policy *policy, const char *buf, size_t count)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	char str_preference[21];
-+	ssize_t ret;
-+
-+	ret = sscanf(buf, "%20s", str_preference);
-+	if (ret != 1)
-+		return -EINVAL;
-+
-+	ret = match_string(energy_perf_strings, -1, str_preference);
-+	if (ret < 0)
-+		return -EINVAL;
-+
-+	mutex_lock(&amd_pstate_limits_lock);
-+	ret = amd_pstate_set_energy_pref_index(cpudata, ret);
-+	mutex_unlock(&amd_pstate_limits_lock);
-+
-+	return ret ?: count;
-+}
-+
-+static ssize_t show_energy_performance_preference(
-+				struct cpufreq_policy *policy, char *buf)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	int preference;
-+
-+	preference = amd_pstate_get_energy_pref_index(cpudata);
-+	if (preference < 0)
-+		return preference;
-+
-+	return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
-+}
-+
 +static void amd_pstate_driver_cleanup(void)
 +{
 +	amd_pstate_enable(false);
@@ -27550,633 +9483,117 @@ index c17bd845f5fc..f4f96baae500 100644
 +	},
 +};
 +
-+static ssize_t amd_pstate_show_status(char *buf)
-+{
-+	if (!current_pstate_driver)
-+		return sysfs_emit(buf, "disable\n");
-+
-+	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
-+}
-+
-+static int amd_pstate_update_status(const char *buf, size_t size)
-+{
-+	int mode_idx;
-+
-+	if (size > strlen("passive") || size < strlen("active"))
-+		return -EINVAL;
-+
-+	mode_idx = get_mode_idx_from_str(buf, size);
-+
-+	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
-+		return -EINVAL;
-+
-+	if (mode_state_machine[cppc_state][mode_idx])
-+		return mode_state_machine[cppc_state][mode_idx](mode_idx);
-+
-+	return 0;
-+}
-+
-+static ssize_t show_status(struct kobject *kobj,
-+			   struct kobj_attribute *attr, char *buf)
-+{
-+	ssize_t ret;
-+
-+	mutex_lock(&amd_pstate_driver_lock);
-+	ret = amd_pstate_show_status(buf);
-+	mutex_unlock(&amd_pstate_driver_lock);
-+
-+	return ret;
-+}
-+
-+static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
-+			    const char *buf, size_t count)
-+{
-+	char *p = memchr(buf, '\n', count);
-+	int ret;
-+
-+	mutex_lock(&amd_pstate_driver_lock);
-+	ret = amd_pstate_update_status(buf, p ? p - buf : count);
-+	mutex_unlock(&amd_pstate_driver_lock);
-+
-+	return ret < 0 ? ret : count;
+ static ssize_t amd_pstate_show_status(char *buf)
+ {
+ 	if (!current_pstate_driver)
+@@ -824,57 +940,22 @@ static ssize_t amd_pstate_show_status(char *buf)
+ 	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
  }
  
- cpufreq_freq_attr_ro(amd_pstate_max_freq);
- cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
- 
- cpufreq_freq_attr_ro(amd_pstate_highest_perf);
-+cpufreq_freq_attr_rw(energy_performance_preference);
-+cpufreq_freq_attr_ro(energy_performance_available_preferences);
-+define_one_global_rw(status);
- 
- static struct freq_attr *amd_pstate_attr[] = {
- 	&amd_pstate_max_freq,
-@@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = {
- 	NULL,
- };
- 
-+static struct freq_attr *amd_pstate_epp_attr[] = {
-+	&amd_pstate_max_freq,
-+	&amd_pstate_lowest_nonlinear_freq,
-+	&amd_pstate_highest_perf,
-+	&energy_performance_preference,
-+	&energy_performance_available_preferences,
-+	NULL,
-+};
-+
-+static struct attribute *pstate_global_attributes[] = {
-+	&status.attr,
-+	NULL
-+};
-+
-+static const struct attribute_group amd_pstate_global_attr_group = {
-+	.attrs = pstate_global_attributes,
-+};
-+
-+static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
-+{
-+	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
-+	struct amd_cpudata *cpudata;
-+	struct device *dev;
-+	u64 value;
-+
-+	/*
-+	 * Resetting PERF_CTL_MSR will put the CPU in P0 frequency,
-+	 * which is ideal for initialization process.
-+	 */
-+	amd_perf_ctl_reset(policy->cpu);
-+	dev = get_cpu_device(policy->cpu);
-+	if (!dev)
-+		return -ENODEV;
-+
-+	cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
-+	if (!cpudata)
-+		return -ENOMEM;
-+
-+	cpudata->cpu = policy->cpu;
-+	cpudata->epp_policy = 0;
-+
-+	ret = amd_pstate_init_perf(cpudata);
-+	if (ret)
-+		goto free_cpudata1;
-+
-+	min_freq = amd_get_min_freq(cpudata);
-+	max_freq = amd_get_max_freq(cpudata);
-+	nominal_freq = amd_get_nominal_freq(cpudata);
-+	lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
-+	if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
-+		dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n",
-+				min_freq, max_freq);
-+		ret = -EINVAL;
-+		goto free_cpudata1;
-+	}
-+
-+	policy->cpuinfo.min_freq = min_freq;
-+	policy->cpuinfo.max_freq = max_freq;
-+	/* It will be updated by governor */
-+	policy->cur = policy->cpuinfo.min_freq;
-+
-+	/* Initial processor data capability frequencies */
-+	cpudata->max_freq = max_freq;
-+	cpudata->min_freq = min_freq;
-+	cpudata->nominal_freq = nominal_freq;
-+	cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
-+
-+	policy->driver_data = cpudata;
-+
-+	cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0);
-+
-+	policy->min = policy->cpuinfo.min_freq;
-+	policy->max = policy->cpuinfo.max_freq;
-+
-+	/*
-+	 * Set the policy to powersave to provide a valid fallback value in case
-+	 * the default cpufreq governor is neither powersave nor performance.
-+	 */
-+	policy->policy = CPUFREQ_POLICY_POWERSAVE;
-+
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		policy->fast_switch_possible = true;
-+		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
-+		if (ret)
-+			return ret;
-+		WRITE_ONCE(cpudata->cppc_req_cached, value);
-+
-+		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
-+		if (ret)
-+			return ret;
-+		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
-+	}
-+	amd_pstate_boost_init(cpudata);
-+
-+	return 0;
-+
-+free_cpudata1:
-+	kfree(cpudata);
-+	return ret;
-+}
-+
-+static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
-+{
-+	pr_debug("CPU %d exiting\n", policy->cpu);
-+	policy->fast_switch_possible = false;
-+	return 0;
-+}
-+
-+static void amd_pstate_epp_init(unsigned int cpu)
-+{
-+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	u32 max_perf, min_perf;
-+	u64 value;
-+	s16 epp;
-+
-+	max_perf = READ_ONCE(cpudata->highest_perf);
-+	min_perf = READ_ONCE(cpudata->lowest_perf);
-+
-+	value = READ_ONCE(cpudata->cppc_req_cached);
-+
-+	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
-+		min_perf = max_perf;
-+
-+	/* Initial min/max values for CPPC Performance Controls Register */
-+	value &= ~AMD_CPPC_MIN_PERF(~0L);
-+	value |= AMD_CPPC_MIN_PERF(min_perf);
-+
-+	value &= ~AMD_CPPC_MAX_PERF(~0L);
-+	value |= AMD_CPPC_MAX_PERF(max_perf);
-+
-+	/* CPPC EPP feature require to set zero to the desire perf bit */
-+	value &= ~AMD_CPPC_DES_PERF(~0L);
-+	value |= AMD_CPPC_DES_PERF(0);
-+
-+	if (cpudata->epp_policy == cpudata->policy)
-+		goto skip_epp;
-+
-+	cpudata->epp_policy = cpudata->policy;
-+
-+	/* Get BIOS pre-defined epp value */
-+	epp = amd_pstate_get_epp(cpudata, value);
-+	if (epp < 0) {
-+		/**
-+		 * This return value can only be negative for shared_memory
-+		 * systems where EPP register read/write not supported.
-+		 */
-+		goto skip_epp;
-+	}
-+
-+	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
-+		epp = 0;
-+
-+	/* Set initial EPP value */
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		value &= ~GENMASK_ULL(31, 24);
-+		value |= (u64)epp << 24;
-+	}
-+
-+	WRITE_ONCE(cpudata->cppc_req_cached, value);
-+	amd_pstate_set_epp(cpudata, epp);
-+skip_epp:
-+	cpufreq_cpu_put(policy);
-+}
-+
-+static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	if (!policy->cpuinfo.max_freq)
-+		return -ENODEV;
-+
-+	pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
-+				policy->cpuinfo.max_freq, policy->max);
-+
-+	cpudata->policy = policy->policy;
-+
-+	amd_pstate_epp_init(policy->cpu);
-+
-+	return 0;
-+}
-+
-+static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
-+{
-+	struct cppc_perf_ctrls perf_ctrls;
-+	u64 value, max_perf;
-+	int ret;
-+
-+	ret = amd_pstate_enable(true);
-+	if (ret)
-+		pr_err("failed to enable amd pstate during resume, return %d\n", ret);
-+
-+	value = READ_ONCE(cpudata->cppc_req_cached);
-+	max_perf = READ_ONCE(cpudata->highest_perf);
-+
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
-+	} else {
-+		perf_ctrls.max_perf = max_perf;
-+		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
-+		cppc_set_perf(cpudata->cpu, &perf_ctrls);
-+	}
-+}
-+
-+static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
-+
-+	if (cppc_state == AMD_PSTATE_ACTIVE) {
-+		amd_pstate_epp_reenable(cpudata);
-+		cpudata->suspended = false;
-+	}
-+
-+	return 0;
-+}
-+
-+static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	struct cppc_perf_ctrls perf_ctrls;
-+	int min_perf;
-+	u64 value;
-+
-+	min_perf = READ_ONCE(cpudata->lowest_perf);
-+	value = READ_ONCE(cpudata->cppc_req_cached);
-+
-+	mutex_lock(&amd_pstate_limits_lock);
-+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
-+		cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN;
-+
-+		/* Set max perf same as min perf */
-+		value &= ~AMD_CPPC_MAX_PERF(~0L);
-+		value |= AMD_CPPC_MAX_PERF(min_perf);
-+		value &= ~AMD_CPPC_MIN_PERF(~0L);
-+		value |= AMD_CPPC_MIN_PERF(min_perf);
-+		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
-+	} else {
-+		perf_ctrls.desired_perf = 0;
-+		perf_ctrls.max_perf = min_perf;
-+		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
-+		cppc_set_perf(cpudata->cpu, &perf_ctrls);
-+	}
-+	mutex_unlock(&amd_pstate_limits_lock);
-+}
-+
-+static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu);
-+
-+	if (cpudata->suspended)
-+		return 0;
-+
-+	if (cppc_state == AMD_PSTATE_ACTIVE)
-+		amd_pstate_epp_offline(policy);
-+
-+	return 0;
-+}
-+
-+static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
-+{
-+	cpufreq_verify_within_cpu_limits(policy);
-+	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
-+	return 0;
-+}
-+
-+static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+	int ret;
-+
-+	/* avoid suspending when EPP is not enabled */
-+	if (cppc_state != AMD_PSTATE_ACTIVE)
-+		return 0;
-+
-+	/* set this flag to avoid setting core offline*/
-+	cpudata->suspended = true;
-+
-+	/* disable CPPC in lowlevel firmware */
-+	ret = amd_pstate_enable(false);
-+	if (ret)
-+		pr_err("failed to suspend, return %d\n", ret);
-+
-+	return 0;
-+}
-+
-+static int amd_pstate_epp_resume(struct cpufreq_policy *policy)
-+{
-+	struct amd_cpudata *cpudata = policy->driver_data;
-+
-+	if (cpudata->suspended) {
-+		mutex_lock(&amd_pstate_limits_lock);
-+
-+		/* enable amd pstate from suspend state*/
-+		amd_pstate_epp_reenable(cpudata);
-+
-+		mutex_unlock(&amd_pstate_limits_lock);
-+
-+		cpudata->suspended = false;
-+	}
-+
-+	return 0;
-+}
-+
- static struct cpufreq_driver amd_pstate_driver = {
- 	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
- 	.verify		= amd_pstate_verify,
-@@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = {
- 	.attr		= amd_pstate_attr,
- };
- 
-+static struct cpufreq_driver amd_pstate_epp_driver = {
-+	.flags		= CPUFREQ_CONST_LOOPS,
-+	.verify		= amd_pstate_epp_verify_policy,
-+	.setpolicy	= amd_pstate_epp_set_policy,
-+	.init		= amd_pstate_epp_cpu_init,
-+	.exit		= amd_pstate_epp_cpu_exit,
-+	.offline	= amd_pstate_epp_cpu_offline,
-+	.online		= amd_pstate_epp_cpu_online,
-+	.suspend	= amd_pstate_epp_suspend,
-+	.resume		= amd_pstate_epp_resume,
-+	.name		= "amd_pstate_epp",
-+	.attr		= amd_pstate_epp_attr,
-+};
-+
- static int __init amd_pstate_init(void)
+-static void amd_pstate_driver_cleanup(void)
+-{
+-	current_pstate_driver = NULL;
+-}
+-
+ static int amd_pstate_update_status(const char *buf, size_t size)
  {
- 	int ret;
-@@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void)
- 	/*
- 	 * by default the pstate driver is disabled to load
- 	 * enable the amd_pstate passive mode driver explicitly
--	 * with amd_pstate=passive in kernel command line
-+	 * with amd_pstate=passive or other modes in kernel command line
- 	 */
--	if (!cppc_load) {
--		pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n");
-+	if (cppc_state == AMD_PSTATE_DISABLE) {
-+		pr_info("driver load is disabled, boot with specific mode to enable this\n");
- 		return -ENODEV;
- 	}
+-	int ret = 0;
+ 	int mode_idx;
  
-@@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void)
+-	if (size > 7 || size < 6)
++	if (size > strlen("passive") || size < strlen("active"))
+ 		return -EINVAL;
+-	mode_idx = get_mode_idx_from_str(buf, size);
+ 
+-	switch(mode_idx) {
+-	case AMD_PSTATE_DISABLE:
+-		if (!current_pstate_driver)
+-			return -EINVAL;
+-		if (cppc_state == AMD_PSTATE_ACTIVE)
+-			return -EBUSY;
+-		cpufreq_unregister_driver(current_pstate_driver);
+-		amd_pstate_driver_cleanup();
+-		break;
+-	case AMD_PSTATE_PASSIVE:
+-		if (current_pstate_driver) {
+-			if (current_pstate_driver == &amd_pstate_driver)
+-				return 0;
+-			cpufreq_unregister_driver(current_pstate_driver);
+-			cppc_state = AMD_PSTATE_PASSIVE;
+-			current_pstate_driver = &amd_pstate_driver;
+-		}
++	mode_idx = get_mode_idx_from_str(buf, size);
+ 
+-		ret = cpufreq_register_driver(current_pstate_driver);
+-		break;
+-	case AMD_PSTATE_ACTIVE:
+-		if (current_pstate_driver) {
+-			if (current_pstate_driver == &amd_pstate_epp_driver)
+-				return 0;
+-			cpufreq_unregister_driver(current_pstate_driver);
+-			current_pstate_driver = &amd_pstate_epp_driver;
+-			cppc_state = AMD_PSTATE_ACTIVE;
+-		}
++	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
++		return -EINVAL;
+ 
+-		ret = cpufreq_register_driver(current_pstate_driver);
+-		break;
+-	default:
+-		ret = -EINVAL;
+-		break;
+-	}
++	if (mode_state_machine[cppc_state][mode_idx])
++		return mode_state_machine[cppc_state][mode_idx](mode_idx);
+ 
+-	return ret;
++	return 0;
+ }
+ 
+ static ssize_t show_status(struct kobject *kobj,
+@@ -1279,7 +1360,7 @@ static int __init amd_pstate_init(void)
  	/* capability check */
  	if (boot_cpu_has(X86_FEATURE_CPPC)) {
  		pr_debug("AMD CPPC MSR based functionality is supported\n");
--		amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
+-		if (cppc_state == AMD_PSTATE_PASSIVE)
 +		if (cppc_state != AMD_PSTATE_ACTIVE)
-+			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
  	} else {
  		pr_debug("AMD CPPC shared memory based functionality is supported\n");
- 		static_call_update(amd_pstate_enable, cppc_enable);
-@@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void)
- 	/* enable amd pstate feature */
- 	ret = amd_pstate_enable(true);
- 	if (ret) {
--		pr_err("failed to enable amd-pstate with return %d\n", ret);
-+		pr_err("failed to enable with return %d\n", ret);
- 		return ret;
- 	}
+@@ -1341,7 +1422,7 @@ static int __init amd_pstate_param(char *str)
+ 		if (cppc_state == AMD_PSTATE_ACTIVE)
+ 			current_pstate_driver = &amd_pstate_epp_driver;
  
--	ret = cpufreq_register_driver(&amd_pstate_driver);
-+	ret = cpufreq_register_driver(current_pstate_driver);
- 	if (ret)
--		pr_err("failed to register amd_pstate_driver with return %d\n",
--		       ret);
-+		pr_err("failed to register with return %d\n", ret);
-+
-+	amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj);
-+	if (!amd_pstate_kobj) {
-+		ret = -EINVAL;
-+		pr_err("global sysfs registration failed.\n");
-+		goto kobject_free;
-+	}
- 
-+	ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group);
-+	if (ret) {
-+		pr_err("sysfs attribute export failed with error %d.\n", ret);
-+		goto global_attr_free;
-+	}
-+
-+	return ret;
-+
-+global_attr_free:
-+	kobject_put(amd_pstate_kobj);
-+kobject_free:
-+	cpufreq_unregister_driver(current_pstate_driver);
- 	return ret;
- }
- device_initcall(amd_pstate_init);
- 
- static int __init amd_pstate_param(char *str)
- {
-+	size_t size;
-+	int mode_idx;
-+
- 	if (!str)
- 		return -EINVAL;
- 
--	if (!strcmp(str, "disable")) {
--		cppc_load = 0;
--		pr_info("driver is explicitly disabled\n");
--	} else if (!strcmp(str, "passive"))
--		cppc_load = 1;
-+	size = strlen(str);
-+	mode_idx = get_mode_idx_from_str(str, size);
- 
--	return 0;
-+	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
-+		cppc_state = mode_idx;
-+		if (cppc_state == AMD_PSTATE_DISABLE)
-+			pr_info("driver is explicitly disabled\n");
-+
-+		if (cppc_state == AMD_PSTATE_ACTIVE)
-+			current_pstate_driver = &amd_pstate_epp_driver;
-+
+-		if (cppc_state == AMD_PSTATE_PASSIVE)
 +		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
-+			current_pstate_driver = &amd_pstate_driver;
-+
-+		return 0;
-+	}
-+
-+	return -EINVAL;
- }
- early_param("amd_pstate", amd_pstate_param);
+ 			current_pstate_driver = &amd_pstate_driver;
  
-diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
-index 4153150e20db..ffea6402189d 100644
---- a/drivers/cpufreq/brcmstb-avs-cpufreq.c
-+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
-@@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
- 
- static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
- {
--	int ret;
--
--	ret = cpufreq_unregister_driver(&brcm_avs_driver);
--	WARN_ON(ret);
-+	cpufreq_unregister_driver(&brcm_avs_driver);
- 
- 	brcm_avs_prepare_uninit(pdev);
- 
-diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
-index 7e56a42750ea..85a0bea2dbf1 100644
---- a/drivers/cpufreq/cpufreq.c
-+++ b/drivers/cpufreq/cpufreq.c
-@@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver);
-  * Returns zero if successful, and -EINVAL if the cpufreq_driver is
-  * currently not initialised.
-  */
--int cpufreq_unregister_driver(struct cpufreq_driver *driver)
-+void cpufreq_unregister_driver(struct cpufreq_driver *driver)
- {
- 	unsigned long flags;
- 
--	if (!cpufreq_driver || (driver != cpufreq_driver))
--		return -EINVAL;
-+	if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver)))
-+		return;
- 
- 	pr_debug("unregistering driver %s\n", driver->name);
- 
-@@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
- 
- 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
- 	cpus_read_unlock();
--
--	return 0;
- }
- EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
- 
-diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
-index f80339779084..f21a9e3df53d 100644
---- a/drivers/cpufreq/mediatek-cpufreq-hw.c
-+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
-@@ -317,7 +317,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev)
- 
- static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev)
- {
--	return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver);
-+	cpufreq_unregister_driver(&cpufreq_mtk_hw_driver);
-+
-+	return 0;
- }
- 
- static const struct of_device_id mtk_cpufreq_hw_match[] = {
-diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
-index 1b50df06c6bc..81649a1969b6 100644
---- a/drivers/cpufreq/omap-cpufreq.c
-+++ b/drivers/cpufreq/omap-cpufreq.c
-@@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev)
- 
- static int omap_cpufreq_remove(struct platform_device *pdev)
- {
--	return cpufreq_unregister_driver(&omap_driver);
-+	cpufreq_unregister_driver(&omap_driver);
-+
-+	return 0;
- }
- 
- static struct platform_driver omap_cpufreq_platdrv = {
-diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
-index d3f55ca06ed3..2f581d2d617d 100644
---- a/drivers/cpufreq/qcom-cpufreq-hw.c
-+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
-@@ -770,7 +770,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev)
- 
- static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev)
- {
--	return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver);
-+	cpufreq_unregister_driver(&cpufreq_qcom_hw_driver);
-+
-+	return 0;
- }
- 
- static struct platform_driver qcom_cpufreq_hw_driver = {
+ 		return 0;
 diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
-index c5614444031f..6126c977ece0 100644
+index 6b487a5bd638..6126c977ece0 100644
 --- a/include/acpi/cppc_acpi.h
 +++ b/include/acpi/cppc_acpi.h
-@@ -108,12 +108,15 @@ struct cppc_perf_caps {
- 	u32 lowest_nonlinear_perf;
+@@ -109,6 +109,7 @@ struct cppc_perf_caps {
  	u32 lowest_freq;
  	u32 nominal_freq;
-+	u32 energy_perf;
+ 	u32 energy_perf;
 +	bool auto_sel;
  };
  
  struct cppc_perf_ctrls {
- 	u32 max_perf;
- 	u32 min_perf;
- 	u32 desired_perf;
-+	u32 energy_perf;
- };
- 
- struct cppc_perf_fb_ctrs {
-@@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void);
- extern bool cpc_supported_by_cpu(void);
- extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
+@@ -153,6 +154,8 @@ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
  extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val);
-+extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
-+extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
+ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
+ extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
 +extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
 +extern int cppc_set_auto_sel(int cpu, bool enable);
  #else /* !CONFIG_ACPI_CPPC_LIB */
  static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
  {
-@@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+@@ -214,6 +217,14 @@ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
  {
  	return -ENOTSUPP;
  }
-+static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
-+{
-+	return -ENOTSUPP;
-+}
-+static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
-+{
-+	return -ENOTSUPP;
-+}
 +static inline int cppc_set_auto_sel(int cpu, bool enable)
 +{
 +	return -ENOTSUPP;
@@ -28189,84 +9606,32 @@ index c5614444031f..6126c977ece0 100644
  
  #endif /* _CPPC_ACPI_H*/
 diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
-index 1c4b8659f171..c10ebf8c42e6 100644
+index f5f22418e64b..c10ebf8c42e6 100644
 --- a/include/linux/amd-pstate.h
 +++ b/include/linux/amd-pstate.h
-@@ -12,6 +12,11 @@
- 
- #include <linux/pm_qos.h>
- 
-+#define AMD_CPPC_EPP_PERFORMANCE		0x00
-+#define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
-+#define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
-+#define AMD_CPPC_EPP_POWERSAVE			0xFF
-+
- /*********************************************************************
-  *                        AMD P-state INTERFACE                       *
-  *********************************************************************/
-@@ -47,6 +52,10 @@ struct amd_aperf_mperf {
-  * @prev: Last Aperf/Mperf/tsc count value read from register
-  * @freq: current cpu frequency value
-  * @boost_supported: check whether the Processor or SBIOS supports boost mode
-+ * @epp_policy: Last saved policy used to set energy-performance preference
-+ * @epp_cached: Cached CPPC energy-performance preference value
-+ * @policy: Cpufreq policy value
-+ * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value
-  *
-  * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
-  * represents all the attributes and goals that AMD P-State requests at runtime.
-@@ -72,6 +81,31 @@ struct amd_cpudata {
- 
- 	u64	freq;
- 	bool	boost_supported;
-+
-+	/* EPP feature related attributes*/
-+	s16	epp_policy;
-+	s16	epp_cached;
-+	u32	policy;
-+	u64	cppc_cap1_cached;
-+	bool	suspended;
+@@ -97,6 +97,7 @@ enum amd_pstate_mode {
+ 	AMD_PSTATE_DISABLE = 0,
+ 	AMD_PSTATE_PASSIVE,
+ 	AMD_PSTATE_ACTIVE,
++	AMD_PSTATE_GUIDED,
+ 	AMD_PSTATE_MAX,
  };
  
-+/*
-+ * enum amd_pstate_mode - driver working mode of amd pstate
-+ */
-+enum amd_pstate_mode {
-+	AMD_PSTATE_DISABLE = 0,
-+	AMD_PSTATE_PASSIVE,
-+	AMD_PSTATE_ACTIVE,
-+	AMD_PSTATE_GUIDED,
-+	AMD_PSTATE_MAX,
-+};
-+
-+static const char * const amd_pstate_mode_string[] = {
-+	[AMD_PSTATE_DISABLE]     = "disable",
-+	[AMD_PSTATE_PASSIVE]     = "passive",
-+	[AMD_PSTATE_ACTIVE]      = "active",
+@@ -104,6 +105,7 @@ static const char * const amd_pstate_mode_string[] = {
+ 	[AMD_PSTATE_DISABLE]     = "disable",
+ 	[AMD_PSTATE_PASSIVE]     = "passive",
+ 	[AMD_PSTATE_ACTIVE]      = "active",
 +	[AMD_PSTATE_GUIDED]      = "guided",
-+	NULL,
-+};
+ 	NULL,
+ };
  #endif /* _LINUX_AMD_PSTATE_H */
-diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
-index 6a94a6eaad27..65623233ab2f 100644
---- a/include/linux/cpufreq.h
-+++ b/include/linux/cpufreq.h
-@@ -448,7 +448,7 @@ struct cpufreq_driver {
- #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
- 
- int cpufreq_register_driver(struct cpufreq_driver *driver_data);
--int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
-+void cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
- 
- bool cpufreq_driver_test_flags(u16 flags);
- const char *cpufreq_get_current_driver(void);
 -- 
-2.40.0.rc2
+2.40.0
 
-From 501028b1bc1da95eeb61b26a0ee82ef93873d5d7 Mon Sep 17 00:00:00 2001
+From 3c01171fc23cece3cf05ed3380e25fa10cd3393d Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 22 Jan 2023 13:41:50 +0100
-Subject: [PATCH 09/16] ksm
+Date: Sun, 9 Apr 2023 21:22:26 +0200
+Subject: [PATCH 06/10] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -28292,9 +9657,9 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  include/linux/syscalls.h                    |   1 +
  include/uapi/asm-generic/unistd.h           |   5 +-
  kernel/sys_ni.c                             |   1 +
- mm/ksm.c                                    |  88 +++++++++------
- mm/madvise.c                                | 113 ++++++++++++++++++++
- 24 files changed, 198 insertions(+), 34 deletions(-)
+ mm/ksm.c                                    |  82 +++++++++-----
+ mm/madvise.c                                | 117 ++++++++++++++++++++
+ 24 files changed, 199 insertions(+), 31 deletions(-)
 
 diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
 index 8ebacf37a8cf..c9d25f85d86d 100644
@@ -28470,7 +9835,7 @@ index 52c94ab5c205..1518e261d882 100644
  450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 +451	common	pmadv_ksm			sys_pmadv_ksm
 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
-index 7e232ba59b86..632a1a792ebb 100644
+index 7e232ba59b86..57ed92987717 100644
 --- a/include/linux/ksm.h
 +++ b/include/linux/ksm.h
 @@ -16,6 +16,10 @@
@@ -28478,9 +9843,9 @@ index 7e232ba59b86..632a1a792ebb 100644
  
  #ifdef CONFIG_KSM
 +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
-+		unsigned long *vm_flags);
++		const vm_flags_t *vm_flags);
 +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
-+		unsigned long end, unsigned long *vm_flags);
++		unsigned long end, const vm_flags_t *vm_flags);
  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  		unsigned long end, int advice, unsigned long *vm_flags);
  int __ksm_enter(struct mm_struct *mm);
@@ -28526,17 +9891,17 @@ index 860b2dcf3ac4..810e1fcaff94 100644
  COND_SYSCALL(mbind);
  COND_SYSCALL(get_mempolicy);
 diff --git a/mm/ksm.c b/mm/ksm.c
-index ee60890cf9b1..bc920121bce9 100644
+index 82029f1d454b..0c206bd8007d 100644
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
-@@ -2582,54 +2582,78 @@ static int ksm_scan_thread(void *nothing)
+@@ -2576,52 +2576,76 @@ static int ksm_scan_thread(void *nothing)
  	return 0;
  }
  
 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 -		unsigned long end, int advice, unsigned long *vm_flags)
 +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
-+		unsigned long *vm_flags)
++		const vm_flags_t *vm_flags)
  {
 -	struct mm_struct *mm = vma->vm_mm;
  	int err;
@@ -28584,24 +9949,12 @@ index ee60890cf9b1..bc920121bce9 100644
 +		if (err)
 +			return err;
 +	}
- 
--		*vm_flags |= VM_MERGEABLE;
--		break;
-+	*vm_flags |= VM_MERGEABLE;
- 
--	case MADV_UNMERGEABLE:
--		if (!(*vm_flags & VM_MERGEABLE))
--			return 0;		/* just ignore the advice */
++
 +	return 0;
 +}
- 
--		if (vma->anon_vma) {
--			err = unmerge_ksm_pages(vma, start, end);
--			if (err)
--				return err;
--		}
++
 +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
-+		unsigned long end, unsigned long *vm_flags)
++		unsigned long end, const vm_flags_t *vm_flags)
 +{
 +	int err;
 +
@@ -28613,9 +9966,6 @@ index ee60890cf9b1..bc920121bce9 100644
 +		if (err)
 +			return err;
 +	}
- 
--		*vm_flags &= ~VM_MERGEABLE;
-+	*vm_flags &= ~VM_MERGEABLE;
 +
 +	return 0;
 +}
@@ -28631,20 +9981,30 @@ index ee60890cf9b1..bc920121bce9 100644
 +		err = ksm_madvise_merge(mm, vma, vm_flags);
 +		if (err)
 +			return err;
-+		break;
-+
-+	case MADV_UNMERGEABLE:
+ 
+ 		*vm_flags |= VM_MERGEABLE;
+ 		break;
+ 
+ 	case MADV_UNMERGEABLE:
+-		if (!(*vm_flags & VM_MERGEABLE))
+-			return 0;		/* just ignore the advice */
+-
+-		if (vma->anon_vma) {
+-			err = unmerge_ksm_pages(vma, start, end);
+-			if (err)
+-				return err;
+-		}
 +		err = ksm_madvise_unmerge(vma, start, end, vm_flags);
 +		if (err)
 +			return err;
- 		break;
- 	}
  
+ 		*vm_flags &= ~VM_MERGEABLE;
+ 		break;
 diff --git a/mm/madvise.c b/mm/madvise.c
-index b6ea204d4e23..0064dcafb812 100644
+index 340125d08c03..36e756355f04 100644
 --- a/mm/madvise.c
 +++ b/mm/madvise.c
-@@ -1527,3 +1527,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+@@ -1522,3 +1522,120 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  out:
  	return ret;
  }
@@ -28710,9 +10070,13 @@ index b6ea204d4e23..0064dcafb812 100644
 +		switch (behaviour) {
 +			case MADV_MERGEABLE:
 +				ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
++				if (!ret)
++					vm_flags_set(vma, VM_MERGEABLE);
 +				break;
 +			case MADV_UNMERGEABLE:
 +				ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
++				if (!ret)
++					vm_flags_clear(vma, VM_MERGEABLE);
 +				break;
 +			default:
 +				/* look, ma, no brain */
@@ -28762,54 +10126,27 @@ index b6ea204d4e23..0064dcafb812 100644
 +subsys_initcall(pmadv_sysfs_init);
 +#endif /* CONFIG_KSM */
 -- 
-2.40.0.rc2
+2.40.0
 
-From abf71738a315ea5ad029cd3976ec7b2d9456c432 Mon Sep 17 00:00:00 2001
+From d349cbde64499039351b1bb146999948a1319b71 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 18:06:12 +0100
-Subject: [PATCH 10/16] maple-lru
+Date: Sun, 9 Apr 2023 21:24:33 +0200
+Subject: [PATCH 07/10] maple-lru
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- Documentation/mm/multigen_lru.rst |  128 +++-
- include/linux/fs.h                |    2 +
- include/linux/maple_tree.h        |    6 -
- include/linux/memcontrol.h        |   10 +
- include/linux/mm_inline.h         |   19 +-
- include/linux/mmzone.h            |  124 +++-
- lib/maple_tree.c                  |  149 ++--
- mm/fadvise.c                      |    5 +-
- mm/memcontrol.c                   |   12 +
- mm/memory.c                       |    7 +-
- mm/page_alloc.c                   |    1 +
- mm/rmap.c                         |   42 +-
- mm/vmscan.c                       | 1083 ++++++++++++++++++-----------
- mm/workingset.c                   |    4 +-
- tools/testing/radix-tree/maple.c  |   18 +-
- 15 files changed, 1066 insertions(+), 544 deletions(-)
+ Documentation/mm/multigen_lru.rst | 44 +++++++++++++++++++++++---
+ include/linux/mmzone.h            |  2 +-
+ lib/maple_tree.c                  | 51 ++++++-------------------------
+ mm/vmscan.c                       | 24 ++++++---------
+ tools/testing/radix-tree/maple.c  | 24 +++++++++++++++
+ 5 files changed, 84 insertions(+), 61 deletions(-)
 
 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
-index d7062c6a8946..52ed5092022f 100644
+index 5f1f6ecbb79b..52ed5092022f 100644
 --- a/Documentation/mm/multigen_lru.rst
 +++ b/Documentation/mm/multigen_lru.rst
-@@ -89,21 +89,22 @@ variables are monotonically increasing.
- 
- Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
- bits in order to fit into the gen counter in ``folio->flags``. Each
--truncated generation number is an index to ``lrugen->lists[]``. The
-+truncated generation number is an index to ``lrugen->folios[]``. The
- sliding window technique is used to track at least ``MIN_NR_GENS`` and
- at most ``MAX_NR_GENS`` generations. The gen counter stores a value
- within ``[1, MAX_NR_GENS]`` while a page is on one of
--``lrugen->lists[]``; otherwise it stores zero.
-+``lrugen->folios[]``; otherwise it stores zero.
- 
- Each generation is divided into multiple tiers. A page accessed ``N``
- times through file descriptors is in tier ``order_base_2(N)``. Unlike
--generations, tiers do not have dedicated ``lrugen->lists[]``. In
-+generations, tiers do not have dedicated ``lrugen->folios[]``. In
- contrast to moving across generations, which requires the LRU lock,
- moving across tiers only involves atomic operations on
+@@ -103,7 +103,8 @@ moving across tiers only involves atomic operations on
  ``folio->flags`` and therefore has a negligible cost. A feedback loop
  modeled after the PID controller monitors refaults over all the tiers
  from anon and file types and decides which tiers from which types to
@@ -28819,34 +10156,10 @@ index d7062c6a8946..52ed5092022f 100644
  
  There are two conceptually independent procedures: the aging and the
  eviction. They form a closed-loop system, i.e., the page reclaim.
-@@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
- Eviction
- --------
- The eviction consumes old generations. Given an ``lruvec``, it
--increments ``min_seq`` when ``lrugen->lists[]`` indexed by
-+increments ``min_seq`` when ``lrugen->folios[]`` indexed by
- ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
- evict from, it first compares ``min_seq[]`` to select the older type.
- If both types are equally old, it selects the one whose first tier has
-@@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To
- this end, the feedback loop uses the first tier as the baseline, for
- the reason stated earlier.
+@@ -156,6 +157,27 @@ This time-based approach has the following advantages:
+    and memory sizes.
+ 2. It is more reliable because it is directly wired to the OOM killer.
  
-+Working set protection
-+----------------------
-+Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is
-+set, an ``lruvec`` is protected from the eviction when its oldest
-+generation was born within ``lru_gen_min_ttl`` milliseconds. In other
-+words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds
-+from getting evicted. The OOM killer is triggered if this working set
-+cannot be kept in memory.
-+
-+This time-based approach has the following advantages:
-+
-+1. It is easier to configure because it is agnostic to applications
-+   and memory sizes.
-+2. It is more reliable because it is directly wired to the OOM killer.
-+
 +``mm_struct`` list
 +------------------
 +An ``mm_struct`` list is maintained for each memcg, and an
@@ -28868,36 +10181,22 @@ index d7062c6a8946..52ed5092022f 100644
 +context switches so that page table walkers can skip processes that
 +have been sleeping since the last iteration.
 +
-+Rmap/PT walk feedback
-+---------------------
-+Searching the rmap for PTEs mapping each page on an LRU list (to test
-+and clear the accessed bit) can be expensive because pages from
-+different VMAs (PA space) are not cache friendly to the rmap (VA
-+space). For workloads mostly using mapped pages, searching the rmap
-+can incur the highest CPU cost in the reclaim path.
-+
-+``lru_gen_look_around()`` exploits spatial locality to reduce the
-+trips into the rmap. It scans the adjacent PTEs of a young PTE and
-+promotes hot pages. If the scan was done cacheline efficiently, it
-+adds the PMD entry pointing to the PTE table to the Bloom filter. This
-+forms a feedback loop between the eviction and the aging.
-+
+ Rmap/PT walk feedback
+ ---------------------
+ Searching the rmap for PTEs mapping each page on an LRU list (to test
+@@ -170,7 +192,7 @@ promotes hot pages. If the scan was done cacheline efficiently, it
+ adds the PMD entry pointing to the PTE table to the Bloom filter. This
+ forms a feedback loop between the eviction and the aging.
+ 
+-Bloom Filters
 +Bloom filters
-+-------------
-+Bloom filters are a space and memory efficient data structure for set
-+membership test, i.e., test if an element is not in the set or may be
-+in the set.
-+
-+In the eviction path, specifically, in ``lru_gen_look_around()``, if a
-+PMD has a sufficient number of hot pages, its address is placed in the
-+filter. In the aging path, set membership means that the PTE range
-+will be scanned for young pages.
-+
-+Note that Bloom filters are probabilistic on set membership. If a test
-+is false positive, the cost is an additional scan of a range of PTEs,
-+which may yield hot pages anyway. Parameters of the filter itself can
-+control the false positive rate in the limit.
-+
+ -------------
+ Bloom filters are a space and memory efficient data structure for set
+ membership test, i.e., test if an element is not in the set or may be
+@@ -186,6 +208,18 @@ is false positive, the cost is an additional scan of a range of PTEs,
+ which may yield hot pages anyway. Parameters of the filter itself can
+ control the false positive rate in the limit.
+ 
 +PID controller
 +--------------
 +A feedback loop modeled after the Proportional-Integral-Derivative
@@ -28910,41 +10209,10 @@ index d7062c6a8946..52ed5092022f 100644
 +varying memory pressure. It calculates a moving average for each new
 +generation to avoid being permanently locked in a suboptimal state.
 +
-+Memcg LRU
-+---------
-+An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
-+since each node and memcg combination has an LRU of folios (see
-+``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
-+global reclaim, which is critical to system-wide memory overcommit in
-+data centers. Note that memcg LRU only applies to global reclaim.
-+
-+The basic structure of an memcg LRU can be understood by an analogy to
-+the active/inactive LRU (of folios):
-+
-+1. It has the young and the old (generations), i.e., the counterparts
-+   to the active and the inactive;
-+2. The increment of ``max_seq`` triggers promotion, i.e., the
-+   counterpart to activation;
-+3. Other events trigger similar operations, e.g., offlining an memcg
-+   triggers demotion, i.e., the counterpart to deactivation.
-+
-+In terms of global reclaim, it has two distinct features:
-+
-+1. Sharding, which allows each thread to start at a random memcg (in
-+   the old generation) and improves parallelism;
-+2. Eventual fairness, which allows direct reclaim to bail out at will
-+   and reduces latency without affecting fairness over some time.
-+
-+In terms of traversing memcgs during global reclaim, it improves the
-+best-case complexity from O(n) to O(1) and does not affect the
-+worst-case complexity O(n). Therefore, on average, it has a sublinear
-+complexity.
-+
- Summary
- -------
--The multi-gen LRU can be disassembled into the following parts:
-+The multi-gen LRU (of folios) can be disassembled into the following
-+parts:
+ Memcg LRU
+ ---------
+ An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
+@@ -223,9 +257,9 @@ parts:
  
  * Generations
  * Rmap walks
@@ -28957,696 +10225,105 @@ index d7062c6a8946..52ed5092022f 100644
  
  The aging and the eviction form a producer-consumer model;
  specifically, the latter drives the former by the sliding window over
-diff --git a/include/linux/fs.h b/include/linux/fs.h
-index c1769a2c5d70..d353c262d669 100644
---- a/include/linux/fs.h
-+++ b/include/linux/fs.h
-@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
- /* File supports DIRECT IO */
- #define	FMODE_CAN_ODIRECT	((__force fmode_t)0x400000)
- 
-+#define	FMODE_NOREUSE		((__force fmode_t)0x800000)
-+
- /* File was opened by fanotify and shouldn't generate fanotify events */
- #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
- 
-diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
-index e594db58a0f1..815a27661517 100644
---- a/include/linux/maple_tree.h
-+++ b/include/linux/maple_tree.h
-@@ -12,7 +12,6 @@
- #include <linux/rcupdate.h>
- #include <linux/spinlock.h>
- /* #define CONFIG_MAPLE_RCU_DISABLED */
--/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
- 
- /*
-  * Allocated nodes are mutable until they have been inserted into the tree,
-@@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas)
- 	return mas->node == MAS_PAUSE;
- }
- 
--void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
--void mas_dup_store(struct ma_state *mas, void *entry);
--
- /*
-  * This finds an empty area from the highest address to the lowest.
-  * AKA "Topdown" version,
-@@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas)
-  * entry.
-  *
-  * Note: may return the zero entry.
-- *
-  */
- #define mas_for_each(__mas, __entry, __max) \
- 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
-@@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt)
- }
- 
- static inline unsigned int mt_height(const struct maple_tree *mt)
--
- {
- 	return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
- }
-diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
-index 85dc9b88ea37..8e0be0680005 100644
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
- 	percpu_ref_put(&objcg->refcnt);
- }
- 
-+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
-+{
-+	return !memcg || css_tryget(&memcg->css);
-+}
-+
- static inline void mem_cgroup_put(struct mem_cgroup *memcg)
- {
- 	if (memcg)
-@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
- {
- }
- 
-+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
-+{
-+	return true;
-+}
-+
- static inline void mem_cgroup_put(struct mem_cgroup *memcg)
- {
- }
-diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
-index ff3f3f23f649..de1e622dd366 100644
---- a/include/linux/mm_inline.h
-+++ b/include/linux/mm_inline.h
-@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
- 	int zone = folio_zonenum(folio);
- 	int delta = folio_nr_pages(folio);
- 	enum lru_list lru = type * LRU_INACTIVE_FILE;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
- 	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
-@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
- 	int gen = folio_lru_gen(folio);
- 	int type = folio_is_file_lru(folio);
- 	int zone = folio_zonenum(folio);
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
- 
-@@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
- 	lru_gen_update_size(lruvec, folio, -1, gen);
- 	/* for folio_rotate_reclaimable() */
- 	if (reclaiming)
--		list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
-+		list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
- 	else
--		list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
-+		list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
- 
- 	return true;
- }
-@@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
- #endif
- }
- 
-+static inline bool vma_has_recency(struct vm_area_struct *vma)
-+{
-+	if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
-+		return false;
-+
-+	if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
-+		return false;
-+
-+	return true;
-+}
-+
- #endif
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index cd28a100d9e4..70bd7f55bdd2 100644
+index 9fb1b03b83b2..bf8786d45b31 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
-@@ -7,6 +7,7 @@
- 
- #include <linux/spinlock.h>
- #include <linux/list.h>
-+#include <linux/list_nulls.h>
- #include <linux/wait.h>
- #include <linux/bitops.h>
- #include <linux/cache.h>
-@@ -312,7 +313,7 @@ enum lruvec_flags {
-  * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
-  * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
-  * corresponding generation. The gen counter in folio->flags stores gen+1 while
-- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
-+ * a page is on one of lrugen->folios[]. Otherwise it stores 0.
-  *
-  * A page is added to the youngest generation on faulting. The aging needs to
-  * check the accessed bit at least twice before handing this page over to the
-@@ -324,8 +325,8 @@ enum lruvec_flags {
-  * rest of generations, if they exist, are considered inactive. See
-  * lru_gen_is_active().
-  *
-- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
-- * the aging needs not to worry about it. And it's set again when a page
-+ * PG_active is always cleared while a page is on one of lrugen->folios[] so
-+ * that the aging needs not to worry about it. And it's set again when a page
-  * considered active is isolated for non-reclaiming purposes, e.g., migration.
-  * See lru_gen_add_folio() and lru_gen_del_folio().
-  *
-@@ -404,7 +405,7 @@ enum {
-  * The number of pages in each generation is eventually consistent and therefore
-  * can be transiently negative when reset_batch_size() is pending.
-  */
--struct lru_gen_struct {
-+struct lru_gen_folio {
- 	/* the aging increments the youngest generation number */
- 	unsigned long max_seq;
- 	/* the eviction increments the oldest generation numbers */
-@@ -412,7 +413,7 @@ struct lru_gen_struct {
- 	/* the birth time of each generation in jiffies */
- 	unsigned long timestamps[MAX_NR_GENS];
- 	/* the multi-gen LRU lists, lazily sorted on eviction */
--	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
-+	struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- 	/* the multi-gen LRU sizes, eventually consistent */
- 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- 	/* the exponential moving average of refaulted */
-@@ -426,6 +427,14 @@ struct lru_gen_struct {
- 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
- 	/* whether the multi-gen LRU is enabled */
- 	bool enabled;
-+#ifdef CONFIG_MEMCG
-+	/* the memcg generation this lru_gen_folio belongs to */
-+	u8 gen;
-+	/* the list segment this lru_gen_folio belongs to */
-+	u8 seg;
-+	/* per-node lru_gen_folio list for global reclaim */
-+	struct hlist_nulls_node list;
-+#endif
- };
- 
- enum {
-@@ -461,7 +470,7 @@ struct lru_gen_mm_state {
- struct lru_gen_mm_walk {
- 	/* the lruvec under reclaim */
- 	struct lruvec *lruvec;
--	/* unstable max_seq from lru_gen_struct */
-+	/* unstable max_seq from lru_gen_folio */
- 	unsigned long max_seq;
- 	/* the next address within an mm to scan */
- 	unsigned long next_addr;
-@@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
- void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
- 
- #ifdef CONFIG_MEMCG
-+
-+/*
-+ * For each node, memcgs are divided into two generations: the old and the
-+ * young. For each generation, memcgs are randomly sharded into multiple bins
-+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
-+ * into three segments: the head, the tail and the default.
-+ *
-+ * An onlining memcg is added to the tail of a random bin in the old generation.
-+ * The eviction starts at the head of a random bin in the old generation. The
-+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
-+ * the old generation, is incremented when all its bins become empty.
-+ *
-+ * There are four operations:
-+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
-+ *    current generation (old or young) and updates its "seg" to "head";
-+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
-+ *    current generation (old or young) and updates its "seg" to "tail";
-+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
-+ *    generation, updates its "gen" to "old" and resets its "seg" to "default";
-+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
-+ *    young generation, updates its "gen" to "young" and resets its "seg" to
-+ *    "default".
-+ *
-+ * The events that trigger the above operations are:
-+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
-+ * 2. The first attempt to reclaim an memcg below low, which triggers
-+ *    MEMCG_LRU_TAIL;
-+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
-+ *    which triggers MEMCG_LRU_TAIL;
-+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
-+ *    which triggers MEMCG_LRU_YOUNG;
-+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
-+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
-+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
-+ *
-+ * Note that memcg LRU only applies to global reclaim, and the round-robin
-+ * incrementing of their max_seq counters ensures the eventual fairness to all
-+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
-+ */
-+#define MEMCG_NR_GENS	2
-+#define MEMCG_NR_BINS	8
-+
-+struct lru_gen_memcg {
-+	/* the per-node memcg generation counter */
-+	unsigned long seq;
-+	/* each memcg has one lru_gen_folio per node */
-+	unsigned long nr_memcgs[MEMCG_NR_GENS];
-+	/* per-node lru_gen_folio list for global reclaim */
-+	struct hlist_nulls_head	fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
-+	/* protects the above */
-+	spinlock_t lock;
-+};
-+
-+void lru_gen_init_pgdat(struct pglist_data *pgdat);
-+
- void lru_gen_init_memcg(struct mem_cgroup *memcg);
- void lru_gen_exit_memcg(struct mem_cgroup *memcg);
--#endif
-+void lru_gen_online_memcg(struct mem_cgroup *memcg);
-+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
-+void lru_gen_release_memcg(struct mem_cgroup *memcg);
-+void lru_gen_soft_reclaim(struct lruvec *lruvec);
-+
-+#else /* !CONFIG_MEMCG */
-+
-+#define MEMCG_NR_GENS	1
-+
-+struct lru_gen_memcg {
-+};
-+
-+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
-+{
-+}
-+
-+#endif /* CONFIG_MEMCG */
- 
- #else /* !CONFIG_LRU_GEN */
- 
-+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
-+{
-+}
-+
- static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
- {
- }
-@@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- }
- 
- #ifdef CONFIG_MEMCG
-+
- static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
- }
-@@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
- {
- }
--#endif
-+
-+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
-+{
-+}
-+
-+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
-+{
-+}
-+
-+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
-+{
-+}
-+
-+static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
-+{
-+}
-+
-+#endif /* CONFIG_MEMCG */
- 
- #endif /* CONFIG_LRU_GEN */
- 
-@@ -524,7 +626,7 @@ struct lruvec {
- 	unsigned long			flags;
- #ifdef CONFIG_LRU_GEN
- 	/* evictable pages divided into generations */
--	struct lru_gen_struct		lrugen;
-+	struct lru_gen_folio		lrugen;
- 	/* to concurrently iterate lru_gen_mm_list */
- 	struct lru_gen_mm_state		mm_state;
- #endif
-@@ -1242,7 +1344,9 @@ typedef struct pglist_data {
+@@ -1369,7 +1369,7 @@ typedef struct pglist_data {
  
  #ifdef CONFIG_LRU_GEN
  	/* kswap mm walk data */
 -	struct lru_gen_mm_walk	mm_walk;
 +	struct lru_gen_mm_walk mm_walk;
-+	/* lru_gen_folio list */
-+	struct lru_gen_memcg memcg_lru;
+ 	/* lru_gen_folio list */
+ 	struct lru_gen_memcg memcg_lru;
  #endif
- 
- 	CACHELINE_PADDING(_pad2_);
 diff --git a/lib/maple_tree.c b/lib/maple_tree.c
-index 5a976393c9ae..a73f83d0eb0e 100644
+index db60edb55f2f..4df6a0ce1c1b 100644
 --- a/lib/maple_tree.c
 +++ b/lib/maple_tree.c
-@@ -146,16 +146,22 @@ struct maple_subtree_state {
- 	struct maple_big_node *bn;
- };
- 
-+#ifdef CONFIG_KASAN_STACK
-+/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
-+#define noinline_for_kasan noinline_for_stack
-+#else
-+#define noinline_for_kasan inline
-+#endif
-+
- /* Functions */
- static inline struct maple_node *mt_alloc_one(gfp_t gfp)
- {
--	return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO);
-+	return kmem_cache_alloc(maple_node_cache, gfp);
- }
- 
- static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
- {
--	return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size,
--				     nodes);
-+	return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
- }
- 
- static inline void mt_free_bulk(size_t size, void __rcu **nodes)
-@@ -183,7 +189,6 @@ static void ma_free_rcu(struct maple_node *node)
- 	call_rcu(&node->rcu, mt_free_rcu);
- }
- 
--
- static void mas_set_height(struct ma_state *mas)
- {
- 	unsigned int new_flags = mas->tree->ma_flags;
-@@ -468,7 +473,7 @@ static inline
- void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
- 		    unsigned char slot)
- {
--	unsigned long val = (unsigned long) parent;
-+	unsigned long val = (unsigned long)parent;
- 	unsigned long shift;
- 	unsigned long type;
- 	enum maple_type p_type = mte_node_type(parent);
-@@ -502,10 +507,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
-  */
- static inline unsigned int mte_parent_slot(const struct maple_enode *enode)
- {
--	unsigned long val = (unsigned long) mte_to_node(enode)->parent;
-+	unsigned long val = (unsigned long)mte_to_node(enode)->parent;
- 
--	/* Root. */
--	if (val & 1)
-+	if (val & MA_ROOT_PARENT)
- 		return 0;
- 
- 	/*
-@@ -1128,9 +1132,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
- {
- 	struct maple_alloc *ret, *node = mas->alloc;
- 	unsigned long total = mas_allocated(mas);
-+	unsigned int req = mas_alloc_req(mas);
- 
- 	/* nothing or a request pending. */
--	if (unlikely(!total))
-+	if (WARN_ON(!total))
- 		return NULL;
- 
- 	if (total == 1) {
-@@ -1140,27 +1145,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
- 		goto single_node;
- 	}
- 
--	if (!node->node_count) {
-+	if (node->node_count == 1) {
- 		/* Single allocation in this node. */
- 		mas->alloc = node->slot[0];
--		node->slot[0] = NULL;
- 		mas->alloc->total = node->total - 1;
- 		ret = node;
- 		goto new_head;
- 	}
--
- 	node->total--;
--	ret = node->slot[node->node_count];
--	node->slot[node->node_count--] = NULL;
-+	ret = node->slot[--node->node_count];
-+	node->slot[node->node_count] = NULL;
- 
- single_node:
- new_head:
--	ret->total = 0;
--	ret->node_count = 0;
--	if (ret->request_count) {
--		mas_set_alloc_req(mas, ret->request_count + 1);
--		ret->request_count = 0;
-+	if (req) {
-+		req++;
-+		mas_set_alloc_req(mas, req);
- 	}
-+
-+	memset(ret, 0, sizeof(*ret));
- 	return (struct maple_node *)ret;
- }
- 
-@@ -1179,21 +1182,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
- 	unsigned long count;
- 	unsigned int requested = mas_alloc_req(mas);
- 
--	memset(reuse, 0, sizeof(*reuse));
- 	count = mas_allocated(mas);
- 
--	if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) {
--		if (head->slot[0])
--			head->node_count++;
--		head->slot[head->node_count] = reuse;
-+	reuse->request_count = 0;
-+	reuse->node_count = 0;
-+	if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
-+		head->slot[head->node_count++] = reuse;
- 		head->total++;
- 		goto done;
- 	}
- 
- 	reuse->total = 1;
- 	if ((head) && !((unsigned long)head & 0x1)) {
--		head->request_count = 0;
- 		reuse->slot[0] = head;
-+		reuse->node_count = 1;
- 		reuse->total += head->total;
- 	}
- 
-@@ -1212,7 +1214,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
- {
- 	struct maple_alloc *node;
- 	unsigned long allocated = mas_allocated(mas);
--	unsigned long success = allocated;
- 	unsigned int requested = mas_alloc_req(mas);
- 	unsigned int count;
- 	void **slots = NULL;
-@@ -1228,24 +1229,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
- 		WARN_ON(!allocated);
- 	}
- 
--	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) {
-+	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
- 		node = (struct maple_alloc *)mt_alloc_one(gfp);
- 		if (!node)
- 			goto nomem_one;
- 
--		if (allocated)
-+		if (allocated) {
- 			node->slot[0] = mas->alloc;
-+			node->node_count = 1;
-+		} else {
-+			node->node_count = 0;
-+		}
- 
--		success++;
- 		mas->alloc = node;
-+		node->total = ++allocated;
- 		requested--;
- 	}
- 
+@@ -1303,26 +1303,18 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
  	node = mas->alloc;
-+	node->request_count = 0;
+ 	node->request_count = 0;
  	while (requested) {
- 		max_req = MAPLE_ALLOC_SLOTS;
--		if (node->slot[0]) {
--			unsigned int offset = node->node_count + 1;
-+		if (node->node_count) {
-+			unsigned int offset = node->node_count;
- 
- 			slots = (void **)&node->slot[offset];
- 			max_req -= offset;
-@@ -1259,15 +1265,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+-		max_req = MAPLE_ALLOC_SLOTS;
+-		if (node->node_count) {
+-			unsigned int offset = node->node_count;
+-
+-			slots = (void **)&node->slot[offset];
+-			max_req -= offset;
+-		} else {
+-			slots = (void **)&node->slot;
+-		}
+-
++		max_req = MAPLE_ALLOC_SLOTS - node->node_count;
++		slots = (void **)&node->slot[node->node_count];
+ 		max_req = min(requested, max_req);
+ 		count = mt_alloc_bulk(gfp, max_req, slots);
+ 		if (!count)
  			goto nomem_bulk;
  
++		if (node->node_count == 0)
++			node->slot[0]->node_count = 0;
  		node->node_count += count;
--		/* zero indexed. */
--		if (slots == (void **)&node->slot)
--			node->node_count--;
--
--		success += count;
-+		allocated += count;
+ 		allocated += count;
  		node = node->slot[0];
-+		node->node_count = 0;
-+		node->request_count = 0;
+-		node->node_count = 0;
+-		node->request_count = 0;
  		requested -= count;
  	}
--	mas->alloc->total = success;
-+	mas->alloc->total = allocated;
- 	return;
+ 	mas->alloc->total = allocated;
+@@ -2317,9 +2309,7 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode)
+ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
+ {
+ 	struct ma_state *mas = wr_mas->mas;
+-	unsigned char count;
+-	unsigned char offset;
+-	unsigned long index, min, max;
++	unsigned char count, offset;
  
- nomem_bulk:
-@@ -1276,10 +1280,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
- nomem_one:
- 	mas_set_alloc_req(mas, requested);
- 	if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
--		mas->alloc->total = success;
-+		mas->alloc->total = allocated;
- 	mas_set_err(mas, -ENOMEM);
--	return;
+ 	if (unlikely(ma_is_dense(wr_mas->type))) {
+ 		wr_mas->r_max = wr_mas->r_min = mas->index;
+@@ -2332,34 +2322,12 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
+ 	count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type,
+ 					       wr_mas->pivots, mas->max);
+ 	offset = mas->offset;
+-	min = mas_safe_min(mas, wr_mas->pivots, offset);
+-	if (unlikely(offset == count))
+-		goto max;
 -
+-	max = wr_mas->pivots[offset];
+-	index = mas->index;
+-	if (unlikely(index <= max))
+-		goto done;
+-
+-	if (unlikely(!max && offset))
+-		goto max;
+ 
+-	min = max + 1;
+-	while (++offset < count) {
+-		max = wr_mas->pivots[offset];
+-		if (index <= max)
+-			goto done;
+-		else if (unlikely(!max))
+-			break;
++	while (offset < count && mas->index > wr_mas->pivots[offset])
++		offset++;
+ 
+-		min = max + 1;
+-	}
+-
+-max:
+-	max = mas->max;
+-done:
+-	wr_mas->r_max = max;
+-	wr_mas->r_min = min;
++	wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
++	wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
+ 	wr_mas->offset_end = mas->offset = offset;
  }
  
- /*
-@@ -1887,10 +1889,9 @@ static inline int mab_calc_split(struct ma_state *mas,
- 
- 	/* Avoid ending a node on a NULL entry */
- 	split = mab_no_null_split(bn, split, slot_count);
--	if (!(*mid_split))
--		return split;
- 
--	*mid_split = mab_no_null_split(bn, *mid_split, slot_count);
-+	if (unlikely(*mid_split))
-+		*mid_split = mab_no_null_split(bn, *mid_split, slot_count);
- 
- 	return split;
- }
-@@ -2113,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
-  *
-  * Return: The actual end of the data stored in @b_node
-  */
--static inline void mas_store_b_node(struct ma_wr_state *wr_mas,
-+static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
- 		struct maple_big_node *b_node, unsigned char offset_end)
- {
- 	unsigned char slot;
-@@ -2947,7 +2948,7 @@ static inline void *mtree_range_walk(struct ma_state *mas)
- 	mas->min = prev_min;
- 	mas->max = prev_max;
- 	mas->node = last;
--	return (void *) next;
-+	return (void *)next;
- 
- dead_node:
- 	mas_reset(mas);
-@@ -3467,7 +3468,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
-  */
- static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
- {
--
- 	struct maple_subtree_state mast;
- 	int height = 0;
- 	unsigned char mid_split, split = 0;
-@@ -3586,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas,
-  * @b_node: The maple big node
-  * @end: The end of the data.
-  */
--static inline int mas_commit_b_node(struct ma_wr_state *wr_mas,
-+static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
- 			    struct maple_big_node *b_node, unsigned char end)
- {
- 	struct maple_node *node;
-@@ -3893,7 +3893,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
- 			goto dead_node;
- 	} while (!ma_is_leaf(type));
- 
--	return (void *) next;
-+	return (void *)next;
- 
- dead_node:
- 	mas_reset(mas);
-@@ -4711,15 +4711,11 @@ static inline void *mas_next_nentry(struct ma_state *mas,
- 
- static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
- {
--
- retry:
- 	mas_set(mas, index);
- 	mas_state_walk(mas);
- 	if (mas_is_start(mas))
- 		goto retry;
--
--	return;
--
- }
- 
- /*
-@@ -5093,35 +5089,21 @@ static inline bool mas_rewind_node(struct ma_state *mas)
-  */
- static inline bool mas_skip_node(struct ma_state *mas)
- {
--	unsigned char slot, slot_count;
--	unsigned long *pivots;
--	enum maple_type mt;
-+	if (mas_is_err(mas))
-+		return false;
- 
--	mt = mte_node_type(mas->node);
--	slot_count = mt_slots[mt] - 1;
- 	do {
- 		if (mte_is_root(mas->node)) {
--			slot = mas->offset;
--			if (slot > slot_count) {
-+			if (mas->offset >= mas_data_end(mas)) {
- 				mas_set_err(mas, -EBUSY);
- 				return false;
- 			}
- 		} else {
- 			mas_ascend(mas);
--			slot = mas->offset;
--			mt = mte_node_type(mas->node);
--			slot_count = mt_slots[mt] - 1;
- 		}
--	} while (slot > slot_count);
--
--	mas->offset = ++slot;
--	pivots = ma_pivots(mas_mn(mas), mt);
--	if (slot > 0)
--		mas->min = pivots[slot - 1] + 1;
--
--	if (slot <= slot_count)
--		mas->max = pivots[slot];
-+	} while (mas->offset >= mas_data_end(mas));
- 
-+	mas->offset++;
- 	return true;
- }
- 
-@@ -5590,8 +5572,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
- 
- /*
-  * mte_destroy_walk() - Free a tree or sub-tree.
-- * @enode - the encoded maple node (maple_enode) to start
-- * @mn - the tree to free - needed for node types.
-+ * @enode: the encoded maple node (maple_enode) to start
-+ * @mt: the tree to free - needed for node types.
-  *
-  * Must hold the write lock.
-  */
-@@ -5620,7 +5602,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
- 				mas_reset(wr_mas->mas);
- 		}
- 	}
--
- }
- 
- /* Interface */
-@@ -5733,6 +5714,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
+@@ -5819,6 +5787,7 @@ int mas_preallocate(struct ma_state *mas, gfp_t gfp)
  	mas_reset(mas);
  	return ret;
  }
@@ -29654,519 +10331,11 @@ index 5a976393c9ae..a73f83d0eb0e 100644
  
  /*
   * mas_destroy() - destroy a maple state.
-@@ -5745,6 +5727,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
- void mas_destroy(struct ma_state *mas)
- {
- 	struct maple_alloc *node;
-+	unsigned long total;
- 
- 	/*
- 	 * When using mas_for_each() to insert an expected number of elements,
-@@ -5767,14 +5750,20 @@ void mas_destroy(struct ma_state *mas)
- 	}
- 	mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
- 
--	while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) {
-+	total = mas_allocated(mas);
-+	while (total) {
- 		node = mas->alloc;
- 		mas->alloc = node->slot[0];
--		if (node->node_count > 0)
--			mt_free_bulk(node->node_count,
--				     (void __rcu **)&node->slot[1]);
-+		if (node->node_count > 1) {
-+			size_t count = node->node_count - 1;
-+
-+			mt_free_bulk(count, (void __rcu **)&node->slot[1]);
-+			total -= count;
-+		}
- 		kmem_cache_free(maple_node_cache, node);
-+		total--;
- 	}
-+
- 	mas->alloc = NULL;
- }
- EXPORT_SYMBOL_GPL(mas_destroy);
-@@ -6734,7 +6723,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
- 
- 		if (i < (MAPLE_RANGE64_SLOTS - 1))
- 			last = node->pivot[i];
--		else if (!node->slot[i] && max != mt_max[mte_node_type(entry)])
-+		else if (!node->slot[i] && max != mt_node_max(entry))
- 			break;
- 		if (last == 0 && i > 0)
- 			break;
-@@ -6841,7 +6830,7 @@ void mt_dump(const struct maple_tree *mt)
- 	if (!xa_is_node(entry))
- 		mt_dump_entry(entry, 0, 0, 0);
- 	else if (entry)
--		mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0);
-+		mt_dump_node(mt, entry, 0, mt_node_max(entry), 0);
- }
- EXPORT_SYMBOL_GPL(mt_dump);
- 
-diff --git a/mm/fadvise.c b/mm/fadvise.c
-index bf04fec87f35..fb7c5f43fd2a 100644
---- a/mm/fadvise.c
-+++ b/mm/fadvise.c
-@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
- 	case POSIX_FADV_NORMAL:
- 		file->f_ra.ra_pages = bdi->ra_pages;
- 		spin_lock(&file->f_lock);
--		file->f_mode &= ~FMODE_RANDOM;
-+		file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
- 		spin_unlock(&file->f_lock);
- 		break;
- 	case POSIX_FADV_RANDOM:
-@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
- 		force_page_cache_readahead(mapping, file, start_index, nrpages);
- 		break;
- 	case POSIX_FADV_NOREUSE:
-+		spin_lock(&file->f_lock);
-+		file->f_mode |= FMODE_NOREUSE;
-+		spin_unlock(&file->f_lock);
- 		break;
- 	case POSIX_FADV_DONTNEED:
- 		__filemap_fdatawrite_range(mapping, offset, endbyte,
-diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index 2eee092f8f11..802d3868d097 100644
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
- 	struct mem_cgroup_per_node *mz;
- 	struct mem_cgroup_tree_per_node *mctz;
- 
-+	if (lru_gen_enabled()) {
-+		if (soft_limit_excess(memcg))
-+			lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
-+		return;
-+	}
-+
- 	mctz = soft_limit_tree.rb_tree_per_node[nid];
- 	if (!mctz)
- 		return;
-@@ -3526,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- 	struct mem_cgroup_tree_per_node *mctz;
- 	unsigned long excess;
- 
-+	if (lru_gen_enabled())
-+		return 0;
-+
- 	if (order > 0)
- 		return 0;
- 
-@@ -5386,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
- 	if (unlikely(mem_cgroup_is_root(memcg)))
- 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
- 				   2UL*HZ);
-+	lru_gen_online_memcg(memcg);
- 	return 0;
- offline_kmem:
- 	memcg_offline_kmem(memcg);
-@@ -5417,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
- 	memcg_offline_kmem(memcg);
- 	reparent_shrinker_deferred(memcg);
- 	wb_memcg_offline(memcg);
-+	lru_gen_offline_memcg(memcg);
- 
- 	drain_all_stock(memcg);
- 
-@@ -5428,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
- 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- 
- 	invalidate_reclaim_iterators(memcg);
-+	lru_gen_release_memcg(memcg);
- }
- 
- static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
-diff --git a/mm/memory.c b/mm/memory.c
-index f526b9152bef..4ad62eba3cb7 100644
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -1392,8 +1392,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
- 						force_flush = 1;
- 					}
- 				}
--				if (pte_young(ptent) &&
--				    likely(!(vma->vm_flags & VM_SEQ_READ)))
-+				if (pte_young(ptent) && likely(vma_has_recency(vma)))
- 					mark_page_accessed(page);
- 			}
- 			rss[mm_counter(page)]--;
-@@ -5140,8 +5139,8 @@ static inline void mm_account_fault(struct pt_regs *regs,
- #ifdef CONFIG_LRU_GEN
- static void lru_gen_enter_fault(struct vm_area_struct *vma)
- {
--	/* the LRU algorithm doesn't apply to sequential or random reads */
--	current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
-+	/* the LRU algorithm only applies to accesses with recency */
-+	current->in_lru_fault = vma_has_recency(vma);
- }
- 
- static void lru_gen_exit_fault(void)
-diff --git a/mm/page_alloc.c b/mm/page_alloc.c
-index 3aec9a6a9cb7..6658cbf43f5d 100644
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid)
- 	pgdat_set_deferred_range(pgdat);
- 
- 	free_area_init_core(pgdat);
-+	lru_gen_init_pgdat(pgdat);
- }
- 
- static void __init free_area_init_memoryless_node(int nid)
-diff --git a/mm/rmap.c b/mm/rmap.c
-index 3b45d049069e..c8701608bb0d 100644
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio,
- 		}
- 
- 		if (pvmw.pte) {
--			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
--			    !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
-+			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
- 				lru_gen_look_around(&pvmw);
- 				referenced++;
- 			}
- 
- 			if (ptep_clear_flush_young_notify(vma, address,
--						pvmw.pte)) {
--				/*
--				 * Don't treat a reference through
--				 * a sequentially read mapping as such.
--				 * If the folio has been used in another mapping,
--				 * we will catch it; if this other mapping is
--				 * already gone, the unmap path will have set
--				 * the referenced flag or activated the folio.
--				 */
--				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
--					referenced++;
--			}
-+						pvmw.pte))
-+				referenced++;
- 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- 			if (pmdp_clear_flush_young_notify(vma, address,
- 						pvmw.pmd))
-@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
- 	struct folio_referenced_arg *pra = arg;
- 	struct mem_cgroup *memcg = pra->memcg;
- 
--	if (!mm_match_cgroup(vma->vm_mm, memcg))
-+	/*
-+	 * Ignore references from this mapping if it has no recency. If the
-+	 * folio has been used in another mapping, we will catch it; if this
-+	 * other mapping is already gone, the unmap path will have set the
-+	 * referenced flag or activated the folio in zap_pte_range().
-+	 */
-+	if (!vma_has_recency(vma))
-+		return true;
-+
-+	/*
-+	 * If we are reclaiming on behalf of a cgroup, skip counting on behalf
-+	 * of references from different cgroups.
-+	 */
-+	if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- 		return true;
- 
- 	return false;
-@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked,
- 		.arg = (void *)&pra,
- 		.anon_lock = folio_lock_anon_vma_read,
- 		.try_lock = true,
-+		.invalid_vma = invalid_folio_referenced_vma,
- 	};
- 
- 	*vm_flags = 0;
-@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked,
- 			return 1;
- 	}
- 
--	/*
--	 * If we are reclaiming on behalf of a cgroup, skip
--	 * counting on behalf of references from different
--	 * cgroups
--	 */
--	if (memcg) {
--		rwc.invalid_vma = invalid_folio_referenced_vma;
--	}
--
- 	rmap_walk(folio, &rwc);
- 	*vm_flags = pra.vm_flags;
- 
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 160acbbdf111..1a8f3b1c0bad 100644
+index 71a7f4517e5a..8dadd1772661 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
-@@ -55,6 +55,8 @@
- #include <linux/ctype.h>
- #include <linux/debugfs.h>
- #include <linux/khugepaged.h>
-+#include <linux/rculist_nulls.h>
-+#include <linux/random.h>
- 
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -135,12 +137,6 @@ struct scan_control {
- 	/* Always discard instead of demoting to lower tier memory */
- 	unsigned int no_demotion:1;
- 
--#ifdef CONFIG_LRU_GEN
--	/* help kswapd make better choices among multiple memcgs */
--	unsigned int memcgs_need_aging:1;
--	unsigned long last_reclaimed;
--#endif
--
- 	/* Allocation order */
- 	s8 order;
- 
-@@ -453,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
- 	return sc->target_mem_cgroup;
- }
- 
-+static bool global_reclaim(struct scan_control *sc)
-+{
-+	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
-+}
-+
- /**
-  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
-  * @sc: scan_control in question
-@@ -503,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
- 	return false;
- }
- 
-+static bool global_reclaim(struct scan_control *sc)
-+{
-+	return true;
-+}
-+
- static bool writeback_throttling_sane(struct scan_control *sc)
- {
- 	return true;
-@@ -3184,6 +3190,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
- 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
- 			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
- 
-+#define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS)
-+#define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS)
-+
- static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
- {
- 	struct pglist_data *pgdat = NODE_DATA(nid);
-@@ -3209,6 +3218,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- 
-+	if (!sc->may_swap)
-+		return 0;
-+
- 	if (!can_demote(pgdat->node_id, sc) &&
- 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
- 		return 0;
-@@ -3223,12 +3235,104 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
- 
- static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
- {
--	/* see the comment on lru_gen_struct */
-+	/* see the comment on lru_gen_folio */
- 	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- 	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- 	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
- }
- 
-+/******************************************************************************
-+ *                          Bloom filters
-+ ******************************************************************************/
-+
-+/*
-+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
-+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
-+ * bits in a bitmap, k is the number of hash functions and n is the number of
-+ * inserted items.
-+ *
-+ * Page table walkers use one of the two filters to reduce their search space.
-+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
-+ * aging uses the double-buffering technique to flip to the other filter each
-+ * time it produces a new generation. For non-leaf entries that have enough
-+ * leaf entries, the aging carries them over to the next generation in
-+ * walk_pmd_range(); the eviction also report them when walking the rmap
-+ * in lru_gen_look_around().
-+ *
-+ * For future optimizations:
-+ * 1. It's not necessary to keep both filters all the time. The spare one can be
-+ *    freed after the RCU grace period and reallocated if needed again.
-+ * 2. And when reallocating, it's worth scaling its size according to the number
-+ *    of inserted entries in the other filter, to reduce the memory overhead on
-+ *    small systems and false positives on large systems.
-+ * 3. Jenkins' hash function is an alternative to Knuth's.
-+ */
-+#define BLOOM_FILTER_SHIFT	15
-+
-+static inline int filter_gen_from_seq(unsigned long seq)
-+{
-+	return seq % NR_BLOOM_FILTERS;
-+}
-+
-+static void get_item_key(void *item, int *key)
-+{
-+	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
-+
-+	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
-+
-+	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
-+	key[1] = hash >> BLOOM_FILTER_SHIFT;
-+}
-+
-+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-+{
-+	int key[2];
-+	unsigned long *filter;
-+	int gen = filter_gen_from_seq(seq);
-+
-+	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
-+	if (!filter)
-+		return true;
-+
-+	get_item_key(item, key);
-+
-+	return test_bit(key[0], filter) && test_bit(key[1], filter);
-+}
-+
-+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-+{
-+	int key[2];
-+	unsigned long *filter;
-+	int gen = filter_gen_from_seq(seq);
-+
-+	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
-+	if (!filter)
-+		return;
-+
-+	get_item_key(item, key);
-+
-+	if (!test_bit(key[0], filter))
-+		set_bit(key[0], filter);
-+	if (!test_bit(key[1], filter))
-+		set_bit(key[1], filter);
-+}
-+
-+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
-+{
-+	unsigned long *filter;
-+	int gen = filter_gen_from_seq(seq);
-+
-+	filter = lruvec->mm_state.filters[gen];
-+	if (filter) {
-+		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
-+		return;
-+	}
-+
-+	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
-+			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
-+	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
-+}
-+
- /******************************************************************************
-  *                          mm_struct list
-  ******************************************************************************/
-@@ -3348,94 +3452,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
- }
- #endif
- 
--/*
-- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
-- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
-- * bits in a bitmap, k is the number of hash functions and n is the number of
-- * inserted items.
-- *
-- * Page table walkers use one of the two filters to reduce their search space.
-- * To get rid of non-leaf entries that no longer have enough leaf entries, the
-- * aging uses the double-buffering technique to flip to the other filter each
-- * time it produces a new generation. For non-leaf entries that have enough
-- * leaf entries, the aging carries them over to the next generation in
-- * walk_pmd_range(); the eviction also report them when walking the rmap
-- * in lru_gen_look_around().
-- *
-- * For future optimizations:
-- * 1. It's not necessary to keep both filters all the time. The spare one can be
-- *    freed after the RCU grace period and reallocated if needed again.
-- * 2. And when reallocating, it's worth scaling its size according to the number
-- *    of inserted entries in the other filter, to reduce the memory overhead on
-- *    small systems and false positives on large systems.
-- * 3. Jenkins' hash function is an alternative to Knuth's.
-- */
--#define BLOOM_FILTER_SHIFT	15
--
--static inline int filter_gen_from_seq(unsigned long seq)
--{
--	return seq % NR_BLOOM_FILTERS;
--}
--
--static void get_item_key(void *item, int *key)
--{
--	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
--
--	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
--
--	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
--	key[1] = hash >> BLOOM_FILTER_SHIFT;
--}
--
--static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
--{
--	unsigned long *filter;
--	int gen = filter_gen_from_seq(seq);
--
--	filter = lruvec->mm_state.filters[gen];
--	if (filter) {
--		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
--		return;
--	}
--
--	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
--			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
--	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
--}
--
--static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
--{
--	int key[2];
--	unsigned long *filter;
--	int gen = filter_gen_from_seq(seq);
--
--	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
--	if (!filter)
--		return;
--
--	get_item_key(item, key);
--
--	if (!test_bit(key[0], filter))
--		set_bit(key[0], filter);
--	if (!test_bit(key[1], filter))
--		set_bit(key[1], filter);
--}
--
--static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
--{
--	int key[2];
--	unsigned long *filter;
--	int gen = filter_gen_from_seq(seq);
--
--	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
--	if (!filter)
--		return true;
--
--	get_item_key(item, key);
--
--	return test_bit(key[0], filter) && test_bit(key[1], filter);
--}
--
- static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
- {
- 	int i;
-@@ -3592,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+@@ -3608,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
  }
  
  /******************************************************************************
@@ -30175,1280 +10344,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644
   ******************************************************************************/
  
  /*
-@@ -3623,7 +3639,7 @@ struct ctrl_pos {
- static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
- 			  struct ctrl_pos *pos)
- {
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- 
- 	pos->refaulted = lrugen->avg_refaulted[type][tier] +
-@@ -3638,7 +3654,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
- static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
- {
- 	int hist, tier;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
- 	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
- 
-@@ -3715,7 +3731,7 @@ static int folio_update_gen(struct folio *folio, int gen)
- static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
- {
- 	int type = folio_is_file_lru(folio);
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- 	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
- 
-@@ -3760,7 +3776,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
- static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
- {
- 	int gen, type, zone;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	walk->batched = 0;
- 
-@@ -3793,7 +3809,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
- 	if (is_vm_hugetlb_page(vma))
- 		return true;
- 
--	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
-+	if (!vma_has_recency(vma))
-+		return true;
-+
-+	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
- 		return true;
- 
- 	if (vma == get_gate_vma(vma->vm_mm))
-@@ -3988,8 +4007,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- }
- 
- #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
--static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
--				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
-+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
-+				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
- {
- 	int i;
- 	pmd_t *pmd;
-@@ -4002,18 +4021,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
- 	VM_WARN_ON_ONCE(pud_leaf(*pud));
- 
- 	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
--	if (*start == -1) {
--		*start = next;
-+	if (*first == -1) {
-+		*first = addr;
-+		bitmap_zero(bitmap, MIN_LRU_BATCH);
- 		return;
- 	}
- 
--	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
-+	i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
- 	if (i && i <= MIN_LRU_BATCH) {
- 		__set_bit(i - 1, bitmap);
- 		return;
- 	}
- 
--	pmd = pmd_offset(pud, *start);
-+	pmd = pmd_offset(pud, *first);
- 
- 	ptl = pmd_lockptr(args->mm, pmd);
- 	if (!spin_trylock(ptl))
-@@ -4024,15 +4044,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
- 	do {
- 		unsigned long pfn;
- 		struct folio *folio;
--		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
-+
-+		/* don't round down the first address */
-+		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
- 
- 		pfn = get_pmd_pfn(pmd[i], vma, addr);
- 		if (pfn == -1)
- 			goto next;
- 
- 		if (!pmd_trans_huge(pmd[i])) {
--			if (arch_has_hw_nonleaf_pmd_young() &&
--			    get_cap(LRU_GEN_NONLEAF_YOUNG))
-+			if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
- 				pmdp_test_and_clear_young(vma, addr, pmd + i);
- 			goto next;
- 		}
-@@ -4061,12 +4082,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
- 	arch_leave_lazy_mmu_mode();
- 	spin_unlock(ptl);
- done:
--	*start = -1;
--	bitmap_zero(bitmap, MIN_LRU_BATCH);
-+	*first = -1;
- }
- #else
--static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
--				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
-+static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
-+				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
- {
- }
- #endif
-@@ -4079,9 +4099,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
- 	unsigned long next;
- 	unsigned long addr;
- 	struct vm_area_struct *vma;
--	unsigned long pos = -1;
-+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
-+	unsigned long first = -1;
- 	struct lru_gen_mm_walk *walk = args->private;
--	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
- 
- 	VM_WARN_ON_ONCE(pud_leaf(*pud));
- 
-@@ -4120,18 +4140,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
- 			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- 				continue;
- 
--			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
-+			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
- 			continue;
- 		}
- #endif
- 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
- 
--		if (arch_has_hw_nonleaf_pmd_young() &&
--		    get_cap(LRU_GEN_NONLEAF_YOUNG)) {
-+		if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
- 			if (!pmd_young(val))
- 				continue;
- 
--			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
-+			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
- 		}
- 
- 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
-@@ -4148,7 +4167,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
- 		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
- 	}
- 
--	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
-+	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
- 
- 	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
- 		goto restart;
-@@ -4238,7 +4257,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
- 	} while (err == -EAGAIN);
- }
- 
--static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
-+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
- {
- 	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
- 
-@@ -4246,7 +4265,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
- 		VM_WARN_ON_ONCE(walk);
- 
- 		walk = &pgdat->mm_walk;
--	} else if (!pgdat && !walk) {
-+	} else if (!walk && force_alloc) {
- 		VM_WARN_ON_ONCE(current_is_kswapd());
- 
- 		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
-@@ -4274,7 +4293,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
- {
- 	int zone;
- 	int remaining = MAX_LRU_BATCH;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- 
- 	if (type == LRU_GEN_ANON && !can_swap)
-@@ -4282,7 +4301,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
- 
- 	/* prevent cold/hot inversion if force_scan is true */
- 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
--		struct list_head *head = &lrugen->lists[old_gen][type][zone];
-+		struct list_head *head = &lrugen->folios[old_gen][type][zone];
- 
- 		while (!list_empty(head)) {
- 			struct folio *folio = lru_to_folio(head);
-@@ -4293,7 +4312,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
- 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
- 
- 			new_gen = folio_inc_gen(lruvec, folio, false);
--			list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
-+			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
- 
- 			if (!--remaining)
- 				return false;
-@@ -4310,7 +4329,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
- {
- 	int gen, type, zone;
- 	bool success = false;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	DEFINE_MIN_SEQ(lruvec);
- 
- 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
-@@ -4321,7 +4340,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
- 			gen = lru_gen_from_seq(min_seq[type]);
- 
- 			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
--				if (!list_empty(&lrugen->lists[gen][type][zone]))
-+				if (!list_empty(&lrugen->folios[gen][type][zone]))
- 					goto next;
- 			}
- 
-@@ -4331,7 +4350,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
- 		;
- 	}
- 
--	/* see the comment on lru_gen_struct */
-+	/* see the comment on lru_gen_folio */
- 	if (can_swap) {
- 		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- 		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
-@@ -4353,7 +4372,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
- {
- 	int prev, next;
- 	int type, zone;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	spin_lock_irq(&lruvec->lru_lock);
- 
-@@ -4411,7 +4430,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- 	bool success;
- 	struct lru_gen_mm_walk *walk;
- 	struct mm_struct *mm = NULL;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
- 
-@@ -4427,12 +4446,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- 	 * handful of PTEs. Spreading the work out over a period of time usually
- 	 * is less efficient, but it avoids bursty page faults.
- 	 */
--	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
-+	if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
- 		success = iterate_mm_list_nowalk(lruvec, max_seq);
- 		goto done;
- 	}
- 
--	walk = set_mm_walk(NULL);
-+	walk = set_mm_walk(NULL, true);
- 	if (!walk) {
- 		success = iterate_mm_list_nowalk(lruvec, max_seq);
- 		goto done;
-@@ -4455,8 +4474,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- 		if (sc->priority <= DEF_PRIORITY - 2)
- 			wait_event_killable(lruvec->mm_state.wait,
- 					    max_seq < READ_ONCE(lrugen->max_seq));
--
--		return max_seq < READ_ONCE(lrugen->max_seq);
-+		return false;
- 	}
- 
- 	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
-@@ -4469,97 +4487,56 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- 	return true;
- }
- 
--static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
--			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
-+/******************************************************************************
-+ *                          working set protection
-+ ******************************************************************************/
-+
-+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
- {
- 	int gen, type, zone;
--	unsigned long old = 0;
--	unsigned long young = 0;
- 	unsigned long total = 0;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	bool can_swap = get_swappiness(lruvec, sc);
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+	DEFINE_MAX_SEQ(lruvec);
-+	DEFINE_MIN_SEQ(lruvec);
- 
- 	for (type = !can_swap; type < ANON_AND_FILE; type++) {
- 		unsigned long seq;
- 
- 		for (seq = min_seq[type]; seq <= max_seq; seq++) {
--			unsigned long size = 0;
--
- 			gen = lru_gen_from_seq(seq);
- 
- 			for (zone = 0; zone < MAX_NR_ZONES; zone++)
--				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
--
--			total += size;
--			if (seq == max_seq)
--				young += size;
--			else if (seq + MIN_NR_GENS == max_seq)
--				old += size;
-+				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
- 		}
- 	}
- 
--	/* try to scrape all its memory if this memcg was deleted */
--	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
--
--	/*
--	 * The aging tries to be lazy to reduce the overhead, while the eviction
--	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
--	 * ideal number of generations is MIN_NR_GENS+1.
--	 */
--	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
--		return true;
--	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
--		return false;
--
--	/*
--	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
--	 * of the total number of pages for each generation. A reasonable range
--	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
--	 * aging cares about the upper bound of hot pages, while the eviction
--	 * cares about the lower bound of cold pages.
--	 */
--	if (young * MIN_NR_GENS > total)
--		return true;
--	if (old * (MIN_NR_GENS + 2) < total)
--		return true;
--
--	return false;
-+	/* whether the size is big enough to be helpful */
-+	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
- }
- 
--static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
-+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
-+				  unsigned long min_ttl)
- {
--	bool need_aging;
--	unsigned long nr_to_scan;
--	int swappiness = get_swappiness(lruvec, sc);
-+	int gen;
-+	unsigned long birth;
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
--	DEFINE_MAX_SEQ(lruvec);
- 	DEFINE_MIN_SEQ(lruvec);
- 
--	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
--
--	mem_cgroup_calculate_protection(NULL, memcg);
-+	/* see the comment on lru_gen_folio */
-+	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
-+	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- 
--	if (mem_cgroup_below_min(NULL, memcg))
-+	if (time_is_after_jiffies(birth + min_ttl))
- 		return false;
- 
--	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
--
--	if (min_ttl) {
--		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
--		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
--
--		if (time_is_after_jiffies(birth + min_ttl))
--			return false;
--
--		/* the size is likely too small to be helpful */
--		if (!nr_to_scan && sc->priority != DEF_PRIORITY)
--			return false;
--	}
-+	if (!lruvec_is_sizable(lruvec, sc))
-+		return false;
- 
--	if (need_aging)
--		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
-+	mem_cgroup_calculate_protection(NULL, memcg);
- 
--	return true;
-+	return !mem_cgroup_below_min(NULL, memcg);
- }
- 
- /* to protect the working set of the last N jiffies */
-@@ -4572,46 +4549,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;
- static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- {
- 	struct mem_cgroup *memcg;
--	bool success = false;
- 	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
- 
- 	VM_WARN_ON_ONCE(!current_is_kswapd());
- 
--	sc->last_reclaimed = sc->nr_reclaimed;
--
--	/*
--	 * To reduce the chance of going into the aging path, which can be
--	 * costly, optimistically skip it if the flag below was cleared in the
--	 * eviction path. This improves the overall performance when multiple
--	 * memcgs are available.
--	 */
--	if (!sc->memcgs_need_aging) {
--		sc->memcgs_need_aging = true;
-+	/* check the order to exclude compaction-induced reclaim */
-+	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- 		return;
--	}
--
--	set_mm_walk(pgdat);
- 
- 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
- 	do {
- 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- 
--		if (age_lruvec(lruvec, sc, min_ttl))
--			success = true;
-+		if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
-+			mem_cgroup_iter_break(NULL, memcg);
-+			return;
-+		}
- 
- 		cond_resched();
- 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- 
--	clear_mm_walk();
--
--	/* check the order to exclude compaction-induced reclaim */
--	if (success || !min_ttl || sc->order)
--		return;
--
- 	/*
- 	 * The main goal is to OOM kill if every generation from all memcgs is
- 	 * younger than min_ttl. However, another possibility is all memcgs are
--	 * either below min or empty.
-+	 * either too small or below min.
- 	 */
- 	if (mutex_trylock(&oom_lock)) {
- 		struct oom_control oc = {
-@@ -4624,6 +4585,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- 	}
- }
- 
-+/******************************************************************************
-+ *                          rmap/PT walk feedback
-+ ******************************************************************************/
-+
- /*
-  * This function exploits spatial locality when shrink_folio_list() walks the
-  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
-@@ -4634,13 +4599,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- {
- 	int i;
--	pte_t *pte;
- 	unsigned long start;
- 	unsigned long end;
--	unsigned long addr;
- 	struct lru_gen_mm_walk *walk;
- 	int young = 0;
--	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
-+	pte_t *pte = pvmw->pte;
-+	unsigned long addr = pvmw->address;
- 	struct folio *folio = pfn_folio(pvmw->pfn);
- 	struct mem_cgroup *memcg = folio_memcg(folio);
- 	struct pglist_data *pgdat = folio_pgdat(folio);
-@@ -4657,25 +4621,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 	/* avoid taking the LRU lock under the PTL when possible */
- 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
- 
--	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
--	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
-+	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
-+	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
- 
- 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
--		if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
-+		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
- 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
--		else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
-+		else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
- 			start = end - MIN_LRU_BATCH * PAGE_SIZE;
- 		else {
--			start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
--			end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
-+			start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
-+			end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
- 		}
- 	}
- 
--	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
-+	/* folio_update_gen() requires stable folio_memcg() */
-+	if (!mem_cgroup_trylock_pages(memcg))
-+		return;
- 
--	rcu_read_lock();
- 	arch_enter_lazy_mmu_mode();
- 
-+	pte -= (addr - start) / PAGE_SIZE;
-+
- 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
- 		unsigned long pfn;
- 
-@@ -4700,58 +4667,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 		      !folio_test_swapcache(folio)))
- 			folio_mark_dirty(folio);
- 
-+		if (walk) {
-+			old_gen = folio_update_gen(folio, new_gen);
-+			if (old_gen >= 0 && old_gen != new_gen)
-+				update_batch_size(walk, folio, old_gen, new_gen);
-+
-+			continue;
-+		}
-+
- 		old_gen = folio_lru_gen(folio);
- 		if (old_gen < 0)
- 			folio_set_referenced(folio);
- 		else if (old_gen != new_gen)
--			__set_bit(i, bitmap);
-+			folio_activate(folio);
- 	}
- 
- 	arch_leave_lazy_mmu_mode();
--	rcu_read_unlock();
-+	mem_cgroup_unlock_pages();
- 
- 	/* feedback from rmap walkers to page table walkers */
- 	if (suitable_to_scan(i, young))
- 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
-+}
- 
--	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
--		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
--			folio = pfn_folio(pte_pfn(pte[i]));
--			folio_activate(folio);
--		}
--		return;
-+/******************************************************************************
-+ *                          memcg LRU
-+ ******************************************************************************/
-+
-+/* see the comment on MEMCG_NR_GENS */
-+enum {
-+	MEMCG_LRU_NOP,
-+	MEMCG_LRU_HEAD,
-+	MEMCG_LRU_TAIL,
-+	MEMCG_LRU_OLD,
-+	MEMCG_LRU_YOUNG,
-+};
-+
-+#ifdef CONFIG_MEMCG
-+
-+static int lru_gen_memcg_seg(struct lruvec *lruvec)
-+{
-+	return READ_ONCE(lruvec->lrugen.seg);
-+}
-+
-+static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
-+{
-+	int seg;
-+	int old, new;
-+	int bin = get_random_u32_below(MEMCG_NR_BINS);
-+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-+
-+	spin_lock(&pgdat->memcg_lru.lock);
-+
-+	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
-+
-+	seg = 0;
-+	new = old = lruvec->lrugen.gen;
-+
-+	/* see the comment on MEMCG_NR_GENS */
-+	if (op == MEMCG_LRU_HEAD)
-+		seg = MEMCG_LRU_HEAD;
-+	else if (op == MEMCG_LRU_TAIL)
-+		seg = MEMCG_LRU_TAIL;
-+	else if (op == MEMCG_LRU_OLD)
-+		new = get_memcg_gen(pgdat->memcg_lru.seq);
-+	else if (op == MEMCG_LRU_YOUNG)
-+		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
-+	else
-+		VM_WARN_ON_ONCE(true);
-+
-+	hlist_nulls_del_rcu(&lruvec->lrugen.list);
-+
-+	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
-+		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
-+	else
-+		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
-+
-+	pgdat->memcg_lru.nr_memcgs[old]--;
-+	pgdat->memcg_lru.nr_memcgs[new]++;
-+
-+	lruvec->lrugen.gen = new;
-+	WRITE_ONCE(lruvec->lrugen.seg, seg);
-+
-+	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
-+		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
-+
-+	spin_unlock(&pgdat->memcg_lru.lock);
-+}
-+
-+void lru_gen_online_memcg(struct mem_cgroup *memcg)
-+{
-+	int gen;
-+	int nid;
-+	int bin = get_random_u32_below(MEMCG_NR_BINS);
-+
-+	for_each_node(nid) {
-+		struct pglist_data *pgdat = NODE_DATA(nid);
-+		struct lruvec *lruvec = get_lruvec(memcg, nid);
-+
-+		spin_lock(&pgdat->memcg_lru.lock);
-+
-+		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
-+
-+		gen = get_memcg_gen(pgdat->memcg_lru.seq);
-+
-+		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
-+		pgdat->memcg_lru.nr_memcgs[gen]++;
-+
-+		lruvec->lrugen.gen = gen;
-+
-+		spin_unlock(&pgdat->memcg_lru.lock);
- 	}
-+}
- 
--	/* folio_update_gen() requires stable folio_memcg() */
--	if (!mem_cgroup_trylock_pages(memcg))
--		return;
-+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
-+{
-+	int nid;
- 
--	if (!walk) {
--		spin_lock_irq(&lruvec->lru_lock);
--		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
-+	for_each_node(nid) {
-+		struct lruvec *lruvec = get_lruvec(memcg, nid);
-+
-+		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
- 	}
-+}
- 
--	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
--		folio = pfn_folio(pte_pfn(pte[i]));
--		if (folio_memcg_rcu(folio) != memcg)
--			continue;
-+void lru_gen_release_memcg(struct mem_cgroup *memcg)
-+{
-+	int gen;
-+	int nid;
- 
--		old_gen = folio_update_gen(folio, new_gen);
--		if (old_gen < 0 || old_gen == new_gen)
--			continue;
-+	for_each_node(nid) {
-+		struct pglist_data *pgdat = NODE_DATA(nid);
-+		struct lruvec *lruvec = get_lruvec(memcg, nid);
- 
--		if (walk)
--			update_batch_size(walk, folio, old_gen, new_gen);
--		else
--			lru_gen_update_size(lruvec, folio, old_gen, new_gen);
-+		spin_lock(&pgdat->memcg_lru.lock);
-+
-+		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
-+
-+		gen = lruvec->lrugen.gen;
-+
-+		hlist_nulls_del_rcu(&lruvec->lrugen.list);
-+		pgdat->memcg_lru.nr_memcgs[gen]--;
-+
-+		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
-+			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
-+
-+		spin_unlock(&pgdat->memcg_lru.lock);
- 	}
-+}
-+
-+void lru_gen_soft_reclaim(struct lruvec *lruvec)
-+{
-+	/* see the comment on MEMCG_NR_GENS */
-+	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
-+		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
-+}
- 
--	if (!walk)
--		spin_unlock_irq(&lruvec->lru_lock);
-+#else /* !CONFIG_MEMCG */
- 
--	mem_cgroup_unlock_pages();
-+static int lru_gen_memcg_seg(struct lruvec *lruvec)
-+{
-+	return 0;
- }
- 
-+#endif
-+
- /******************************************************************************
-  *                          the eviction
-  ******************************************************************************/
-@@ -4765,7 +4845,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
- 	int delta = folio_nr_pages(folio);
- 	int refs = folio_lru_refs(folio);
- 	int tier = lru_tier_from_refs(refs);
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
- 
-@@ -4790,7 +4870,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
- 
- 	/* promoted */
- 	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
--		list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
-+		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- 		return true;
- 	}
- 
-@@ -4799,7 +4879,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
- 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- 
- 		gen = folio_inc_gen(lruvec, folio, false);
--		list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
-+		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
- 
- 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- 			   lrugen->protected[hist][type][tier - 1] + delta);
-@@ -4811,7 +4891,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
- 	if (folio_test_locked(folio) || folio_test_writeback(folio) ||
- 	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
- 		gen = folio_inc_gen(lruvec, folio, true);
--		list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
-+		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- 		return true;
- 	}
- 
-@@ -4822,12 +4902,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
- {
- 	bool success;
- 
--	/* unmapping inhibited */
--	if (!sc->may_unmap && folio_mapped(folio))
--		return false;
--
- 	/* swapping inhibited */
--	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
-+	if (!(sc->gfp_mask & __GFP_IO) &&
- 	    (folio_test_dirty(folio) ||
- 	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))
- 		return false;
-@@ -4865,7 +4941,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- 	int scanned = 0;
- 	int isolated = 0;
- 	int remaining = MAX_LRU_BATCH;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- 
- 	VM_WARN_ON_ONCE(!list_empty(list));
-@@ -4878,7 +4954,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- 	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
- 		LIST_HEAD(moved);
- 		int skipped = 0;
--		struct list_head *head = &lrugen->lists[gen][type][zone];
-+		struct list_head *head = &lrugen->folios[gen][type][zone];
- 
- 		while (!list_empty(head)) {
- 			struct folio *folio = lru_to_folio(head);
-@@ -4924,9 +5000,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- 	__count_vm_events(PGSCAN_ANON + type, isolated);
- 
- 	/*
--	 * There might not be eligible pages due to reclaim_idx, may_unmap and
--	 * may_writepage. Check the remaining to prevent livelock if it's not
--	 * making progress.
-+	 * There might not be eligible folios due to reclaim_idx. Check the
-+	 * remaining to prevent livelock if it's not making progress.
- 	 */
- 	return isolated || !remaining ? scanned : 0;
- }
-@@ -5021,8 +5096,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
- 	return scanned;
- }
- 
--static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
--			bool *need_swapping)
-+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
- {
- 	int type;
- 	int scanned;
-@@ -5111,153 +5185,348 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
- 		goto retry;
- 	}
- 
--	if (need_swapping && type == LRU_GEN_ANON)
--		*need_swapping = true;
--
- 	return scanned;
- }
- 
-+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
-+			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
-+{
-+	int gen, type, zone;
-+	unsigned long old = 0;
-+	unsigned long young = 0;
-+	unsigned long total = 0;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
-+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+	DEFINE_MIN_SEQ(lruvec);
-+
-+	/* whether this lruvec is completely out of cold folios */
-+	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
-+		*nr_to_scan = 0;
-+		return true;
-+	}
-+
-+	for (type = !can_swap; type < ANON_AND_FILE; type++) {
-+		unsigned long seq;
-+
-+		for (seq = min_seq[type]; seq <= max_seq; seq++) {
-+			unsigned long size = 0;
-+
-+			gen = lru_gen_from_seq(seq);
-+
-+			for (zone = 0; zone < MAX_NR_ZONES; zone++)
-+				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-+
-+			total += size;
-+			if (seq == max_seq)
-+				young += size;
-+			else if (seq + MIN_NR_GENS == max_seq)
-+				old += size;
-+		}
-+	}
-+
-+	/* try to scrape all its memory if this memcg was deleted */
-+	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
-+
-+	/*
-+	 * The aging tries to be lazy to reduce the overhead, while the eviction
-+	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
-+	 * ideal number of generations is MIN_NR_GENS+1.
-+	 */
-+	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
-+		return false;
-+
-+	/*
-+	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
-+	 * of the total number of pages for each generation. A reasonable range
-+	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
-+	 * aging cares about the upper bound of hot pages, while the eviction
-+	 * cares about the lower bound of cold pages.
-+	 */
-+	if (young * MIN_NR_GENS > total)
-+		return true;
-+	if (old * (MIN_NR_GENS + 2) < total)
-+		return true;
-+
-+	return false;
-+}
-+
- /*
-  * For future optimizations:
-  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
-  *    reclaim.
-  */
--static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
--				    bool can_swap, bool *need_aging)
-+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
- {
- 	unsigned long nr_to_scan;
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- 	DEFINE_MAX_SEQ(lruvec);
--	DEFINE_MIN_SEQ(lruvec);
- 
--	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
--	    (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
--	     !sc->memcg_low_reclaim))
-+	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
- 		return 0;
- 
--	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
--	if (!*need_aging)
-+	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
- 		return nr_to_scan;
- 
- 	/* skip the aging path at the default priority */
- 	if (sc->priority == DEF_PRIORITY)
--		goto done;
-+		return nr_to_scan;
- 
--	/* leave the work to lru_gen_age_node() */
--	if (current_is_kswapd())
--		return 0;
-+	/* skip this lruvec as it's low on cold folios */
-+	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
-+}
- 
--	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
--		return nr_to_scan;
--done:
--	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
-+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
-+{
-+	/* don't abort memcg reclaim to ensure fairness */
-+	if (!global_reclaim(sc))
-+		return -1;
-+
-+	return max(sc->nr_to_reclaim, compact_gap(sc->order));
- }
- 
--static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
--			      struct scan_control *sc, bool need_swapping)
-+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
--	int i;
--	DEFINE_MAX_SEQ(lruvec);
-+	long nr_to_scan;
-+	unsigned long scanned = 0;
-+	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
-+	int swappiness = get_swappiness(lruvec, sc);
- 
--	if (!current_is_kswapd()) {
--		/* age each memcg at most once to ensure fairness */
--		if (max_seq - seq > 1)
--			return true;
-+	/* clean file folios are more likely to exist */
-+	if (swappiness && !(sc->gfp_mask & __GFP_IO))
-+		swappiness = 1;
- 
--		/* over-swapping can increase allocation latency */
--		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
--			return true;
-+	while (true) {
-+		int delta;
- 
--		/* give this thread a chance to exit and free its memory */
--		if (fatal_signal_pending(current)) {
--			sc->nr_reclaimed += MIN_LRU_BATCH;
--			return true;
--		}
-+		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
-+		if (nr_to_scan <= 0)
-+			break;
- 
--		if (cgroup_reclaim(sc))
--			return false;
--	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
--		return false;
-+		delta = evict_folios(lruvec, sc, swappiness);
-+		if (!delta)
-+			break;
- 
--	/* keep scanning at low priorities to ensure fairness */
--	if (sc->priority > DEF_PRIORITY - 2)
--		return false;
-+		scanned += delta;
-+		if (scanned >= nr_to_scan)
-+			break;
- 
--	/*
--	 * A minimum amount of work was done under global memory pressure. For
--	 * kswapd, it may be overshooting. For direct reclaim, the allocation
--	 * may succeed if all suitable zones are somewhat safe. In either case,
--	 * it's better to stop now, and restart later if necessary.
--	 */
--	for (i = 0; i <= sc->reclaim_idx; i++) {
--		unsigned long wmark;
--		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
-+		if (sc->nr_reclaimed >= nr_to_reclaim)
-+			break;
- 
--		if (!managed_zone(zone))
-+		cond_resched();
-+	}
-+
-+	/* whether try_to_inc_max_seq() was successful */
-+	return nr_to_scan < 0;
-+}
-+
-+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
-+{
-+	bool success;
-+	unsigned long scanned = sc->nr_scanned;
-+	unsigned long reclaimed = sc->nr_reclaimed;
-+	int seg = lru_gen_memcg_seg(lruvec);
-+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-+
-+	/* see the comment on MEMCG_NR_GENS */
-+	if (!lruvec_is_sizable(lruvec, sc))
-+		return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
-+
-+	mem_cgroup_calculate_protection(NULL, memcg);
-+
-+	if (mem_cgroup_below_min(NULL, memcg))
-+		return MEMCG_LRU_YOUNG;
-+
-+	if (mem_cgroup_below_low(NULL, memcg)) {
-+		/* see the comment on MEMCG_NR_GENS */
-+		if (seg != MEMCG_LRU_TAIL)
-+			return MEMCG_LRU_TAIL;
-+
-+		memcg_memory_event(memcg, MEMCG_LOW);
-+	}
-+
-+	success = try_to_shrink_lruvec(lruvec, sc);
-+
-+	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
-+
-+	if (!sc->proactive)
-+		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
-+			   sc->nr_reclaimed - reclaimed);
-+
-+	sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
-+	current->reclaim_state->reclaimed_slab = 0;
-+
-+	return success ? MEMCG_LRU_YOUNG : 0;
-+}
-+
-+#ifdef CONFIG_MEMCG
-+
-+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+	int op;
-+	int gen;
-+	int bin;
-+	int first_bin;
-+	struct lruvec *lruvec;
-+	struct lru_gen_folio *lrugen;
-+	struct mem_cgroup *memcg;
-+	const struct hlist_nulls_node *pos;
-+	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
-+
-+	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
-+restart:
-+	op = 0;
-+	memcg = NULL;
-+	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
-+
-+	rcu_read_lock();
-+
-+	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
-+		if (op)
-+			lru_gen_rotate_memcg(lruvec, op);
-+
-+		mem_cgroup_put(memcg);
-+
-+		lruvec = container_of(lrugen, struct lruvec, lrugen);
-+		memcg = lruvec_memcg(lruvec);
-+
-+		if (!mem_cgroup_tryget(memcg)) {
-+			op = 0;
-+			memcg = NULL;
- 			continue;
-+		}
- 
--		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
--		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
--			return false;
-+		rcu_read_unlock();
-+
-+		op = shrink_one(lruvec, sc);
-+
-+		rcu_read_lock();
-+
-+		if (sc->nr_reclaimed >= nr_to_reclaim)
-+			break;
- 	}
- 
--	sc->nr_reclaimed += MIN_LRU_BATCH;
-+	rcu_read_unlock();
- 
--	return true;
-+	if (op)
-+		lru_gen_rotate_memcg(lruvec, op);
-+
-+	mem_cgroup_put(memcg);
-+
-+	if (sc->nr_reclaimed >= nr_to_reclaim)
-+		return;
-+
-+	/* restart if raced with lru_gen_rotate_memcg() */
-+	if (gen != get_nulls_value(pos))
-+		goto restart;
-+
-+	/* try the rest of the bins of the current generation */
-+	bin = get_memcg_bin(bin + 1);
-+	if (bin != first_bin)
-+		goto restart;
- }
- 
- static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
- 	struct blk_plug plug;
--	bool need_aging = false;
--	bool need_swapping = false;
--	unsigned long scanned = 0;
--	unsigned long reclaimed = sc->nr_reclaimed;
--	DEFINE_MAX_SEQ(lruvec);
-+
-+	VM_WARN_ON_ONCE(global_reclaim(sc));
-+	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
- 
- 	lru_add_drain();
- 
- 	blk_start_plug(&plug);
- 
--	set_mm_walk(lruvec_pgdat(lruvec));
-+	set_mm_walk(NULL, sc->proactive);
- 
--	while (true) {
--		int delta;
--		int swappiness;
--		unsigned long nr_to_scan;
-+	if (try_to_shrink_lruvec(lruvec, sc))
-+		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
- 
--		if (sc->may_swap)
--			swappiness = get_swappiness(lruvec, sc);
--		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
--			swappiness = 1;
--		else
--			swappiness = 0;
-+	clear_mm_walk();
- 
--		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
--		if (!nr_to_scan)
--			goto done;
-+	blk_finish_plug(&plug);
-+}
- 
--		delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
--		if (!delta)
--			goto done;
-+#else /* !CONFIG_MEMCG */
- 
--		scanned += delta;
--		if (scanned >= nr_to_scan)
--			break;
-+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+	BUILD_BUG();
-+}
- 
--		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
--			break;
-+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-+{
-+	BUILD_BUG();
-+}
- 
--		cond_resched();
--	}
-+#endif
-+
-+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+	int priority;
-+	unsigned long reclaimable;
-+	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
-+
-+	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
-+		return;
-+	/*
-+	 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
-+	 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
-+	 * estimated reclaimed_to_scanned_ratio = inactive / total.
-+	 */
-+	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
-+	if (get_swappiness(lruvec, sc))
-+		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-+
-+	reclaimable /= MEMCG_NR_GENS;
-+
-+	/* round down reclaimable and round up sc->nr_to_reclaim */
-+	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-+
-+	sc->priority = clamp(priority, 0, DEF_PRIORITY);
-+}
-+
-+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+	struct blk_plug plug;
-+	unsigned long reclaimed = sc->nr_reclaimed;
-+
-+	VM_WARN_ON_ONCE(!global_reclaim(sc));
-+
-+	/*
-+	 * Unmapped clean folios are already prioritized. Scanning for more of
-+	 * them is likely futile and can cause high reclaim latency when there
-+	 * is a large number of memcgs.
-+	 */
-+	if (!sc->may_writepage || !sc->may_unmap)
-+		goto done;
-+
-+	lru_add_drain();
-+
-+	blk_start_plug(&plug);
-+
-+	set_mm_walk(pgdat, sc->proactive);
-+
-+	set_initial_priority(pgdat, sc);
-+
-+	if (current_is_kswapd())
-+		sc->nr_reclaimed = 0;
-+
-+	if (mem_cgroup_disabled())
-+		shrink_one(&pgdat->__lruvec, sc);
-+	else
-+		shrink_many(pgdat, sc);
-+
-+	if (current_is_kswapd())
-+		sc->nr_reclaimed += reclaimed;
- 
--	/* see the comment in lru_gen_age_node() */
--	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
--		sc->memcgs_need_aging = false;
--done:
- 	clear_mm_walk();
- 
- 	blk_finish_plug(&plug);
-+done:
-+	/* kswapd should never fail */
-+	pgdat->kswapd_failures = 0;
- }
- 
- /******************************************************************************
-@@ -5266,7 +5535,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
- 
- static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
- {
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	if (lrugen->enabled) {
- 		enum lru_list lru;
-@@ -5279,7 +5548,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
- 		int gen, type, zone;
- 
- 		for_each_gen_type_zone(gen, type, zone) {
--			if (!list_empty(&lrugen->lists[gen][type][zone]))
-+			if (!list_empty(&lrugen->folios[gen][type][zone]))
- 				return false;
- 		}
- 	}
-@@ -5324,7 +5593,7 @@ static bool drain_evictable(struct lruvec *lruvec)
- 	int remaining = MAX_LRU_BATCH;
- 
- 	for_each_gen_type_zone(gen, type, zone) {
--		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
-+		struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
- 
- 		while (!list_empty(head)) {
- 			bool success;
-@@ -5402,14 +5671,14 @@ static void lru_gen_change_state(bool enabled)
+@@ -5671,14 +5671,14 @@ static void lru_gen_change_state(bool enabled)
   *                          sysfs interface
   ******************************************************************************/
  
@@ -31467,7 +10363,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644
  {
  	unsigned int msecs;
  
-@@ -5421,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+@@ -5690,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
  	return len;
  }
  
@@ -31481,7 +10377,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644
  {
  	unsigned int caps = 0;
  
-@@ -5442,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
+@@ -5711,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
  }
  
  /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -31490,7 +10386,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644
  			     const char *buf, size_t len)
  {
  	int i;
-@@ -5469,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+@@ -5738,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
  	return len;
  }
  
@@ -31501,7 +10397,7 @@ index 160acbbdf111..1a8f3b1c0bad 100644
  
  static struct attribute *lru_gen_attrs[] = {
  	&lru_gen_min_ttl_attr.attr,
-@@ -5479,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = {
+@@ -5748,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = {
  	NULL
  };
  
@@ -31510,1959 +10406,2106 @@ index 160acbbdf111..1a8f3b1c0bad 100644
  	.name = "lru_gen",
  	.attrs = lru_gen_attrs,
  };
-@@ -5545,7 +5810,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
- 	int i;
- 	int type, tier;
- 	int hist = lru_hist_from_seq(seq);
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
- 		seq_printf(m, "            %10d", tier);
-@@ -5595,7 +5860,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
- 	unsigned long seq;
- 	bool full = !debugfs_real_fops(m->file)->write;
- 	struct lruvec *lruvec = v;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 	int nid = lruvec_pgdat(lruvec)->node_id;
- 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- 	DEFINE_MAX_SEQ(lruvec);
-@@ -5692,7 +5957,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
- 		if (sc->nr_reclaimed >= nr_to_reclaim)
- 			return 0;
- 
--		if (!evict_folios(lruvec, sc, swappiness, NULL))
-+		if (!evict_folios(lruvec, sc, swappiness))
- 			return 0;
- 
- 		cond_resched();
-@@ -5713,11 +5978,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
- 
- 	if (!mem_cgroup_disabled()) {
- 		rcu_read_lock();
-+
- 		memcg = mem_cgroup_from_id(memcg_id);
--#ifdef CONFIG_MEMCG
--		if (memcg && !css_tryget(&memcg->css))
-+		if (!mem_cgroup_tryget(memcg))
- 			memcg = NULL;
--#endif
-+
- 		rcu_read_unlock();
- 
- 		if (!memcg)
-@@ -5777,7 +6042,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
- 	set_task_reclaim_state(current, &sc.reclaim_state);
- 	flags = memalloc_noreclaim_save();
- 	blk_start_plug(&plug);
--	if (!set_mm_walk(NULL)) {
-+	if (!set_mm_walk(NULL, true)) {
- 		err = -ENOMEM;
- 		goto done;
- 	}
-@@ -5849,7 +6114,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
- {
- 	int i;
- 	int gen, type, zone;
--	struct lru_gen_struct *lrugen = &lruvec->lrugen;
-+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
- 
- 	lrugen->max_seq = MIN_NR_GENS + 1;
- 	lrugen->enabled = lru_gen_enabled();
-@@ -5858,13 +6123,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
- 		lrugen->timestamps[i] = jiffies;
- 
- 	for_each_gen_type_zone(gen, type, zone)
--		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
-+		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
- 
- 	lruvec->mm_state.seq = MIN_NR_GENS;
- 	init_waitqueue_head(&lruvec->mm_state.wait);
- }
- 
- #ifdef CONFIG_MEMCG
-+
-+void lru_gen_init_pgdat(struct pglist_data *pgdat)
-+{
-+	int i, j;
-+
-+	spin_lock_init(&pgdat->memcg_lru.lock);
-+
-+	for (i = 0; i < MEMCG_NR_GENS; i++) {
-+		for (j = 0; j < MEMCG_NR_BINS; j++)
-+			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
-+	}
-+}
-+
- void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
- 	INIT_LIST_HEAD(&memcg->mm_list.fifo);
-@@ -5876,19 +6154,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
- 	int i;
- 	int nid;
- 
-+	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
-+
- 	for_each_node(nid) {
- 		struct lruvec *lruvec = get_lruvec(memcg, nid);
- 
-+		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
- 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
- 					   sizeof(lruvec->lrugen.nr_pages)));
- 
-+		lruvec->lrugen.list.next = LIST_POISON1;
-+
- 		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
- 			bitmap_free(lruvec->mm_state.filters[i]);
- 			lruvec->mm_state.filters[i] = NULL;
- 		}
- 	}
- }
--#endif
-+
-+#endif /* CONFIG_MEMCG */
- 
- static int __init init_lru_gen(void)
- {
-@@ -5915,6 +6199,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
- {
- }
- 
-+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+}
-+
- #endif /* CONFIG_LRU_GEN */
- 
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-@@ -5928,7 +6216,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- 	bool proportional_reclaim;
- 	struct blk_plug plug;
- 
--	if (lru_gen_enabled()) {
-+	if (lru_gen_enabled() && !global_reclaim(sc)) {
- 		lru_gen_shrink_lruvec(lruvec, sc);
- 		return;
- 	}
-@@ -6171,6 +6459,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
- 	struct lruvec *target_lruvec;
- 	bool reclaimable = false;
- 
-+	if (lru_gen_enabled() && global_reclaim(sc)) {
-+		lru_gen_shrink_node(pgdat, sc);
-+		return;
-+	}
-+
- 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
- 
- again:
-diff --git a/mm/workingset.c b/mm/workingset.c
-index 1a86645b7b3c..fd666584515c 100644
---- a/mm/workingset.c
-+++ b/mm/workingset.c
-@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
- 	unsigned long token;
- 	unsigned long min_seq;
- 	struct lruvec *lruvec;
--	struct lru_gen_struct *lrugen;
-+	struct lru_gen_folio *lrugen;
- 	int type = folio_is_file_lru(folio);
- 	int delta = folio_nr_pages(folio);
- 	int refs = folio_lru_refs(folio);
-@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
- 	unsigned long token;
- 	unsigned long min_seq;
- 	struct lruvec *lruvec;
--	struct lru_gen_struct *lrugen;
-+	struct lru_gen_folio *lrugen;
- 	struct mem_cgroup *memcg;
- 	struct pglist_data *pgdat;
- 	int type = folio_is_file_lru(folio);
 diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
-index 81fa7ec2e66a..1f36bc1c5d36 100644
+index 4c89ff333f6f..9286d3baa12d 100644
 --- a/tools/testing/radix-tree/maple.c
 +++ b/tools/testing/radix-tree/maple.c
-@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt)
- 
- 		if (!MAPLE_32BIT) {
- 			if (i >= 35)
--				e = i - 35;
-+				e = i - 34;
- 			else if (i >= 5)
--				e = i - 5;
-+				e = i - 4;
- 			else if (i >= 2)
--				e = i - 2;
-+				e = i - 1;
- 		} else {
- 			if (i >= 4)
- 				e = i - 4;
-@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt)
- 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
- 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
--	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
-+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
- 
- 	mn = mas_pop_node(&mas); /* get the next node. */
- 	MT_BUG_ON(mt, mn == NULL);
- 	MT_BUG_ON(mt, not_empty(mn));
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS);
--	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2);
-+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
- 
- 	mas_push_node(&mas, mn);
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
--	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
-+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
- 
- 	/* Check the limit of pop/push/pop */
- 	mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */
-@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt)
- 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
- 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
- 	MT_BUG_ON(mt, mas_alloc_req(&mas));
--	MT_BUG_ON(mt, mas.alloc->node_count);
-+	MT_BUG_ON(mt, mas.alloc->node_count != 1);
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
- 	mn = mas_pop_node(&mas);
- 	MT_BUG_ON(mt, not_empty(mn));
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
--	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS - 1);
-+	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS);
- 	mas_push_node(&mas, mn);
--	MT_BUG_ON(mt, mas.alloc->node_count);
-+	MT_BUG_ON(mt, mas.alloc->node_count != 1);
- 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
- 	mn = mas_pop_node(&mas);
- 	MT_BUG_ON(mt, not_empty(mn));
--- 
-2.40.0.rc2
-
-From d9e434e1093f450c71f9a327b2201f7bdcc75743 Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 17 Feb 2023 13:41:20 +0100
-Subject: [PATCH 11/16] mm/kvm: lockless accessed bit harvest
-
-TLDR
-====
-This patchset RCU-protects KVM page tables and compare-and-exchanges
-KVM PTEs with the accessed bit set by hardware. It significantly
-improves the performance of guests when the host is under heavy
-memory pressure.
-
-ChromeOS has been using a similar approach [1] since mid 2021 and it
-was proven successful on tens of millions devices.
-
-[1] https://crrev.com/c/2987928
-
-Overview
-========
-The goal of this patchset is to optimize the performance of guests
-when the host memory is overcommitted. It focuses on the vast
-majority of VMs that are not nested and run on hardware that sets the
-accessed bit in KVM page tables.
-
-Note that nested VMs and hardware that does not support the accessed
-bit are both out of scope.
-
-This patchset relies on two techniques, RCU and cmpxchg, to safely
-test and clear the accessed bit without taking kvm->mmu_lock. The
-former protects KVM page tables from being freed while the latter
-clears the accessed bit atomically against both hardware and other
-software page table walkers.
-
-A new MMU notifier API, mmu_notifier_test_clear_young(), is
-introduced. It follows two design patterns: fallback and batching.
-For any unsupported cases, it can optionally fall back to
-mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can test
-or test and clear their accessed bits according to a bitmap provided
-by the caller.
-
-This patchset only applies mmu_notifier_test_clear_young() to MGLRU.
-A follow-up patchset will apply it to /proc/PID/pagemap and
-/prod/PID/clear_refs.
-
-Evaluation
-==========
-An existing selftest can quickly demonstrate the effectiveness of
-this patchset. On a generic workstation equipped with 64 CPUs and
-256GB DRAM:
-
-  $ sudo max_guest_memory_test -c 64 -m 256 -s 256
-
-  MGLRU      run2
-  ---------------
-  Before    ~600s
-  After      ~50s
-  Off       ~250s
-
-  kswapd (MGLRU before)
-    100.00%  balance_pgdat
-      100.00%  shrink_node
-        100.00%  shrink_one
-          99.97%  try_to_shrink_lruvec
-            99.06%  evict_folios
-              97.41%  shrink_folio_list
-                31.33%  folio_referenced
-                  31.06%  rmap_walk_file
-                    30.89%  folio_referenced_one
-                      20.83%  __mmu_notifier_clear_flush_young
-                        20.54%  kvm_mmu_notifier_clear_flush_young
-  =>                      19.34%  _raw_write_lock
-
-  kswapd (MGLRU after)
-    100.00%  balance_pgdat
-      100.00%  shrink_node
-        100.00%  shrink_one
-          99.97%  try_to_shrink_lruvec
-            99.51%  evict_folios
-              71.70%  shrink_folio_list
-                7.08%  folio_referenced
-                  6.78%  rmap_walk_file
-                    6.72%  folio_referenced_one
-                      5.60%  lru_gen_look_around
-  =>                    1.53%  __mmu_notifier_test_clear_young
-
-  kswapd (MGLRU off)
-    100.00%  balance_pgdat
-      100.00%  shrink_node
-        99.92%  shrink_lruvec
-          69.95%  shrink_folio_list
-            19.35%  folio_referenced
-              18.37%  rmap_walk_file
-                17.88%  folio_referenced_one
-                  13.20%  __mmu_notifier_clear_flush_young
-                    11.64%  kvm_mmu_notifier_clear_flush_young
-  =>                  9.93%  _raw_write_lock
-          26.23%  shrink_active_list
-            25.50%  folio_referenced
-              25.35%  rmap_walk_file
-                25.28%  folio_referenced_one
-                  23.87%  __mmu_notifier_clear_flush_young
-                    23.69%  kvm_mmu_notifier_clear_flush_young
-  =>                  18.98%  _raw_write_lock
-
-Comprehensive benchmarks are coming soon.
-
-Yu Zhao (5):
-  mm/kvm: add mmu_notifier_test_clear_young()
-  kvm/x86: add kvm_arch_test_clear_young()
-  kvm/arm64: add kvm_arch_test_clear_young()
-  kvm/powerpc: add kvm_arch_test_clear_young()
-  mm: multi-gen LRU: use mmu_notifier_test_clear_young()
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- arch/arm64/include/asm/kvm_host.h       |   7 ++
- arch/arm64/include/asm/kvm_pgtable.h    |   8 ++
- arch/arm64/include/asm/stage2_pgtable.h |  43 ++++++++
- arch/arm64/kvm/arm.c                    |   1 +
- arch/arm64/kvm/hyp/pgtable.c            |  51 ++--------
- arch/arm64/kvm/mmu.c                    |  77 +++++++++++++-
- arch/powerpc/include/asm/kvm_host.h     |  18 ++++
- arch/powerpc/include/asm/kvm_ppc.h      |  14 +--
- arch/powerpc/kvm/book3s.c               |   7 ++
- arch/powerpc/kvm/book3s.h               |   2 +
- arch/powerpc/kvm/book3s_64_mmu_radix.c  |  78 ++++++++++++++-
- arch/powerpc/kvm/book3s_hv.c            |  10 +-
- arch/x86/include/asm/kvm_host.h         |  27 +++++
- arch/x86/kvm/mmu/spte.h                 |  12 ---
- arch/x86/kvm/mmu/tdp_mmu.c              |  41 ++++++++
- include/linux/kvm_host.h                |  29 ++++++
- include/linux/mmu_notifier.h            |  40 ++++++++
- include/linux/mmzone.h                  |   6 +-
- mm/mmu_notifier.c                       |  26 +++++
- mm/rmap.c                               |   8 +-
- mm/vmscan.c                             | 127 +++++++++++++++++++++---
- virt/kvm/kvm_main.c                     |  58 +++++++++++
- 22 files changed, 593 insertions(+), 97 deletions(-)
-
-diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
-index 35a159d131b5..572bcd321586 100644
---- a/arch/arm64/include/asm/kvm_host.h
-+++ b/arch/arm64/include/asm/kvm_host.h
-@@ -1031,4 +1031,11 @@ static inline void kvm_hyp_reserve(void) { }
- void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
- bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
- 
-+/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
-+static inline bool kvm_arch_has_test_clear_young(void)
-+{
-+	return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled();
-+}
-+
- #endif /* __ARM64_KVM_HOST_H__ */
-diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
-index 63f81b27a4e3..8c9a04388c88 100644
---- a/arch/arm64/include/asm/kvm_pgtable.h
-+++ b/arch/arm64/include/asm/kvm_pgtable.h
-@@ -105,6 +105,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
-  * @put_page:			Decrement the refcount on a page. When the
-  *				refcount reaches 0 the page is automatically
-  *				freed.
-+ * @put_page_rcu:		RCU variant of put_page().
-  * @page_count:			Return the refcount of a page.
-  * @phys_to_virt:		Convert a physical address into a virtual
-  *				address	mapped in the current context.
-@@ -122,6 +123,7 @@ struct kvm_pgtable_mm_ops {
- 	void		(*free_removed_table)(void *addr, u32 level);
- 	void		(*get_page)(void *addr);
- 	void		(*put_page)(void *addr);
-+	void		(*put_page_rcu)(void *addr);
- 	int		(*page_count)(void *addr);
- 	void*		(*phys_to_virt)(phys_addr_t phys);
- 	phys_addr_t	(*virt_to_phys)(void *addr);
-@@ -188,6 +190,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
-  *					children.
-  * @KVM_PGTABLE_WALK_SHARED:		Indicates the page-tables may be shared
-  *					with other software walkers.
-+ *
-+ * kvm_arch_test_clear_young() is a special case. It relies on two
-+ * techniques, RCU and cmpxchg, to safely test and clear the accessed
-+ * bit without taking the MMU lock. The former protects KVM page tables
-+ * from being freed while the latter clears the accessed bit atomically
-+ * against both the hardware and other software page table walkers.
-  */
- enum kvm_pgtable_walk_flags {
- 	KVM_PGTABLE_WALK_LEAF			= BIT(0),
-diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
-index c8dca8ae359c..350437661d4b 100644
---- a/arch/arm64/include/asm/stage2_pgtable.h
-+++ b/arch/arm64/include/asm/stage2_pgtable.h
-@@ -30,4 +30,47 @@
-  */
- #define kvm_mmu_cache_min_pages(kvm)	(kvm_stage2_levels(kvm) - 1)
- 
-+#define KVM_PTE_TYPE			BIT(1)
-+#define KVM_PTE_TYPE_BLOCK		0
-+#define KVM_PTE_TYPE_PAGE		1
-+#define KVM_PTE_TYPE_TABLE		1
-+
-+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
-+
-+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
-+#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
-+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
-+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
-+#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
-+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
-+#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
-+
-+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
-+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
-+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
-+#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
-+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
-+#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
-+
-+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
-+
-+#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
-+
-+#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
-+
-+#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
-+
-+#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
-+					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
-+					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
-+
-+#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
-+#define KVM_MAX_OWNER_ID		1
-+
-+/*
-+ * Used to indicate a pte for which a 'break-before-make' sequence is in
-+ * progress.
-+ */
-+#define KVM_INVALID_PTE_LOCKED		BIT(10)
-+
- #endif	/* __ARM64_S2_PGTABLE_H_ */
-diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
-index 9c5573bc4614..6770bc47f5c9 100644
---- a/arch/arm64/kvm/arm.c
-+++ b/arch/arm64/kvm/arm.c
-@@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-  */
- void kvm_arch_destroy_vm(struct kvm *kvm)
- {
-+	kvm_free_stage2_pgd(&kvm->arch.mmu);
- 	bitmap_free(kvm->arch.pmu_filter);
- 	free_cpumask_var(kvm->arch.supported_cpus);
- 
-diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
-index b11cf2c618a6..8d65ee4767f1 100644
---- a/arch/arm64/kvm/hyp/pgtable.c
-+++ b/arch/arm64/kvm/hyp/pgtable.c
-@@ -12,49 +12,6 @@
- #include <asm/stage2_pgtable.h>
- 
- 
--#define KVM_PTE_TYPE			BIT(1)
--#define KVM_PTE_TYPE_BLOCK		0
--#define KVM_PTE_TYPE_PAGE		1
--#define KVM_PTE_TYPE_TABLE		1
--
--#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
--
--#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
--#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
--#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
--#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
--#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
--#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
--#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
--
--#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
--#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
--#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
--#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
--#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
--#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
--
--#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
--
--#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
--
--#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
--
--#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
--
--#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
--					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
--					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
--
--#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
--#define KVM_MAX_OWNER_ID		1
--
--/*
-- * Used to indicate a pte for which a 'break-before-make' sequence is in
-- * progress.
-- */
--#define KVM_INVALID_PTE_LOCKED		BIT(10)
--
- struct kvm_pgtable_walk_data {
- 	struct kvm_pgtable_walker	*walker;
- 
-@@ -994,8 +951,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
- 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
- 					       kvm_granule_size(ctx->level));
- 
--	if (childp)
--		mm_ops->put_page(childp);
-+	if (childp) {
-+		if (mm_ops->put_page_rcu)
-+			mm_ops->put_page_rcu(childp);
-+		else
-+			mm_ops->put_page(childp);
-+	}
- 
- 	return 0;
- }
-diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
-index a3ee3b605c9b..761fffc788f5 100644
---- a/arch/arm64/kvm/mmu.c
-+++ b/arch/arm64/kvm/mmu.c
-@@ -171,6 +171,21 @@ static int kvm_host_page_count(void *addr)
- 	return page_count(virt_to_page(addr));
- }
- 
-+static void kvm_s2_rcu_put_page(struct rcu_head *head)
-+{
-+	put_page(container_of(head, struct page, rcu_head));
-+}
-+
-+static void kvm_s2_put_page_rcu(void *addr)
-+{
-+	struct page *page = virt_to_page(addr);
-+
-+	if (kvm_host_page_count(addr) == 1)
-+		kvm_account_pgtable_pages(addr, -1);
-+
-+	call_rcu(&page->rcu_head, kvm_s2_rcu_put_page);
-+}
-+
- static phys_addr_t kvm_host_pa(void *addr)
- {
- 	return __pa(addr);
-@@ -684,6 +699,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
- 	.free_removed_table	= stage2_free_removed_table,
- 	.get_page		= kvm_host_get_page,
- 	.put_page		= kvm_s2_put_page,
-+	.put_page_rcu		= kvm_s2_put_page_rcu,
- 	.page_count		= kvm_host_page_count,
- 	.phys_to_virt		= kvm_host_va,
- 	.virt_to_phys		= kvm_host_pa,
-@@ -1624,6 +1640,66 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- 	return pte_valid(pte) && pte_young(pte);
- }
- 
-+struct test_clear_young_arg {
-+	struct kvm_gfn_range *range;
-+	gfn_t lsb_gfn;
-+	unsigned long *bitmap;
-+};
-+
-+static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx,
-+				   enum kvm_pgtable_walk_flags flags)
-+{
-+	struct test_clear_young_arg *arg = ctx->arg;
-+	gfn_t gfn = ctx->addr / PAGE_SIZE;
-+	kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
-+
-+	VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep)));
-+	VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end);
-+
-+	if (!kvm_pte_valid(new))
-+		return 0;
-+
-+	if (new == ctx->old)
-+		return 0;
-+
-+	/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+	if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap))
-+		cmpxchg64(ctx->ptep, ctx->old, new);
-+
-+	return 0;
-+}
-+
-+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+			       gfn_t lsb_gfn, unsigned long *bitmap)
-+{
-+	u64 start = range->start * PAGE_SIZE;
-+	u64 end = range->end * PAGE_SIZE;
-+	struct test_clear_young_arg arg = {
-+		.range		= range,
-+		.lsb_gfn	= lsb_gfn,
-+		.bitmap		= bitmap,
-+	};
-+	struct kvm_pgtable_walker walker = {
-+		.cb		= stage2_test_clear_young,
-+		.arg		= &arg,
-+		.flags		= KVM_PGTABLE_WALK_LEAF,
-+	};
-+
-+	BUILD_BUG_ON(is_hyp_code());
-+
-+	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
-+		return false;
-+
-+	/* see the comments on kvm_pgtable_walk_flags */
-+	rcu_read_lock();
-+
-+	kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker);
-+
-+	rcu_read_unlock();
-+
-+	return true;
-+}
-+
- bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- {
- 	if (!kvm->arch.mmu.pgt)
-@@ -1848,7 +1924,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
- 
- void kvm_arch_flush_shadow_all(struct kvm *kvm)
- {
--	kvm_free_stage2_pgd(&kvm->arch.mmu);
- }
- 
- void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
-index caea15dcb91d..996850029ce0 100644
---- a/arch/powerpc/include/asm/kvm_host.h
-+++ b/arch/powerpc/include/asm/kvm_host.h
-@@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {}
- static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
- static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
- 
-+static inline int kvmppc_radix_possible(void)
-+{
-+	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
-+}
-+
-+static inline bool kvmhv_on_pseries(void)
-+{
-+	return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE);
-+}
-+
-+/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
-+static inline bool kvm_arch_has_test_clear_young(void)
-+{
-+	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
-+	       kvmppc_radix_possible() && !kvmhv_on_pseries();
-+}
-+
- #endif /* __POWERPC_KVM_HOST_H__ */
-diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
-index eae9619b6190..0bb772fc12b1 100644
---- a/arch/powerpc/include/asm/kvm_ppc.h
-+++ b/arch/powerpc/include/asm/kvm_ppc.h
-@@ -277,6 +277,8 @@ struct kvmppc_ops {
- 	bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
- 	bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
- 	bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
-+	bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range,
-+				 gfn_t lsb_gfn, unsigned long *bitmap);
- 	bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
- 	void (*free_memslot)(struct kvm_memory_slot *slot);
- 	int (*init_vm)(struct kvm *kvm);
-@@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void)		{ return false; }
- 
- #endif
- 
--#ifdef CONFIG_PPC_PSERIES
--static inline bool kvmhv_on_pseries(void)
--{
--	return !cpu_has_feature(CPU_FTR_HVMODE);
--}
--#else
--static inline bool kvmhv_on_pseries(void)
--{
--	return false;
--}
--#endif
--
- #ifdef CONFIG_KVM_XICS
- static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
- {
-diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
-index 6d525285dbe8..f4cf330e3e81 100644
---- a/arch/powerpc/kvm/book3s.c
-+++ b/arch/powerpc/kvm/book3s.c
-@@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- 	return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
- }
- 
-+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+			       gfn_t lsb_gfn, unsigned long *bitmap)
-+{
-+	return kvm->arch.kvm_ops->test_clear_young &&
-+	       kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap);
-+}
-+
- bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- {
- 	return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
-diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
-index 58391b4b32ed..fe9cac423817 100644
---- a/arch/powerpc/kvm/book3s.h
-+++ b/arch/powerpc/kvm/book3s.h
-@@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
- extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
- extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
- extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
-+extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+				   gfn_t lsb_gfn, unsigned long *bitmap);
- extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
- 
- extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
-diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
-index 9d3743ca16d5..8476646c554c 100644
---- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
-+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
-@@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- 	return ref;
- }
- 
-+bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+			    gfn_t lsb_gfn, unsigned long *bitmap)
-+{
-+	bool success;
-+	gfn_t gfn = range->start;
-+
-+	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
-+		return false;
-+
-+	/*
-+	 * This function relies on two techniques, RCU and cmpxchg, to safely
-+	 * test and clear the accessed bit without taking the MMU lock. The
-+	 * former protects KVM page tables from being freed while the latter
-+	 * clears the accessed bit atomically against both the hardware and
-+	 * other software page table walkers.
-+	 */
-+	rcu_read_lock();
-+
-+	success = kvm_is_radix(kvm);
-+	if (!success)
-+		goto unlock;
-+
-+	/*
-+	 * case 1:  this function          kvmppc_switch_mmu_to_hpt()
-+	 *
-+	 *          rcu_read_lock()
-+	 *          test kvm_is_radix()    kvm->arch.radix = 0
-+	 *          use kvm->arch.pgtable
-+	 *          rcu_read_unlock()
-+	 *                                 synchronize_rcu()
-+	 *                                 kvmppc_free_radix()
-+	 *
-+	 *
-+	 * case 2:  this function          kvmppc_switch_mmu_to_radix()
-+	 *
-+	 *                                 kvmppc_init_vm_radix()
-+	 *                                 smp_wmb()
-+	 *          test kvm_is_radix()    kvm->arch.radix = 1
-+	 *          smp_rmb()
-+	 *          use kvm->arch.pgtable
-+	 */
-+	smp_rmb();
-+
-+	while (gfn < range->end) {
-+		pte_t *ptep;
-+		pte_t old, new;
-+		unsigned int shift;
-+
-+		ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift);
-+		if (!ptep)
-+			goto next;
-+
-+		VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep)));
-+
-+		old = READ_ONCE(*ptep);
-+		if (!pte_present(old) || !pte_young(old))
-+			goto next;
-+
-+		new = pte_mkold(old);
-+
-+		/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+		if (__test_and_change_bit(lsb_gfn - gfn, bitmap))
-+			pte_xchg(ptep, old, new);
-+next:
-+		gfn += shift ? BIT(shift - PAGE_SHIFT) : 1;
-+	}
-+unlock:
-+	rcu_read_unlock();
-+
-+	return success;
-+}
-+
- /* Returns the number of PAGE_SIZE pages that are dirty */
- static int kvm_radix_test_clear_dirty(struct kvm *kvm,
- 				struct kvm_memory_slot *memslot, int pagenum)
-@@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void)
- {
- 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
- 
--	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
-+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
-+					  SLAB_TYPESAFE_BY_RCU, pte_ctor);
- 	if (!kvm_pte_cache)
- 		return -ENOMEM;
- 
- 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
- 
--	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
-+	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
-+					  SLAB_TYPESAFE_BY_RCU, pmd_ctor);
- 	if (!kvm_pmd_cache) {
- 		kmem_cache_destroy(kvm_pte_cache);
- 		return -ENOMEM;
-diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
-index 6ba68dd6190b..17b415661282 100644
---- a/arch/powerpc/kvm/book3s_hv.c
-+++ b/arch/powerpc/kvm/book3s_hv.c
-@@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
- 	spin_lock(&kvm->mmu_lock);
- 	kvm->arch.radix = 0;
- 	spin_unlock(&kvm->mmu_lock);
-+	/* see the comments in kvmhv_test_clear_young() */
-+	synchronize_rcu();
- 	kvmppc_free_radix(kvm);
- 
- 	lpcr = LPCR_VPM1;
-@@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
- 	if (err)
- 		return err;
- 	kvmppc_rmap_reset(kvm);
-+	/* see the comments in kvmhv_test_clear_young() */
-+	smp_wmb();
- 	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
- 	spin_lock(&kvm->mmu_lock);
- 	kvm->arch.radix = 1;
-@@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = {
- 	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
- 	.age_gfn = kvm_age_gfn_hv,
- 	.test_age_gfn = kvm_test_age_gfn_hv,
-+	.test_clear_young = kvmhv_test_clear_young,
- 	.set_spte_gfn = kvm_set_spte_gfn_hv,
- 	.free_memslot = kvmppc_core_free_memslot_hv,
- 	.init_vm =  kvmppc_core_init_vm_hv,
-@@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void)
- 	return 0;
- }
- 
--static int kvmppc_radix_possible(void)
--{
--	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
--}
--
- static int kvmppc_book3s_init_hv(void)
- {
- 	int r;
-diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
-index 6aaae18f1854..d2995c9e8f07 100644
---- a/arch/x86/include/asm/kvm_host.h
-+++ b/arch/x86/include/asm/kvm_host.h
-@@ -1367,6 +1367,12 @@ struct kvm_arch {
- 	 *	the MMU lock in read mode + the tdp_mmu_pages_lock or
- 	 *	the MMU lock in write mode
- 	 *
-+	 * kvm_arch_test_clear_young() is a special case. It relies on two
-+	 * techniques, RCU and cmpxchg, to safely test and clear the accessed
-+	 * bit without taking the MMU lock. The former protects KVM page tables
-+	 * from being freed while the latter clears the accessed bit atomically
-+	 * against both the hardware and other software page table walkers.
-+	 *
- 	 * Roots will remain in the list until their tdp_mmu_root_count
- 	 * drops to zero, at which point the thread that decremented the
- 	 * count to zero should removed the root from the list and clean
-@@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
- 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
- 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
- 
-+extern u64 __read_mostly shadow_accessed_mask;
-+
-+/*
-+ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
-+ * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
-+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
-+ * scenario where KVM is using A/D bits for L1, but not L2.
-+ */
-+static inline bool kvm_ad_enabled(void)
-+{
-+	return shadow_accessed_mask;
-+}
-+
-+/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
-+static inline bool kvm_arch_has_test_clear_young(void)
-+{
-+	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
-+	       (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled));
-+}
-+
- #endif /* _ASM_X86_KVM_HOST_H */
-diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
-index 6f54dc9409c9..0dc7fed1f3fd 100644
---- a/arch/x86/kvm/mmu/spte.h
-+++ b/arch/x86/kvm/mmu/spte.h
-@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
- extern u64 __read_mostly shadow_nx_mask;
- extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
- extern u64 __read_mostly shadow_user_mask;
--extern u64 __read_mostly shadow_accessed_mask;
- extern u64 __read_mostly shadow_dirty_mask;
- extern u64 __read_mostly shadow_mmio_value;
- extern u64 __read_mostly shadow_mmio_mask;
-@@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte)
- 	return !!(pte & SPTE_MMU_PRESENT_MASK);
- }
- 
--/*
-- * Returns true if A/D bits are supported in hardware and are enabled by KVM.
-- * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
-- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
-- * scenario where KVM is using A/D bits for L1, but not L2.
-- */
--static inline bool kvm_ad_enabled(void)
--{
--	return !!shadow_accessed_mask;
--}
--
- static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
- {
- 	return sp->role.ad_disabled;
-diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
-index d6df38d371a0..9028e09f1aab 100644
---- a/arch/x86/kvm/mmu/tdp_mmu.c
-+++ b/arch/x86/kvm/mmu/tdp_mmu.c
-@@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
- 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
- }
- 
-+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+			       gfn_t lsb_gfn, unsigned long *bitmap)
-+{
-+	struct kvm_mmu_page *root;
-+
-+	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
-+		return false;
-+
-+	if (kvm_memslots_have_rmaps(kvm))
-+		return false;
-+
-+	/* see the comments on kvm_arch->tdp_mmu_roots */
-+	rcu_read_lock();
-+
-+	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
-+		struct tdp_iter iter;
-+
-+		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
-+			continue;
-+
-+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
-+			u64 *sptep = rcu_dereference(iter.sptep);
-+			u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
-+
-+			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
-+			VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);
-+
-+			if (new_spte == iter.old_spte)
-+				continue;
-+
-+			/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+			if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
-+				cmpxchg64(sptep, iter.old_spte, new_spte);
-+		}
-+	}
-+
-+	rcu_read_unlock();
-+
-+	return true;
-+}
-+
- static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
- 			 struct kvm_gfn_range *range)
- {
-diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
-index 4f26b244f6d0..df46fc815c8b 100644
---- a/include/linux/kvm_host.h
-+++ b/include/linux/kvm_host.h
-@@ -2281,4 +2281,33 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
- /* Max number of entries allowed for each kvm dirty ring */
- #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
- 
-+/*
-+ * Architectures that implement kvm_arch_test_clear_young() should override
-+ * kvm_arch_has_test_clear_young().
-+ *
-+ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can
-+ * return true if kvm_arch_test_clear_young() is supported but disabled due to
-+ * some runtime constraint. In this case, kvm_arch_test_clear_young() should
-+ * return false.
-+ *
-+ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the
-+ * following specifications:
-+ * 1. The offset of each bit is relative to the second to the last parameter
-+ *    lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to
-+ *    better suit batching while forward looping.
-+ * 2. For each KVM PTE with the accessed bit set, the implementation should flip
-+ *    the corresponding bit in the bitmap. It should only clear the accessed bit
-+ *    if the old value is 1. This allows the caller to test or test and clear
-+ *    the accessed bit.
-+ */
-+#ifndef kvm_arch_has_test_clear_young
-+static inline bool kvm_arch_has_test_clear_young(void)
-+{
-+	return false;
-+}
-+#endif
-+
-+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
-+			       gfn_t lsb_gfn, unsigned long *bitmap);
-+
- #endif
-diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
-index d6c06e140277..521f71ad0467 100644
---- a/include/linux/mmu_notifier.h
-+++ b/include/linux/mmu_notifier.h
-@@ -122,6 +122,11 @@ struct mmu_notifier_ops {
- 			  struct mm_struct *mm,
- 			  unsigned long address);
- 
-+	/* see the comments on mmu_notifier_test_clear_young() */
-+	bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
-+				 unsigned long start, unsigned long end,
-+				 unsigned long *bitmap);
-+
- 	/*
- 	 * change_pte is called in cases that pte mapping to page is changed:
- 	 * for example, when ksm remaps pte to point to a new shared page.
-@@ -391,6 +396,9 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
- extern int __mmu_notifier_clear_young(struct mm_struct *mm,
- 				      unsigned long start,
- 				      unsigned long end);
-+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
-+					   unsigned long start, unsigned long end,
-+					   bool fallback, unsigned long *bitmap);
- extern int __mmu_notifier_test_young(struct mm_struct *mm,
- 				     unsigned long address);
- extern void __mmu_notifier_change_pte(struct mm_struct *mm,
-@@ -433,6 +441,31 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
- 	return 0;
- }
- 
-+/*
-+ * This function always returns 0 if fallback is not allowed. If fallback
-+ * happens, its return value is similar to that of mmu_notifier_clear_young().
-+ *
-+ * The bitmap has the following specifications:
-+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
-+ * 2. The offset of each bit is relative to the end. E.g., the offset
-+ *    corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit
-+ *    batching while forward looping.
-+ * 3. For each KVM PTE with the accessed bit set (young), this function flips
-+ *    the corresponding bit in the bitmap. It only clears the accessed bit if
-+ *    the old value is 1. A caller can test or test and clear the accessed bit
-+ *    by setting the corresponding bit in the bitmap to 0 or 1, and the new
-+ *    value will be 1 or 0 for a young KVM PTE.
-+ */
-+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
-+						unsigned long start, unsigned long end,
-+						bool fallback, unsigned long *bitmap)
-+{
-+	if (mm_has_notifiers(mm))
-+		return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap);
-+
-+	return 0;
-+}
-+
- static inline int mmu_notifier_test_young(struct mm_struct *mm,
- 					  unsigned long address)
- {
-@@ -687,6 +720,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
- 	return 0;
- }
- 
-+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
-+						unsigned long start, unsigned long end,
-+						bool fallback, unsigned long *bitmap)
-+{
-+	return 0;
-+}
-+
- static inline int mmu_notifier_test_young(struct mm_struct *mm,
- 					  unsigned long address)
- {
-diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 70bd7f55bdd2..0ddbf712708d 100644
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -379,6 +379,7 @@ enum {
- 	LRU_GEN_CORE,
- 	LRU_GEN_MM_WALK,
- 	LRU_GEN_NONLEAF_YOUNG,
-+	LRU_GEN_SPTE_WALK,
- 	NR_LRU_GEN_CAPS
+@@ -55,6 +55,28 @@ struct rcu_reader_struct {
+ 	struct rcu_test_struct2 *test;
  };
  
-@@ -485,7 +486,7 @@ struct lru_gen_mm_walk {
- };
- 
- void lru_gen_init_lruvec(struct lruvec *lruvec);
--void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
- 
- #ifdef CONFIG_MEMCG
- 
-@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
- {
- }
- 
--static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
-+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- {
-+	return false;
- }
- 
- #ifdef CONFIG_MEMCG
-diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
-index f45ff1b7626a..324799848fed 100644
---- a/mm/mmu_notifier.c
-+++ b/mm/mmu_notifier.c
-@@ -402,6 +402,32 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
- 	return young;
- }
- 
-+/* see the comments on mmu_notifier_test_clear_young() */
-+int __mmu_notifier_test_clear_young(struct mm_struct *mm,
-+				    unsigned long start, unsigned long end,
-+				    bool fallback, unsigned long *bitmap)
++static int get_alloc_node_count(struct ma_state *mas)
 +{
-+	int key;
-+	struct mmu_notifier *mn;
-+	int young = 0;
++	int count = 1;
++	struct maple_alloc *node = mas->alloc;
 +
-+	key = srcu_read_lock(&srcu);
-+
-+	hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list,
-+				  hlist, srcu_read_lock_held(&srcu)) {
-+		if (mn->ops->test_clear_young &&
-+		    mn->ops->test_clear_young(mn, mm, start, end, bitmap))
-+			continue;
-+
-+		if (fallback && mn->ops->clear_young)
-+			young |= mn->ops->clear_young(mn, mm, start, end);
++	if (!node || ((unsigned long)node & 0x1))
++		return 0;
++	while (node->node_count) {
++		count += node->node_count;
++		node = node->slot[0];
 +	}
-+
-+	srcu_read_unlock(&srcu, key);
-+
-+	return young;
++	return count;
 +}
 +
- int __mmu_notifier_test_young(struct mm_struct *mm,
- 			      unsigned long address)
- {
-diff --git a/mm/rmap.c b/mm/rmap.c
-index c8701608bb0d..8ecbbadab752 100644
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio,
- 			return false; /* To break the loop */
- 		}
- 
--		if (pvmw.pte) {
--			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
--				lru_gen_look_around(&pvmw);
-+		if (lru_gen_enabled() && pvmw.pte) {
-+			if (lru_gen_look_around(&pvmw))
- 				referenced++;
--			}
--
-+		} else if (pvmw.pte) {
- 			if (ptep_clear_flush_young_notify(vma, address,
- 						pvmw.pte))
- 				referenced++;
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 1a8f3b1c0bad..ec0142165ce7 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -57,6 +57,8 @@
- #include <linux/khugepaged.h>
- #include <linux/rculist_nulls.h>
- #include <linux/random.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/kvm_host.h>
- 
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -3927,6 +3929,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- 	return folio;
- }
- 
-+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end,
-+			    unsigned long *bitmap, unsigned long *last)
++static void check_mas_alloc_node_count(struct ma_state *mas)
 +{
-+	if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK))
-+		return false;
-+
-+	if (*last > addr)
-+		goto done;
-+
-+	*last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
-+		addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
-+	bitmap_zero(bitmap, MIN_LRU_BATCH);
-+
-+	mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
-+done:
-+	return test_bit((*last - addr) / PAGE_SIZE, bitmap);
-+}
-+
-+static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
-+			     unsigned long *bitmap, unsigned long *last)
-+{
-+	int i;
-+	unsigned long start, end = *last + 1;
-+
-+	if (addr + PAGE_SIZE != end)
-+		return;
-+
-+	i = find_last_bit(bitmap, MIN_LRU_BATCH);
-+	if (i == MIN_LRU_BATCH)
-+		return;
-+
-+	start = end - (i + 1) * PAGE_SIZE;
-+
-+	i = find_first_bit(bitmap, MIN_LRU_BATCH);
-+
-+	end -= i * PAGE_SIZE;
-+
-+	mmu_notifier_test_clear_young(mm, start, end, false, bitmap);
-+}
-+
-+static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
-+			    unsigned long *bitmap, unsigned long *last)
-+{
-+	if (*last > addr)
-+		__clear_bit((*last - addr) / PAGE_SIZE, bitmap);
-+
-+	clear_spte_young(mm, addr, bitmap, last);
-+}
-+
- static bool suitable_to_scan(int total, int young)
- {
- 	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
-@@ -3942,6 +3993,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- 	pte_t *pte;
- 	spinlock_t *ptl;
- 	unsigned long addr;
-+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
-+	unsigned long last = 0;
- 	int total = 0;
- 	int young = 0;
- 	struct lru_gen_mm_walk *walk = args->private;
-@@ -3960,6 +4013,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- 	pte = pte_offset_map(pmd, start & PMD_MASK);
- restart:
- 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
-+		bool success;
- 		unsigned long pfn;
- 		struct folio *folio;
- 
-@@ -3967,20 +4021,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- 		walk->mm_stats[MM_LEAF_TOTAL]++;
- 
- 		pfn = get_pte_pfn(pte[i], args->vma, addr);
--		if (pfn == -1)
-+		if (pfn == -1) {
-+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
- 			continue;
-+		}
- 
--		if (!pte_young(pte[i])) {
-+		success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last);
-+		if (!success && !pte_young(pte[i])) {
-+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
- 			walk->mm_stats[MM_LEAF_OLD]++;
- 			continue;
- 		}
- 
- 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
--		if (!folio)
-+		if (!folio) {
-+			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
- 			continue;
-+		}
- 
--		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
--			VM_WARN_ON_ONCE(true);
-+		clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
-+		if (pte_young(pte[i]))
-+			ptep_test_and_clear_young(args->vma, addr, pte + i);
- 
- 		young++;
- 		walk->mm_stats[MM_LEAF_YOUNG]++;
-@@ -4589,6 +4650,24 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
-  *                          rmap/PT walk feedback
-  ******************************************************************************/
- 
-+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
-+			       pte_t *pte, int *young)
-+{
-+	unsigned long old = true;
-+
-+	*young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old);
-+	if (!old)
-+		*young = true;
-+
-+	if (pte_young(*pte)) {
-+		ptep_test_and_clear_young(vma, addr, pte);
-+		*young = true;
-+		return true;
-+	}
-+
-+	return !old && get_cap(LRU_GEN_SPTE_WALK);
++	mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL);
++	mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL);
++	MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total);
++	mas_destroy(mas);
 +}
 +
  /*
-  * This function exploits spatial locality when shrink_folio_list() walks the
-  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
-@@ -4596,12 +4675,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
-  * the PTE table to the Bloom filter. This forms a feedback loop between the
-  * eviction and the aging.
-  */
--void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
-+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- {
- 	int i;
- 	unsigned long start;
- 	unsigned long end;
- 	struct lru_gen_mm_walk *walk;
-+	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
-+	unsigned long last = 0;
- 	int young = 0;
- 	pte_t *pte = pvmw->pte;
- 	unsigned long addr = pvmw->address;
-@@ -4615,8 +4696,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 	lockdep_assert_held(pvmw->ptl);
- 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+  * check_new_node() - Check the creation of new nodes and error path
+  * verification.
+@@ -69,6 +91,8 @@ static noinline void check_new_node(struct maple_tree *mt)
  
-+	if (!should_look_around(pvmw->vma, addr, pte, &young))
-+		return young;
+ 	MA_STATE(mas, mt, 0, 0);
+ 
++	check_mas_alloc_node_count(&mas);
 +
- 	if (spin_is_contended(pvmw->ptl))
--		return;
-+		return young;
+ 	/* Try allocating 3 nodes */
+ 	mtree_lock(mt);
+ 	mt_set_non_kernel(0);
+-- 
+2.40.0
+
+From c9249daec15495e2d4e2a0519e75421784e31ddc Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 9 Apr 2023 21:25:25 +0200
+Subject: [PATCH 08/10] Per-VMA locks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Previous versions:
+v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/
+v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/
+v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
+RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
+
+LWN article describing the feature:
+https://lwn.net/Articles/906852/
+
+Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
+last year [2], which concluded with suggestion that “a reader/writer
+semaphore could be put into the VMA itself; that would have the effect of
+using the VMA as a sort of range lock. There would still be contention at
+the VMA level, but it would be an improvement.” This patchset implements
+this suggested approach.
+
+When handling page faults we lookup the VMA that contains the faulting
+page under RCU protection and try to acquire its lock. If that fails we
+fall back to using mmap_lock, similar to how SPF handled this situation.
+
+One notable way the implementation deviates from the proposal is the way
+VMAs are read-locked. During some of mm updates, multiple VMAs need to be
+locked until the end of the update (e.g. vma_merge, split_vma, etc).
+Tracking all the locked VMAs, avoiding recursive locks, figuring out when
+it's safe to unlock previously locked VMAs would make the code more
+complex. So, instead of the usual lock/unlock pattern, the proposed
+solution marks a VMA as locked and provides an efficient way to:
+1. Identify locked VMAs.
+2. Unlock all locked VMAs in bulk.
+We also postpone unlocking the locked VMAs until the end of the update,
+when we do mmap_write_unlock. Potentially this keeps a VMA locked for
+longer than is absolutely necessary but it results in a big reduction of
+code complexity.
+Read-locking a VMA is done using two sequence numbers - one in the
+vm_area_struct and one in the mm_struct. VMA is considered read-locked
+when these sequence numbers are equal. To read-lock a VMA we set the
+sequence number in vm_area_struct to be equal to the sequence number in
+mm_struct. To unlock all VMAs we increment mm_struct's seq number. This
+allows for an efficient way to track locked VMAs and to drop the locks on
+all VMAs at the end of the update.
+
+The patchset implements per-VMA locking only for anonymous pages which
+are not in swap and avoids userfaultfs as their implementation is more
+complex. Additional support for file-back page faults, swapped and user
+pages can be added incrementally.
+
+Performance benchmarks show similar although slightly smaller benefits as
+with SPF patchset (~75% of SPF benefits). Still, with lower complexity
+this approach might be more desirable.
+
+Since RFC was posted in September 2022, two separate Google teams outside
+of Android evaluated the patchset and confirmed positive results. Here are
+the known usecases when per-VMA locks show benefits:
+
+Android:
+Apps with high number of threads (~100) launch times improve by up to 20%.
+Each thread mmaps several areas upon startup (Stack and Thread-local
+storage (TLS), thread signal stack, indirect ref table), which requires
+taking mmap_lock in write mode. Page faults take mmap_lock in read mode.
+During app launch, both thread creation and page faults establishing the
+active workinget are happening in parallel and that causes lock contention
+between mm writers and readers even if updates and page faults are
+happening in different VMAs. Per-vma locks prevent this contention by
+providing more granular lock.
+
+Google Fibers:
+We have several dynamically sized thread pools that spawn new threads
+under increased load and reduce their number when idling. For example,
+Google's in-process scheduling/threading framework, UMCG/Fibers, is backed
+by such a thread pool. When idling, only a small number of idle worker
+threads are available; when a spike of incoming requests arrive, each
+request is handled in its own "fiber", which is a work item posted onto a
+UMCG worker thread; quite often these spikes lead to a number of new
+threads spawning. Each new thread needs to allocate and register an RSEQ
+section on its TLS, then register itself with the kernel as a UMCG worker
+thread, and only after that it can be considered by the in-process
+UMCG/Fiber scheduler as available to do useful work. In short, during an
+incoming workload spike new threads have to be spawned, and they perform
+several syscalls (RSEQ registration, UMCG worker registration, memory
+allocations) before they can actually start doing useful work. Removing
+any bottlenecks on this thread startup path will greatly improve our
+services' latencies when faced with request/workload spikes.
+At high scale, mmap_lock contention during thread creation and stack page
+faults leads to user-visible multi-second serving latencies in a similar
+pattern to Android app startup. Per-VMA locking patchset has been run
+successfully in limited experiments with user-facing production workloads.
+In these experiments, we observed that the peak thread creation rate was
+high enough that thread creation is no longer a bottleneck.
+
+TCP zerocopy receive:
+From the point of view of TCP zerocopy receive, the per-vma lock patch is
+massively beneficial.
+In today's implementation, a process with N threads where N - 1 are
+performing zerocopy receive and 1 thread is performing madvise() with the
+write lock taken (e.g. needs to change vm_flags) will result in all N -1
+receive threads blocking until the madvise is done. Conversely, on a busy
+process receiving a lot of data, an madvise operation that does need to
+take the mmap lock in write mode will need to wait for all of the receives
+to be done - a lose:lose proposition. Per-VMA locking _removes_ by
+definition this source of contention entirely.
+There are other benefits for receive as well, chiefly a reduction in
+cacheline bouncing across receiving threads for locking/unlocking the
+single mmap lock. On an RPC style synthetic workload with 4KB RPCs:
+1a) The find+lock+unlock VMA path in the base case, without the per-vma
+lock patchset, is about 0.7% of cycles as measured by perf.
+1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5%
+cycles overall - most of this is within the TCP read hotpath (a small
+fraction is 'other' usage in the system).
+2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial
+patch written to take advantage of it in TCP, is about 0.4% of cycles
+(down from 0.7% above)
+2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1%
+cycles and is out of the TCP read hotpath entirely (down from 0.5% before,
+the remaining usage is the 'other' usage in the system).
+So, in addition to entirely removing an onerous source of contention, it
+also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+
+(compared to overall cycles in perf) for the 'small' RPC scenario.
+
+The patchset structure is:
+0001-0008: Enable maple-tree RCU mode
+0009-0031: Main per-vma locks patchset
+0032-0033: Performance optimizations
+
+Changes since v3:
+- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge
+- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo
+- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett
+- Dropped patch [6] from the set as unnecessary, per Liam Howlett
+
+[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/
+[2] https://lwn.net/Articles/893906/
+[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/
+[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/
+[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/
+[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/
+
+The patchset applies cleanly over mm-unstable branch.
+
+Laurent Dufour (1):
+  powerc/mm: try VMA lock-based page fault handling first
+
+Liam Howlett (4):
+  maple_tree: Be more cautious about dead nodes
+  maple_tree: Detect dead nodes in mas_start()
+  maple_tree: Fix freeing of nodes in rcu mode
+  maple_tree: remove extra smp_wmb() from mas_dead_leaves()
+
+Liam R. Howlett (4):
+  maple_tree: Fix write memory barrier of nodes once dead for RCU mode
+  maple_tree: Add smp_rmb() to dead node detection
+  maple_tree: Add RCU lock checking to rcu callback functions
+  mm: Enable maple tree RCU mode by default.
+
+Michel Lespinasse (1):
+  mm: rcu safe VMA freeing
+
+Suren Baghdasaryan (23):
+  mm: introduce CONFIG_PER_VMA_LOCK
+  mm: move mmap_lock assert function definitions
+  mm: add per-VMA lock and helper functions to control it
+  mm: mark VMA as being written when changing vm_flags
+  mm/mmap: move vma_prepare before vma_adjust_trans_huge
+  mm/khugepaged: write-lock VMA while collapsing a huge page
+  mm/mmap: write-lock VMAs in vma_prepare before modifying them
+  mm/mremap: write-lock VMA while remapping it to a new address range
+  mm: write-lock VMAs before removing them from VMA tree
+  mm: conditionally write-lock VMA in free_pgtables
+  kernel/fork: assert no VMA readers during its destruction
+  mm/mmap: prevent pagefault handler from racing with mmu_notifier
+    registration
+  mm: introduce vma detached flag
+  mm: introduce lock_vma_under_rcu to be used from arch-specific code
+  mm: fall back to mmap_lock if vma->anon_vma is not yet set
+  mm: add FAULT_FLAG_VMA_LOCK flag
+  mm: prevent do_swap_page from handling page faults under VMA lock
+  mm: prevent userfaults to be handled under per-vma lock
+  mm: introduce per-VMA lock statistics
+  x86/mm: try VMA lock-based page fault handling first
+  arm64/mm: try VMA lock-based page fault handling first
+  mm/mmap: free vm_area_struct without call_rcu in exit_mmap
+  mm: separate vma->lock from vm_area_struct
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/mm/userfaultfd.rst |  17 +++
+ arch/arm64/Kconfig                           |   1 +
+ arch/arm64/mm/fault.c                        |  36 +++++
+ arch/powerpc/mm/fault.c                      |  37 +++++
+ arch/powerpc/platforms/powernv/Kconfig       |   1 +
+ arch/powerpc/platforms/pseries/Kconfig       |   1 +
+ arch/s390/Kconfig                            |   1 +
+ arch/s390/mm/fault.c                         |  24 +++
+ arch/x86/Kconfig                             |   1 +
+ arch/x86/mm/fault.c                          |  36 +++++
+ fs/userfaultfd.c                             |  16 ++
+ include/linux/mm.h                           | 127 +++++++++++++++-
+ include/linux/mm_inline.h                    |   6 +
+ include/linux/mm_types.h                     |  30 +++-
+ include/linux/mmap_lock.h                    |  37 +++--
+ include/linux/userfaultfd_k.h                |  23 +++
+ include/linux/vm_event_item.h                |   6 +
+ include/linux/vmstat.h                       |   6 +
+ include/uapi/linux/userfaultfd.h             |  10 +-
+ kernel/fork.c                                |  96 ++++++++++--
+ mm/Kconfig                                   |  12 ++
+ mm/Kconfig.debug                             |   6 +
+ mm/hugetlb.c                                 |   4 +
+ mm/init-mm.c                                 |   3 +
+ mm/internal.h                                |   2 +-
+ mm/khugepaged.c                              |  10 +-
+ mm/memory.c                                  | 146 ++++++++++++++++---
+ mm/mmap.c                                    |  48 ++++--
+ mm/mprotect.c                                |  51 +++++--
+ mm/mremap.c                                  |   1 +
+ mm/rmap.c                                    |  31 ++--
+ mm/vmstat.c                                  |   6 +
+ tools/testing/selftests/mm/userfaultfd.c     |  45 +++++-
+ 33 files changed, 783 insertions(+), 94 deletions(-)
+
+diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
+index 7dc823b56ca4..bd2226299583 100644
+--- a/Documentation/admin-guide/mm/userfaultfd.rst
++++ b/Documentation/admin-guide/mm/userfaultfd.rst
+@@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
+ you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
+ used.
  
- 	/* avoid taking the LRU lock under the PTL when possible */
- 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
-@@ -4624,6 +4708,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
- 	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
- 
-+	if (end - start == PAGE_SIZE)
-+		return young;
++Userfaultfd write-protect mode currently behave differently on none ptes
++(when e.g. page is missing) over different types of memories.
 +
- 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
- 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
- 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
-@@ -4637,28 +4724,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 
- 	/* folio_update_gen() requires stable folio_memcg() */
- 	if (!mem_cgroup_trylock_pages(memcg))
--		return;
-+		return young;
- 
- 	arch_enter_lazy_mmu_mode();
- 
- 	pte -= (addr - start) / PAGE_SIZE;
- 
- 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
-+		bool success;
- 		unsigned long pfn;
- 
- 		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
--		if (pfn == -1)
-+		if (pfn == -1) {
-+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
- 			continue;
-+		}
- 
--		if (!pte_young(pte[i]))
-+		success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last);
-+		if (!success && !pte_young(pte[i])) {
-+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
- 			continue;
-+		}
- 
- 		folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
--		if (!folio)
-+		if (!folio) {
-+			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
- 			continue;
-+		}
- 
--		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
--			VM_WARN_ON_ONCE(true);
-+		clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
-+		if (pte_young(pte[i]))
-+			ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
- 
- 		young++;
- 
-@@ -4688,6 +4784,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- 	/* feedback from rmap walkers to page table walkers */
- 	if (suitable_to_scan(i, young))
- 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
++For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
++(e.g. when pages are missing and not populated).  For file-backed memories
++like shmem and hugetlbfs, none ptes will be write protected just like a
++present pte.  In other words, there will be a userfaultfd write fault
++message generated when writing to a missing page on file typed memories,
++as long as the page range was write-protected before.  Such a message will
++not be generated on anonymous memories by default.
 +
-+	return young;
- }
++If the application wants to be able to write protect none ptes on anonymous
++memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ.  On
++newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
++and set the feature bit in advance to make sure none ptes will also be
++write protected even upon anonymous memory.
++
+ QEMU/KVM
+ ========
  
- /******************************************************************************
-@@ -5705,6 +5803,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c
- 	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
- 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index 1023e896d46b..6f104c829731 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -95,6 +95,7 @@ config ARM64
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
+ 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ 	select ARCH_WANT_DEFAULT_BPF_JIT
+ 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
+index f4cb0f85ccf4..9e0db5c387e3 100644
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 	unsigned long vm_flags;
+ 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+ 	unsigned long addr = untagged_addr(far);
++#ifdef CONFIG_PER_VMA_LOCK
++	struct vm_area_struct *vma;
++#endif
  
-+	if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
-+		caps |= BIT(LRU_GEN_SPTE_WALK);
-+
- 	return sysfs_emit(buf, "0x%04x\n", caps);
- }
+ 	if (kprobe_page_fault(regs, esr))
+ 		return 0;
+@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
  
-diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index 07aae60288f9..a115a27b375e 100644
---- a/virt/kvm/kvm_main.c
-+++ b/virt/kvm/kvm_main.c
-@@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
- 	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
- }
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
  
-+static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
-+				 unsigned long end, unsigned long *bitmap)
-+{
-+	int i;
-+	int key;
-+	bool success = true;
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(mm_flags & FAULT_FLAG_USER))
++		goto lock_mmap;
 +
-+	trace_kvm_age_hva(start, end);
++	vma = lock_vma_under_rcu(mm, addr);
++	if (!vma)
++		goto lock_mmap;
 +
-+	key = srcu_read_lock(&kvm->srcu);
++	if (!(vma->vm_flags & vm_flags)) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, addr & PAGE_MASK,
++				mm_flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
 +
-+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-+		struct interval_tree_node *node;
-+		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
 +
-+		kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
-+			gfn_t lsb_gfn;
-+			unsigned long hva_start, hva_end;
-+			struct kvm_gfn_range range = {
-+				.slot = container_of(node, struct kvm_memory_slot,
-+						     hva_node[slots->node_idx]),
-+			};
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		if (!user_mode(regs))
++			goto no_context;
++		return 0;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
+ 	/*
+ 	 * As per x86, we may deadlock here. However, since the kernel only
+ 	 * validly references user space from well defined areas of the code,
+@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 	}
+ 	mmap_read_unlock(mm);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	/*
+ 	 * Handle the "normal" (no error) case first.
+ 	 */
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index af46aa88422b..531177a4ee08 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ 	if (is_exec)
+ 		flags |= FAULT_FLAG_INSTRUCTION;
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
 +
-+			hva_start = max(start, range.slot->userspace_addr);
-+			hva_end = min(end - 1, range.slot->userspace_addr +
-+					       range.slot->npages * PAGE_SIZE - 1);
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
 +
-+			range.start = hva_to_gfn_memslot(hva_start, range.slot);
-+			range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
-+
-+			if (WARN_ON_ONCE(range.end <= range.start))
-+				continue;
-+
-+			/* see the comments on the generic kvm_arch_has_test_clear_young() */
-+			lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
-+
-+			success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
-+			if (!success)
-+				break;
-+		}
++	if (unlikely(access_pkey_error(is_write, is_exec,
++				       (error_code & DSISR_KEYFAULT), vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
 +	}
 +
-+	srcu_read_unlock(&kvm->srcu, key);
++	if (unlikely(access_error(is_write, is_exec, vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
 +
-+	return success;
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++
++	if (fault_signal_pending(fault, regs))
++		return user_mode(regs) ? 0 : SIGBUS;
++
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ 	/* When running in the kernel we expect faults to occur only to
+ 	 * addresses in user space.  All other faults represent errors in the
+ 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
+@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ 
+ 	mmap_read_unlock(current->mm);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	if (unlikely(fault & VM_FAULT_ERROR))
+ 		return mm_fault_error(regs, address, fault);
+ 
+diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
+index ae248a161b43..70a46acc70d6 100644
+--- a/arch/powerpc/platforms/powernv/Kconfig
++++ b/arch/powerpc/platforms/powernv/Kconfig
+@@ -16,6 +16,7 @@ config PPC_POWERNV
+ 	select PPC_DOORBELL
+ 	select MMU_NOTIFIER
+ 	select FORCE_SMP
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	default y
+ 
+ config OPAL_PRD
+diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
+index 21b22bf16ce6..4ebf2ef2845d 100644
+--- a/arch/powerpc/platforms/pseries/Kconfig
++++ b/arch/powerpc/platforms/pseries/Kconfig
+@@ -22,6 +22,7 @@ config PPC_PSERIES
+ 	select HOTPLUG_CPU
+ 	select FORCE_SMP
+ 	select SWIOTLB
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	default y
+ 
+ config PARAVIRT
+diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
+index 9809c74e1240..548b5b587003 100644
+--- a/arch/s390/Kconfig
++++ b/arch/s390/Kconfig
+@@ -120,6 +120,7 @@ config S390
+ 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ 	select ARCH_SUPPORTS_HUGETLBFS
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_CMPXCHG_LOCKREF
+ 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
+index a2632fd97d00..b65144c392b0 100644
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+ 		access = VM_WRITE;
+ 	if (access == VM_WRITE)
+ 		flags |= FAULT_FLAG_WRITE;
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
++	if (!(vma->vm_flags & access)) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto out;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		fault = VM_FAULT_SIGNAL;
++		goto out;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
+ 	mmap_read_lock(mm);
+ 
+ 	gmap = NULL;
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index a825bf031f49..df21fba77db1 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -27,6 +27,7 @@ config X86_64
+ 	# Options that are inherently 64-bit kernel only:
+ 	select ARCH_HAS_GIGANTIC_PAGE
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_USE_CMPXCHG_LOCKREF
+ 	select HAVE_ARCH_SOFT_DIRTY
+ 	select MODULES_USE_ELF_RELA
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index a498ae1fbe66..e4399983c50c 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -19,6 +19,7 @@
+ #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
+ #include <linux/efi.h>			/* efi_crash_gracefully_on_page_fault()*/
+ #include <linux/mm_types.h>
++#include <linux/mm.h>			/* find_and_lock_vma() */
+ 
+ #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
+ #include <asm/traps.h>			/* dotraplinkage, ...		*/
+@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	}
+ #endif
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
++
++	if (unlikely(access_error(error_code, vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		if (!user_mode(regs))
++			kernelmode_fixup_or_oops(regs, error_code, address,
++						 SIGBUS, BUS_ADRERR,
++						 ARCH_DEFAULT_PKEY);
++		return;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ 	/*
+ 	 * Kernel-mode access to the user address space should only occur
+ 	 * on well-defined single instructions listed in the exception
+@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	}
+ 
+ 	mmap_read_unlock(mm);
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	if (likely(!(fault & VM_FAULT_ERROR)))
+ 		return;
+ 
+diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
+index 44d1ee429eb0..881e9c82b9d1 100644
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+ 	return ctx->features & UFFD_FEATURE_INITIALIZED;
+ }
+ 
++/*
++ * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
++ * meaningful when userfaultfd_wp()==true on the vma and when it's
++ * anonymous.
++ */
++bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
++{
++	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
++
++	if (!ctx)
++		return false;
++
++	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
 +}
 +
-+static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
-+					      unsigned long start, unsigned long end,
-+					      unsigned long *bitmap)
+ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ 				     vm_flags_t flags)
+ {
+@@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ #endif
+ #ifndef CONFIG_PTE_MARKER_UFFD_WP
+ 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
++	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ #endif
+ 	uffdio_api.ioctls = UFFD_API_IOCTLS;
+ 	ret = -EFAULT;
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 1f79667824eb..c4c9de7d1916 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
+ struct vm_area_struct *vm_area_alloc(struct mm_struct *);
+ struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
+ void vm_area_free(struct vm_area_struct *);
++/* Use only if VMA has no other users */
++void __vm_area_free(struct vm_area_struct *vma);
+ 
+ #ifndef CONFIG_MMU
+ extern struct rb_root nommu_region_tree;
+@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
+ 	{ FAULT_FLAG_USER,		"USER" }, \
+ 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
+ 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
+-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
++	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
++	{ FAULT_FLAG_VMA_LOCK,		"VMA_LOCK" }
+ 
+ /*
+  * vm_fault is filled by the pagefault handler and passed to the vma's
+@@ -623,6 +626,117 @@ struct vm_operations_struct {
+ 					  unsigned long addr);
+ };
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++/*
++ * Try to read-lock a vma. The function is allowed to occasionally yield false
++ * locked result to avoid performance overhead, in which case we fall back to
++ * using mmap_lock. The function should never yield false unlocked result.
++ */
++static inline bool vma_start_read(struct vm_area_struct *vma)
 +{
-+	if (kvm_arch_has_test_clear_young())
-+		return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap);
++	/* Check before locking. A race might cause false locked result. */
++	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
++		return false;
 +
++	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
++		return false;
++
++	/*
++	 * Overflow might produce false locked result.
++	 * False unlocked result is impossible because we modify and check
++	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
++	 * modification invalidates all existing locks.
++	 */
++	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
++		up_read(&vma->vm_lock->lock);
++		return false;
++	}
++	return true;
++}
++
++static inline void vma_end_read(struct vm_area_struct *vma)
++{
++	rcu_read_lock(); /* keeps vma alive till the end of up_read */
++	up_read(&vma->vm_lock->lock);
++	rcu_read_unlock();
++}
++
++static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
++{
++	mmap_assert_write_locked(vma->vm_mm);
++
++	/*
++	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
++	 * mm->mm_lock_seq can't be concurrently modified.
++	 */
++	*mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
++	return (vma->vm_lock_seq == *mm_lock_seq);
++}
++
++static inline void vma_start_write(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	if (__is_vma_write_locked(vma, &mm_lock_seq))
++		return;
++
++	down_write(&vma->vm_lock->lock);
++	vma->vm_lock_seq = mm_lock_seq;
++	up_write(&vma->vm_lock->lock);
++}
++
++static inline bool vma_try_start_write(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	if (__is_vma_write_locked(vma, &mm_lock_seq))
++		return true;
++
++	if (!down_write_trylock(&vma->vm_lock->lock))
++		return false;
++
++	vma->vm_lock_seq = mm_lock_seq;
++	up_write(&vma->vm_lock->lock);
++	return true;
++}
++
++static inline void vma_assert_write_locked(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
++}
++
++static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
++{
++	/* When detaching vma should be write-locked */
++	if (detached)
++		vma_assert_write_locked(vma);
++	vma->detached = detached;
++}
++
++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
++					  unsigned long address);
++
++#else /* CONFIG_PER_VMA_LOCK */
++
++static inline void vma_init_lock(struct vm_area_struct *vma) {}
++static inline bool vma_start_read(struct vm_area_struct *vma)
++		{ return false; }
++static inline void vma_end_read(struct vm_area_struct *vma) {}
++static inline void vma_start_write(struct vm_area_struct *vma) {}
++static inline bool vma_try_start_write(struct vm_area_struct *vma)
++		{ return true; }
++static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
++static inline void vma_mark_detached(struct vm_area_struct *vma,
++				     bool detached) {}
++
++#endif /* CONFIG_PER_VMA_LOCK */
++
++/*
++ * WARNING: vma_init does not initialize vma->vm_lock.
++ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
++ */
+ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+ {
+ 	static const struct vm_operations_struct dummy_vm_ops = {};
+@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+ 	vma->vm_mm = mm;
+ 	vma->vm_ops = &dummy_vm_ops;
+ 	INIT_LIST_HEAD(&vma->anon_vma_chain);
++	vma_mark_detached(vma, false);
+ }
+ 
+ /* Use when VMA is not part of the VMA tree and needs no locking */
+@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
+ static inline void vm_flags_reset(struct vm_area_struct *vma,
+ 				  vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	vm_flags_init(vma, flags);
+ }
+ 
+ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+ 				       vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+ }
+ 
+ static inline void vm_flags_set(struct vm_area_struct *vma,
+ 				vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+ }
+ 
+ static inline void vm_flags_clear(struct vm_area_struct *vma,
+ 				  vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+ }
+ 
+@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
+ static inline void vm_flags_mod(struct vm_area_struct *vma,
+ 				vm_flags_t set, vm_flags_t clear)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	__vm_flags_mod(vma, set, clear);
+ }
+ 
+diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
+index de1e622dd366..0e1d239a882c 100644
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
+ 	/* The current status of the pte should be "cleared" before calling */
+ 	WARN_ON_ONCE(!pte_none(*pte));
+ 
++	/*
++	 * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
++	 * thing, because when zapping either it means it's dropping the
++	 * page, or in TTU where the present pte will be quickly replaced
++	 * with a swap pte.  There's no way of leaking the bit.
++	 */
+ 	if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
+ 		return;
+ 
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 22b2ac82bffd..ef74ea892c5b 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -471,6 +471,10 @@ struct anon_vma_name {
+ 	char name[];
+ };
+ 
++struct vma_lock {
++	struct rw_semaphore lock;
++};
++
+ /*
+  * This struct describes a virtual memory area. There is one of these
+  * per VM-area/task. A VM area is any part of the process virtual memory
+@@ -480,9 +484,16 @@ struct anon_vma_name {
+ struct vm_area_struct {
+ 	/* The first cache line has the info for VMA tree walking. */
+ 
+-	unsigned long vm_start;		/* Our start address within vm_mm. */
+-	unsigned long vm_end;		/* The first byte after our end address
+-					   within vm_mm. */
++	union {
++		struct {
++			/* VMA covers [vm_start; vm_end) addresses within mm */
++			unsigned long vm_start;
++			unsigned long vm_end;
++		};
++#ifdef CONFIG_PER_VMA_LOCK
++		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
++#endif
++	};
+ 
+ 	struct mm_struct *vm_mm;	/* The address space we belong to. */
+ 
+@@ -501,6 +512,14 @@ struct vm_area_struct {
+ 		vm_flags_t __private __vm_flags;
+ 	};
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	int vm_lock_seq;
++	struct vma_lock *vm_lock;
++
++	/* Flag to indicate areas detached from the mm->mm_mt tree */
++	bool detached;
++#endif
++
+ 	/*
+ 	 * For areas with an address space and backing store,
+ 	 * linkage into the address_space->i_mmap interval tree.
+@@ -637,6 +656,9 @@ struct mm_struct {
+ 					  * init_mm.mmlist, and are protected
+ 					  * by mmlist_lock
+ 					  */
++#ifdef CONFIG_PER_VMA_LOCK
++		int mm_lock_seq;
++#endif
+ 
+ 
+ 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
+@@ -1042,6 +1064,7 @@ typedef struct {
+  *                      mapped after the fault.
+  * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
+  *                        We should only access orig_pte if this flag set.
++ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
+  *
+  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+  * whether we would allow page faults to retry by specifying these two
+@@ -1079,6 +1102,7 @@ enum fault_flag {
+ 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+ 	FAULT_FLAG_UNSHARE =		1 << 10,
+ 	FAULT_FLAG_ORIG_PTE_VALID =	1 << 11,
++	FAULT_FLAG_VMA_LOCK =		1 << 12,
+ };
+ 
+ typedef unsigned int __bitwise zap_flags_t;
+diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
+index 96e113e23d04..aab8f1b28d26 100644
+--- a/include/linux/mmap_lock.h
++++ b/include/linux/mmap_lock.h
+@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
+ 
+ #endif /* CONFIG_TRACING */
+ 
++static inline void mmap_assert_locked(struct mm_struct *mm)
++{
++	lockdep_assert_held(&mm->mmap_lock);
++	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
++}
++
++static inline void mmap_assert_write_locked(struct mm_struct *mm)
++{
++	lockdep_assert_held_write(&mm->mmap_lock);
++	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
++}
++
++#ifdef CONFIG_PER_VMA_LOCK
++static inline void vma_end_write_all(struct mm_struct *mm)
++{
++	mmap_assert_write_locked(mm);
++	/* No races during update due to exclusive mmap_lock being held */
++	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
++}
++#else
++static inline void vma_end_write_all(struct mm_struct *mm) {}
++#endif
++
+ static inline void mmap_init_lock(struct mm_struct *mm)
+ {
+ 	init_rwsem(&mm->mmap_lock);
+@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
+ static inline void mmap_write_unlock(struct mm_struct *mm)
+ {
+ 	__mmap_lock_trace_released(mm, true);
++	vma_end_write_all(mm);
+ 	up_write(&mm->mmap_lock);
+ }
+ 
+ static inline void mmap_write_downgrade(struct mm_struct *mm)
+ {
+ 	__mmap_lock_trace_acquire_returned(mm, false, true);
++	vma_end_write_all(mm);
+ 	downgrade_write(&mm->mmap_lock);
+ }
+ 
+@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
+ 	up_read_non_owner(&mm->mmap_lock);
+ }
+ 
+-static inline void mmap_assert_locked(struct mm_struct *mm)
+-{
+-	lockdep_assert_held(&mm->mmap_lock);
+-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+-}
+-
+-static inline void mmap_assert_write_locked(struct mm_struct *mm)
+-{
+-	lockdep_assert_held_write(&mm->mmap_lock);
+-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+-}
+-
+ static inline int mmap_lock_is_contended(struct mm_struct *mm)
+ {
+ 	return rwsem_is_contended(&mm->mmap_lock);
+diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
+index 3767f18114ef..0cf8880219da 100644
+--- a/include/linux/userfaultfd_k.h
++++ b/include/linux/userfaultfd_k.h
+@@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ 				  unsigned long end, struct list_head *uf);
+ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
+ 				       struct list_head *uf);
++extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
+ 
+ #else /* CONFIG_USERFAULTFD */
+ 
+@@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
+ 	return false;
+ }
+ 
++static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
++{
 +	return false;
 +}
 +
- static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
- 				       struct mm_struct *mm,
- 				       unsigned long address)
-@@ -903,6 +960,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
- 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
- 	.clear_young		= kvm_mmu_notifier_clear_young,
- 	.test_young		= kvm_mmu_notifier_test_young,
-+	.test_clear_young	= kvm_mmu_notifier_test_clear_young,
- 	.change_pte		= kvm_mmu_notifier_change_pte,
- 	.release		= kvm_mmu_notifier_release,
+ #endif /* CONFIG_USERFAULTFD */
+ 
++static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
++{
++	/* Only wr-protect mode uses pte markers */
++	if (!userfaultfd_wp(vma))
++		return false;
++
++	/* File-based uffd-wp always need markers */
++	if (!vma_is_anonymous(vma))
++		return true;
++
++	/*
++	 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
++	 * enabled (to apply markers on zero pages).
++	 */
++	return userfaultfd_wp_unpopulated(vma);
++}
++
+ static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
+ {
+ #ifdef CONFIG_PTE_MARKER_UFFD_WP
+diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
+index 7f5d1caf5890..8abfa1240040 100644
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+ #ifdef CONFIG_X86
+ 		DIRECT_MAP_LEVEL2_SPLIT,
+ 		DIRECT_MAP_LEVEL3_SPLIT,
++#endif
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++		VMA_LOCK_SUCCESS,
++		VMA_LOCK_ABORT,
++		VMA_LOCK_RETRY,
++		VMA_LOCK_MISS,
+ #endif
+ 		NR_VM_EVENT_ITEMS
  };
--- 
-2.40.0.rc2
-
-From c63e61e48ac0d492af1918ba84350e07a5c95d17 Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 13 Feb 2023 09:26:09 +0100
-Subject: [PATCH 12/16] objtool
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- tools/objtool/.gitignore                |  1 +
- tools/objtool/Build                     |  2 -
- tools/objtool/Documentation/objtool.txt |  8 +++
- tools/objtool/Makefile                  | 66 +++++++++++++++++--------
- tools/objtool/builtin-check.c           |  2 +-
- tools/objtool/check.c                   |  7 +++
- tools/objtool/elf.c                     | 42 ++++++++--------
- tools/objtool/include/objtool/builtin.h |  2 -
- tools/objtool/include/objtool/elf.h     |  9 ++--
- tools/objtool/include/objtool/special.h |  2 +-
- tools/objtool/special.c                 |  6 +--
- 11 files changed, 93 insertions(+), 54 deletions(-)
-
-diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
-index 14236db3677f..4faa4dd72f35 100644
---- a/tools/objtool/.gitignore
-+++ b/tools/objtool/.gitignore
-@@ -2,3 +2,4 @@
- arch/x86/lib/inat-tables.c
- /objtool
- fixdep
-+libsubcmd/
-diff --git a/tools/objtool/Build b/tools/objtool/Build
-index 33f2ee5a46d3..a3cdf8af6635 100644
---- a/tools/objtool/Build
-+++ b/tools/objtool/Build
-@@ -16,8 +16,6 @@ objtool-y += libctype.o
- objtool-y += str_error_r.o
- objtool-y += librbtree.o
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 19cf5b6892ce..fed855bae6d8 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
+ #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
+ #endif
  
--CFLAGS += -I$(srctree)/tools/lib
--
- $(OUTPUT)libstring.o: ../lib/string.c FORCE
- 	$(call rule_mkdir)
- 	$(call if_changed_dep,cc_o_c)
-diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt
-index 8a671902a187..8e53fc6735ef 100644
---- a/tools/objtool/Documentation/objtool.txt
-+++ b/tools/objtool/Documentation/objtool.txt
-@@ -410,6 +410,14 @@ the objtool maintainers.
-    can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL
-    directive right before the call.
- 
-+12. file.o: warning: func(): not an indirect call target
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++#define count_vm_vma_lock_event(x) count_vm_event(x)
++#else
++#define count_vm_vma_lock_event(x) do {} while (0)
++#endif
 +
-+   This means that objtool is running with --ibt and a function expected
-+   to be an indirect call target is not. In particular, this happens for
-+   init_module() or cleanup_module() if a module relies on these special
-+   names and does not use module_init() / module_exit() macros to create
-+   them.
+ #define __count_zid_vm_events(item, zid, delta) \
+ 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
+ 
+diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
+index 005e5e306266..90c958952bfc 100644
+--- a/include/uapi/linux/userfaultfd.h
++++ b/include/uapi/linux/userfaultfd.h
+@@ -38,7 +38,8 @@
+ 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
+ 			   UFFD_FEATURE_MINOR_SHMEM |		\
+ 			   UFFD_FEATURE_EXACT_ADDRESS |		\
+-			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
++			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
++			   UFFD_FEATURE_WP_UNPOPULATED)
+ #define UFFD_API_IOCTLS				\
+ 	((__u64)1 << _UFFDIO_REGISTER |		\
+ 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+@@ -203,6 +204,12 @@ struct uffdio_api {
+ 	 *
+ 	 * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
+ 	 * write-protection mode is supported on both shmem and hugetlbfs.
++	 *
++	 * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
++	 * write-protection mode will always apply to unpopulated pages
++	 * (i.e. empty ptes).  This will be the default behavior for shmem
++	 * & hugetlbfs, so this flag only affects anonymous memory behavior
++	 * when userfault write-protection mode is registered.
+ 	 */
+ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+ #define UFFD_FEATURE_EVENT_FORK			(1<<1)
+@@ -217,6 +224,7 @@ struct uffdio_api {
+ #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
+ #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
+ #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
++#define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
+ 	__u64 features;
+ 
+ 	__u64 ioctls;
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 49c173e367d2..346ce90d1f33 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep;
+ /* SLAB cache for mm_struct structures (tsk->mm) */
+ static struct kmem_cache *mm_cachep;
+ 
++#ifdef CONFIG_PER_VMA_LOCK
 +
- 
- If the error doesn't seem to make sense, it could be a bug in objtool.
- Feel free to ask the objtool maintainer for help.
-diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
-index a3a9cc24e0e3..83b100c1e7f6 100644
---- a/tools/objtool/Makefile
-+++ b/tools/objtool/Makefile
-@@ -2,19 +2,18 @@
- include ../scripts/Makefile.include
- include ../scripts/Makefile.arch
- 
--# always use the host compiler
--AR	 = $(HOSTAR)
--CC	 = $(HOSTCC)
--LD	 = $(HOSTLD)
--
- ifeq ($(srctree),)
- srctree := $(patsubst %/,%,$(dir $(CURDIR)))
- srctree := $(patsubst %/,%,$(dir $(srctree)))
- endif
- 
--SUBCMD_SRCDIR		= $(srctree)/tools/lib/subcmd/
--LIBSUBCMD_OUTPUT	= $(or $(OUTPUT),$(CURDIR)/)
--LIBSUBCMD		= $(LIBSUBCMD_OUTPUT)libsubcmd.a
-+LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
-+ifneq ($(OUTPUT),)
-+  LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
-+else
-+  LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
-+endif
-+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
- 
- OBJTOOL    := $(OUTPUT)objtool
- OBJTOOL_IN := $(OBJTOOL)-in.o
-@@ -28,16 +27,29 @@ INCLUDES := -I$(srctree)/tools/include \
- 	    -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
- 	    -I$(srctree)/tools/arch/$(SRCARCH)/include	\
- 	    -I$(srctree)/tools/objtool/include \
--	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include
-+	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \
-+	    -I$(LIBSUBCMD_OUTPUT)/include
-+# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it
-+# is passed here to match a legacy behavior.
- WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
--CFLAGS   := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
--LDFLAGS  += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
-+OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
-+OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
- 
- # Allow old libelf to be used:
--elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
--CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
-+elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr)
-+OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
++/* SLAB cache for vm_area_struct.lock */
++static struct kmem_cache *vma_lock_cachep;
 +
-+# Always want host compilation.
-+HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
- 
- AWK = awk
-+MKDIR = mkdir
++static bool vma_lock_alloc(struct vm_area_struct *vma)
++{
++	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
++	if (!vma->vm_lock)
++		return false;
 +
-+ifeq ($(V),1)
-+  Q =
-+else
-+  Q = @
-+endif
- 
- BUILD_ORC := n
- 
-@@ -49,21 +61,33 @@ export BUILD_ORC
- export srctree OUTPUT CFLAGS SRCARCH AWK
- include $(srctree)/tools/build/Makefile.include
- 
--$(OBJTOOL_IN): fixdep FORCE
--	@$(CONFIG_SHELL) ./sync-check.sh
--	@$(MAKE) $(build)=objtool
-+$(OBJTOOL_IN): fixdep $(LIBSUBCMD) FORCE
-+	$(Q)$(CONFIG_SHELL) ./sync-check.sh
-+	$(Q)$(MAKE) $(build)=objtool $(HOST_OVERRIDES) CFLAGS="$(OBJTOOL_CFLAGS)" \
-+		LDFLAGS="$(OBJTOOL_LDFLAGS)"
++	init_rwsem(&vma->vm_lock->lock);
++	vma->vm_lock_seq = -1;
 +
- 
- $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
--	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
-+	$(QUIET_LINK)$(HOSTCC) $(OBJTOOL_IN) $(OBJTOOL_LDFLAGS) -o $@
++	return true;
++}
 +
++static inline void vma_lock_free(struct vm_area_struct *vma)
++{
++	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
++}
 +
-+$(LIBSUBCMD_OUTPUT):
-+	$(Q)$(MKDIR) -p $@
++#else /* CONFIG_PER_VMA_LOCK */
++
++static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
++static inline void vma_lock_free(struct vm_area_struct *vma) {}
++
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+ {
+ 	struct vm_area_struct *vma;
  
-+$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE
-+	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
-+		DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \
-+		$(HOST_OVERRIDES) EXTRA_CFLAGS="$(OBJTOOL_CFLAGS)" \
-+		$@ install_headers
- 
--$(LIBSUBCMD): fixdep FORCE
--	$(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT)
-+$(LIBSUBCMD)-clean:
-+	$(call QUIET_CLEAN, libsubcmd)
-+	$(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
- 
--clean:
-+clean: $(LIBSUBCMD)-clean
- 	$(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL)
- 	$(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
--	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep $(LIBSUBCMD)
-+	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep
- 
- FORCE:
- 
-diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
-index a4f39407bf59..7c175198d09f 100644
---- a/tools/objtool/builtin-check.c
-+++ b/tools/objtool/builtin-check.c
-@@ -65,7 +65,7 @@ static int parse_hacks(const struct option *opt, const char *str, int unset)
- 	return found ? 0 : -1;
+ 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+-	if (vma)
+-		vma_init(vma, mm);
++	if (!vma)
++		return NULL;
++
++	vma_init(vma, mm);
++	if (!vma_lock_alloc(vma)) {
++		kmem_cache_free(vm_area_cachep, vma);
++		return NULL;
++	}
++
+ 	return vma;
  }
  
--const struct option check_options[] = {
-+static const struct option check_options[] = {
- 	OPT_GROUP("Actions:"),
- 	OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks),
- 	OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
-diff --git a/tools/objtool/check.c b/tools/objtool/check.c
-index ea1e7cdeb1b3..384b7df3fbb2 100644
---- a/tools/objtool/check.c
-+++ b/tools/objtool/check.c
-@@ -856,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
- 	list_for_each_entry(insn, &file->endbr_list, call_node) {
+@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+ {
+ 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  
- 		int *site = (int *)sec->data->d_buf + idx;
-+		struct symbol *sym = insn->sym;
- 		*site = 0;
- 
-+		if (opts.module && sym && sym->type == STT_FUNC &&
-+		    insn->offset == sym->offset &&
-+		    (!strcmp(sym->name, "init_module") ||
-+		     !strcmp(sym->name, "cleanup_module")))
-+			WARN("%s(): not an indirect call target", sym->name);
+-	if (new) {
+-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+-		/*
+-		 * orig->shared.rb may be modified concurrently, but the clone
+-		 * will be reinitialized.
+-		 */
+-		data_race(memcpy(new, orig, sizeof(*new)));
+-		INIT_LIST_HEAD(&new->anon_vma_chain);
+-		dup_anon_vma_name(orig, new);
++	if (!new)
++		return NULL;
 +
- 		if (elf_add_reloc_to_insn(file->elf, sec,
- 					  idx * sizeof(int),
- 					  R_X86_64_PC32,
-diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
-index 64443a7f4bbf..6806ce01d933 100644
---- a/tools/objtool/elf.c
-+++ b/tools/objtool/elf.c
-@@ -284,13 +284,13 @@ static int read_sections(struct elf *elf)
- 	    !elf_alloc_hash(section_name, sections_nr))
- 		return -1;
++	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
++	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
++	/*
++	 * orig->shared.rb may be modified concurrently, but the clone
++	 * will be reinitialized.
++	 */
++	data_race(memcpy(new, orig, sizeof(*new)));
++	if (!vma_lock_alloc(new)) {
++		kmem_cache_free(vm_area_cachep, new);
++		return NULL;
+ 	}
++	INIT_LIST_HEAD(&new->anon_vma_chain);
++	dup_anon_vma_name(orig, new);
++
+ 	return new;
+ }
  
-+	elf->section_data = calloc(sections_nr, sizeof(*sec));
-+	if (!elf->section_data) {
-+		perror("calloc");
-+		return -1;
-+	}
- 	for (i = 0; i < sections_nr; i++) {
--		sec = malloc(sizeof(*sec));
--		if (!sec) {
--			perror("malloc");
--			return -1;
--		}
--		memset(sec, 0, sizeof(*sec));
-+		sec = &elf->section_data[i];
+-void vm_area_free(struct vm_area_struct *vma)
++void __vm_area_free(struct vm_area_struct *vma)
+ {
+ 	free_anon_vma_name(vma);
++	vma_lock_free(vma);
+ 	kmem_cache_free(vm_area_cachep, vma);
+ }
  
- 		INIT_LIST_HEAD(&sec->symbol_list);
- 		INIT_LIST_HEAD(&sec->reloc_list);
-@@ -422,13 +422,13 @@ static int read_symbols(struct elf *elf)
- 	    !elf_alloc_hash(symbol_name, symbols_nr))
- 		return -1;
++#ifdef CONFIG_PER_VMA_LOCK
++static void vm_area_free_rcu_cb(struct rcu_head *head)
++{
++	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
++						  vm_rcu);
++
++	/* The vma should not be locked while being destroyed. */
++	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
++	__vm_area_free(vma);
++}
++#endif
++
++void vm_area_free(struct vm_area_struct *vma)
++{
++#ifdef CONFIG_PER_VMA_LOCK
++	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
++#else
++	__vm_area_free(vma);
++#endif
++}
++
+ static void account_kernel_stack(struct task_struct *tsk, int account)
+ {
+ 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	seqcount_init(&mm->write_protect_seq);
+ 	mmap_init_lock(mm);
+ 	INIT_LIST_HEAD(&mm->mmlist);
++#ifdef CONFIG_PER_VMA_LOCK
++	mm->mm_lock_seq = 0;
++#endif
+ 	mm_pgtables_bytes_init(mm);
+ 	mm->map_count = 0;
+ 	mm->locked_vm = 0;
+@@ -3073,6 +3140,9 @@ void __init proc_caches_init(void)
+ 			NULL);
  
-+	elf->symbol_data = calloc(symbols_nr, sizeof(*sym));
-+	if (!elf->symbol_data) {
-+		perror("calloc");
-+		return -1;
-+	}
- 	for (i = 0; i < symbols_nr; i++) {
--		sym = malloc(sizeof(*sym));
--		if (!sym) {
--			perror("malloc");
--			return -1;
--		}
--		memset(sym, 0, sizeof(*sym));
-+		sym = &elf->symbol_data[i];
+ 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
++#ifdef CONFIG_PER_VMA_LOCK
++	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
++#endif
+ 	mmap_init();
+ 	nsproxy_cache_init();
+ }
+diff --git a/mm/Kconfig b/mm/Kconfig
+index cf2e47030fe8..459af2123189 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS
+ 	  This option has a per-memcg and per-node memory overhead.
+ # }
  
- 		sym->idx = i;
++config ARCH_SUPPORTS_PER_VMA_LOCK
++       def_bool n
++
++config PER_VMA_LOCK
++	def_bool y
++	depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
++	help
++	  Allow per-vma locking during page fault handling.
++
++	  This feature allows locking each virtual memory area separately when
++	  handling page faults instead of taking mmap_lock.
++
+ source "mm/damon/Kconfig"
  
-@@ -918,13 +918,13 @@ static int read_relocs(struct elf *elf)
- 		sec->base->reloc = sec;
+ endmenu
+diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
+index c3547a373c9c..4965a7333a3f 100644
+--- a/mm/Kconfig.debug
++++ b/mm/Kconfig.debug
+@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
  
- 		nr_reloc = 0;
-+		sec->reloc_data = calloc(sec->sh.sh_size / sec->sh.sh_entsize, sizeof(*reloc));
-+		if (!sec->reloc_data) {
-+			perror("calloc");
-+			return -1;
-+		}
- 		for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) {
--			reloc = malloc(sizeof(*reloc));
--			if (!reloc) {
--				perror("malloc");
--				return -1;
--			}
--			memset(reloc, 0, sizeof(*reloc));
-+			reloc = &sec->reloc_data[i];
- 			switch (sec->sh.sh_type) {
- 			case SHT_REL:
- 				if (read_rel_reloc(sec, i, reloc, &symndx))
-@@ -1453,16 +1453,16 @@ void elf_close(struct elf *elf)
- 		list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) {
- 			list_del(&sym->list);
- 			hash_del(&sym->hash);
--			free(sym);
- 		}
- 		list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) {
- 			list_del(&reloc->list);
- 			hash_del(&reloc->hash);
--			free(reloc);
- 		}
- 		list_del(&sec->list);
--		free(sec);
-+		free(sec->reloc_data);
+ 	  If unsure, say Y.
+ 
++config PER_VMA_LOCK_STATS
++	bool "Statistics for per-vma locks"
++	depends on PER_VMA_LOCK
++	default y
++	help
++	  Statistics for per-vma locks.
+diff --git a/mm/hugetlb.c b/mm/hugetlb.c
+index 245038a9fe4e..4d860b53a14a 100644
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6004,6 +6004,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	int need_wait_lock = 0;
+ 	unsigned long haddr = address & huge_page_mask(h);
+ 
++	/* TODO: Handle faults under the VMA lock */
++	if (flags & FAULT_FLAG_VMA_LOCK)
++		return VM_FAULT_RETRY;
++
+ 	/*
+ 	 * Serialize hugepage allocation and instantiation, so that we don't
+ 	 * get spurious allocation failures if two CPUs race to instantiate
+diff --git a/mm/init-mm.c b/mm/init-mm.c
+index c9327abb771c..33269314e060 100644
+--- a/mm/init-mm.c
++++ b/mm/init-mm.c
+@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
+ 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
+ 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
++#ifdef CONFIG_PER_VMA_LOCK
++	.mm_lock_seq	= 0,
++#endif
+ 	.user_ns	= &init_user_ns,
+ 	.cpu_bitmap	= CPU_BITS_NONE,
+ #ifdef CONFIG_IOMMU_SVA
+diff --git a/mm/internal.h b/mm/internal.h
+index 7920a8b7982e..0c455d6e4e3e 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
+ 
+ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		   struct vm_area_struct *start_vma, unsigned long floor,
+-		   unsigned long ceiling);
++		   unsigned long ceiling, bool mm_wr_locked);
+ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+ 
+ struct zap_details;
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 92e6f56a932d..042007f0bfa1 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1049,6 +1049,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ 	if (result != SCAN_SUCCEED)
+ 		goto out_up_write;
+ 
++	vma_start_write(vma);
+ 	anon_vma_lock_write(vma->anon_vma);
+ 
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+@@ -1172,7 +1173,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+ 				 * enabled swap entries.  Please see
+ 				 * comment below for pte_uffd_wp().
+ 				 */
+-				if (pte_swp_uffd_wp(pteval)) {
++				if (pte_swp_uffd_wp_any(pteval)) {
+ 					result = SCAN_PTE_UFFD_WP;
+ 					goto out_unmap;
+ 				}
+@@ -1512,6 +1513,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ 		goto drop_hpage;
  	}
  
-+	free(elf->symbol_data);
-+	free(elf->section_data);
- 	free(elf);
- }
-diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
-index fa45044e3863..2a108e648b7a 100644
---- a/tools/objtool/include/objtool/builtin.h
-+++ b/tools/objtool/include/objtool/builtin.h
-@@ -7,8 +7,6 @@
- 
- #include <subcmd/parse-options.h>
- 
--extern const struct option check_options[];
--
- struct opts {
- 	/* actions: */
- 	bool dump_orc;
-diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
-index bb60fd42b46f..ad0024da262b 100644
---- a/tools/objtool/include/objtool/elf.h
-+++ b/tools/objtool/include/objtool/elf.h
-@@ -39,6 +39,7 @@ struct section {
- 	char *name;
- 	int idx;
- 	bool changed, text, rodata, noinstr, init, truncate;
-+	struct reloc *reloc_data;
- };
- 
- struct symbol {
-@@ -49,12 +50,11 @@ struct symbol {
- 	GElf_Sym sym;
- 	struct section *sec;
- 	char *name;
--	unsigned int idx;
--	unsigned char bind, type;
-+	unsigned int idx, len;
- 	unsigned long offset;
--	unsigned int len;
- 	unsigned long __subtree_last;
- 	struct symbol *pfunc, *cfunc, *alias;
-+	unsigned char bind, type;
- 	u8 uaccess_safe      : 1;
- 	u8 static_call_tramp : 1;
- 	u8 retpoline_thunk   : 1;
-@@ -104,6 +104,9 @@ struct elf {
- 	struct hlist_head *section_hash;
- 	struct hlist_head *section_name_hash;
- 	struct hlist_head *reloc_hash;
++	/* Lock the vma before taking i_mmap and page table locks */
++	vma_start_write(vma);
 +
-+	struct section *section_data;
-+	struct symbol *symbol_data;
- };
+ 	/*
+ 	 * We need to lock the mapping so that from here on, only GUP-fast and
+ 	 * hardware page walks can access the parts of the page tables that
+@@ -1689,6 +1693,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ 		result = SCAN_PTE_MAPPED_HUGEPAGE;
+ 		if ((cc->is_khugepaged || is_target) &&
+ 		    mmap_write_trylock(mm)) {
++			/* trylock for the same lock inversion as above */
++			if (!vma_try_start_write(vma))
++				goto unlock_next;
++
+ 			/*
+ 			 * Re-check whether we have an ->anon_vma, because
+ 			 * collapse_and_free_pmd() requires that either no
+diff --git a/mm/memory.c b/mm/memory.c
+index 01a23ad48a04..e7ffdadb684d 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
+ #endif
  
- #define OFFSET_STRIDE_BITS	4
-diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
-index dc4721e19002..86d4af9c5aa9 100644
---- a/tools/objtool/include/objtool/special.h
-+++ b/tools/objtool/include/objtool/special.h
-@@ -19,6 +19,7 @@ struct special_alt {
- 	bool skip_orig;
- 	bool skip_alt;
- 	bool jump_or_nop;
-+	u8 key_addend;
+ static vm_fault_t do_fault(struct vm_fault *vmf);
++static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
++static bool vmf_pte_changed(struct vm_fault *vmf);
++
++/*
++ * Return true if the original pte was a uffd-wp pte marker (so the pte was
++ * wr-protected).
++ */
++static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
++{
++	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
++		return false;
++
++	return pte_marker_uffd_wp(vmf->orig_pte);
++}
  
- 	struct section *orig_sec;
- 	unsigned long orig_off;
-@@ -27,7 +28,6 @@ struct special_alt {
- 	unsigned long new_off;
+ /*
+  * A number of key systems in x86 including ioremap() rely on the assumption
+@@ -348,7 +362,7 @@ void free_pgd_range(struct mmu_gather *tlb,
  
- 	unsigned int orig_len, new_len; /* group only */
--	u8 key_addend;
- };
+ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		   struct vm_area_struct *vma, unsigned long floor,
+-		   unsigned long ceiling)
++		   unsigned long ceiling, bool mm_wr_locked)
+ {
+ 	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
  
- int special_get_alts(struct elf *elf, struct list_head *alts);
-diff --git a/tools/objtool/special.c b/tools/objtool/special.c
-index 9c8d827f69af..baa85c31526b 100644
---- a/tools/objtool/special.c
-+++ b/tools/objtool/special.c
-@@ -26,7 +26,7 @@ struct special_entry {
- 	unsigned char key; /* jump_label key */
- };
+@@ -366,6 +380,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		 * Hide vma from rmap and truncate_pagecache before freeing
+ 		 * pgtables
+ 		 */
++		if (mm_wr_locked)
++			vma_start_write(vma);
+ 		unlink_anon_vmas(vma);
+ 		unlink_file_vma(vma);
  
--struct special_entry entries[] = {
-+static const struct special_entry entries[] = {
- 	{
- 		.sec = ".altinstructions",
- 		.group = true,
-@@ -65,7 +65,7 @@ static void reloc_to_sec_off(struct reloc *reloc, struct section **sec,
- 	*off = reloc->sym->offset + reloc->addend;
+@@ -380,6 +396,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 			       && !is_vm_hugetlb_page(next)) {
+ 				vma = next;
+ 				next = mas_find(&mas, ceiling - 1);
++				if (mm_wr_locked)
++					vma_start_write(vma);
+ 				unlink_anon_vmas(vma);
+ 				unlink_file_vma(vma);
+ 			}
+@@ -1345,6 +1363,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
+ 			      unsigned long addr, pte_t *pte,
+ 			      struct zap_details *details, pte_t pteval)
+ {
++	/* Zap on anonymous always means dropping everything */
++	if (vma_is_anonymous(vma))
++		return;
++
+ 	if (zap_drop_file_uffd_wp(details))
+ 		return;
+ 
+@@ -1451,8 +1473,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 				continue;
+ 			rss[mm_counter(page)]--;
+ 		} else if (pte_marker_entry_uffd_wp(entry)) {
+-			/* Only drop the uffd-wp marker if explicitly requested */
+-			if (!zap_drop_file_uffd_wp(details))
++			/*
++			 * For anon: always drop the marker; for file: only
++			 * drop the marker if explicitly requested.
++			 */
++			if (!vma_is_anonymous(vma) &&
++			    !zap_drop_file_uffd_wp(details))
+ 				continue;
+ 		} else if (is_hwpoison_entry(entry) ||
+ 			   is_swapin_error_entry(entry)) {
+@@ -3322,6 +3348,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
+ 	struct vm_area_struct *vma = vmf->vma;
+ 	struct folio *folio = NULL;
+ 
++	if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma))
++		return VM_FAULT_RETRY;
++
+ 	if (likely(!unshare)) {
+ 		if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ 			pte_unmap_unlock(vmf->pte, vmf->ptl);
+@@ -3633,6 +3662,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
+ 	return 0;
  }
  
--static int get_alt_entry(struct elf *elf, struct special_entry *entry,
-+static int get_alt_entry(struct elf *elf, const struct special_entry *entry,
- 			 struct section *sec, int idx,
- 			 struct special_alt *alt)
- {
-@@ -139,7 +139,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
++static vm_fault_t do_pte_missing(struct vm_fault *vmf)
++{
++	if (vma_is_anonymous(vmf->vma))
++		return do_anonymous_page(vmf);
++	else
++		return do_fault(vmf);
++}
++
+ /*
+  * This is actually a page-missing access, but with uffd-wp special pte
+  * installed.  It means this pte was wr-protected before being unmapped.
+@@ -3643,11 +3680,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
+ 	 * Just in case there're leftover special ptes even after the region
+ 	 * got unregistered - we can simply clear them.
+ 	 */
+-	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
++	if (unlikely(!userfaultfd_wp(vmf->vma)))
+ 		return pte_marker_clear(vmf);
+ 
+-	/* do_fault() can handle pte markers too like none pte */
+-	return do_fault(vmf);
++	return do_pte_missing(vmf);
+ }
+ 
+ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+@@ -3698,6 +3734,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
+ 	if (!pte_unmap_same(vmf))
+ 		goto out;
+ 
++	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
++		ret = VM_FAULT_RETRY;
++		goto out;
++	}
++
+ 	entry = pte_to_swp_entry(vmf->orig_pte);
+ 	if (unlikely(non_swap_entry(entry))) {
+ 		if (is_migration_entry(entry)) {
+@@ -4012,6 +4053,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
   */
- int special_get_alts(struct elf *elf, struct list_head *alts)
+ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
--	struct special_entry *entry;
-+	const struct special_entry *entry;
- 	struct section *sec;
- 	unsigned int nr_entries;
- 	struct special_alt *alt;
++	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
+ 	struct vm_area_struct *vma = vmf->vma;
+ 	struct folio *folio;
+ 	vm_fault_t ret = 0;
+@@ -4045,7 +4087,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
+ 						vma->vm_page_prot));
+ 		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ 				vmf->address, &vmf->ptl);
+-		if (!pte_none(*vmf->pte)) {
++		if (vmf_pte_changed(vmf)) {
+ 			update_mmu_tlb(vma, vmf->address, vmf->pte);
+ 			goto unlock;
+ 		}
+@@ -4085,7 +4127,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
+ 
+ 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ 			&vmf->ptl);
+-	if (!pte_none(*vmf->pte)) {
++	if (vmf_pte_changed(vmf)) {
+ 		update_mmu_tlb(vma, vmf->address, vmf->pte);
+ 		goto release;
+ 	}
+@@ -4105,6 +4147,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
+ 	folio_add_new_anon_rmap(folio, vma, vmf->address);
+ 	folio_add_lru_vma(folio, vma);
+ setpte:
++	if (uffd_wp)
++		entry = pte_mkuffd_wp(entry);
+ 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+ 
+ 	/* No need to invalidate - it was non-present before */
+@@ -4272,7 +4316,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+ {
+ 	struct vm_area_struct *vma = vmf->vma;
+-	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
++	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
+ 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+ 	bool prefault = vmf->address != addr;
+ 	pte_t entry;
+@@ -4503,6 +4547,8 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
+ 			return ret;
+ 	}
+ 
++	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
++		return VM_FAULT_RETRY;
+ 	ret = __do_fault(vmf);
+ 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ 		return ret;
+@@ -4519,6 +4565,9 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
+ 	struct vm_area_struct *vma = vmf->vma;
+ 	vm_fault_t ret;
+ 
++	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
++		return VM_FAULT_RETRY;
++
+ 	if (unlikely(anon_vma_prepare(vma)))
+ 		return VM_FAULT_OOM;
+ 
+@@ -4558,6 +4607,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
+ 	struct vm_area_struct *vma = vmf->vma;
+ 	vm_fault_t ret, tmp;
+ 
++	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
++		return VM_FAULT_RETRY;
++
+ 	ret = __do_fault(vmf);
+ 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ 		return ret;
+@@ -4916,12 +4968,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+ 		}
+ 	}
+ 
+-	if (!vmf->pte) {
+-		if (vma_is_anonymous(vmf->vma))
+-			return do_anonymous_page(vmf);
+-		else
+-			return do_fault(vmf);
+-	}
++	if (!vmf->pte)
++		return do_pte_missing(vmf);
+ 
+ 	if (!pte_present(vmf->orig_pte))
+ 		return do_swap_page(vmf);
+@@ -4929,6 +4977,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+ 	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ 		return do_numa_page(vmf);
+ 
++	if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma))
++		return VM_FAULT_RETRY;
++
+ 	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ 	spin_lock(vmf->ptl);
+ 	entry = vmf->orig_pte;
+@@ -4965,10 +5016,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+ }
+ 
+ /*
+- * By the time we get here, we already hold the mm semaphore
+- *
+- * The mmap_lock may have been released depending on flags and our
+- * return value.  See filemap_fault() and __folio_lock_or_retry().
++ * On entry, we hold either the VMA lock or the mmap_lock
++ * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
++ * the result, the mmap_lock is not held on exit.  See filemap_fault()
++ * and __folio_lock_or_retry().
+  */
+ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ 		unsigned long address, unsigned int flags)
+@@ -5230,6 +5281,63 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++/*
++ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
++ * stable and not isolated. If the VMA is not found or is being modified the
++ * function returns NULL.
++ */
++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
++					  unsigned long address)
++{
++	MA_STATE(mas, &mm->mm_mt, address, address);
++	struct vm_area_struct *vma;
++
++	rcu_read_lock();
++retry:
++	vma = mas_walk(&mas);
++	if (!vma)
++		goto inval;
++
++	/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
++	if (vma_is_anonymous(vma) && !vma->anon_vma)
++		goto inval;
++
++	if (!vma_start_read(vma))
++		goto inval;
++
++	/*
++	 * Due to the possibility of userfault handler dropping mmap_lock, avoid
++	 * it for now and fall back to page fault handling under mmap_lock.
++	 */
++	if (userfaultfd_armed(vma)) {
++		vma_end_read(vma);
++		goto inval;
++	}
++
++	/* Check since vm_start/vm_end might change before we lock the VMA */
++	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
++		vma_end_read(vma);
++		goto inval;
++	}
++
++	/* Check if the VMA got isolated after we found it */
++	if (vma->detached) {
++		vma_end_read(vma);
++		count_vm_vma_lock_event(VMA_LOCK_MISS);
++		/* The area was replaced with another one */
++		goto retry;
++	}
++
++	rcu_read_unlock();
++	return vma;
++inval:
++	rcu_read_unlock();
++	count_vm_vma_lock_event(VMA_LOCK_ABORT);
++	return NULL;
++}
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+  * Allocate p4d page table.
+diff --git a/mm/mmap.c b/mm/mmap.c
+index ff68a67a2a7c..a2bc2d9432b8 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
+ /*
+  * Close a vm structure and free it.
+  */
+-static void remove_vma(struct vm_area_struct *vma)
++static void remove_vma(struct vm_area_struct *vma, bool unreachable)
+ {
+ 	might_sleep();
+ 	if (vma->vm_ops && vma->vm_ops->close)
+@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
+ 	if (vma->vm_file)
+ 		fput(vma->vm_file);
+ 	mpol_put(vma_policy(vma));
+-	vm_area_free(vma);
++	if (unreachable)
++		__vm_area_free(vma);
++	else
++		vm_area_free(vma);
+ }
+ 
+ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
+@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp,
+  */
+ static inline void vma_prepare(struct vma_prepare *vp)
+ {
++	vma_start_write(vp->vma);
++	if (vp->adj_next)
++		vma_start_write(vp->adj_next);
++	/* vp->insert is always a newly created VMA, no need for locking */
++	if (vp->remove)
++		vma_start_write(vp->remove);
++	if (vp->remove2)
++		vma_start_write(vp->remove2);
++
+ 	if (vp->file) {
+ 		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+ 
+@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp,
+ 
+ 	if (vp->remove) {
+ again:
++		vma_mark_detached(vp->remove, true);
+ 		if (vp->file) {
+ 			uprobe_munmap(vp->remove, vp->remove->vm_start,
+ 				      vp->remove->vm_end);
+@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 	if (vma_iter_prealloc(vmi))
+ 		goto nomem;
+ 
++	vma_prepare(&vp);
+ 	vma_adjust_trans_huge(vma, start, end, 0);
+ 	/* VMA iterator points to previous, so set to start if necessary */
+ 	if (vma_iter_addr(vmi) != start)
+ 		vma_iter_set(vmi, start);
+ 
+-	vma_prepare(&vp);
+ 	vma->vm_start = start;
+ 	vma->vm_end = end;
+ 	vma->vm_pgoff = pgoff;
+@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 		return -ENOMEM;
+ 
+ 	init_vma_prep(&vp, vma);
+-	vma_adjust_trans_huge(vma, start, end, 0);
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, start, end, 0);
+ 
+ 	if (vma->vm_start < start)
+ 		vma_iter_clear(vmi, vma->vm_start, start);
+@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
+ 	if (vma_iter_prealloc(vmi))
+ 		return NULL;
+ 
+-	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
+ 	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
+ 	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ 		   vp.anon_vma != adjust->anon_vma);
+ 
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
+ 	if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ 		vma_expanded = true;
+ 
+@@ -2119,7 +2132,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
+ 		if (vma->vm_flags & VM_ACCOUNT)
+ 			nr_accounted += nrpages;
+ 		vm_stat_account(mm, vma->vm_flags, -nrpages);
+-		remove_vma(vma);
++		remove_vma(vma, false);
+ 	}
+ 	vm_unacct_memory(nr_accounted);
+ 	validate_mm(mm);
+@@ -2142,7 +2155,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+ 	update_hiwater_rss(mm);
+ 	unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
+ 	free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+-				 next ? next->vm_start : USER_PGTABLES_CEILING);
++				 next ? next->vm_start : USER_PGTABLES_CEILING,
++				 mm_wr_locked);
+ 	tlb_finish_mmu(&tlb);
+ }
+ 
+@@ -2198,10 +2212,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 	if (new->vm_ops && new->vm_ops->open)
+ 		new->vm_ops->open(new);
+ 
+-	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ 	init_vma_prep(&vp, vma);
+ 	vp.insert = new;
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ 
+ 	if (new_below) {
+ 		vma->vm_start = addr;
+@@ -2245,10 +2259,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ static inline int munmap_sidetree(struct vm_area_struct *vma,
+ 				   struct ma_state *mas_detach)
+ {
++	vma_start_write(vma);
+ 	mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ 	if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ 		return -ENOMEM;
+ 
++	vma_mark_detached(vma, true);
+ 	if (vma->vm_flags & VM_LOCKED)
+ 		vma->vm_mm->locked_vm -= vma_pages(vma);
+ 
+@@ -2904,9 +2920,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 		if (vma_iter_prealloc(vmi))
+ 			goto unacct_fail;
+ 
+-		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ 		init_vma_prep(&vp, vma);
+ 		vma_prepare(&vp);
++		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ 		vma->vm_end = addr + len;
+ 		vm_flags_set(vma, VM_SOFTDIRTY);
+ 		vma_iter_store(vmi, vma);
+@@ -3039,7 +3055,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	mmap_write_lock(mm);
+ 	mt_clear_in_rcu(&mm->mm_mt);
+ 	free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+-		      USER_PGTABLES_CEILING);
++		      USER_PGTABLES_CEILING, true);
+ 	tlb_finish_mmu(&tlb);
+ 
+ 	/*
+@@ -3050,7 +3066,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	do {
+ 		if (vma->vm_flags & VM_ACCOUNT)
+ 			nr_accounted += vma_pages(vma);
+-		remove_vma(vma);
++		remove_vma(vma, true);
+ 		count++;
+ 		cond_resched();
+ 	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+@@ -3173,6 +3189,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ 			get_file(new_vma->vm_file);
+ 		if (new_vma->vm_ops && new_vma->vm_ops->open)
+ 			new_vma->vm_ops->open(new_vma);
++		vma_start_write(new_vma);
+ 		if (vma_link(mm, new_vma))
+ 			goto out_vma_link;
+ 		*need_rmap_locks = false;
+@@ -3467,6 +3484,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+  * of mm/rmap.c:
+  *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+  *     hugetlb mapping);
++ *   - all vmas marked locked
+  *   - all i_mmap_rwsem locks;
+  *   - all anon_vma->rwseml
+  *
+@@ -3489,6 +3507,13 @@ int mm_take_all_locks(struct mm_struct *mm)
+ 
+ 	mutex_lock(&mm_all_locks_mutex);
+ 
++	mas_for_each(&mas, vma, ULONG_MAX) {
++		if (signal_pending(current))
++			goto out_unlock;
++		vma_start_write(vma);
++	}
++
++	mas_set(&mas, 0);
+ 	mas_for_each(&mas, vma, ULONG_MAX) {
+ 		if (signal_pending(current))
+ 			goto out_unlock;
+@@ -3578,6 +3603,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
+ 		if (vma->vm_file && vma->vm_file->f_mapping)
+ 			vm_unlock_mapping(vma->vm_file->f_mapping);
+ 	}
++	vma_end_write_all(mm);
+ 
+ 	mutex_unlock(&mm_all_locks_mutex);
+ }
+diff --git a/mm/mprotect.c b/mm/mprotect.c
+index 13e84d8c0797..b9da9a5f87fe 100644
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb,
+ 		} else {
+ 			/* It must be an none page, or what else?.. */
+ 			WARN_ON_ONCE(!pte_none(oldpte));
+-			if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
++
++			/*
++			 * Nobody plays with any none ptes besides
++			 * userfaultfd when applying the protections.
++			 */
++			if (likely(!uffd_wp))
++				continue;
++
++			if (userfaultfd_wp_use_markers(vma)) {
+ 				/*
+ 				 * For file-backed mem, we need to be able to
+ 				 * wr-protect a none pte, because even if the
+@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+ 	return 0;
+ }
+ 
+-/* Return true if we're uffd wr-protecting file-backed memory, or false */
++/*
++ * Return true if we want to split THPs into PTE mappings in change
++ * protection procedure, false otherwise.
++ */
+ static inline bool
+-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
++pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+ {
++	/*
++	 * pte markers only resides in pte level, if we need pte markers,
++	 * we need to split.  We cannot wr-protect shmem thp because file
++	 * thp is handled differently when split by erasing the pmd so far.
++	 */
+ 	return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
+ }
+ 
+ /*
+- * If wr-protecting the range for file-backed, populate pgtable for the case
+- * when pgtable is empty but page cache exists.  When {pte|pmd|...}_alloc()
+- * failed we treat it the same way as pgtable allocation failures during
+- * page faults by kicking OOM and returning error.
++ * Return true if we want to populate pgtables in change protection
++ * procedure, false otherwise
++ */
++static inline bool
++pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
++{
++	/* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
++	if (!(cp_flags & MM_CP_UFFD_WP))
++		return false;
++
++	/* Populate if the userfaultfd mode requires pte markers */
++	return userfaultfd_wp_use_markers(vma);
++}
++
++/*
++ * Populate the pgtable underneath for whatever reason if requested.
++ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
++ * allocation failures during page faults by kicking OOM and returning
++ * error.
+  */
+ #define  change_pmd_prepare(vma, pmd, cp_flags)				\
+ 	({								\
+ 		long err = 0;						\
+-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
++		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
+ 			if (pte_alloc(vma->vm_mm, pmd))			\
+ 				err = -ENOMEM;				\
+ 		}							\
+@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+ #define  change_prepare(vma, high, low, addr, cp_flags)			\
+ 	  ({								\
+ 		long err = 0;						\
+-		if (unlikely(uffd_wp_protect_file(vma, cp_flags))) {	\
++		if (unlikely(pgtable_populate_needed(vma, cp_flags))) {	\
+ 			low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
+ 			if (p == NULL)					\
+ 				err = -ENOMEM;				\
+@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
+ 
+ 		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ 			if ((next - addr != HPAGE_PMD_SIZE) ||
+-			    uffd_wp_protect_file(vma, cp_flags)) {
++			    pgtable_split_needed(vma, cp_flags)) {
+ 				__split_huge_pmd(vma, pmd, addr, false, NULL);
+ 				/*
+ 				 * For file-backed, the pmd could have been
+diff --git a/mm/mremap.c b/mm/mremap.c
+index 411a85682b58..dd541e59edda 100644
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
+ 			return -ENOMEM;
+ 	}
+ 
++	vma_start_write(vma);
+ 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ 			   &need_rmap_locks);
+diff --git a/mm/rmap.c b/mm/rmap.c
+index 8632e02661ac..cfdaa56cad3e 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -25,21 +25,22 @@
+  *     mapping->invalidate_lock (in filemap_fault)
+  *       page->flags PG_locked (lock_page)
+  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
+- *           mapping->i_mmap_rwsem
+- *             anon_vma->rwsem
+- *               mm->page_table_lock or pte_lock
+- *                 swap_lock (in swap_duplicate, swap_info_get)
+- *                   mmlist_lock (in mmput, drain_mmlist and others)
+- *                   mapping->private_lock (in block_dirty_folio)
+- *                     folio_lock_memcg move_lock (in block_dirty_folio)
+- *                       i_pages lock (widely used)
+- *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
+- *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+- *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+- *                     sb_lock (within inode_lock in fs/fs-writeback.c)
+- *                     i_pages lock (widely used, in set_page_dirty,
+- *                               in arch-dependent flush_dcache_mmap_lock,
+- *                               within bdi.wb->list_lock in __sync_single_inode)
++ *           vma_start_write
++ *             mapping->i_mmap_rwsem
++ *               anon_vma->rwsem
++ *                 mm->page_table_lock or pte_lock
++ *                   swap_lock (in swap_duplicate, swap_info_get)
++ *                     mmlist_lock (in mmput, drain_mmlist and others)
++ *                     mapping->private_lock (in block_dirty_folio)
++ *                       folio_lock_memcg move_lock (in block_dirty_folio)
++ *                         i_pages lock (widely used)
++ *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
++ *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
++ *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
++ *                       sb_lock (within inode_lock in fs/fs-writeback.c)
++ *                       i_pages lock (widely used, in set_page_dirty,
++ *                                 in arch-dependent flush_dcache_mmap_lock,
++ *                                 within bdi.wb->list_lock in __sync_single_inode)
+  *
+  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
+  *   ->tasklist_lock
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 1ea6a5ce1c41..4f1089a1860e 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
+ 	"direct_map_level2_splits",
+ 	"direct_map_level3_splits",
+ #endif
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++	"vma_lock_success",
++	"vma_lock_abort",
++	"vma_lock_retry",
++	"vma_lock_miss",
++#endif
+ #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+ };
+ #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
+diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
+index 7f22844ed704..e030d63c031a 100644
+--- a/tools/testing/selftests/mm/userfaultfd.c
++++ b/tools/testing/selftests/mm/userfaultfd.c
+@@ -1444,6 +1444,43 @@ static int pagemap_test_fork(bool present)
+ 	return result;
+ }
+ 
++static void userfaultfd_wp_unpopulated_test(int pagemap_fd)
++{
++	uint64_t value;
++
++	/* Test applying pte marker to anon unpopulated */
++	wp_range(uffd, (uint64_t)area_dst, page_size, true);
++	value = pagemap_read_vaddr(pagemap_fd, area_dst);
++	pagemap_check_wp(value, true);
++
++	/* Test unprotect on anon pte marker */
++	wp_range(uffd, (uint64_t)area_dst, page_size, false);
++	value = pagemap_read_vaddr(pagemap_fd, area_dst);
++	pagemap_check_wp(value, false);
++
++	/* Test zap on anon marker */
++	wp_range(uffd, (uint64_t)area_dst, page_size, true);
++	if (madvise(area_dst, page_size, MADV_DONTNEED))
++		err("madvise(MADV_DONTNEED) failed");
++	value = pagemap_read_vaddr(pagemap_fd, area_dst);
++	pagemap_check_wp(value, false);
++
++	/* Test fault in after marker removed */
++	*area_dst = 1;
++	value = pagemap_read_vaddr(pagemap_fd, area_dst);
++	pagemap_check_wp(value, false);
++	/* Drop it to make pte none again */
++	if (madvise(area_dst, page_size, MADV_DONTNEED))
++		err("madvise(MADV_DONTNEED) failed");
++
++	/* Test read-zero-page upon pte marker */
++	wp_range(uffd, (uint64_t)area_dst, page_size, true);
++	*(volatile char *)area_dst;
++	/* Drop it to make pte none again */
++	if (madvise(area_dst, page_size, MADV_DONTNEED))
++		err("madvise(MADV_DONTNEED) failed");
++}
++
+ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+ {
+ 	struct uffdio_register uffdio_register;
+@@ -1462,7 +1499,7 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+ 	/* Flush so it doesn't flush twice in parent/child later */
+ 	fflush(stdout);
+ 
+-	uffd_test_ctx_init(0);
++	uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED);
+ 
+ 	if (test_pgsize > page_size) {
+ 		/* This is a thp test */
+@@ -1482,6 +1519,10 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+ 
+ 	pagemap_fd = pagemap_open();
+ 
++	/* Smoke test WP_UNPOPULATED first when it's still empty */
++	if (test_pgsize == page_size)
++		userfaultfd_wp_unpopulated_test(pagemap_fd);
++
+ 	/* Touch the page */
+ 	*area_dst = 1;
+ 	wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
+@@ -1526,7 +1567,7 @@ static int userfaultfd_stress(void)
+ 	struct uffdio_register uffdio_register;
+ 	struct uffd_stats uffd_stats[nr_cpus];
+ 
+-	uffd_test_ctx_init(0);
++	uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED);
+ 
+ 	if (posix_memalign(&area, page_size, page_size))
+ 		err("out of memory");
 -- 
-2.40.0.rc2
+2.40.0
 
-From 56bbff019101b84507c1e796512b1be6840c6eda Mon Sep 17 00:00:00 2001
+From d0f327c32c39cafbdeefac5fd65a0087a603e76f Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 3 Mar 2023 17:02:07 +0100
-Subject: [PATCH 13/16] sched
+Date: Sun, 9 Apr 2023 21:25:55 +0200
+Subject: [PATCH 09/10] sched
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- arch/x86/kernel/itmt.c           |  23 +-
- arch/x86/kernel/smpboot.c        |   2 +-
- include/linux/sched.h            |   3 +
- include/linux/sched/sd_flags.h   |   5 +-
- kernel/sched/core.c              |  77 ++--
- kernel/sched/cpufreq_schedutil.c |  43 +--
- kernel/sched/deadline.c          |  42 ++-
- kernel/sched/debug.c             |   1 +
- kernel/sched/fair.c              | 591 ++++++++++++++++++++-----------
- kernel/sched/features.h          |   1 +
- kernel/sched/pelt.c              |  60 ++++
- kernel/sched/pelt.h              |  42 ++-
- kernel/sched/sched.h             |  28 +-
- 13 files changed, 601 insertions(+), 317 deletions(-)
+ arch/x86/kernel/itmt.c         |  23 +--
+ arch/x86/kernel/smpboot.c      |   4 +-
+ include/linux/sched.h          |   3 +
+ include/linux/sched/sd_flags.h |   5 +-
+ kernel/sched/core.c            |   4 +-
+ kernel/sched/debug.c           |   1 +
+ kernel/sched/fair.c            | 265 ++++++++++++++++++++-------------
+ kernel/sched/features.h        |   1 +
+ kernel/sched/pelt.c            |  60 ++++++++
+ kernel/sched/pelt.h            |  42 +++++-
+ kernel/sched/sched.h           |  23 ++-
+ 11 files changed, 294 insertions(+), 137 deletions(-)
 
 diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
 index 9ff480e94511..6510883c5e81 100644
@@ -33507,7 +12550,7 @@ index 9ff480e94511..6510883c5e81 100644
 +	per_cpu(sched_core_priority, cpu) = prio;
  }
 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
-index 55cad72715d9..0213d066a9a9 100644
+index 9013bb28255a..cea297d97034 100644
 --- a/arch/x86/kernel/smpboot.c
 +++ b/arch/x86/kernel/smpboot.c
 @@ -547,7 +547,7 @@ static int x86_core_flags(void)
@@ -33519,8 +12562,17 @@ index 55cad72715d9..0213d066a9a9 100644
  }
  #endif
  #ifdef CONFIG_SCHED_CLUSTER
+@@ -578,7 +578,7 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = {
+ #ifdef CONFIG_SCHED_MC
+ 	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+ #endif
+-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
++	{ cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) },
+ 	{ NULL, },
+ };
+ 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 853d08f7562b..28ce1be0ba47 100644
+index 63d242164b1a..6d398b337b0d 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -557,6 +557,9 @@ struct sched_entity {
@@ -33534,7 +12586,7 @@ index 853d08f7562b..28ce1be0ba47 100644
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	int				depth;
 diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
-index 57bde66d95f7..800238854ba5 100644
+index 57bde66d95f7..fad77b5172e2 100644
 --- a/include/linux/sched/sd_flags.h
 +++ b/include/linux/sched/sd_flags.h
 @@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
@@ -33547,15 +12599,15 @@ index 57bde66d95f7..800238854ba5 100644
   * NEEDS_GROUPS: Load balancing flag.
   */
 -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
-+SD_FLAG(SD_ASYM_PACKING,  SDF_NEEDS_GROUPS)
++SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
  
  /*
   * Prefer to place tasks in a sibling domain
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 2a4918a1faa9..5237639786b7 100644
+index 0d18c3969f90..17bb9637f314 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -722,7 +722,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
+@@ -724,7 +724,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
  		update_irq_load_avg(rq, irq_delta + steal);
  #endif
@@ -33564,112 +12616,7 @@ index 2a4918a1faa9..5237639786b7 100644
  }
  
  void update_rq_clock(struct rq *rq)
-@@ -3675,14 +3675,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
- }
- 
- /*
-- * Mark the task runnable and perform wakeup-preemption.
-+ * Mark the task runnable.
-  */
--static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
--			   struct rq_flags *rf)
-+static inline void ttwu_do_wakeup(struct task_struct *p)
- {
--	check_preempt_curr(rq, p, wake_flags);
- 	WRITE_ONCE(p->__state, TASK_RUNNING);
- 	trace_sched_wakeup(p);
-+}
-+
-+static void
-+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
-+		 struct rq_flags *rf)
-+{
-+	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-+
-+	lockdep_assert_rq_held(rq);
-+
-+	if (p->sched_contributes_to_load)
-+		rq->nr_uninterruptible--;
-+
-+#ifdef CONFIG_SMP
-+	if (wake_flags & WF_MIGRATED)
-+		en_flags |= ENQUEUE_MIGRATED;
-+	else
-+#endif
-+	if (p->in_iowait) {
-+		delayacct_blkio_end(p);
-+		atomic_dec(&task_rq(p)->nr_iowait);
-+	}
-+
-+	activate_task(rq, p, en_flags);
-+	check_preempt_curr(rq, p, wake_flags);
-+
-+	ttwu_do_wakeup(p);
- 
- #ifdef CONFIG_SMP
- 	if (p->sched_class->task_woken) {
-@@ -3712,31 +3737,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- #endif
- }
- 
--static void
--ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
--		 struct rq_flags *rf)
--{
--	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
--
--	lockdep_assert_rq_held(rq);
--
--	if (p->sched_contributes_to_load)
--		rq->nr_uninterruptible--;
--
--#ifdef CONFIG_SMP
--	if (wake_flags & WF_MIGRATED)
--		en_flags |= ENQUEUE_MIGRATED;
--	else
--#endif
--	if (p->in_iowait) {
--		delayacct_blkio_end(p);
--		atomic_dec(&task_rq(p)->nr_iowait);
--	}
--
--	activate_task(rq, p, en_flags);
--	ttwu_do_wakeup(rq, p, wake_flags, rf);
--}
--
- /*
-  * Consider @p being inside a wait loop:
-  *
-@@ -3770,9 +3770,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
- 
- 	rq = __task_rq_lock(p, &rf);
- 	if (task_on_rq_queued(p)) {
--		/* check_preempt_curr() may use rq clock */
--		update_rq_clock(rq);
--		ttwu_do_wakeup(rq, p, wake_flags, &rf);
-+		if (!task_on_cpu(rq, p)) {
-+			/*
-+			 * When on_rq && !on_cpu the task is preempted, see if
-+			 * it should preempt the task that is current now.
-+			 */
-+			update_rq_clock(rq);
-+			check_preempt_curr(rq, p, wake_flags);
-+		}
-+		ttwu_do_wakeup(p);
- 		ret = 1;
- 	}
- 	__task_rq_unlock(rq, &rf);
-@@ -4138,8 +4144,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
- 			goto out;
- 
- 		trace_sched_waking(p);
--		WRITE_ONCE(p->__state, TASK_RUNNING);
--		trace_sched_wakeup(p);
-+		ttwu_do_wakeup(p);
- 		goto out;
- 	}
- 
-@@ -4424,6 +4429,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4434,6 +4434,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	p->se.prev_sum_exec_runtime	= 0;
  	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
@@ -33678,215 +12625,6 @@ index 2a4918a1faa9..5237639786b7 100644
  	INIT_LIST_HEAD(&p->se.group_node);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
-index 1207c78f85c1..5c840151f3bb 100644
---- a/kernel/sched/cpufreq_schedutil.c
-+++ b/kernel/sched/cpufreq_schedutil.c
-@@ -48,7 +48,6 @@ struct sugov_cpu {
- 
- 	unsigned long		util;
- 	unsigned long		bw_dl;
--	unsigned long		max;
- 
- 	/* The field below is for single-CPU policies only: */
- #ifdef CONFIG_NO_HZ_COMMON
-@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
- {
- 	struct rq *rq = cpu_rq(sg_cpu->cpu);
- 
--	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
- 	sg_cpu->bw_dl = cpu_bw_dl(rq);
- 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
- 					  FREQUENCY_UTIL, NULL);
-@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
-  * sugov_iowait_apply() - Apply the IO boost to a CPU.
-  * @sg_cpu: the sugov data for the cpu to boost
-  * @time: the update time from the caller
-+ * @max_cap: the max CPU capacity
-  *
-  * A CPU running a task which woken up after an IO operation can have its
-  * utilization boosted to speed up the completion of those IO operations.
-@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
-  * This mechanism is designed to boost high frequently IO waiting tasks, while
-  * being more conservative on tasks which does sporadic IO operations.
-  */
--static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
-+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
-+			       unsigned long max_cap)
- {
- 	unsigned long boost;
- 
-@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
- 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
- 	 * into the same scale so we can compare.
- 	 */
--	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
-+	boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
- 	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
- 	if (sg_cpu->util < boost)
- 		sg_cpu->util = boost;
-@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
- }
- 
- static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
--					      u64 time, unsigned int flags)
-+					      u64 time, unsigned long max_cap,
-+					      unsigned int flags)
- {
- 	sugov_iowait_boost(sg_cpu, time, flags);
- 	sg_cpu->last_update = time;
-@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- 		return false;
- 
- 	sugov_get_util(sg_cpu);
--	sugov_iowait_apply(sg_cpu, time);
-+	sugov_iowait_apply(sg_cpu, time, max_cap);
- 
- 	return true;
- }
-@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
- 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- 	unsigned int cached_freq = sg_policy->cached_raw_freq;
-+	unsigned long max_cap;
- 	unsigned int next_f;
- 
--	if (!sugov_update_single_common(sg_cpu, time, flags))
-+	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
-+
-+	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
- 		return;
- 
--	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
-+	next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
- 	/*
- 	 * Do not reduce the frequency if the CPU has not been idle
- 	 * recently, as the reduction is likely to be premature then.
-@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
- {
- 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- 	unsigned long prev_util = sg_cpu->util;
-+	unsigned long max_cap;
- 
- 	/*
- 	 * Fall back to the "frequency" path if frequency invariance is not
-@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
- 		return;
- 	}
- 
--	if (!sugov_update_single_common(sg_cpu, time, flags))
-+	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
-+
-+	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
- 		return;
- 
- 	/*
-@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
- 		sg_cpu->util = prev_util;
- 
- 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
--				   map_util_perf(sg_cpu->util), sg_cpu->max);
-+				   map_util_perf(sg_cpu->util), max_cap);
- 
- 	sg_cpu->sg_policy->last_freq_update_time = time;
- }
-@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
- {
- 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- 	struct cpufreq_policy *policy = sg_policy->policy;
--	unsigned long util = 0, max = 1;
-+	unsigned long util = 0, max_cap;
- 	unsigned int j;
- 
-+	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
-+
- 	for_each_cpu(j, policy->cpus) {
- 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
--		unsigned long j_util, j_max;
- 
- 		sugov_get_util(j_sg_cpu);
--		sugov_iowait_apply(j_sg_cpu, time);
--		j_util = j_sg_cpu->util;
--		j_max = j_sg_cpu->max;
-+		sugov_iowait_apply(j_sg_cpu, time, max_cap);
- 
--		if (j_util * max > j_max * util) {
--			util = j_util;
--			max = j_max;
--		}
-+		util = max(j_sg_cpu->util, util);
- 	}
- 
--	return get_next_freq(sg_policy, util, max);
-+	return get_next_freq(sg_policy, util, max_cap);
- }
- 
- static void
-diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
-index 0d97d54276cc..71b24371a6f7 100644
---- a/kernel/sched/deadline.c
-+++ b/kernel/sched/deadline.c
-@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
- static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- 			    int oldprio)
- {
--	if (task_on_rq_queued(p) || task_current(rq, p)) {
-+	if (!task_on_rq_queued(p))
-+		return;
-+
- #ifdef CONFIG_SMP
--		/*
--		 * This might be too much, but unfortunately
--		 * we don't have the old deadline value, and
--		 * we can't argue if the task is increasing
--		 * or lowering its prio, so...
--		 */
--		if (!rq->dl.overloaded)
--			deadline_queue_pull_task(rq);
-+	/*
-+	 * This might be too much, but unfortunately
-+	 * we don't have the old deadline value, and
-+	 * we can't argue if the task is increasing
-+	 * or lowering its prio, so...
-+	 */
-+	if (!rq->dl.overloaded)
-+		deadline_queue_pull_task(rq);
- 
-+	if (task_current(rq, p)) {
- 		/*
- 		 * If we now have a earlier deadline task than p,
- 		 * then reschedule, provided p is still on this
-@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- 		 */
- 		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
- 			resched_curr(rq);
--#else
-+	} else {
- 		/*
--		 * Again, we don't know if p has a earlier
--		 * or later deadline, so let's blindly set a
--		 * (maybe not needed) rescheduling point.
-+		 * Current may not be deadline in case p was throttled but we
-+		 * have just replenished it (e.g. rt_mutex_setprio()).
-+		 *
-+		 * Otherwise, if p was given an earlier deadline, reschedule.
- 		 */
--		resched_curr(rq);
--#endif /* CONFIG_SMP */
-+		if (!dl_task(rq->curr) ||
-+		    dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
-+			resched_curr(rq);
- 	}
-+#else
-+	/*
-+	 * We don't know if p has a earlier or later deadline, so let's blindly
-+	 * set a (maybe not needed) rescheduling point.
-+	 */
-+	resched_curr(rq);
-+#endif
- }
- 
- DEFINE_SCHED_CLASS(dl) = {
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
 index 1637b65ba07a..8d64fba16cfe 100644
 --- a/kernel/sched/debug.c
@@ -33900,7 +12638,7 @@ index 1637b65ba07a..8d64fba16cfe 100644
  	P(se.avg.load_sum);
  	P(se.avg.runnable_sum);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 86a988c830ef..84254f52c56a 100644
+index dcdd8422de72..115be8a965f2 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -33951,311 +12689,7 @@ index 86a988c830ef..84254f52c56a 100644
  struct task_numa_env {
  	struct task_struct *p;
  
-@@ -4494,17 +4494,9 @@ static inline int util_fits_cpu(unsigned long util,
- 	 *
- 	 * For uclamp_max, we can tolerate a drop in performance level as the
- 	 * goal is to cap the task. So it's okay if it's getting less.
--	 *
--	 * In case of capacity inversion we should honour the inverted capacity
--	 * for both uclamp_min and uclamp_max all the time.
- 	 */
--	capacity_orig = cpu_in_capacity_inversion(cpu);
--	if (capacity_orig) {
--		capacity_orig_thermal = capacity_orig;
--	} else {
--		capacity_orig = capacity_orig_of(cpu);
--		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
--	}
-+	capacity_orig = capacity_orig_of(cpu);
-+	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
- 
- 	/*
- 	 * We want to force a task to fit a cpu as implied by uclamp_max.
-@@ -4579,8 +4571,8 @@ static inline int util_fits_cpu(unsigned long util,
- 	 * handle the case uclamp_min > uclamp_max.
- 	 */
- 	uclamp_min = min(uclamp_min, uclamp_max);
--	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
--		fits = fits && (uclamp_min <= capacity_orig_thermal);
-+	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
-+		return -1;
- 
- 	return fits;
- }
-@@ -4590,7 +4582,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
- 	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
- 	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
- 	unsigned long util = task_util_est(p);
--	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
-+	/*
-+	 * Return true only if the cpu fully fits the task requirements, which
-+	 * include the utilization but also the performance hints.
-+	 */
-+	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
- }
- 
- static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
-@@ -4674,6 +4670,7 @@ static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
- 	u64 vruntime = cfs_rq->min_vruntime;
-+	u64 sleep_time;
- 
- 	/*
- 	 * The 'current' period is already promised to the current tasks,
-@@ -4703,8 +4700,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- 		vruntime -= thresh;
- 	}
- 
--	/* ensure we never gain time by being placed backwards. */
--	se->vruntime = max_vruntime(se->vruntime, vruntime);
-+	/*
-+	 * Pull vruntime of the entity being placed to the base level of
-+	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-+	 * slept for a long time, don't even try to compare its vruntime with
-+	 * the base as it may be too far off and the comparison may get
-+	 * inversed due to s64 overflow.
-+	 */
-+	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-+	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
-+		se->vruntime = vruntime;
-+	else
-+		se->vruntime = max_vruntime(se->vruntime, vruntime);
- }
- 
- static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-@@ -4914,7 +4921,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
- 	struct sched_entity *se;
- 	s64 delta;
- 
--	ideal_runtime = sched_slice(cfs_rq, curr);
-+	/*
-+	 * When many tasks blow up the sched_period; it is possible that
-+	 * sched_slice() reports unusually large results (when many tasks are
-+	 * very light for example). Therefore impose a maximum.
-+	 */
-+	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-+
- 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- 	if (delta_exec > ideal_runtime) {
- 		resched_curr(rq_of(cfs_rq));
-@@ -5479,22 +5492,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
- 		resched_curr(rq);
- }
- 
--static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
-+#ifdef CONFIG_SMP
-+static void __cfsb_csd_unthrottle(void *arg)
- {
--	struct cfs_rq *cfs_rq;
-+	struct cfs_rq *cursor, *tmp;
-+	struct rq *rq = arg;
-+	struct rq_flags rf;
-+
-+	rq_lock(rq, &rf);
-+
-+	/*
-+	 * Since we hold rq lock we're safe from concurrent manipulation of
-+	 * the CSD list. However, this RCU critical section annotates the
-+	 * fact that we pair with sched_free_group_rcu(), so that we cannot
-+	 * race with group being freed in the window between removing it
-+	 * from the list and advancing to the next entry in the list.
-+	 */
-+	rcu_read_lock();
-+
-+	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
-+				 throttled_csd_list) {
-+		list_del_init(&cursor->throttled_csd_list);
-+
-+		if (cfs_rq_throttled(cursor))
-+			unthrottle_cfs_rq(cursor);
-+	}
-+
-+	rcu_read_unlock();
-+
-+	rq_unlock(rq, &rf);
-+}
-+
-+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-+{
-+	struct rq *rq = rq_of(cfs_rq);
-+	bool first;
-+
-+	if (rq == this_rq()) {
-+		unthrottle_cfs_rq(cfs_rq);
-+		return;
-+	}
-+
-+	/* Already enqueued */
-+	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
-+		return;
-+
-+	first = list_empty(&rq->cfsb_csd_list);
-+	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
-+	if (first)
-+		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
-+}
-+#else
-+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-+{
-+	unthrottle_cfs_rq(cfs_rq);
-+}
-+#endif
-+
-+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-+{
-+	lockdep_assert_rq_held(rq_of(cfs_rq));
-+
-+	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
-+	    cfs_rq->runtime_remaining <= 0))
-+		return;
-+
-+	__unthrottle_cfs_rq_async(cfs_rq);
-+}
-+
-+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
-+{
-+	struct cfs_rq *local_unthrottle = NULL;
-+	int this_cpu = smp_processor_id();
- 	u64 runtime, remaining = 1;
-+	bool throttled = false;
-+	struct cfs_rq *cfs_rq;
-+	struct rq_flags rf;
-+	struct rq *rq;
- 
- 	rcu_read_lock();
- 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
- 				throttled_list) {
--		struct rq *rq = rq_of(cfs_rq);
--		struct rq_flags rf;
-+		rq = rq_of(cfs_rq);
-+
-+		if (!remaining) {
-+			throttled = true;
-+			break;
-+		}
- 
- 		rq_lock_irqsave(rq, &rf);
- 		if (!cfs_rq_throttled(cfs_rq))
- 			goto next;
- 
--		/* By the above check, this should never be true */
-+#ifdef CONFIG_SMP
-+		/* Already queued for async unthrottle */
-+		if (!list_empty(&cfs_rq->throttled_csd_list))
-+			goto next;
-+#endif
-+
-+		/* By the above checks, this should never be true */
- 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
- 
- 		raw_spin_lock(&cfs_b->lock);
-@@ -5508,16 +5604,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
- 		cfs_rq->runtime_remaining += runtime;
- 
- 		/* we check whether we're throttled above */
--		if (cfs_rq->runtime_remaining > 0)
--			unthrottle_cfs_rq(cfs_rq);
-+		if (cfs_rq->runtime_remaining > 0) {
-+			if (cpu_of(rq) != this_cpu ||
-+			    SCHED_WARN_ON(local_unthrottle))
-+				unthrottle_cfs_rq_async(cfs_rq);
-+			else
-+				local_unthrottle = cfs_rq;
-+		} else {
-+			throttled = true;
-+		}
- 
- next:
- 		rq_unlock_irqrestore(rq, &rf);
--
--		if (!remaining)
--			break;
- 	}
- 	rcu_read_unlock();
-+
-+	if (local_unthrottle) {
-+		rq = cpu_rq(this_cpu);
-+		rq_lock_irqsave(rq, &rf);
-+		if (cfs_rq_throttled(local_unthrottle))
-+			unthrottle_cfs_rq(local_unthrottle);
-+		rq_unlock_irqrestore(rq, &rf);
-+	}
-+
-+	return throttled;
- }
- 
- /*
-@@ -5562,10 +5672,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
- 	while (throttled && cfs_b->runtime > 0) {
- 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- 		/* we can't nest cfs_b->lock while distributing bandwidth */
--		distribute_cfs_runtime(cfs_b);
-+		throttled = distribute_cfs_runtime(cfs_b);
- 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
--
--		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- 	}
- 
- 	/*
-@@ -5842,6 +5950,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- {
- 	cfs_rq->runtime_enabled = 0;
- 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
-+#ifdef CONFIG_SMP
-+	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-+#endif
- }
- 
- void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-@@ -5858,12 +5969,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- 
- static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- {
-+	int __maybe_unused i;
-+
- 	/* init_cfs_bandwidth() was not called */
- 	if (!cfs_b->throttled_cfs_rq.next)
- 		return;
- 
- 	hrtimer_cancel(&cfs_b->period_timer);
- 	hrtimer_cancel(&cfs_b->slack_timer);
-+
-+	/*
-+	 * It is possible that we still have some cfs_rq's pending on a CSD
-+	 * list, though this race is very rare. In order for this to occur, we
-+	 * must have raced with the last task leaving the group while there
-+	 * exist throttled cfs_rq(s), and the period_timer must have queued the
-+	 * CSD item but the remote cpu has not yet processed it. To handle this,
-+	 * we can simply flush all pending CSD work inline here. We're
-+	 * guaranteed at this point that no additional cfs_rq of this group can
-+	 * join a CSD list.
-+	 */
-+#ifdef CONFIG_SMP
-+	for_each_possible_cpu(i) {
-+		struct rq *rq = cpu_rq(i);
-+		unsigned long flags;
-+
-+		if (list_empty(&rq->cfsb_csd_list))
-+			continue;
-+
-+		local_irq_save(flags);
-+		__cfsb_csd_unthrottle(rq);
-+		local_irq_restore(flags);
-+	}
-+#endif
- }
- 
- /*
-@@ -6026,6 +6163,7 @@ static inline bool cpu_overutilized(int cpu)
- 	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- 	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
- 
-+	/* Return true only if the utilization doesn't fit CPU's capacity */
- 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
- }
- 
-@@ -6159,6 +6297,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6333,6 +6333,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  static void set_next_buddy(struct sched_entity *se);
  
@@ -34274,7 +12708,7 @@ index 86a988c830ef..84254f52c56a 100644
  /*
   * The dequeue_task method is called before nr_running is
   * decreased. We remove the task from the rbtree and
-@@ -6231,6 +6381,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6405,6 +6417,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  dequeue_throttle:
  	util_est_update(&rq->cfs, p, task_sleep);
@@ -34282,7 +12716,7 @@ index 86a988c830ef..84254f52c56a 100644
  	hrtick_update(rq);
  }
  
-@@ -6364,6 +6515,23 @@ static int wake_wide(struct task_struct *p)
+@@ -6538,6 +6551,23 @@ static int wake_wide(struct task_struct *p)
  	return 1;
  }
  
@@ -34306,7 +12740,7 @@ index 86a988c830ef..84254f52c56a 100644
  /*
   * The purpose of wake_affine() is to quickly determine on which CPU we can run
   * soonest. For the purpose of speed we only consider the waking and previous
-@@ -6400,6 +6568,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
+@@ -6574,6 +6604,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
  	if (available_idle_cpu(prev_cpu))
  		return prev_cpu;
  
@@ -34318,7 +12752,7 @@ index 86a988c830ef..84254f52c56a 100644
  	return nr_cpumask_bits;
  }
  
-@@ -6774,6 +6947,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+@@ -6948,6 +6983,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
  			/* overloaded LLC is unlikely to have idle cpu/core */
  			if (nr == 1)
  				return -1;
@@ -34339,262 +12773,28 @@ index 86a988c830ef..84254f52c56a 100644
  		}
  	}
  
-@@ -6819,6 +7006,7 @@ static int
- select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
- {
- 	unsigned long task_util, util_min, util_max, best_cap = 0;
-+	int fits, best_fits = 0;
- 	int cpu, best_cpu = -1;
- 	struct cpumask *cpus;
- 
-@@ -6834,12 +7022,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
- 
- 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
- 			continue;
--		if (util_fits_cpu(task_util, util_min, util_max, cpu))
-+
-+		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
-+
-+		/* This CPU fits with all requirements */
-+		if (fits > 0)
- 			return cpu;
-+		/*
-+		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
-+		 * Look for the CPU with best capacity.
-+		 */
-+		else if (fits < 0)
-+			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
- 
--		if (cpu_cap > best_cap) {
-+		/*
-+		 * First, select CPU which fits better (-1 being better than 0).
-+		 * Then, select the one with best capacity at same level.
-+		 */
-+		if ((fits < best_fits) ||
-+		    ((fits == best_fits) && (cpu_cap > best_cap))) {
- 			best_cap = cpu_cap;
- 			best_cpu = cpu;
-+			best_fits = fits;
- 		}
- 	}
- 
-@@ -6852,7 +7056,11 @@ static inline bool asym_fits_cpu(unsigned long util,
- 				 int cpu)
- {
- 	if (sched_asym_cpucap_active())
--		return util_fits_cpu(util, util_min, util_max, cpu);
-+		/*
-+		 * Return true only if the cpu fully fits the task requirements
-+		 * which include the utilization and the performance hints.
-+		 */
-+		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
- 
- 	return true;
+@@ -9288,96 +9337,65 @@ group_type group_classify(unsigned int imbalance_pct,
  }
-@@ -7219,6 +7427,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
- 	struct root_domain *rd = this_rq()->rd;
- 	int cpu, best_energy_cpu, target = -1;
-+	int prev_fits = -1, best_fits = -1;
-+	unsigned long best_thermal_cap = 0;
-+	unsigned long prev_thermal_cap = 0;
- 	struct sched_domain *sd;
- 	struct perf_domain *pd;
- 	struct energy_env eenv;
-@@ -7254,6 +7465,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 		unsigned long prev_spare_cap = 0;
- 		int max_spare_cap_cpu = -1;
- 		unsigned long base_energy;
-+		int fits, max_fits = -1;
  
- 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
- 
-@@ -7303,7 +7515,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 				util_min = max(rq_util_min, p_util_min);
- 				util_max = max(rq_util_max, p_util_max);
- 			}
--			if (!util_fits_cpu(util, util_min, util_max, cpu))
-+
-+			fits = util_fits_cpu(util, util_min, util_max, cpu);
-+			if (!fits)
- 				continue;
- 
- 			lsub_positive(&cpu_cap, util);
-@@ -7311,7 +7525,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 			if (cpu == prev_cpu) {
- 				/* Always use prev_cpu as a candidate. */
- 				prev_spare_cap = cpu_cap;
--			} else if (cpu_cap > max_spare_cap) {
-+				prev_fits = fits;
-+			} else if ((fits > max_fits) ||
-+				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
- 				/*
- 				 * Find the CPU with the maximum spare capacity
- 				 * among the remaining CPUs in the performance
-@@ -7319,6 +7535,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 				 */
- 				max_spare_cap = cpu_cap;
- 				max_spare_cap_cpu = cpu;
-+				max_fits = fits;
- 			}
- 		}
- 
-@@ -7337,26 +7554,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
- 			if (prev_delta < base_energy)
- 				goto unlock;
- 			prev_delta -= base_energy;
-+			prev_thermal_cap = cpu_thermal_cap;
- 			best_delta = min(best_delta, prev_delta);
- 		}
- 
- 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
- 		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
-+			/* Current best energy cpu fits better */
-+			if (max_fits < best_fits)
-+				continue;
-+
-+			/*
-+			 * Both don't fit performance hint (i.e. uclamp_min)
-+			 * but best energy cpu has better capacity.
-+			 */
-+			if ((max_fits < 0) &&
-+			    (cpu_thermal_cap <= best_thermal_cap))
-+				continue;
-+
- 			cur_delta = compute_energy(&eenv, pd, cpus, p,
- 						   max_spare_cap_cpu);
- 			/* CPU utilization has changed */
- 			if (cur_delta < base_energy)
- 				goto unlock;
- 			cur_delta -= base_energy;
--			if (cur_delta < best_delta) {
--				best_delta = cur_delta;
--				best_energy_cpu = max_spare_cap_cpu;
--			}
-+
-+			/*
-+			 * Both fit for the task but best energy cpu has lower
-+			 * energy impact.
-+			 */
-+			if ((max_fits > 0) && (best_fits > 0) &&
-+			    (cur_delta >= best_delta))
-+				continue;
-+
-+			best_delta = cur_delta;
-+			best_energy_cpu = max_spare_cap_cpu;
-+			best_fits = max_fits;
-+			best_thermal_cap = cpu_thermal_cap;
- 		}
- 	}
- 	rcu_read_unlock();
- 
--	if (best_delta < prev_delta)
-+	if ((best_fits > prev_fits) ||
-+	    ((best_fits > 0) && (best_delta < prev_delta)) ||
-+	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
- 		target = best_energy_cpu;
- 
- 	return target;
-@@ -8856,82 +9097,16 @@ static unsigned long scale_rt_capacity(int cpu)
- 
- static void update_cpu_capacity(struct sched_domain *sd, int cpu)
- {
--	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
- 	unsigned long capacity = scale_rt_capacity(cpu);
- 	struct sched_group *sdg = sd->groups;
--	struct rq *rq = cpu_rq(cpu);
- 
--	rq->cpu_capacity_orig = capacity_orig;
-+	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
- 
- 	if (!capacity)
- 		capacity = 1;
- 
--	rq->cpu_capacity = capacity;
--
--	/*
--	 * Detect if the performance domain is in capacity inversion state.
--	 *
--	 * Capacity inversion happens when another perf domain with equal or
--	 * lower capacity_orig_of() ends up having higher capacity than this
--	 * domain after subtracting thermal pressure.
--	 *
--	 * We only take into account thermal pressure in this detection as it's
--	 * the only metric that actually results in *real* reduction of
--	 * capacity due to performance points (OPPs) being dropped/become
--	 * unreachable due to thermal throttling.
--	 *
--	 * We assume:
--	 *   * That all cpus in a perf domain have the same capacity_orig
--	 *     (same uArch).
--	 *   * Thermal pressure will impact all cpus in this perf domain
--	 *     equally.
--	 */
--	if (sched_energy_enabled()) {
--		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
--		struct perf_domain *pd;
--
--		rcu_read_lock();
--
--		pd = rcu_dereference(rq->rd->pd);
--		rq->cpu_capacity_inverted = 0;
--
--		for (; pd; pd = pd->next) {
--			struct cpumask *pd_span = perf_domain_span(pd);
--			unsigned long pd_cap_orig, pd_cap;
--
--			/* We can't be inverted against our own pd */
--			if (cpumask_test_cpu(cpu_of(rq), pd_span))
--				continue;
--
--			cpu = cpumask_any(pd_span);
--			pd_cap_orig = arch_scale_cpu_capacity(cpu);
--
--			if (capacity_orig < pd_cap_orig)
--				continue;
--
--			/*
--			 * handle the case of multiple perf domains have the
--			 * same capacity_orig but one of them is under higher
--			 * thermal pressure. We record it as capacity
--			 * inversion.
--			 */
--			if (capacity_orig == pd_cap_orig) {
--				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
--
--				if (pd_cap > inv_cap) {
--					rq->cpu_capacity_inverted = inv_cap;
--					break;
--				}
--			} else if (pd_cap_orig > inv_cap) {
--				rq->cpu_capacity_inverted = inv_cap;
--				break;
--			}
--		}
--
--		rcu_read_unlock();
--	}
--
--	trace_sched_cpu_capacity_tp(rq);
-+	cpu_rq(cpu)->cpu_capacity = capacity;
-+	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
- 
- 	sdg->sgc->capacity = capacity;
- 	sdg->sgc->min_capacity = capacity;
-@@ -9135,20 +9310,15 @@ group_type group_classify(unsigned int imbalance_pct,
-  * @sgs:	Load-balancing statistics of the candidate busiest group
-  * @sg:		The candidate busiest group
-  *
+ /**
+- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
+- * @dst_cpu:	Destination CPU of the load balancing
+- * @sds:	Load-balancing data with statistics of the local group
+- * @sgs:	Load-balancing statistics of the candidate busiest group
+- * @sg:		The candidate busiest group
+- *
 - * Check the state of the SMT siblings of both @sds::local and @sg and decide
 - * if @dst_cpu can pull tasks.
-+ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull
-+ * tasks.
++ * sched_use_asym_prio - Check whether asym_packing priority must be used
++ * @sd:		The scheduling domain of the load balancing
++ * @cpu:	A CPU
   *
 - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
 - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
 - * only if @dst_cpu has higher priority.
-+ * This function must be called only if all the SMT siblings of @dst_cpu are
-+ * idle, if any.
++ * Always use CPU priority when balancing load between SMT siblings. When
++ * balancing load between cores, it is not sufficient that @cpu is idle. Only
++ * use CPU priority if the whole core is idle.
   *
 - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
 - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
@@ -34603,24 +12803,24 @@ index 86a988c830ef..84254f52c56a 100644
 - *
 - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
 - * of @dst_cpu are idle and @sg has lower priority.
-+ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than
-+ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances
-+ * in the number of busy CPUs will be dealt with in find_busiest_group().
-  *
-  * Return: true if @dst_cpu can pull tasks, false otherwise.
+- *
+- * Return: true if @dst_cpu can pull tasks, false otherwise.
++ * Returns: True if the priority of @cpu must be followed. False otherwise.
   */
-@@ -9157,51 +9327,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
- 				    struct sched_group *sg)
+-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+-				    struct sg_lb_stats *sgs,
+-				    struct sched_group *sg)
++static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
  {
  #ifdef CONFIG_SCHED_SMT
 -	bool local_is_smt, sg_is_smt;
- 	int sg_busy_cpus;
- 
+-	int sg_busy_cpus;
+-
 -	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
 -	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
 -
- 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
- 
+-	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+-
 -	if (!local_is_smt) {
 -		/*
 -		 * If we are here, @dst_cpu is idle and does not have SMT
@@ -34652,36 +12852,70 @@ index 86a988c830ef..84254f52c56a 100644
 -		return false;
 -	}
 -
- 	/*
+-	/*
 -	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
 -	 * up with more than one busy SMT sibling and only pull tasks if there
 -	 * are not busy CPUs (i.e., no CPU has running tasks).
-+	 * If the difference in the number of busy CPUs is two or more, let
-+	 * find_busiest_group() take care of it. We only care if @sg has
-+	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
- 	 */
+-	 */
 -	if (!sds->local_stat.sum_nr_running)
-+	if (sg_busy_cpus == 1)
- 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
++	if (!sched_smt_active())
++		return true;
  
- 	return false;
-@@ -9215,7 +9350,14 @@ static inline bool
+-	return false;
++	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+ #else
+-	/* Always return false so that callers deal with non-SMT cases. */
+-	return false;
++	return true;
+ #endif
+ }
+ 
++/**
++ * sched_asym - Check if the destination CPU can do asym_packing load balance
++ * @env:	The load balancing environment
++ * @sds:	Load-balancing data with statistics of the local group
++ * @sgs:	Load-balancing statistics of the candidate busiest group
++ * @group:	The candidate busiest group
++ *
++ * @env::dst_cpu can do asym_packing if it has higher priority than the
++ * preferred CPU of @group.
++ *
++ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
++ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
++ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
++ * imbalances in the number of CPUS are dealt with in find_busiest_group().
++ *
++ * If we are balancing load within an SMT core, or at DIE domain level, always
++ * proceed.
++ *
++ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
++ * otherwise.
++ */
+ static inline bool
  sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
  	   struct sched_group *group)
  {
 -	/* Only do SMT checks if either local or candidate have SMT siblings */
-+	/*
-+	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
-+	 * is not sufficient. We need to make sure the whole core is idle.
-+	 */
-+	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
+-	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
+-	    (group->flags & SD_SHARE_CPUCAPACITY))
+-		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
++	/* Ensure that the whole local core is idle, if applicable. */
++	if (!sched_use_asym_prio(env->sd, env->dst_cpu))
 +		return false;
 +
-+	/* Only do SMT checks if either local or candidate have SMT siblings. */
- 	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
- 	    (group->flags & SD_SHARE_CPUCAPACITY))
- 		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
-@@ -9408,10 +9550,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
++	/*
++	 * CPU priorities does not make sense for SMT cores with more than one
++	 * busy sibling.
++	 */
++	if (group->flags & SD_SHARE_CPUCAPACITY) {
++		if (sgs->group_weight - sgs->idle_cpus != 1)
++			return false;
++	}
+ 
+ 	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+ }
+@@ -9567,10 +9585,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  		 * contention when accessing shared HW resources.
  		 *
  		 * XXX for now avg_load is not computed and always 0 so we
@@ -34706,7 +12940,7 @@ index 86a988c830ef..84254f52c56a 100644
  		break;
  
  	case group_has_spare:
-@@ -9886,7 +10040,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
+@@ -10045,7 +10075,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
  
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
@@ -34714,72 +12948,35 @@ index 86a988c830ef..84254f52c56a 100644
  	struct sched_group *sg = env->sd->groups;
  	struct sg_lb_stats *local = &sds->local_stat;
  	struct sg_lb_stats tmp_sgs;
-@@ -9927,9 +10080,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
+@@ -10086,8 +10115,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  		sg = sg->next;
  	} while (sg != env->sd->groups);
  
 -	/* Tag domain that child domain prefers tasks go to siblings first */
 -	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
--
 +	/*
-+	 * Tag domain that @env::sd prefers to spread excess tasks among
-+	 * sibling sched groups.
++	 * Indicate that the child domain of the busiest group prefers tasks
++	 * go to a child's sibling domains first. NB the flags of a sched group
++	 * are those of the child domain.
 +	 */
-+	sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING;
++	if (sds->busiest)
++		sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
+ 
  
  	if (env->sd->flags & SD_NUMA)
- 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
-@@ -10159,24 +10314,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
- 	 */
- 	update_sd_lb_stats(env, &sds);
- 
--	if (sched_energy_enabled()) {
--		struct root_domain *rd = env->dst_rq->rd;
--
--		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
--			goto out_balanced;
--	}
--
--	local = &sds.local_stat;
--	busiest = &sds.busiest_stat;
--
- 	/* There is no busy sibling group to pull tasks from */
- 	if (!sds.busiest)
- 		goto out_balanced;
- 
-+	busiest = &sds.busiest_stat;
-+
- 	/* Misfit tasks should be dealt with regardless of the avg load */
- 	if (busiest->group_type == group_misfit_task)
- 		goto force_balance;
- 
-+	if (sched_energy_enabled()) {
-+		struct root_domain *rd = env->dst_rq->rd;
-+
-+		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
-+			goto out_balanced;
-+	}
-+
- 	/* ASYM feature bypasses nice load balance check */
- 	if (busiest->group_type == group_asym_packing)
- 		goto force_balance;
-@@ -10189,6 +10343,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
- 	if (busiest->group_type == group_imbalanced)
- 		goto force_balance;
- 
-+	local = &sds.local_stat;
- 	/*
- 	 * If the local group is busier than the selected busiest group
- 	 * don't try and pull any tasks.
-@@ -10228,7 +10383,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
+@@ -10387,7 +10421,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
  			goto out_balanced;
  	}
  
 -	/* Try to move all excess tasks to child's sibling domain */
++	/*
++	 * Try to move all excess tasks to a sibling domain of the busiest
++	 * group's child domain.
++	 */
  	if (sds.prefer_sibling && local->group_type == group_has_spare &&
  	    busiest->sum_nr_running > local->sum_nr_running + 1)
  		goto force_balance;
-@@ -10330,11 +10484,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+@@ -10489,8 +10526,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  		    nr_running == 1)
  			continue;
  
@@ -34792,72 +12989,49 @@ index 86a988c830ef..84254f52c56a 100644
 +		 * SMT cores with more than one busy sibling.
 +		 */
  		if ((env->sd->flags & SD_ASYM_PACKING) &&
++		    sched_use_asym_prio(env->sd, i) &&
  		    sched_asym_prefer(i, env->dst_cpu) &&
--		    nr_running == 1)
--			continue;
-+		    nr_running == 1) {
-+			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
-+			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
-+				continue;
-+		}
- 
- 		switch (env->migration_type) {
- 		case migrate_load:
-@@ -10424,8 +10587,20 @@ asym_active_balance(struct lb_env *env)
- 	 * lower priority CPUs in order to pack all tasks in the
- 	 * highest priority CPUs.
+ 		    nr_running == 1)
+ 			continue;
+@@ -10579,12 +10623,19 @@ static inline bool
+ asym_active_balance(struct lb_env *env)
+ {
+ 	/*
+-	 * ASYM_PACKING needs to force migrate tasks from busy but
+-	 * lower priority CPUs in order to pack all tasks in the
+-	 * highest priority CPUs.
++	 * ASYM_PACKING needs to force migrate tasks from busy but lower
++	 * priority CPUs in order to pack all tasks in the highest priority
++	 * CPUs. When done between cores, do it only if the whole core if the
++	 * whole core is idle.
++	 *
++	 * If @env::src_cpu is an SMT core with busy siblings, let
++	 * the lower priority @env::dst_cpu help it. Do not follow
++	 * CPU priority.
  	 */
--	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ 	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
 -	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
-+	if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) {
-+		/* Always obey priorities between SMT siblings. */
-+		if (env->sd->flags & SD_SHARE_CPUCAPACITY)
-+			return sched_asym_prefer(env->dst_cpu, env->src_cpu);
-+
-+		/*
-+		 * A lower priority CPU can help an SMT core with more than one
-+		 * busy sibling.
-+		 */
-+		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
-+		       !is_core_idle(env->src_cpu);
-+	}
-+
-+	return false;
++	       sched_use_asym_prio(env->sd, env->dst_cpu) &&
++	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
++		!sched_use_asym_prio(env->sd, env->src_cpu));
  }
  
  static inline bool
-@@ -11162,8 +11337,17 @@ static void nohz_balancer_kick(struct rq *rq)
+@@ -11318,9 +11369,13 @@ static void nohz_balancer_kick(struct rq *rq)
+ 		 * When ASYM_PACKING; see if there's a more preferred CPU
+ 		 * currently idle; in which case, kick the ILB to move tasks
+ 		 * around.
++		 *
++		 * When balancing betwen cores, all the SMT siblings of the
++		 * preferred CPU must be idle.
  		 */
  		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- 			if (sched_asym_prefer(i, cpu)) {
--				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
--				goto unlock;
-+				/*
-+				 * Always do ASYM_PACKING balance in the SMT
-+				 * domain. In upper domains, the core must be
-+				 * fully idle.
-+				 */
-+				if (sd->flags & SD_SHARE_CPUCAPACITY ||
-+				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
-+				     is_core_idle(i))) {
-+					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
-+					goto unlock;
-+				}
+-			if (sched_asym_prefer(i, cpu)) {
++			if (sched_use_asym_prio(sd, i) &&
++			    sched_asym_prefer(i, cpu)) {
+ 				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ 				goto unlock;
  			}
- 		}
- 	}
-@@ -12498,6 +12682,11 @@ __init void init_sched_fair_class(void)
- 	for_each_possible_cpu(i) {
- 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
- 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
-+
-+#ifdef CONFIG_CFS_BANDWIDTH
-+		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
-+		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
-+#endif
- 	}
- 
- 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
 index ee7f23c76bd3..efdc29c42161 100644
 --- a/kernel/sched/features.h
@@ -35025,20 +13199,10 @@ index 3a0e0dc28721..9b35b5072bae 100644
  static inline void
  update_idle_rq_clock_pelt(struct rq *rq) { }
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 771f8ddb7053..9e8bb6278604 100644
+index 3e8df6d31c1e..7331d436ebc4 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
-@@ -645,6 +645,9 @@ struct cfs_rq {
- 	int			throttled;
- 	int			throttle_count;
- 	struct list_head	throttled_list;
-+#ifdef CONFIG_SMP
-+	struct list_head	throttled_csd_list;
-+#endif
- #endif /* CONFIG_CFS_BANDWIDTH */
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- };
-@@ -1015,6 +1018,7 @@ struct rq {
+@@ -1018,6 +1018,7 @@ struct rq {
  	u64			clock;
  	/* Ensure that all clocks are in the same cache line */
  	u64			clock_task ____cacheline_aligned;
@@ -35046,560 +13210,67 @@ index 771f8ddb7053..9e8bb6278604 100644
  	u64			clock_pelt;
  	unsigned long		lost_idle_time;
  	u64			clock_pelt_idle;
-@@ -1041,7 +1045,6 @@ struct rq {
+@@ -1772,6 +1773,13 @@ queue_balance_callback(struct rq *rq,
+ 	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ 			__sd; __sd = __sd->parent)
  
- 	unsigned long		cpu_capacity;
- 	unsigned long		cpu_capacity_orig;
--	unsigned long		cpu_capacity_inverted;
- 
- 	struct balance_callback *balance_callback;
- 
-@@ -1154,6 +1157,11 @@ struct rq {
- 
- 	/* Scratch cpumask to be temporarily used under rq_lock */
- 	cpumask_var_t		scratch_mask;
++/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
++#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
++static const unsigned int SD_SHARED_CHILD_MASK =
++#include <linux/sched/sd_flags.h>
++0;
++#undef SD_FLAG
 +
-+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
-+	call_single_data_t	cfsb_csd;
-+	struct list_head	cfsb_csd_list;
-+#endif
- };
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -2893,24 +2901,6 @@ static inline unsigned long capacity_orig_of(int cpu)
- 	return cpu_rq(cpu)->cpu_capacity_orig;
- }
- 
--/*
-- * Returns inverted capacity if the CPU is in capacity inversion state.
-- * 0 otherwise.
-- *
-- * Capacity inversion detection only considers thermal impact where actual
-- * performance points (OPPs) gets dropped.
-- *
-- * Capacity inversion state happens when another performance domain that has
-- * equal or lower capacity_orig_of() becomes effectively larger than the perf
-- * domain this CPU belongs to due to thermal pressure throttling it hard.
-- *
-- * See comment in update_cpu_capacity().
-- */
--static inline unsigned long cpu_in_capacity_inversion(int cpu)
--{
--	return cpu_rq(cpu)->cpu_capacity_inverted;
--}
--
  /**
-  * enum cpu_util_type - CPU utilization type
-  * @FREQUENCY_UTIL:	Utilization used to select frequency
--- 
-2.40.0.rc2
-
-From e0cfd01287f19367a61351b05d43cf4471156ffd Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 6 Feb 2023 09:53:13 +0100
-Subject: [PATCH 14/16] zram
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- Documentation/admin-guide/blockdev/zram.rst |   2 +
- drivers/block/zram/zram_drv.c               | 319 +++++++++++++++++++-
- drivers/block/zram/zram_drv.h               |   7 +
- 3 files changed, 322 insertions(+), 6 deletions(-)
-
-diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
-index e4551579cb12..a1dd202efca1 100644
---- a/Documentation/admin-guide/blockdev/zram.rst
-+++ b/Documentation/admin-guide/blockdev/zram.rst
-@@ -209,6 +209,7 @@ compact           	WO	trigger memory compaction
- debug_stat        	RO	this file is used for zram debugging purposes
- backing_dev	  	RW	set up backend storage for zram to write out
- idle		  	WO	mark allocated slot as idle
-+merge           	WO	trigger merge identical pages
- ======================  ======  ===============================================
- 
- 
-@@ -267,6 +268,7 @@ line of text and contains the following stats separated by whitespace:
-  pages_compacted  the number of pages freed during compaction
-  huge_pages	  the number of incompressible pages
-  huge_pages_since the number of incompressible pages since zram set up
-+ pages_merged	  the number of identical pages merged into single one
-  ================ =============================================================
- 
- File /sys/block/zram<id>/bd_stat
-diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
-index e290d6d97047..084f8f830bde 100644
---- a/drivers/block/zram/zram_drv.c
-+++ b/drivers/block/zram/zram_drv.c
-@@ -33,12 +33,15 @@
- #include <linux/debugfs.h>
- #include <linux/cpuhotplug.h>
- #include <linux/part_stat.h>
-+#include <linux/hashtable.h>
-+#include <linux/xxhash.h>
- 
- #include "zram_drv.h"
- 
- static DEFINE_IDR(zram_index_idr);
- /* idr index must be protected */
- static DEFINE_MUTEX(zram_index_mutex);
-+static DEFINE_MUTEX(zram_rbtree_mutex);
- 
- static int zram_major;
- static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
-@@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
- static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- 				u32 index, int offset, struct bio *bio);
- 
-+struct zram_rbtree_node {
-+	struct rb_node node;
-+	unsigned long key;
-+	unsigned long cnt;
-+};
-+
-+struct zram_hash_node {
-+	unsigned long index;
-+	struct hlist_node next;
-+};
- 
- static int zram_slot_trylock(struct zram *zram, u32 index)
+  * highest_flag_domain - Return highest sched_domain containing flag.
+  * @cpu:	The CPU whose highest level of sched domain is to
+@@ -1779,16 +1787,25 @@ queue_balance_callback(struct rq *rq,
+  * @flag:	The flag to check for the highest sched_domain
+  *		for the given CPU.
+  *
+- * Returns the highest sched_domain of a CPU which contains the given flag.
++ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
++ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
+  */
+ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
  {
-@@ -1140,7 +1153,7 @@ static ssize_t recomp_algorithm_store(struct device *dev,
- 	while (*args) {
- 		args = next_arg(args, &param, &val);
+ 	struct sched_domain *sd, *hsd = NULL;
  
--		if (!*val)
-+		if (!val || !*val)
- 			return -EINVAL;
- 
- 		if (!strcmp(param, "algo")) {
-@@ -1184,6 +1197,30 @@ static ssize_t compact_store(struct device *dev,
- 	return len;
- }
- 
-+static int zram_do_scan(struct zram *zram);
-+
-+static ssize_t merge_store(struct device *dev,
-+		struct device_attribute *attr, const char *buf, size_t len)
-+{
-+	struct zram *zram = dev_to_zram(dev);
-+	int ret;
-+
-+	down_read(&zram->init_lock);
-+	if (!init_done(zram)) {
-+		up_read(&zram->init_lock);
-+		return -EINVAL;
-+	}
-+
-+	ret = zram_do_scan(zram);
-+	if (ret != 0) {
-+		up_read(&zram->init_lock);
-+		return -ENOMEM;
-+	}
-+
-+	up_read(&zram->init_lock);
-+	return len;
-+}
-+
- static ssize_t io_stat_show(struct device *dev,
- 		struct device_attribute *attr, char *buf)
- {
-@@ -1223,7 +1260,7 @@ static ssize_t mm_stat_show(struct device *dev,
- 	max_used = atomic_long_read(&zram->stats.max_used_pages);
- 
- 	ret = scnprintf(buf, PAGE_SIZE,
--			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
-+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu %8llu\n",
- 			orig_size << PAGE_SHIFT,
- 			(u64)atomic64_read(&zram->stats.compr_data_size),
- 			mem_used << PAGE_SHIFT,
-@@ -1232,7 +1269,8 @@ static ssize_t mm_stat_show(struct device *dev,
- 			(u64)atomic64_read(&zram->stats.same_pages),
- 			atomic_long_read(&pool_stats.pages_compacted),
- 			(u64)atomic64_read(&zram->stats.huge_pages),
--			(u64)atomic64_read(&zram->stats.huge_pages_since));
-+			(u64)atomic64_read(&zram->stats.huge_pages_since),
-+			(u64)atomic64_read(&zram->stats.pages_merged));
- 	up_read(&zram->init_lock);
- 
- 	return ret;
-@@ -1283,6 +1321,248 @@ static DEVICE_ATTR_RO(bd_stat);
- #endif
- static DEVICE_ATTR_RO(debug_stat);
- 
-+static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
-+{
-+	struct rb_node **new = &(root->rb_node), *parent = NULL;
-+	struct zram_rbtree_node *this;
-+
-+	while (*new) {
-+		this = rb_entry(*new, struct zram_rbtree_node, node);
-+		parent = *new;
-+		if (data->key < this->key)
-+			new = &((*new)->rb_left);
-+		else if (data->key > this->key)
-+			new = &((*new)->rb_right);
-+		else
-+			return false;
-+	}
-+
-+	rb_link_node(&data->node, parent, new);
-+	rb_insert_color(&data->node, root);
-+	return true;
-+}
-+
-+static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
-+		unsigned long key)
-+{
-+	struct rb_node *node = root->rb_node;
-+	struct zram_rbtree_node *data;
-+
-+	while (node) {
-+		data = rb_entry(node, struct zram_rbtree_node, node);
-+		if (key < data->key)
-+			node = node->rb_left;
-+		else if (key > data->key)
-+			node = node->rb_right;
-+		else
-+			return data;
-+	}
-+
-+	return NULL;
-+}
-+
-+static unsigned long zram_calc_hash(void *src, size_t len)
-+{
-+	return xxhash(src, len, 0);
-+}
-+
-+static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
-+		size_t htable_size, size_t index)
-+{
-+	struct zram_rbtree_node *rb_node;
-+	struct zram_hash_node *node;
-+	unsigned long handle, cur_handle;
-+	size_t obj_size;
-+	char *src, *buf;
-+	unsigned long hash;
-+	int ret = 0;
-+
-+	handle = zram_get_handle(zram, index);
-+	if (!handle)
-+		return ret;
-+
-+	obj_size = zram_get_obj_size(zram, index);
-+	buf = kmalloc(obj_size, GFP_KERNEL);
-+	if (!buf) {
-+		pr_err("Failed to allocate zs_map_object buffer\n");
-+		return -ENOMEM;
-+	}
-+
-+	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
-+	memcpy(buf, src, obj_size);
-+	zs_unmap_object(zram->mem_pool, handle);
-+	hash = zram_calc_hash(buf, obj_size);
-+
-+	mutex_lock(&zram_rbtree_mutex);
-+	hlist_for_each_entry(node, &htable[hash % htable_size], next) {
-+		int cmp;
-+
-+		zram_slot_lock(zram, node->index);
+ 	for_each_domain(cpu, sd) {
+-		if (!(sd->flags & flag))
++		if (sd->flags & flag) {
++			hsd = sd;
++			continue;
++		}
 +
 +		/*
-+		 * Page may change as the hash table is being formed,
-+		 * so the checks below are necessary.
++		 * Stop the search if @flag is known to be shared at lower
++		 * levels. It will not be found further up.
 +		 */
-+		cur_handle = zram_get_handle(zram, node->index);
-+		if (handle == cur_handle ||
-+			obj_size != zram_get_obj_size(zram, node->index)) {
-+			zram_slot_unlock(zram, node->index);
-+			continue;
-+		}
-+
-+		src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
-+		cmp = memcmp(buf, src, obj_size);
-+		zs_unmap_object(zram->mem_pool, cur_handle);
-+
-+		if (!cmp) {
-+			rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
-+
-+			/*
-+			 * This check is necessary in order not to zs_free an object
-+			 * that someone already refers to. This situation is possible
-+			 * when with repeated calls to zram_do_scan(). For example:
-+			 *
-+			 * [slot0] [slot1] [slot2] [slot3] [slot4]
-+			 * [obj0]  [obj1]  [obj2]  [obj3]  [obj4]
-+			 *
-+			 * Let's imagine that obj2 and obj3 are equal, and we called
-+			 * zram_do_scan() function:
-+			 *
-+			 * [slot0] [slot1] [slot2] [slot3] [slot4]
-+			 * [obj0]  [obj1]  [obj2]  [obj2]  [obj4]
-+			 *
-+			 * Now, slot2 and slot3 refers to obj2 zsmalloc object.
-+			 * Time passed, now slot0 refres to obj0_n, which is equal
-+			 * to obj2:
-+			 *
-+			 * [slot0]  [slot1] [slot2] [slot3] [slot4]
-+			 * [obj0_n] [obj1]  [obj2]  [obj2]  [obj4]
-+			 *
-+			 * Now we call zram_do_scan() function again. We get to slot2,
-+			 * and we understand that obj2 and obj0_n hashes are the same. We
-+			 * try to zs_free(obj2), but slot3 also already refers to it.
-+			 *
-+			 * This is not correct!
-+			 */
-+			if (unlikely(rb_node))
-+				if (rb_node->cnt > 1) {
-+					zram_slot_unlock(zram, node->index);
-+					continue;
-+				}
-+
-+			zram_set_handle(zram, index, cur_handle);
-+			zs_free(zram->mem_pool, handle);
-+
-+			rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
-+
-+			if (!rb_node) {
-+				rb_node = kzalloc(sizeof(struct zram_rbtree_node),
-+								GFP_KERNEL);
-+				if (!rb_node) {
-+					pr_err("Failed to allocate rb_node\n");
-+					ret = -ENOMEM;
-+					zram_slot_unlock(zram, node->index);
-+					mutex_unlock(&zram_rbtree_mutex);
-+					goto merged_or_err;
-+				}
-+
-+				rb_node->key = cur_handle;
-+				/* Two slots refers to an zsmalloc object with cur_handle key */
-+				rb_node->cnt = 2;
-+				zram_rbtree_insert(&zram->sph_rbtree, rb_node);
-+			} else {
-+				rb_node->cnt++;
-+			}
-+
-+			atomic64_inc(&zram->stats.pages_merged);
-+			atomic64_sub(obj_size, &zram->stats.compr_data_size);
-+			zram_set_flag(zram, index, ZRAM_MERGED);
-+			zram_set_flag(zram, node->index, ZRAM_MERGED);
-+
-+			zram_slot_unlock(zram, node->index);
-+			mutex_unlock(&zram_rbtree_mutex);
-+			goto merged_or_err;
-+		}
-+
-+		zram_slot_unlock(zram, node->index);
-+	}
-+
-+	mutex_unlock(&zram_rbtree_mutex);
-+
-+	node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
-+	if (!node) {
-+		ret = -ENOMEM;
-+		goto merged_or_err;
-+	}
-+
-+	node->index = index;
-+	hlist_add_head(&node->next, &htable[hash % htable_size]);
-+
-+merged_or_err:
-+	kfree(buf);
-+	return ret;
-+}
-+
-+static void zram_free_htable_entries(struct hlist_head *htable,
-+		size_t htable_size)
-+{
-+	struct hlist_node *n;
-+	struct zram_hash_node *node;
-+
-+	hlist_for_each_entry_safe(node, n, htable, next) {
-+		hlist_del(&node->next);
-+		kfree(node);
-+	}
-+}
-+
-+static int zram_do_scan(struct zram *zram)
-+{
-+	size_t num_pages = zram->disksize >> PAGE_SHIFT;
-+	size_t htable_size = num_pages;
-+	size_t index;
-+	struct hlist_head *htable;
-+	int i, ret = 0;
-+
-+	htable = vzalloc(htable_size * sizeof(struct hlist_head));
-+	if (!htable) {
-+		pr_err("Failed to allocate hash table\n");
-+		return -ENOMEM;
-+	}
-+
-+	for (i = 0; i < htable_size; i++)
-+		INIT_HLIST_HEAD(&htable[i]);
-+
-+	for (index = 0; index < num_pages; index++) {
-+		zram_slot_lock(zram, index);
-+
-+		if (!zram_allocated(zram, index)) {
-+			zram_slot_unlock(zram, index);
-+			continue;
-+		}
-+
-+		if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
-+			zram_test_flag(zram, index, ZRAM_WB) ||
-+			zram_test_flag(zram, index, ZRAM_SAME)) {
-+			zram_slot_unlock(zram, index);
-+			continue;
-+		}
-+
-+		/* Ignore pages that have been recompressed */
-+		if (zram_get_priority(zram, index) != 0)
-+			continue;
-+
-+		ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
-+		zram_slot_unlock(zram, index);
-+		if (ret != 0)
-+			goto out;
-+	}
-+
-+out:
-+	zram_free_htable_entries(htable, htable_size);
-+	vfree(htable);
-+	return ret;
-+}
-+
- static void zram_meta_free(struct zram *zram, u64 disksize)
- {
- 	size_t num_pages = disksize >> PAGE_SHIFT;
-@@ -1324,6 +1604,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
- static void zram_free_page(struct zram *zram, size_t index)
- {
- 	unsigned long handle;
-+	struct zram_rbtree_node *node;
++		if (flag & SD_SHARED_CHILD_MASK)
+ 			break;
+-		hsd = sd;
+ 	}
  
- #ifdef CONFIG_ZRAM_MEMORY_TRACKING
- 	zram->table[index].ac_time = 0;
-@@ -1361,7 +1642,28 @@ static void zram_free_page(struct zram *zram, size_t index)
- 	if (!handle)
- 		return;
- 
--	zs_free(zram->mem_pool, handle);
-+	if (zram_test_flag(zram, index, ZRAM_MERGED)) {
-+		zram_clear_flag(zram, index, ZRAM_MERGED);
-+		mutex_lock(&zram_rbtree_mutex);
-+
-+		node = zram_rbtree_search(&zram->sph_rbtree, handle);
-+		BUG_ON(!node);
-+
-+		node->cnt--;
-+		if (node->cnt == 0) {
-+			rb_erase(&node->node, &zram->sph_rbtree);
-+			mutex_unlock(&zram_rbtree_mutex);
-+
-+			zs_free(zram->mem_pool, handle);
-+			kfree(node);
-+		} else {
-+			mutex_unlock(&zram_rbtree_mutex);
-+		}
-+
-+		atomic64_dec(&zram->stats.pages_merged);
-+	} else {
-+		zs_free(zram->mem_pool, handle);
-+	}
- 
- 	atomic64_sub(zram_get_obj_size(zram, index),
- 			&zram->stats.compr_data_size);
-@@ -1824,7 +2126,7 @@ static ssize_t recompress_store(struct device *dev,
- 	while (*args) {
- 		args = next_arg(args, &param, &val);
- 
--		if (!*val)
-+		if (!val || !*val)
- 			return -EINVAL;
- 
- 		if (!strcmp(param, "type")) {
-@@ -1909,7 +2211,8 @@ static ssize_t recompress_store(struct device *dev,
- 		if (zram_test_flag(zram, index, ZRAM_WB) ||
- 		    zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
- 		    zram_test_flag(zram, index, ZRAM_SAME) ||
--		    zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
-+		    zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE) ||
-+		    zram_test_flag(zram, index, ZRAM_MERGED))
- 			goto next;
- 
- 		err = zram_recompress(zram, index, page, threshold,
-@@ -2295,6 +2598,7 @@ static const struct block_device_operations zram_devops = {
- };
- 
- static DEVICE_ATTR_WO(compact);
-+static DEVICE_ATTR_WO(merge);
- static DEVICE_ATTR_RW(disksize);
- static DEVICE_ATTR_RO(initstate);
- static DEVICE_ATTR_WO(reset);
-@@ -2335,6 +2639,7 @@ static struct attribute *zram_disk_attrs[] = {
- #ifdef CONFIG_ZRAM_WRITEBACK
- 	&dev_attr_bd_stat.attr,
- #endif
-+	&dev_attr_merge.attr,
- 	&dev_attr_debug_stat.attr,
- #ifdef CONFIG_ZRAM_MULTI_COMP
- 	&dev_attr_recomp_algorithm.attr,
-@@ -2421,6 +2726,8 @@ static int zram_add(void)
- 
- 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
- 
-+	zram->sph_rbtree = RB_ROOT;
-+
- 	zram_debugfs_register(zram);
- 	pr_info("Added device: %s\n", zram->disk->disk_name);
- 	return device_id;
-diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
-index c5254626f051..2afdbf76a1aa 100644
---- a/drivers/block/zram/zram_drv.h
-+++ b/drivers/block/zram/zram_drv.h
-@@ -56,6 +56,7 @@ enum zram_pageflags {
- 
- 	ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
- 	ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
-+	ZRAM_MERGED,	/* page was merged */
- 
- 	__NR_ZRAM_PAGEFLAGS,
- };
-@@ -87,6 +88,7 @@ struct zram_stats {
- 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
- 	atomic64_t writestall;		/* no. of write slow paths */
- 	atomic64_t miss_free;		/* no. of missed free */
-+	atomic64_t pages_merged;	/* no. of pages, which merged into single one */
- #ifdef	CONFIG_ZRAM_WRITEBACK
- 	atomic64_t bd_count;		/* no. of pages in backing device */
- 	atomic64_t bd_reads;		/* no. of reads from backing device */
-@@ -140,5 +142,10 @@ struct zram {
- #ifdef CONFIG_ZRAM_MEMORY_TRACKING
- 	struct dentry *debugfs_dir;
- #endif
-+	/*
-+	 * This is same pages handle's rb tree, where the key is a handle
-+	 * to same pages and the value is a link counter
-+	 */
-+	struct rb_root sph_rbtree;
- };
- #endif
+ 	return hsd;
 -- 
-2.40.0.rc2
+2.40.0
 
-From 02b507dfef3f09d3de2785ed80164e15c8ed7844 Mon Sep 17 00:00:00 2001
+From 6c867f735d5efe4f7df3cc9cf96dc0928914c438 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Tue, 14 Feb 2023 22:02:09 +0100
-Subject: [PATCH 15/16] zstd import v1.5.4
+Date: Sun, 9 Apr 2023 21:26:12 +0200
+Subject: [PATCH 10/10] zstd
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  include/linux/zstd.h                          |    2 +-
  include/linux/zstd_errors.h                   |   23 +-
- include/linux/zstd_lib.h                      |  569 +++++--
+ include/linux/zstd_lib.h                      |  703 +++++--
+ kernel/module/decompress.c                    |    2 +-
  lib/zstd/Makefile                             |    2 +-
- lib/zstd/common/bits.h                        |  124 ++
- lib/zstd/common/bitstream.h                   |   51 +-
+ lib/zstd/common/allocations.h                 |   56 +
+ lib/zstd/common/bits.h                        |  149 ++
+ lib/zstd/common/bitstream.h                   |   53 +-
  lib/zstd/common/compiler.h                    |   14 +-
  lib/zstd/common/cpu.h                         |    3 +-
  lib/zstd/common/debug.c                       |    3 +-
@@ -35612,46 +13283,47 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  lib/zstd/common/huf.h                         |  222 +--
  lib/zstd/common/mem.h                         |    2 +-
  lib/zstd/common/portability_macros.h          |   26 +-
- lib/zstd/common/zstd_common.c                 |    3 +-
+ lib/zstd/common/zstd_common.c                 |   38 +-
  lib/zstd/common/zstd_deps.h                   |    2 +-
- lib/zstd/common/zstd_internal.h               |   94 +-
+ lib/zstd/common/zstd_internal.h               |   99 +-
  lib/zstd/compress/clevels.h                   |    3 +-
  lib/zstd/compress/fse_compress.c              |   59 +-
  lib/zstd/compress/hist.c                      |    3 +-
  lib/zstd/compress/hist.h                      |    3 +-
  lib/zstd/compress/huf_compress.c              |  372 ++--
- lib/zstd/compress/zstd_compress.c             | 1491 ++++++++++++-----
- lib/zstd/compress/zstd_compress_internal.h    |  267 +--
+ lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
  lib/zstd/compress/zstd_compress_literals.c    |  155 +-
  lib/zstd/compress/zstd_compress_literals.h    |   25 +-
  lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
  lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
  lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
  lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
- lib/zstd/compress/zstd_cwksp.h                |    5 +-
+ lib/zstd/compress/zstd_cwksp.h                |  149 +-
  lib/zstd/compress/zstd_double_fast.c          |  129 +-
  lib/zstd/compress/zstd_double_fast.h          |    6 +-
- lib/zstd/compress/zstd_fast.c                 |  582 +++++--
+ lib/zstd/compress/zstd_fast.c                 |  582 ++++--
  lib/zstd/compress/zstd_fast.h                 |    6 +-
- lib/zstd/compress/zstd_lazy.c                 |  364 ++--
+ lib/zstd/compress/zstd_lazy.c                 |  518 ++---
  lib/zstd/compress/zstd_lazy.h                 |    7 +-
  lib/zstd/compress/zstd_ldm.c                  |   11 +-
  lib/zstd/compress/zstd_ldm.h                  |    3 +-
  lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
- lib/zstd/compress/zstd_opt.c                  |  185 +-
+ lib/zstd/compress/zstd_opt.c                  |  187 +-
  lib/zstd/compress/zstd_opt.h                  |    3 +-
- lib/zstd/decompress/huf_decompress.c          |  731 ++++----
- lib/zstd/decompress/zstd_ddict.c              |    8 +-
+ lib/zstd/decompress/huf_decompress.c          |  731 ++++---
+ lib/zstd/decompress/zstd_ddict.c              |    9 +-
  lib/zstd/decompress/zstd_ddict.h              |    3 +-
- lib/zstd/decompress/zstd_decompress.c         |  215 ++-
- lib/zstd/decompress/zstd_decompress_block.c   |  252 ++-
- lib/zstd/decompress/zstd_decompress_block.h   |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  269 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
+ lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
  .../decompress/zstd_decompress_internal.h     |    7 +-
  lib/zstd/decompress_sources.h                 |    2 +-
- lib/zstd/zstd_common_module.c                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    5 +-
  lib/zstd/zstd_compress_module.c               |    2 +-
  lib/zstd/zstd_decompress_module.c             |    4 +-
- 57 files changed, 4086 insertions(+), 2268 deletions(-)
+ 59 files changed, 4732 insertions(+), 2612 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
  create mode 100644 lib/zstd/common/bits.h
 
 diff --git a/include/linux/zstd.h b/include/linux/zstd.h
@@ -35733,7 +13405,7 @@ index 58b6dd45a969..6d5cf55f0bf3 100644
  } ZSTD_ErrorCode;
  
 diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
-index 79d55465d5c1..dc7e9605a624 100644
+index 79d55465d5c1..738fe8ea4ead 100644
 --- a/include/linux/zstd_lib.h
 +++ b/include/linux/zstd_lib.h
 @@ -1,5 +1,6 @@
@@ -35796,11 +13468,21 @@ index 79d55465d5c1..dc7e9605a624 100644
  #define ZSTD_VERSION_MAJOR    1
  #define ZSTD_VERSION_MINOR    5
 -#define ZSTD_VERSION_RELEASE  2
-+#define ZSTD_VERSION_RELEASE  4
++#define ZSTD_VERSION_RELEASE  5
  #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
  
  /*! ZSTD_versionNumber() :
-@@ -156,7 +176,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ ***************************************/
+ /*! ZSTD_compress() :
+  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data.
+  *  @return : compressed size written into `dst` (<= `dstCapacity),
+  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
   *  "empty", "unknown" and "error" results to the same return value (0),
   *  while ZSTD_getFrameContentSize() gives them separate return values.
   * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
@@ -35811,7 +13493,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
   * `src` should point to the start of a ZSTD frame or skippable frame.
-@@ -168,8 +190,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
  
  
  /*======  Helper functions  ======*/
@@ -35844,7 +13526,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
  ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
  ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
-@@ -412,6 +456,9 @@ typedef enum {
+@@ -412,6 +457,9 @@ typedef enum {
       * ZSTD_c_validateSequences
       * ZSTD_c_useBlockSplitter
       * ZSTD_c_useRowMatchFinder
@@ -35854,7 +13536,7 @@ index 79d55465d5c1..dc7e9605a624 100644
       * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
       * note : never ever use experimentalParam? names directly;
       *        also, the enums values themselves are unstable and can still change.
-@@ -430,7 +477,11 @@ typedef enum {
+@@ -430,7 +478,11 @@ typedef enum {
       ZSTD_c_experimentalParam12=1009,
       ZSTD_c_experimentalParam13=1010,
       ZSTD_c_experimentalParam14=1011,
@@ -35867,7 +13549,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  } ZSTD_cParameter;
  
  typedef struct {
-@@ -493,7 +544,7 @@ typedef enum {
+@@ -493,7 +545,7 @@ typedef enum {
   *                  They will be used to compress next frame.
   *                  Resetting session never fails.
   *  - The parameters : changes all parameters back to "default".
@@ -35876,7 +13558,17 @@ index 79d55465d5c1..dc7e9605a624 100644
   *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
   *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
   *  - Both : similar to resetting the session, followed by resetting parameters.
-@@ -543,13 +594,15 @@ typedef enum {
+@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+  *  - The function is always blocking, returns when compression is completed.
+- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
++ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
+  * @return : compressed size written into `dst` (<= `dstCapacity),
+  *           or an error code if it fails (which can be tested using ZSTD_isError()).
+  */
+@@ -543,13 +596,15 @@ typedef enum {
       * ZSTD_d_stableOutBuffer
       * ZSTD_d_forceIgnoreChecksum
       * ZSTD_d_refMultipleDDicts
@@ -35893,7 +13585,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  } ZSTD_dParameter;
  
-@@ -728,8 +781,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
   * This following is a legacy streaming API, available since v1.0+ .
   * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
   * It is redundant, but remains fully supported.
@@ -35902,7 +13594,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   ******************************************************************************/
  
  /*!
-@@ -738,6 +789,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
   *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
   *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
   *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
@@ -35912,7 +13604,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   */
  ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
  /*!
-@@ -788,13 +842,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
  
  /*===== Streaming decompression functions =====*/
  
@@ -35945,7 +13637,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
  
  ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
-@@ -913,7 +985,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
   *  If @return == 0, the dictID could not be decoded.
   *  This could for one of the following reasons :
   *  - The frame does not require a dictionary to be decoded (most common case).
@@ -35954,7 +13646,22 @@ index 79d55465d5c1..dc7e9605a624 100644
   *    Note : this use case also happens when using a non-conformant dictionary.
   *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
   *  - This is not a Zstandard frame.
-@@ -937,8 +1009,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * Advanced dictionary and prefix API (Requires v1.4.0+)
+  *
+  * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is re-used,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+  ******************************************************************************/
+ 
+ 
+@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
   * @result : 0, or an error code (which can be tested with ZSTD_isError()).
   *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
   *           meaning "return to no-dictionary mode".
@@ -35966,7 +13673,16 @@ index 79d55465d5c1..dc7e9605a624 100644
   *  Note 2 : Loading a dictionary involves building tables.
   *           It's also a CPU consuming operation, with non-negligible impact on latency.
   *           Tables are dependent on compression parameters, and for this reason,
-@@ -951,7 +1024,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+  *           In such a case, dictionary buffer must outlive its users.
+  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- *           to precisely select how dictionary content must be interpreted. */
++ *           to precisely select how dictionary content must be interpreted.
++ *  Note 5 : This method does not benefit from LDM (long distance mode).
++ *           If you want to employ LDM on some large dictionary content,
++ *           prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
  ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
  
  /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
@@ -35975,7 +13691,15 @@ index 79d55465d5c1..dc7e9605a624 100644
   *  Note that compression parameters are enforced from within CDict,
   *  and supersede any compression parameter previously set within CCtx.
   *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
-@@ -986,9 +1059,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+  *  Decompression will need same prefix to properly regenerate data.
+  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ *  This method is compatible with LDM (long distance mode).
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                   const void* prefix, size_t prefixSize);
  
  /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
@@ -35988,7 +13712,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   * @result : 0, or an error code (which can be tested with ZSTD_isError()).
   *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
   *            meaning "return to no-dictionary mode".
-@@ -1012,9 +1085,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
   *  The memory for the table is allocated on the first call to refDDict, and can be
   *  freed with ZSTD_freeDCtx().
   *
@@ -36001,7 +13725,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   *  Special: referencing a NULL DDict means "return to no-dictionary mode".
   *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
   */
-@@ -1071,24 +1145,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
  #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
  #endif
  
@@ -36026,7 +13750,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  /* **************************************************************************************
   *   experimental API (static linking only)
   ****************************************************************************************
-@@ -1123,6 +1179,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
  #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
  #define ZSTD_STRATEGY_MIN        ZSTD_fast
  #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
@@ -36034,10 +13758,43 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  
  #define ZSTD_OVERLAPLOG_MIN       0
-@@ -1350,29 +1407,85 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+@@ -1303,7 +1369,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+ 
+ /* *************************************
+-*  Frame size functions
++*  Frame header and size functions
+ ***************************************/
+ 
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
   *           or an error code (if srcSize is too small) */
  ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
  
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
++    unsigned blockSizeMax;
++    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++    unsigned headerSize;
++    unsigned dictID;
++    unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ *  decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ *           or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ *  same as ZSTD_getFrameHeader(),
++ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
 +/*! ZSTD_decompressionMargin() :
 + * Zstd supports in-place decompression, where the input and output buffers overlap.
 + * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
@@ -36125,7 +13882,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  /*! ZSTD_mergeBlockDelimiters() :
   * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
-@@ -1388,7 +1501,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
  ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
  
  /*! ZSTD_compressSequences() :
@@ -36136,7 +13893,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
   * The entire source is compressed into a single frame.
   *
-@@ -1413,11 +1528,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
   * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
   * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
   *         and cannot emit an RLE block that disagrees with the repcode history
@@ -36153,7 +13910,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  
  /*! ZSTD_writeSkippableFrame() :
-@@ -1481,8 +1597,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
   *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
   *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
   *
@@ -36166,7 +13923,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   */
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
-@@ -1501,7 +1620,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
   *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
   *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
   *         an internal ?Dict will be created, which additional size is not estimated here.
@@ -36180,16 +13937,30 @@ index 79d55465d5c1..dc7e9605a624 100644
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
-@@ -1649,22 +1773,31 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
   *  This function never fails (wide contract) */
  ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
  
 +/*! ZSTD_CCtx_setCParams() :
-+ *  Set all parameters provided within @cparams into the working @cctx.
++ *  Set all parameters provided within @p cparams into the working @p cctx.
 + *  Note : if modifying parameters during compression (MT mode only),
 + *         note that changes to the .windowLog parameter will be ignored.
-+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ *         On failure, no parameters are updated.
++ */
 +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ *  Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ *  Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
 +
  /*! ZSTD_compress_advanced() :
   *  Note : this function is now DEPRECATED.
@@ -36216,7 +13987,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                                void* dst, size_t dstCapacity,
                                          const void* src, size_t srcSize,
-@@ -1808,13 +1941,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * Experimental parameter.
   * Default is 0 == disabled. Set to 1 to enable.
   *
@@ -36240,7 +14011,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   *
   * When this flag is enabled zstd won't allocate an input window buffer,
   * because the user guarantees it can reference the ZSTD_inBuffer until
-@@ -1822,18 +1958,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
   * avoid the memcpy() from the input buffer to the input window buffer.
   *
@@ -36264,7 +14035,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   */
  #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
  
-@@ -1878,7 +2011,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * Without validation, providing a sequence that does not conform to the zstd spec will cause
   * undefined behavior, and may produce a corrupted block.
   *
@@ -36273,7 +14044,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   * specifics regarding offset/matchlength requirements) then the function will bail out and
   * return an error.
   *
-@@ -1928,6 +2061,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   */
  #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
  
@@ -36321,7 +14092,7 @@ index 79d55465d5c1..dc7e9605a624 100644
 + * This parameter can be used to set an upper bound on the blocksize
 + * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
 + * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
-+ * compressBound() innacurate). Only currently meant to be used for testing.
++ * compressBound() inaccurate). Only currently meant to be used for testing.
 + *
 + */
 +#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
@@ -36353,7 +14124,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  /*! ZSTD_CCtx_getParameter() :
   *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
   *  and store it into int* value.
-@@ -2084,7 +2290,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   * in the range [dst, dst + pos) MUST not be modified during decompression
   * or you will get data corruption.
   *
@@ -36362,7 +14133,7 @@ index 79d55465d5c1..dc7e9605a624 100644
   * it can write directly to the ZSTD_outBuffer, but it will still allocate
   * an input buffer large enough to fit any compressed block. This will also
   * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
-@@ -2137,6 +2343,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   */
  #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
  
@@ -36380,7 +14151,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  
  /*! ZSTD_DCtx_setFormat() :
   *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
-@@ -2145,6 +2362,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   *  such ZSTD_f_zstd1_magicless for example.
   * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
  ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
@@ -36388,7 +14159,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
  
  /*! ZSTD_decompressStream_simpleArgs() :
-@@ -2181,6 +2399,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -36396,7 +14167,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                           int compressionLevel,
                           unsigned long long pledgedSrcSize);
-@@ -2198,6 +2417,7 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -36404,7 +14175,20 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                       const void* dict, size_t dictSize,
                             int compressionLevel);
-@@ -2218,6 +2438,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ 
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- *     for ((param, value) : params) {
+- *         ZSTD_CCtx_setParameter(zcs, param, value);
+- *     }
++ *     ZSTD_CCtx_setParams(zcs, params);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+  *
+@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -36412,7 +14196,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            ZSTD_parameters params,
-@@ -2232,6 +2453,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
@@ -36420,7 +14204,18 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
  
  /*! ZSTD_initCStream_usingCDict_advanced() :
-@@ -2250,6 +2472,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+- *   This function is DEPRECATED, and is approximately equivalent to:
++ *   This function is DEPRECATED, and is equivalent to:
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- *     for ((fParam, value) : fParams) {
+- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+- *     }
++ *     ZSTD_CCtx_setFParams(zcs, fParams);
+  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+  *     ZSTD_CCtx_refCDict(zcs, cdict);
+  *
+@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
@@ -36428,7 +14223,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                 const ZSTD_CDict* cdict,
                                       ZSTD_frameParameters fParams,
-@@ -2274,6 +2497,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
   *  This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -36436,7 +14231,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
  
  
-@@ -2319,8 +2543,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
   *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
   *
   * note: no dictionary will be used if dict == NULL or dictSize < 8
@@ -36446,7 +14241,7 @@ index 79d55465d5c1..dc7e9605a624 100644
  ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
  
  /*!
-@@ -2330,20 +2554,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+@@ -2330,27 +2595,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
   *     ZSTD_DCtx_refDDict(zds, ddict);
   *
   * note : ddict is referenced, it must outlive decompression session
@@ -36464,98 +14259,7 @@ index 79d55465d5c1..dc7e9605a624 100644
 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
 - */
 -ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
--
- 
- /* *******************************************************************
- *  Buffer-less and synchronous inner streaming functions
-@@ -2362,7 +2576,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
- 
-   Start by initializing a context.
-   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
--  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
- 
-   Then, consume your input using ZSTD_compressContinue().
-   There are some important considerations to keep in mind when using this advanced function :
-@@ -2387,15 +2600,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
- ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
- ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
- ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
--ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 +
-+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
-+ZSTDLIB_STATIC_API
-+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
- 
- ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
- ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
- 
- /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
- ZSTD_DEPRECATED("use advanced API to access custom parameters")
-+ZSTDLIB_STATIC_API
- size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
- ZSTD_DEPRECATED("use advanced API to access custom parameters")
-+ZSTDLIB_STATIC_API
- size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
- /*
-   Buffer-less streaming decompression (synchronous mode)
-@@ -2408,8 +2626,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
-   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
-   Data fragment must be large enough to ensure successful decoding.
-  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
--  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
--           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
-+  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
-+           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
-            errorCode, which can be tested using ZSTD_isError().
- 
-   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
-@@ -2428,7 +2646,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
- 
-   The most memory efficient way is to use a round buffer of sufficient size.
-   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
--  which can @return an error code if required value is too large for current system (in 32-bits mode).
-+  which can return an error code if required value is too large for current system (in 32-bits mode).
-   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
-   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
-   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
-@@ -2448,7 +2666,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
-   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
-   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
- 
-- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
-+  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
-   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
-   It can also be an error code, which can be tested with ZSTD_isError().
- 
-@@ -2480,6 +2698,8 @@ typedef struct {
-     unsigned headerSize;
-     unsigned dictID;
-     unsigned checksumFlag;
-+    unsigned _reserved1;
-+    unsigned _reserved2;
- } ZSTD_frameHeader;
- 
- /*! ZSTD_getFrameHeader() :
-@@ -2502,6 +2722,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
- ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
- 
- /* misc */
-+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
- ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
- typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
- ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
-@@ -2524,7 +2745,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
-     - It is necessary to init context before starting
-       + compression : any ZSTD_compressBegin*() variant, including with dictionary
-       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
--      + copyCCtx() and copyDCtx() can be used too
-     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
-       + If input is larger than a block size, it's necessary to split input data into multiple blocks
-       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
-@@ -2547,5 +2767,166 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_
- ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
- 
- 
 +/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
 + *
 + * *** OVERVIEW ***
@@ -36716,9 +14420,194 @@ index 79d55465d5c1..dc7e9605a624 100644
 +  void* sequenceProducerState,
 +  ZSTD_sequenceProducer_F* sequenceProducer
 +);
+ 
+ 
+ /* *******************************************************************
+-*  Buffer-less and synchronous inner streaming functions
++*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+ *
+-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-*  But it's also a complex one, with several restrictions, documented below.
+-*  Prefer normal streaming API for an easier experience.
++*  This API is deprecated, and will be removed in a future version.
++*  It allows streaming (de)compression with user allocated buffers.
++*  However, it is hard to use, and not as well tested as the rest of
++*  our API.
++*
++*  Please use the normal streaming API instead: ZSTD_compressStream2,
++*  and ZSTD_decompressStream.
++*  If there is functionality that you need, but it doesn't provide,
++*  please open an issue on our GitHub.
+ ********************************************************************* */
+ 
+ /*
+@@ -2362,7 +2785,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2384,18 +2806,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ */
+ 
+ /*=====   Buffer-less streaming compression functions  =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 +
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+@@ -2408,8 +2840,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2860,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2880,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2471,27 +2903,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+ 
+ /*=====   Buffer-less streaming decompression functions  =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+-    unsigned blockSizeMax;
+-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+-    unsigned headerSize;
+-    unsigned dictID;
+-    unsigned checksumFlag;
+-} ZSTD_frameHeader;
+ 
+-/*! ZSTD_getFrameHeader() :
+- *  decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- *  same as ZSTD_getFrameHeader(),
+- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2914,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +2922,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ 
+ 
+ 
+-/* ============================ */
+-/*       Block level API       */
+-/* ============================ */
++/* ========================================= */
++/*       Block level API (DEPRECATED)       */
++/* ========================================= */
+ 
+ /*!
++
++    This API is deprecated in favor of the regular compression API.
++    You can get the frame header down to 2 bytes by setting:
++      - ZSTD_c_format = ZSTD_f_zstd1_magicless
++      - ZSTD_c_contentSizeFlag = 0
++      - ZSTD_c_checksumFlag = 0
++      - ZSTD_c_dictIDFlag = 0
++
++    This API is not as well tested as our normal API, so we recommend not using it.
++    We will be removing it in a future version. If the normal API doesn't provide
++    the functionality you need, please open a GitHub issue.
++
+     Block functions produce and decode raw zstd blocks, without frame metadata.
+     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +2949,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +2965,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+ 
+ /*=====   Raw zstd block functions  =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+-
  #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
  
+diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
+index bb79ac1a6d8f..7ddc87bee274 100644
+--- a/kernel/module/decompress.c
++++ b/kernel/module/decompress.c
+@@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
+ 		zstd_dec.size = PAGE_SIZE;
+ 
+ 		ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+-		kunmap(page);
++		kunmap_local(zstd_dec.dst);
+ 		retval = zstd_get_error_code(ret);
+ 		if (retval)
+ 			break;
 diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
 index 20f08c644b71..464c410b2768 100644
 --- a/lib/zstd/Makefile
@@ -36731,12 +14620,74 @@ index 20f08c644b71..464c410b2768 100644
  # All rights reserved.
  #
  # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000000..05adbbeccaa9
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "mem.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc)
++        return customMem.customAlloc(customMem.opaque, size);
++    return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++    if (customMem.customAlloc) {
++        /* calloc implemented as malloc+memset;
++         * not as efficient as calloc, but next best guess for custom malloc */
++        void* const ptr = customMem.customAlloc(customMem.opaque, size);
++        ZSTD_memset(ptr, 0, size);
++        return ptr;
++    }
++    return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++    if (ptr!=NULL) {
++        if (customMem.customFree)
++            customMem.customFree(customMem.opaque, ptr);
++        else
++            ZSTD_free(ptr);
++    }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
 diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
 new file mode 100644
-index 000000000000..bb7967def569
+index 000000000000..aa3487ec4b6a
 --- /dev/null
 +++ b/lib/zstd/common/bits.h
-@@ -0,0 +1,124 @@
+@@ -0,0 +1,149 @@
 +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 +/*
 + * Copyright (c) Meta Platforms, Inc. and affiliates.
@@ -36757,7 +14708,7 @@ index 000000000000..bb7967def569
 +{
 +    assert(val != 0);
 +    {
-+        static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
 +                                                30, 22, 20, 15, 25, 17, 4, 8,
 +                                                31, 27, 13, 23, 21, 19, 16, 7,
 +                                                26, 12, 18, 6, 11, 5, 10, 9};
@@ -36860,9 +14811,34 @@ index 000000000000..bb7967def569
 +    return 31 - ZSTD_countLeadingZeros32(val);
 +}
 +
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++    assert(count < 64);
++    count &= 0x3F; /* for fickle pattern recognition */
++    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++    assert(count < 32);
++    count &= 0x1F; /* for fickle pattern recognition */
++    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++    assert(count < 16);
++    count &= 0x0F; /* for fickle pattern recognition */
++    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
 +#endif /* ZSTD_BITS_H */
 diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
-index feef3a1b1d60..83a180c65faf 100644
+index feef3a1b1d60..444dc4f85c64 100644
 --- a/lib/zstd/common/bitstream.h
 +++ b/lib/zstd/common/bitstream.h
 @@ -1,7 +1,8 @@
@@ -36979,6 +14955,15 @@ index feef3a1b1d60..83a180c65faf 100644
  MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
  {
      size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+  *  This function is safe, it guarantees it will not read beyond src buffer.
+  * @return : status of `BIT_DStream_t` internal register.
+  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+         return BIT_DStream_overflow;
 diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
 index c42d39faf9bd..c437e0975575 100644
 --- a/lib/zstd/common/compiler.h
@@ -37933,7 +15918,7 @@ index 0e3b2c0a527d..7ede8cf1ffe5 100644
 +
  #endif /* ZSTD_PORTABILITY_MACROS_H */
 diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
-index 3d7e35b309b5..5a9abca10944 100644
+index 3d7e35b309b5..44b95b25344a 100644
 --- a/lib/zstd/common/zstd_common.c
 +++ b/lib/zstd/common/zstd_common.c
 @@ -1,5 +1,6 @@
@@ -37944,8 +15929,54 @@ index 3d7e35b309b5..5a9abca10944 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ *  Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+ 
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+  *  provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-*  Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc)
+-        return customMem.customAlloc(customMem.opaque, size);
+-    return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+-    if (customMem.customAlloc) {
+-        /* calloc implemented as malloc+memset;
+-         * not as efficient as calloc, but next best guess for custom malloc */
+-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+-        ZSTD_memset(ptr, 0, size);
+-        return ptr;
+-    }
+-    return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+-    if (ptr!=NULL) {
+-        if (customMem.customFree)
+-            customMem.customFree(customMem.opaque, ptr);
+-        else
+-            ZSTD_free(ptr);
+-    }
+-}
 diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
-index 7a5bf44839c9..925161416033 100644
+index f06df065dec0..670c5fa2a952 100644
 --- a/lib/zstd/common/zstd_deps.h
 +++ b/lib/zstd/common/zstd_deps.h
 @@ -1,6 +1,6 @@
@@ -37957,7 +15988,7 @@ index 7a5bf44839c9..925161416033 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
-index 93305d9b41bb..170cd1db4819 100644
+index 93305d9b41bb..7f023e4d4774 100644
 --- a/lib/zstd/common/zstd_internal.h
 +++ b/lib/zstd/common/zstd_internal.h
 @@ -1,5 +1,6 @@
@@ -38067,7 +16098,7 @@ index 93305d9b41bb..170cd1db4819 100644
          }
      }
      return seqLen;
-@@ -337,12 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
   *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
   */
  typedef struct {
@@ -38078,14 +16109,13 @@ index 93305d9b41bb..170cd1db4819 100644
  
  const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
 -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
- 
- /* custom memory allocation functions */
- void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-@@ -350,61 +347,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
- void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
- 
- 
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
 -MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
 -{
 -    assert(val != 0);
@@ -38139,11 +16169,10 @@ index 93305d9b41bb..170cd1db4819 100644
 -#       endif
 -    }
 -}
--
--
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ 
  /* ZSTD_invalidateRepCodes() :
-  * ensures next compression will not use repcodes from previous block.
-  * Note : only works with regular variant;
 diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
 index d9a76112ec3a..6ab8be6532ef 100644
 --- a/lib/zstd/compress/clevels.h
@@ -39061,7 +17090,7 @@ index 74ef0db47621..83241abafe35 100644
  }
 -
 diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
-index f620cafca633..81b8cd119cd8 100644
+index f620cafca633..c1c316e9e289 100644
 --- a/lib/zstd/compress/zstd_compress.c
 +++ b/lib/zstd/compress/zstd_compress.c
 @@ -1,5 +1,6 @@
@@ -39072,7 +17101,13 @@ index f620cafca633..81b8cd119cd8 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -16,7 +17,6 @@
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
  #include "hist.h"           /* HIST_countFast_wksp */
  #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
  #include "../common/fse.h"
@@ -39080,15 +17115,15 @@ index f620cafca633..81b8cd119cd8 100644
  #include "../common/huf.h"
  #include "zstd_compress_internal.h"
  #include "zstd_compress_sequences.h"
-@@ -27,6 +27,7 @@
+@@ -27,6 +28,7 @@
  #include "zstd_opt.h"
  #include "zstd_ldm.h"
  #include "zstd_compress_superblock.h"
-+#include  "../common/bits.h"      /* ZSTD_highbit32 */
++#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
  
  /* ***************************************************************
  *  Tuning parameters
-@@ -55,14 +56,17 @@
+@@ -55,14 +57,17 @@
  *  Helper functions
  ***************************************/
  /* ZSTD_compressBound()
@@ -39112,7 +17147,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  
-@@ -171,12 +175,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
      if (cctx==NULL) return 0;   /* support free on NULL */
      RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                      "not compatible with static CCtx");
@@ -39127,7 +17162,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
      return 0;
  }
-@@ -257,9 +258,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
      return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
  }
  
@@ -39139,7 +17174,7 @@ index f620cafca633..81b8cd119cd8 100644
   */
  static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                   const ZSTD_compressionParameters* const cParams) {
-@@ -267,6 +268,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
      return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
  }
  
@@ -39174,7 +17209,7 @@ index f620cafca633..81b8cd119cd8 100644
  static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
          ZSTD_compressionParameters cParams)
  {
-@@ -284,6 +313,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
      }
      cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
      cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
@@ -39185,7 +17220,7 @@ index f620cafca633..81b8cd119cd8 100644
      assert(!ZSTD_checkCParams(cParams));
      return cctxParams;
  }
-@@ -329,10 +362,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
  #define ZSTD_NO_CLEVEL 0
  
  /*
@@ -39201,7 +17236,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      assert(!ZSTD_checkCParams(params->cParams));
      ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
-@@ -345,6 +381,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
      cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
      cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
      cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
@@ -39211,7 +17246,7 @@ index f620cafca633..81b8cd119cd8 100644
      DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                  cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
  }
-@@ -359,7 +398,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
  
  /*
   * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
@@ -39220,7 +17255,7 @@ index f620cafca633..81b8cd119cd8 100644
   */
  static void ZSTD_CCtxParams_setZstdParams(
          ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
-@@ -455,8 +494,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
          return bounds;
  
      case ZSTD_c_enableLongDistanceMatching:
@@ -39231,7 +17266,7 @@ index f620cafca633..81b8cd119cd8 100644
          return bounds;
  
      case ZSTD_c_ldmHashLog:
-@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
          bounds.upperBound = 1;
          return bounds;
  
@@ -39258,7 +17293,7 @@ index f620cafca633..81b8cd119cd8 100644
      default:
          bounds.error = ERROR(parameter_unsupported);
          return bounds;
-@@ -613,6 +672,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
      case ZSTD_c_useBlockSplitter:
      case ZSTD_c_useRowMatchFinder:
      case ZSTD_c_deterministicRefPrefix:
@@ -39269,7 +17304,7 @@ index f620cafca633..81b8cd119cd8 100644
      default:
          return 0;
      }
-@@ -625,7 +688,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
          if (ZSTD_isUpdateAuthorized(param)) {
              cctx->cParamsChanged = 1;
          } else {
@@ -39278,7 +17313,7 @@ index f620cafca633..81b8cd119cd8 100644
      }   }
  
      switch(param)
-@@ -668,6 +731,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
      case ZSTD_c_useBlockSplitter:
      case ZSTD_c_useRowMatchFinder:
      case ZSTD_c_deterministicRefPrefix:
@@ -39289,7 +17324,7 @@ index f620cafca633..81b8cd119cd8 100644
          break;
  
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
-@@ -723,12 +790,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
      case ZSTD_c_minMatch :
          if (value!=0)   /* 0 => use default */
              BOUNDCHECK(ZSTD_c_minMatch, value);
@@ -39304,7 +17339,7 @@ index f620cafca633..81b8cd119cd8 100644
          return CCtxParams->cParams.targetLength;
  
      case ZSTD_c_strategy :
-@@ -741,12 +808,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
          /* Content size written in frame header _when known_ (default:1) */
          DEBUGLOG(4, "set content size flag = %u", (value!=0));
          CCtxParams->fParams.contentSizeFlag = value != 0;
@@ -39319,7 +17354,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
          DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
-@@ -755,18 +822,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
  
      case ZSTD_c_forceMaxWindow :
          CCtxParams->forceWindow = (value != 0);
@@ -39341,7 +17376,7 @@ index f620cafca633..81b8cd119cd8 100644
          CCtxParams->literalCompressionMode = lcm;
          return CCtxParams->literalCompressionMode;
      }
-@@ -789,47 +856,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
  
      case ZSTD_c_enableDedicatedDictSearch :
          CCtxParams->enableDedicatedDictSearch = (value!=0);
@@ -39397,7 +17432,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      case ZSTD_c_stableInBuffer:
          BOUNDCHECK(ZSTD_c_stableInBuffer, value);
-@@ -866,6 +934,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
          CCtxParams->deterministicRefPrefix = !!value;
          return CCtxParams->deterministicRefPrefix;
  
@@ -39425,7 +17460,7 @@ index f620cafca633..81b8cd119cd8 100644
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
      }
  }
-@@ -980,6 +1069,18 @@ size_t ZSTD_CCtxParams_getParameter(
+@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
      case ZSTD_c_deterministicRefPrefix:
          *value = (int)CCtxParams->deterministicRefPrefix;
          break;
@@ -39444,22 +17479,45 @@ index f620cafca633..81b8cd119cd8 100644
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
      }
      return 0;
-@@ -1006,9 +1107,24 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
      return 0;
  }
  
 +size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
 +{
++    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
 +    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
-+    assert(cctx != NULL);
-+    if (cctx->streamStage != zcss_init) {
-+        /* All parameters in @cparams are allowed to be updated during MT compression.
-+         * This must be signaled, so that MT compression picks up the changes */
-+        cctx->cParamsChanged = 1;
-+    }
-+    /* only update if parameters are valid */
++    /* only update if all parameters are valid */
 +    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
-+    cctx->requestedParams.cParams = cparams;
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++    return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setParams");
++    /* First check cParams, because we want to update all or none. */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++    /* Finally set cParams, which should succeed. */
++    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
 +    return 0;
 +}
 +
@@ -39470,15 +17528,82 @@ index f620cafca633..81b8cd119cd8 100644
      RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                      "Can't set pledgedSrcSize when not in init stage.");
      cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
-@@ -1151,6 +1267,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+         ZSTD_compressionParameters* cParams);
+ 
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+  */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+         return 0;
+     }
+     if (dl->cdict != NULL) {
+-        assert(cctx->cdict == dl->cdict);
+         /* Local dictionary already initialized. */
++        assert(cctx->cdict == dl->cdict);
+         return 0;
+     }
+     assert(dl->dictSize > 0);
+@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+ 
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++        ZSTD_CCtx* cctx,
++        const void* dict, size_t dictSize,
++        ZSTD_dictLoadMethod_e dictLoadMethod,
++        ZSTD_dictContentType_e dictContentType)
+ {
+-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+-                    "Can't load a dictionary when ctx is not in init stage.");
+     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
++    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++                    "Can't load a dictionary when cctx is not in init stage.");
++    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
++    if (dict == NULL || dictSize == 0)  /* no dictionary */
+         return 0;
+     if (dictLoadMethod == ZSTD_dlm_byRef) {
+         cctx->localDict.dict = dict;
+     } else {
++        /* copy dictionary content inside CCtx to own its lifetime */
+         void* dictBuffer;
+         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+-                        "no malloc for static CCtx");
++                        "static CCtx can't allocate for an internal copy of dictionary");
+         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++                        "allocation failed for dictionary content");
+         ZSTD_memcpy(dictBuffer, dict, dictSize);
+-        cctx->localDict.dictBuffer = dictBuffer;
+-        cctx->localDict.dict = dictBuffer;
++        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
++        cctx->localDict.dict = dictBuffer;        /* read-only reference */
+     }
+     cctx->localDict.dictSize = dictSize;
+     cctx->localDict.dictContentType = dictContentType;
+@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
          RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                         "Can't reset parameters only when not in init stage.");
+-                        "Can't reset parameters only when not in init stage.");
++                        "Reset parameters is only possible during init stage.");
          ZSTD_clearAllDicts(cctx);
 +        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
          return ZSTD_CCtxParams_reset(&cctx->requestedParams);
      }
      return 0;
-@@ -1247,7 +1364,8 @@ static ZSTD_compressionParameters
+@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
  ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                              unsigned long long srcSize,
                              size_t dictSize,
@@ -39488,7 +17613,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      const U64 minSrcSize = 513; /* (1<<9) + 1 */
      const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
-@@ -1281,8 +1399,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
      }
  
      /* resize windowLog if input is small enough, to use less memory */
@@ -39499,7 +17624,7 @@ index f620cafca633..81b8cd119cd8 100644
          U32 const tSize = (U32)(srcSize + dictSize);
          static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
          U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
-@@ -1300,6 +1418,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
      if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
          cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
  
@@ -39542,7 +17667,7 @@ index f620cafca633..81b8cd119cd8 100644
      return cPar;
  }
  
-@@ -1310,7 +1464,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
  {
      cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
      if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
@@ -39551,7 +17676,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
-@@ -1341,7 +1495,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
      ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
      assert(!ZSTD_checkCParams(cParams));
      /* srcSizeHint == 0 means 0 */
@@ -39560,7 +17685,16 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  static size_t
-@@ -1386,6 +1540,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
+                                             : 0;
+     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                 ? optPotentialSpace
+@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
      return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
  }
  
@@ -39574,7 +17708,7 @@ index f620cafca633..81b8cd119cd8 100644
  static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          const ZSTD_compressionParameters* cParams,
          const ldmParams_t* ldmParams,
-@@ -1393,12 +1554,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          const ZSTD_paramSwitch_e useRowMatchFinder,
          const size_t buffInSize,
          const size_t buffOutSize,
@@ -39592,7 +17726,7 @@ index f620cafca633..81b8cd119cd8 100644
      size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                              + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                              + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
-@@ -1417,6 +1579,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
  
      size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
  
@@ -39604,7 +17738,7 @@ index f620cafca633..81b8cd119cd8 100644
      size_t const neededSpace =
          cctxSpace +
          entropySpace +
-@@ -1425,7 +1592,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          ldmSeqSpace +
          matchStateSize +
          tokenSpace +
@@ -39614,7 +17748,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
      return neededSpace;
-@@ -1443,7 +1611,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
       * be needed. However, we still allocate two 0-sized buffers, which can
       * take space under ASAN. */
      return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
@@ -39623,7 +17757,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
-@@ -1493,7 +1661,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
      {   ZSTD_compressionParameters const cParams =
                  ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
@@ -39632,7 +17766,7 @@ index f620cafca633..81b8cd119cd8 100644
          size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                  ? ((size_t)1 << cParams.windowLog) + blockSize
                  : 0;
-@@ -1504,7 +1672,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
  
          return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
              &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
@@ -39641,7 +17775,83 @@ index f620cafca633..81b8cd119cd8 100644
      }
  }
  
-@@ -1768,6 +1936,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1637,6 +1833,19 @@ typedef enum {
+     ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+ 
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++    val *= 0x9FB21C651E98DF25ULL;
++    val ^= (val >> 35) + len ;
++    val *= 0x9FB21C651E98DF25ULL;
++    return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+ 
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+     }
+ 
+     ms->hashLog3 = hashLog3;
++    ms->lazySkipping = 0;
+ 
+     ZSTD_invalidateMatchState(ms);
+ 
+@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ZSTD_cwksp_clean_tables(ws);
+     }
+ 
++    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
++        /* Row match finder needs an additional table of hashes ("tags") */
++        size_t const tagTableSize = hSize;
++        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++         * 0 when we reset a Cdict */
++        if(forWho == ZSTD_resetTarget_CCtx) {
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++            ZSTD_advanceHashSalt(ms);
++        } else {
++            /* When we are not salting we want to always memset the memory */
++            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++            ZSTD_memset(ms->tagTable, 0, tagTableSize);
++            ms->hashSalt = 0;
++        }
++        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
++            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
++            assert(cParams->hashLog >= rowLog);
++            ms->rowHashLog = cParams->hashLog - rowLog;
++        }
++    }
++
+     /* opt parser space */
+     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+         DEBUGLOG(4, "reserving optimal parser space");
+@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+     }
+ 
+-    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+-        }
+-        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+-            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+-            assert(cParams->hashLog >= rowLog);
+-            ms->rowHashLog = cParams->hashLog - rowLog;
+-        }
+-    }
+-
+     ms->cParams = *cParams;
+ 
+     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
      assert(params->useRowMatchFinder != ZSTD_ps_auto);
      assert(params->useBlockSplitter != ZSTD_ps_auto);
      assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
@@ -39649,7 +17859,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
          /* Adjust long distance matching parameters */
          ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
-@@ -1776,9 +1945,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
      }
  
      {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
@@ -39661,7 +17871,7 @@ index f620cafca633..81b8cd119cd8 100644
          size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                  ? ZSTD_compressBound(blockSize) + 1
                  : 0;
-@@ -1795,7 +1963,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
          size_t const neededSpace =
              ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                  &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
@@ -39670,7 +17880,7 @@ index f620cafca633..81b8cd119cd8 100644
          int resizeWorkspace;
  
          FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
-@@ -1838,6 +2006,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
  
          /* init params */
          zc->blockState.matchState.cParams = params->cParams;
@@ -39678,10 +17888,34 @@ index f620cafca633..81b8cd119cd8 100644
          zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
          zc->consumedSrcSize = 0;
          zc->producedCSize = 0;
-@@ -1907,6 +2076,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
-             zc->ldmState.loadedDictEnd = 0;
-         }
+@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
  
+         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+ 
++        FORWARD_IF_ERROR(ZSTD_reset_matchState(
++                &zc->blockState.matchState,
++                ws,
++                &params->cParams,
++                params->useRowMatchFinder,
++                crp,
++                needsIndexReset,
++                ZSTD_resetTarget_CCtx), "");
++
++        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++        /* ldm hash table */
++        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++            /* TODO: avoid memset? */
++            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++            zc->maxNbLdmSequences = maxNbLdmSeq;
++
++            ZSTD_window_init(&zc->ldmState.window);
++            zc->ldmState.loadedDictEnd = 0;
++        }
++
 +        /* reserve space for block-level external sequences */
 +        if (params->useSequenceProducer) {
 +            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
@@ -39690,10 +17924,53 @@ index f620cafca633..81b8cd119cd8 100644
 +                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
 +        }
 +
-         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-         assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        /* buffers */
++
+         /* ZSTD_wildcopy() is used to copy into the literals buffer,
+          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+          */
+         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+         zc->seqStore.maxNbLit = blockSize;
  
-@@ -1980,7 +2157,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+-        /* buffers */
+         zc->bufferedPolicy = zbuff;
+         zc->inBuffSize = buffInSize;
+         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+-            &zc->blockState.matchState,
+-            ws,
+-            &params->cParams,
+-            params->useRowMatchFinder,
+-            crp,
+-            needsIndexReset,
+-            ZSTD_resetTarget_CCtx), "");
+-
+-        /* ldm hash table */
+-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+-            /* TODO: avoid memset? */
+-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+-            zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+-            ZSTD_window_init(&zc->ldmState.window);
+-            zc->ldmState.loadedDictEnd = 0;
+-        }
+ 
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+ 
+         zc->initialized = 1;
+ 
+@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
          }
  
          params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
@@ -39703,7 +17980,7 @@ index f620cafca633..81b8cd119cd8 100644
          params.cParams.windowLog = windowLog;
          params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
          FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
-@@ -2019,6 +2197,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
      return 0;
  }
  
@@ -39726,7 +18003,7 @@ index f620cafca633..81b8cd119cd8 100644
  static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                              const ZSTD_CDict* cdict,
                              ZSTD_CCtx_params params,
-@@ -2054,14 +2248,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                              : 0;
          size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
  
@@ -39748,7 +18025,18 @@ index f620cafca633..81b8cd119cd8 100644
          }
          /* copy tag table */
          if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-@@ -2147,6 +2342,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+-            size_t const tagTableSize = hSize*sizeof(U16);
++            size_t const tagTableSize = hSize;
+             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+-                cdict->matchState.tagTable,
+-                tagTableSize);
++                        cdict->matchState.tagTable,
++                        tagTableSize);
++            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+         }
+     }
+ 
+@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
          params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
          params.ldmParams = srcCCtx->appliedParams.ldmParams;
          params.fParams = fParams;
@@ -39756,7 +18044,7 @@ index f620cafca633..81b8cd119cd8 100644
          ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                  /* loadedDictSize */ 0,
                                  ZSTDcrp_leaveDirty, zbuff);
-@@ -2294,7 +2490,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
  
  /* See doc/zstd_compression_format.md for detailed format description */
  
@@ -39765,7 +18053,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      const seqDef* const sequences = seqStorePtr->sequencesStart;
      BYTE* const llCodeTable = seqStorePtr->llCode;
-@@ -2302,18 +2498,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
      BYTE* const mlCodeTable = seqStorePtr->mlCode;
      U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
      U32 u;
@@ -39791,7 +18079,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  /* ZSTD_useTargetCBlockSize():
-@@ -2347,6 +2549,7 @@ typedef struct {
+@@ -2347,6 +2602,7 @@ typedef struct {
      U32 MLtype;
      size_t size;
      size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
@@ -39799,7 +18087,7 @@ index f620cafca633..81b8cd119cd8 100644
  } ZSTD_symbolEncodingTypeStats_t;
  
  /* ZSTD_buildSequencesStatistics():
-@@ -2357,11 +2560,13 @@ typedef struct {
+@@ -2357,11 +2613,13 @@ typedef struct {
   * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
   */
  static ZSTD_symbolEncodingTypeStats_t
@@ -39818,7 +18106,7 @@ index f620cafca633..81b8cd119cd8 100644
      BYTE* const ostart = dst;
      const BYTE* const oend = dstEnd;
      BYTE* op = ostart;
-@@ -2375,7 +2580,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  
      stats.lastCountSize = 0;
      /* convert length/distances into codes */
@@ -39827,7 +18115,7 @@ index f620cafca633..81b8cd119cd8 100644
      assert(op <= oend);
      assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
      /* build CTable for Literal Lengths */
-@@ -2480,22 +2685,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
   */
  #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
  MEM_STATIC size_t
@@ -39859,7 +18147,7 @@ index f620cafca633..81b8cd119cd8 100644
      const BYTE* const ofCodeTable = seqStorePtr->ofCode;
      const BYTE* const llCodeTable = seqStorePtr->llCode;
      const BYTE* const mlCodeTable = seqStorePtr->mlCode;
-@@ -2503,29 +2708,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
      BYTE* const oend = ostart + dstCapacity;
      BYTE* op = ostart;
      size_t lastCountSize;
@@ -39898,7 +18186,7 @@ index f620cafca633..81b8cd119cd8 100644
          FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
          assert(cSize <= dstCapacity);
          op += cSize;
-@@ -2551,11 +2758,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
          ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
          return (size_t)(op - ostart);
      }
@@ -39913,7 +18201,7 @@ index f620cafca633..81b8cd119cd8 100644
                                               &prevEntropy->fse, &nextEntropy->fse,
                                                op, oend,
                                                strategy, count,
-@@ -2564,6 +2770,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
          *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
          lastCountSize = stats.lastCountSize;
          op += stats.size;
@@ -39921,7 +18209,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
  
      {   size_t const bitstreamSize = ZSTD_encodeSequences(
-@@ -2598,14 +2805,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
  }
  
  MEM_STATIC size_t
@@ -39945,7 +18233,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                              seqStorePtr, prevEntropy, nextEntropy, cctxParams,
-@@ -2615,15 +2823,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
      /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
       * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
       */
@@ -39969,7 +18257,7 @@ index f620cafca633..81b8cd119cd8 100644
      return cSize;
  }
  
-@@ -2718,6 +2932,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
      ssPtr->longLengthType = ZSTD_llt_none;
  }
  
@@ -40042,7 +18330,7 @@ index f620cafca633..81b8cd119cd8 100644
  typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
  
  static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
-@@ -2727,7 +3007,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
      assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
      /* Assert that we have correctly flushed the ctx params into the ms's copy */
      ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
@@ -40053,7 +18341,7 @@ index f620cafca633..81b8cd119cd8 100644
          if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
              ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
          } else {
-@@ -2763,6 +3045,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
          }
          if (zc->externSeqStore.pos < zc->externSeqStore.size) {
              assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
@@ -40069,7 +18357,7 @@ index f620cafca633..81b8cd119cd8 100644
              /* Updates ldmSeqStore.pos */
              lastLLSize =
                  ZSTD_ldm_blockCompress(&zc->externSeqStore,
-@@ -2774,6 +3065,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
          } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
              rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
  
@@ -40084,7 +18372,7 @@ index f620cafca633..81b8cd119cd8 100644
              ldmSeqStore.seq = zc->ldmSequences;
              ldmSeqStore.capacity = zc->maxNbLdmSequences;
              /* Updates ldmSeqStore.size */
-@@ -2788,7 +3087,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                         zc->appliedParams.useRowMatchFinder,
                                         src, srcSize);
              assert(ldmSeqStore.pos == ldmSeqStore.size);
@@ -40154,7 +18442,7 @@ index f620cafca633..81b8cd119cd8 100644
              ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
                                                                                      zc->appliedParams.useRowMatchFinder,
                                                                                      dictMode);
-@@ -2849,7 +3209,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
          /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
             so we provide seqStoreSeqs[i].offset - 1 */
          ZSTD_updateRep(updatedRepcodes.rep,
@@ -40163,7 +18451,7 @@ index f620cafca633..81b8cd119cd8 100644
                         seqStoreSeqs[i].litLength == 0);
          literalsRead += outSeqs[i].litLength;
      }
-@@ -2865,6 +3225,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
      zc->seqCollector.seqIndex += seqStoreSeqSize;
  }
  
@@ -40174,7 +18462,7 @@ index f620cafca633..81b8cd119cd8 100644
  size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
                                size_t outSeqsSize, const void* src, size_t srcSize)
  {
-@@ -2910,19 +3274,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
      const size_t unrollMask = unrollSize - 1;
      const size_t prefixLength = length & unrollMask;
      size_t i;
@@ -40196,7 +18484,7 @@ index f620cafca633..81b8cd119cd8 100644
      return 1;
  }
  
-@@ -2938,7 +3300,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
      return nbSeqs < 4 && nbLits < 10;
  }
  
@@ -40206,7 +18494,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
      bs->prevCBlock = bs->nextCBlock;
-@@ -2946,7 +3309,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
  }
  
  /* Writes the block header */
@@ -40217,7 +18505,7 @@ index f620cafca633..81b8cd119cd8 100644
      U32 const cBlockHeader = cSize == 1 ?
                          lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                          lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
-@@ -2959,13 +3324,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
   *  Stores literals block type (raw, rle, compressed, repeat) and
   *  huffman description table to hufMetadata.
   *  Requires ENTROPY_WORKSPACE_SIZE workspace
@@ -40241,7 +18529,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      BYTE* const wkspStart = (BYTE*)workspace;
      BYTE* const wkspEnd = wkspStart + wkspSize;
-@@ -2973,9 +3341,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
      unsigned* const countWksp = (unsigned*)workspace;
      const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
      BYTE* const nodeWksp = countWkspStart + countWkspSize;
@@ -40253,7 +18541,7 @@ index f620cafca633..81b8cd119cd8 100644
      HUF_repeat repeat = prevHuf->repeatMode;
      DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
  
-@@ -2990,73 +3358,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  
      /* small ? don't even attempt compression (speed opt) */
  #ifndef COMPRESS_LITERALS_SIZE_MIN
@@ -40365,7 +18653,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
  }
  
-@@ -3066,8 +3438,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
   * and updates nextEntropy to the appropriate repeatMode.
   */
  static ZSTD_symbolEncodingTypeStats_t
@@ -40377,7 +18665,7 @@ index f620cafca633..81b8cd119cd8 100644
      nextEntropy->litlength_repeatMode = FSE_repeat_none;
      nextEntropy->offcode_repeatMode = FSE_repeat_none;
      nextEntropy->matchlength_repeatMode = FSE_repeat_none;
-@@ -3078,16 +3451,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
   *  Builds entropy for the sequences.
   *  Stores symbol compression modes and fse table to fseMetadata.
   *  Requires ENTROPY_WORKSPACE_SIZE wksp.
@@ -40404,7 +18692,7 @@ index f620cafca633..81b8cd119cd8 100644
      BYTE* const ostart = fseMetadata->fseTablesBuffer;
      BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
      BYTE* op = ostart;
-@@ -3114,23 +3489,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
  /* ZSTD_buildBlockEntropyStats() :
   *  Builds entropy for the block.
   *  Requires workspace size ENTROPY_WORKSPACE_SIZE
@@ -40444,7 +18732,7 @@ index f620cafca633..81b8cd119cd8 100644
      FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
      entropyMetadata->fseMetadata.fseTablesSize =
          ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
-@@ -3143,11 +3523,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
  }
  
  /* Returns the size estimate for the literals section (header + content) of a block */
@@ -40462,7 +18750,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      unsigned* const countWksp = (unsigned*)workspace;
      unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-@@ -3169,12 +3550,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
  }
  
  /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
@@ -40482,7 +18770,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      unsigned* const countWksp = (unsigned*)workspace;
      const BYTE* ctp = codeTable;
-@@ -3206,99 +3588,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
  }
  
  /* Returns the size estimate for the sequences section (header + content) of a block */
@@ -40635,7 +18923,7 @@ index f620cafca633..81b8cd119cd8 100644
      return matchBytes;
  }
  
-@@ -3307,15 +3697,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
   */
  static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                 const seqStore_t* originalSeqStore,
@@ -40654,7 +18942,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
  
      /* Move longLengthPos into the correct position if necessary */
-@@ -3328,13 +3715,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
      }
      resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
      resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
@@ -40671,7 +18959,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
      resultSeqStore->llCode += startIdx;
      resultSeqStore->mlCode += startIdx;
-@@ -3342,20 +3728,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
  }
  
  /*
@@ -40708,7 +18996,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  /*
-@@ -3371,30 +3763,32 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
   *        1-3 : repcode 1-3
   *        4+ : real_offset+3
   */
@@ -40719,14 +19007,16 @@ index f620cafca633..81b8cd119cd8 100644
 +                        const seqStore_t* const seqStore, U32 const nbSeq)
 +{
      U32 idx = 0;
++    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
      for (; idx < nbSeq; ++idx) {
          seqDef* const seq = seqStore->sequencesStart + idx;
-         U32 const ll0 = (seq->litLength == 0);
+-        U32 const ll0 = (seq->litLength == 0);
 -        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
 -        assert(seq->offBase > 0);
 -        if (STORED_IS_REPCODE(offCode)) {
 -            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
 -            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
 +        U32 const offBase = seq->offBase;
 +        assert(offBase > 0);
 +        if (OFFBASE_IS_REPCODE(offBase)) {
@@ -40751,7 +19041,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
  }
  
-@@ -3404,10 +3798,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
   * Returns the total size of that block (including header) or a ZSTD error code.
   */
  static size_t
@@ -40765,7 +19055,7 @@ index f620cafca633..81b8cd119cd8 100644
                                    U32 lastBlock, U32 isPartition)
  {
      const U32 rleMaxLength = 25;
-@@ -3481,45 +3876,49 @@ typedef struct {
+@@ -3481,45 +3930,49 @@ typedef struct {
  
  /* Helper function to perform the recursive search for block splits.
   * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
@@ -40825,7 +19115,7 @@ index f620cafca633..81b8cd119cd8 100644
          ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
          splits->splitLocations[splits->idx] = (U32)midIdx;
          splits->idx++;
-@@ -3527,14 +3926,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
      }
  }
  
@@ -40849,7 +19139,7 @@ index f620cafca633..81b8cd119cd8 100644
          /* Refuse to try and split anything with less than 4 sequences */
          return 0;
      }
-@@ -3550,18 +3953,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
   * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
   */
  static size_t
@@ -40876,7 +19166,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
       * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
-@@ -3583,30 +3988,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
      ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
      ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
  
@@ -40917,7 +19207,7 @@ index f620cafca633..81b8cd119cd8 100644
          srcBytesTotal += srcBytes;
          if (lastPartition) {
              /* This is the final partition, need to account for possible last literals */
-@@ -3621,7 +4027,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                         op, dstCapacity,
                                                         ip, srcBytes,
                                                         lastBlockEntireSrc, 1 /* isPartition */);
@@ -40927,7 +19217,7 @@ index f620cafca633..81b8cd119cd8 100644
          FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
  
          ip += srcBytes;
-@@ -3629,10 +4036,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
          dstCapacity -= cSizeChunk;
          cSize += cSizeChunk;
          *currSeqStore = *nextSeqStore;
@@ -40941,7 +19231,7 @@ index f620cafca633..81b8cd119cd8 100644
       */
      ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
      return cSize;
-@@ -3643,8 +4050,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                                void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize, U32 lastBlock)
  {
@@ -40950,7 +19240,7 @@ index f620cafca633..81b8cd119cd8 100644
      U32 nbSeq;
      size_t cSize;
      DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
-@@ -3655,7 +4060,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
          if (bss == ZSTDbss_noCompress) {
              if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                  zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
@@ -40959,7 +19249,7 @@ index f620cafca633..81b8cd119cd8 100644
              FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
              DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
              return cSize;
-@@ -3673,9 +4078,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                              void* dst, size_t dstCapacity,
                              const void* src, size_t srcSize, U32 frame)
  {
@@ -40972,7 +19262,7 @@ index f620cafca633..81b8cd119cd8 100644
       */
      const U32 rleMaxLength = 25;
      size_t cSize;
-@@ -3767,10 +4172,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
           *   * cSize >= blockBound(srcSize): We have expanded the block too much so
           *     emit an uncompressed block.
           */
@@ -40987,7 +19277,7 @@ index f620cafca633..81b8cd119cd8 100644
                  FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                  if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                      ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-@@ -3778,7 +4184,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                  }
              }
          }
@@ -40996,7 +19286,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
      /* Superblock compression failed, attempt to emit a single no compress block.
-@@ -3836,7 +4242,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
  *   All blocks will be terminated, all input will be consumed.
  *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
  *   Frame is supposed already started (header already produced)
@@ -41005,7 +19295,7 @@ index f620cafca633..81b8cd119cd8 100644
  */
  static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                       void* dst, size_t dstCapacity,
-@@ -3860,7 +4266,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
          ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
          U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
  
@@ -41016,7 +19306,7 @@ index f620cafca633..81b8cd119cd8 100644
                          dstSize_tooSmall,
                          "not enough space to store compressed block");
          if (remaining < blockSize) blockSize = remaining;
-@@ -3899,7 +4307,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                      MEM_writeLE24(op, cBlockHeader);
                      cSize += ZSTD_blockHeaderSize;
                  }
@@ -41025,7 +19315,31 @@ index f620cafca633..81b8cd119cd8 100644
  
  
              ip += blockSize;
-@@ -4091,7 +4499,7 @@ size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                        void* dst, size_t dstCapacity,
++                                  const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++                             void* dst, size_t dstCapacity,
++                       const void* src, size_t srcSize)
++{
++    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+ 
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
  {
      ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
      assert(!ZSTD_checkCParams(cParams));
@@ -41033,8 +19347,34 @@ index f620cafca633..81b8cd119cd8 100644
 +    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
  }
  
- size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
-@@ -4111,31 +4519,47 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++    return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+ 
+     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+  *  @return : 0, or an error code
+  */
+@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                           ZSTD_cwksp* ws,
                                           ZSTD_CCtx_params const* params,
                                           const void* src, size_t srcSize,
@@ -41079,19 +19419,50 @@ index f620cafca633..81b8cd119cd8 100644
          /* If the dictionary is too large, only load the suffix of the dictionary. */
          if (srcSize > maxDictSize) {
              ip = iend - maxDictSize;
-             src = ip;
-             srcSize = maxDictSize;
--        }
-+    }   }
-+
+@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         }
+     }
+ 
+-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
 +    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
 +        /* We must have cleared our windows when our source is this large. */
 +        assert(ZSTD_window_isEmpty(ms->window));
 +        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++    }
+     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+-    ms->forceNonContiguous = params->deterministicRefPrefix;
+ 
+-    if (loadLdmDict) {
++    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++    }
++
++    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++    if (params->cParams.strategy < ZSTD_btultra) {
++        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++        if (srcSize > maxDictSize) {
++            ip = iend - maxDictSize;
++            src = ip;
++            srcSize = maxDictSize;
++        }
      }
  
-     DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
-@@ -4158,10 +4582,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
++    ms->nextToUpdate = (U32)(ip - ms->window.base);
++    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++    ms->forceNonContiguous = params->deterministicRefPrefix;
++
+     if (srcSize <= HASH_READ_SIZE) return 0;
+ 
+     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+ 
+-    if (loadLdmDict)
+-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
      switch(params->cParams.strategy)
      {
      case ZSTD_fast:
@@ -41104,7 +19475,16 @@ index f620cafca633..81b8cd119cd8 100644
          break;
  
      case ZSTD_greedy:
-@@ -4327,6 +4751,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+         } else {
+             assert(params->useRowMatchFinder != ZSTD_ps_auto);
+             if (params->useRowMatchFinder == ZSTD_ps_enable) {
+-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                        ZSTD_CCtx_params const* params,
                                        const void* dict, size_t dictSize,
                                        ZSTD_dictTableLoadMethod_e dtlm,
@@ -41112,7 +19492,7 @@ index f620cafca633..81b8cd119cd8 100644
                                        void* workspace)
  {
      const BYTE* dictPtr = (const BYTE*)dict;
-@@ -4345,7 +4770,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
      {
          size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
          FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
@@ -41121,7 +19501,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
      return dictID;
  }
-@@ -4361,6 +4786,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                           const void* dict, size_t dictSize,
                                 ZSTD_dictContentType_e dictContentType,
                                 ZSTD_dictTableLoadMethod_e dtlm,
@@ -41129,7 +19509,7 @@ index f620cafca633..81b8cd119cd8 100644
                                 void* workspace)
  {
      DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
-@@ -4373,13 +4799,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict restricted modes */
      if (dictContentType == ZSTD_dct_rawContent)
@@ -41145,7 +19525,7 @@ index f620cafca633..81b8cd119cd8 100644
          }
          RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
          assert(0);   /* impossible */
-@@ -4387,13 +4813,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict as full zstd dictionary */
      return ZSTD_loadZstdDictionary(
@@ -41161,7 +19541,7 @@ index f620cafca633..81b8cd119cd8 100644
   * @return : 0, or an error code */
  static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                      const void* dict, size_t dictSize,
-@@ -4426,11 +4853,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                          cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                          &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                          cdict->dictContentSize, cdict->dictContentType, dtlm,
@@ -41175,13 +19555,13 @@ index f620cafca633..81b8cd119cd8 100644
          FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
          assert(dictID <= UINT_MAX);
          cctx->dictID = (U32)dictID;
-@@ -4471,11 +4898,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                              &cctxParams, pledgedSrcSize);
  }
  
 -size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
-+size_t
-+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
  {
      ZSTD_CCtx_params cctxParams;
 -    {
@@ -41190,7 +19570,61 @@ index f620cafca633..81b8cd119cd8 100644
          ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
      }
      DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
-@@ -4709,7 +5136,7 @@ static size_t ZSTD_initCDict_internal(
+@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+ 
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+ 
+ 
+@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+     (void)extraCSize;
+ }
+ 
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+-                         void* dst, size_t dstCapacity,
+-                   const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize)
+ {
+     size_t endResult;
+     size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+     return cSize + endResult;
+ }
+ 
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++                        void* dst, size_t dstCapacity,
++                  const void* src, size_t srcSize)
++{
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize,
+@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
+     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                          params, srcSize, ZSTDb_not_buffered) , "");
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
          {   size_t const dictID = ZSTD_compress_insertDictionary(
                      &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                      &params, cdict->dictContent, cdict->dictContentSize,
@@ -41199,7 +19633,43 @@ index f620cafca633..81b8cd119cd8 100644
              FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
              assert(dictID <= (size_t)(U32)-1);
              cdict->dictID = (U32)dictID;
-@@ -5197,30 +5624,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+     params.cParams = cParams;
+     params.useRowMatchFinder = useRowMatchFinder;
+     cdict->useRowMatchFinder = useRowMatchFinder;
++    cdict->compressionLevel = ZSTD_NO_CLEVEL;
+ 
+     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                               dict, dictSize,
+@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+ 
+ /* ZSTD_compressBegin_usingCDict() :
+  * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+ 
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+  * Implementation of various ZSTD_compress_usingCDict* functions.
+  */
+@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+ 
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
  
  static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
  {
@@ -41253,7 +19723,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
          assert(zcs->inBuff != NULL);
          assert(zcs->inBuffSize > 0);
-@@ -5229,8 +5667,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
          assert(zcs->outBuff !=  NULL);
          assert(zcs->outBuffSize > 0);
      }
@@ -41265,7 +19735,16 @@ index f620cafca633..81b8cd119cd8 100644
      assert((U32)flushMode <= (U32)ZSTD_e_end);
  
      while (someMoreWork) {
-@@ -5262,8 +5702,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
+               && (zcs->inBuffPos == 0) ) {
+                 /* shortcut to compression pass directly into output buffer */
+-                size_t const cSize = ZSTD_compressEnd(zcs,
++                size_t const cSize = ZSTD_compressEnd_public(zcs,
+                                                 op, oend-op, ip, iend-ip);
+                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                          zcs->inBuff + zcs->inBuffPos, toLoad,
                                          ip, iend-ip);
                  zcs->inBuffPos += loaded;
@@ -41275,7 +19754,7 @@ index f620cafca633..81b8cd119cd8 100644
                  if ( (flushMode == ZSTD_e_continue)
                    && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                      /* not enough input to fill full block : stop here */
-@@ -5274,6 +5713,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                      /* empty */
                      someMoreWork = 0; break;
                  }
@@ -41296,7 +19775,7 @@ index f620cafca633..81b8cd119cd8 100644
              }
              /* compress current block (note : this stage cannot be stopped in the middle) */
              DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
-@@ -5281,9 +5734,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                  void* cDst;
                  size_t cSize;
                  size_t oSize = oend-op;
@@ -41308,7 +19787,19 @@ index f620cafca633..81b8cd119cd8 100644
                  if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                      cDst = op;   /* compress into output buffer, to skip flush stage */
                  else
-@@ -5306,19 +5758,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 if (inputBuffered) {
+                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                     cSize = lastBlock ?
+-                            ZSTD_compressEnd(zcs, cDst, oSize,
++                            ZSTD_compressEnd_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize,
++                            ZSTD_compressContinue_public(zcs, cDst, oSize,
+                                         zcs->inBuff + zcs->inToCompress, iSize);
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                      if (!lastBlock)
                          assert(zcs->inBuffTarget <= zcs->inBuffSize);
                      zcs->inToCompress = zcs->inBuffPos;
@@ -41318,8 +19809,10 @@ index f620cafca633..81b8cd119cd8 100644
 +                } else { /* !inputBuffered, hence ZSTD_bm_stable */
 +                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
                      cSize = lastBlock ?
-                             ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
-                             ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
                      /* Consume the input prior to error checking to mirror buffered mode. */
 -                    if (iSize > 0)
 -                        ip += iSize;
@@ -41332,7 +19825,7 @@ index f620cafca633..81b8cd119cd8 100644
                  }
                  if (cDst == op) {  /* no need to flush */
                      op += cSize;
-@@ -5388,8 +5837,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
  /* After a compression call set the expected input/output buffer.
   * This is validated at the start of the next compression call.
   */
@@ -41344,7 +19837,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
          cctx->expectedInBuffer = *input;
      }
-@@ -5408,22 +5859,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
  {
      if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
          ZSTD_inBuffer const expect = cctx->expectedInBuffer;
@@ -41373,7 +19866,7 @@ index f620cafca633..81b8cd119cd8 100644
      ZSTD_CCtx_params params = cctx->requestedParams;
      ZSTD_prefixDict const prefixDict = cctx->prefixDict;
      FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
-@@ -5437,9 +5888,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
          params.compressionLevel = cctx->cdict->compressionLevel;
      }
      DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
@@ -41386,7 +19879,7 @@ index f620cafca633..81b8cd119cd8 100644
                  ? prefixDict.dictSize
                  : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
          ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
-@@ -5451,6 +5902,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
      params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
      params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
      params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
@@ -41396,7 +19889,7 @@ index f620cafca633..81b8cd119cd8 100644
  
      {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
          assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
-@@ -5477,6 +5931,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
      return 0;
  }
  
@@ -41405,7 +19898,7 @@ index f620cafca633..81b8cd119cd8 100644
  size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                               ZSTD_outBuffer* output,
                               ZSTD_inBuffer* input,
-@@ -5491,8 +5947,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
  
      /* transparent initialization stage */
      if (cctx->streamStage == zcss_init) {
@@ -41435,7 +19928,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
      /* end of transparent initialization stage */
  
-@@ -5510,13 +5985,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                        const void* src, size_t srcSize, size_t* srcPos,
                              ZSTD_EndDirective endOp)
  {
@@ -41462,7 +19955,7 @@ index f620cafca633..81b8cd119cd8 100644
  }
  
  size_t ZSTD_compress2(ZSTD_CCtx* cctx,
-@@ -5539,6 +6021,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
          /* Reset to the original values. */
          cctx->requestedParams.inBufferMode = originalInBufferMode;
          cctx->requestedParams.outBufferMode = originalOutBufferMode;
@@ -41470,7 +19963,7 @@ index f620cafca633..81b8cd119cd8 100644
          FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
          if (result != 0) {  /* compression not completed, due to lack of output space */
              assert(oPos == dstCapacity);
-@@ -5549,64 +6032,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
      }
  }
  
@@ -41555,7 +20048,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (cctx->cdict) {
          dictSize = (U32)cctx->cdict->dictContentSize;
      } else if (cctx->prefixDict.dict) {
-@@ -5615,25 +6095,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
          dictSize = 0;
      }
      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
@@ -41620,7 +20113,7 @@ index f620cafca633..81b8cd119cd8 100644
      ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
  
      if (inSeqs[idx].litLength) {
-@@ -5642,26 +6152,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
          ip += inSeqs[idx].litLength;
          seqPos->posInSrc += inSeqs[idx].litLength;
      }
@@ -41650,7 +20143,7 @@ index f620cafca633..81b8cd119cd8 100644
  {
      U32 idx = seqPos->idx;
      U32 startPosInSequence = seqPos->posInSequence;
-@@ -5673,6 +6172,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
      U32 bytesAdjustment = 0;
      U32 finalMatchSplit = 0;
  
@@ -41660,7 +20153,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (cctx->cdict) {
          dictSize = cctx->cdict->dictContentSize;
      } else if (cctx->prefixDict.dict) {
-@@ -5680,7 +6182,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
      } else {
          dictSize = 0;
      }
@@ -41669,7 +20162,7 @@ index f620cafca633..81b8cd119cd8 100644
      DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
      while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
-@@ -5688,7 +6190,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
          U32 litLength = currSeq.litLength;
          U32 matchLength = currSeq.matchLength;
          U32 const rawOffset = currSeq.offset;
@@ -41678,7 +20171,7 @@ index f620cafca633..81b8cd119cd8 100644
  
          /* Modify the sequence depending on where endPosInSequence lies */
          if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
-@@ -5702,7 +6204,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
              /* Move to the next sequence */
              endPosInSequence -= currSeq.litLength + currSeq.matchLength;
              startPosInSequence = 0;
@@ -41686,7 +20179,7 @@ index f620cafca633..81b8cd119cd8 100644
          } else {
              /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                 does not reach the end of the match. So, we have to split the sequence */
-@@ -5742,21 +6243,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
          }
          /* Check if this offset can be represented with a repcode */
          {   U32 const ll0 = (litLength == 0);
@@ -41717,7 +20210,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
      DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
      assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
-@@ -5779,7 +6282,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
  
  typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
@@ -41726,7 +20219,7 @@ index f620cafca633..81b8cd119cd8 100644
  static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
  {
      ZSTD_sequenceCopier sequenceCopier = NULL;
-@@ -5793,6 +6296,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
      return sequenceCopier;
  }
  
@@ -41784,7 +20277,7 @@ index f620cafca633..81b8cd119cd8 100644
  /* Compress, block-by-block, all of the sequences given.
   *
   * Returns the cumulative size of all compressed blocks (including their headers),
-@@ -5805,9 +6359,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                            const void* src, size_t srcSize)
  {
      size_t cSize = 0;
@@ -41794,7 +20287,7 @@ index f620cafca633..81b8cd119cd8 100644
      size_t remaining = srcSize;
      ZSTD_sequencePosition seqPos = {0, 0, 0};
  
-@@ -5827,22 +6378,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
      }
  
      while (remaining) {
@@ -41830,7 +20323,7 @@ index f620cafca633..81b8cd119cd8 100644
              cSize += cBlockSize;
              ip += blockSize;
              op += cBlockSize;
-@@ -5851,6 +6409,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              continue;
          }
  
@@ -41838,7 +20331,7 @@ index f620cafca633..81b8cd119cd8 100644
          compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                  &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                  &cctx->appliedParams,
-@@ -5859,11 +6418,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                  cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                  cctx->bmi2);
          FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
@@ -41852,7 +20345,7 @@ index f620cafca633..81b8cd119cd8 100644
              /* We don't want to emit our first block as a RLE even if it qualifies because
              * doing so will cause the decoder (cli only) to throw a "should consume all input error."
              * This is only an issue for zstd <= v1.4.3
-@@ -5874,12 +6433,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
          if (compressedSeqsSize == 0) {
              /* ZSTD_noCompressBlock writes the block header as well */
              cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
@@ -41869,7 +20362,7 @@ index f620cafca633..81b8cd119cd8 100644
          } else {
              U32 cBlockHeader;
              /* Error checking and repcodes update */
-@@ -5891,11 +6450,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
              MEM_writeLE24(op, cBlockHeader);
              cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
@@ -41882,7 +20375,7 @@ index f620cafca633..81b8cd119cd8 100644
  
          if (lastBlock) {
              break;
-@@ -5906,12 +6464,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              dstCapacity -= cBlockSize;
              cctx->isFirstBlock = 0;
          }
@@ -41899,7 +20392,7 @@ index f620cafca633..81b8cd119cd8 100644
                                const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                                const void* src, size_t srcSize)
  {
-@@ -5921,7 +6482,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
      size_t frameHeaderSize = 0;
  
      /* Transparent initialization stage, same as compressStream2() */
@@ -41908,7 +20401,7 @@ index f620cafca633..81b8cd119cd8 100644
      assert(cctx != NULL);
      FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
      /* Begin writing output, starting with frame header */
-@@ -5949,26 +6510,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
          cSize += 4;
      }
  
@@ -41947,7 +20440,7 @@ index f620cafca633..81b8cd119cd8 100644
      if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
      /* single thread mode : attempt to calculate remaining to flush more precisely */
      {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
-@@ -6090,7 +6659,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
              cp.targetLength = (unsigned)(-clampedCompressionLevel);
          }
          /* refine parameters based on srcSize & dictSize */
@@ -41956,7 +20449,7 @@ index f620cafca633..81b8cd119cd8 100644
      }
  }
  
-@@ -6125,3 +6694,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
      if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
      return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
  }
@@ -41979,7 +20472,7 @@ index f620cafca633..81b8cd119cd8 100644
 +    }
 +}
 diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
-index 71697a11ae30..0198c8f5cac0 100644
+index 71697a11ae30..899f5e2de8e9 100644
 --- a/lib/zstd/compress/zstd_compress_internal.h
 +++ b/lib/zstd/compress/zstd_compress_internal.h
 @@ -1,5 +1,6 @@
@@ -42031,7 +20524,19 @@ index 71697a11ae30..0198c8f5cac0 100644
  UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
  
  typedef struct {
-@@ -228,6 +237,11 @@ struct ZSTD_matchState_t {
+@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
+     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+ 
+     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
++    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
+ 
+     U32* hashTable;
+     U32* hashTable3;
+@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
      const ZSTD_matchState_t* dictMatchState;
      ZSTD_compressionParameters cParams;
      const rawSeqStore_t* ldmSeqStore;
@@ -42040,10 +20545,17 @@ index 71697a11ae30..0198c8f5cac0 100644
 +     * This behavior is controlled from the cctx ms.
 +     * This parameter has no effect in the cdict ms. */
 +    int prefetchCDictTables;
++
++    /* When == 0, lazy match finders insert every position.
++     * When != 0, lazy match finders only insert positions they search.
++     * This allows them to skip much faster over incompressible data,
++     * at a small cost to compression ratio.
++     */
++    int lazySkipping;
  };
  
  typedef struct {
-@@ -324,6 +338,24 @@ struct ZSTD_CCtx_params_s {
+@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
  
      /* Internal use, for createCCtxParams() and freeCCtxParams() only */
      ZSTD_customMem customMem;
@@ -42068,7 +20580,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
  
  #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
-@@ -355,6 +387,14 @@ typedef struct {
+@@ -355,6 +396,14 @@ typedef struct {
      ZSTD_entropyCTablesMetadata_t entropyMetadata;
  } ZSTD_blockSplitCtx;
  
@@ -42083,7 +20595,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  struct ZSTD_CCtx_s {
      ZSTD_compressionStage_e stage;
      int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
-@@ -404,6 +444,7 @@ struct ZSTD_CCtx_s {
+@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
  
      /* Stable in/out buffer verification */
      ZSTD_inBuffer expectedInBuffer;
@@ -42091,7 +20603,7 @@ index 71697a11ae30..0198c8f5cac0 100644
      size_t expectedOutBufferSize;
  
      /* Dictionary */
-@@ -417,9 +458,13 @@ struct ZSTD_CCtx_s {
+@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
  
      /* Workspace for block splitter */
      ZSTD_blockSplitCtx blockSplitCtx;
@@ -42105,7 +20617,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  
  typedef enum {
      ZSTD_noDict = 0,
-@@ -441,7 +486,7 @@ typedef enum {
+@@ -441,7 +495,7 @@ typedef enum {
                                   * In this mode we take both the source size and the dictionary size
                                   * into account when selecting and adjusting the parameters.
                                   */
@@ -42114,7 +20626,7 @@ index 71697a11ae30..0198c8f5cac0 100644
                                   * We don't know what these parameters are for. We default to the legacy
                                   * behavior of taking both the source size and the dict size into account
                                   * when selecting and adjusting parameters.
-@@ -500,9 +545,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
  /* ZSTD_noCompressBlock() :
   * Writes uncompressed block to dst buffer from given src.
   * Returns the size of the block */
@@ -42127,7 +20639,7 @@ index 71697a11ae30..0198c8f5cac0 100644
      RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                      dstSize_tooSmall, "dst buf too small for uncompressed block");
      MEM_writeLE24(dst, cBlockHeader24);
-@@ -510,7 +557,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
      return ZSTD_blockHeaderSize + srcSize;
  }
  
@@ -42137,7 +20649,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  {
      BYTE* const op = (BYTE*)dst;
      U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
-@@ -529,7 +577,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
  {
      U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
      ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
@@ -42146,7 +20658,7 @@ index 71697a11ae30..0198c8f5cac0 100644
      return (srcSize >> minlog) + 2;
  }
  
-@@ -565,29 +613,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
      while (ip < iend) *op++ = *ip++;
  }
  
@@ -42190,7 +20702,7 @@ index 71697a11ae30..0198c8f5cac0 100644
                size_t matchLength)
  {
      BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
-@@ -596,8 +642,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      static const BYTE* g_start = NULL;
      if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
      {   U32 const pos = (U32)((const BYTE*)literals - g_start);
@@ -42201,7 +20713,7 @@ index 71697a11ae30..0198c8f5cac0 100644
      }
  #endif
      assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
-@@ -607,9 +653,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      assert(literals + litLength <= litLimit);
      if (litEnd <= litLimit_w) {
          /* Common case we can use wildcopy.
@@ -42214,7 +20726,7 @@ index 71697a11ae30..0198c8f5cac0 100644
          ZSTD_copy16(seqStorePtr->lit, literals);
          if (litLength > 16) {
              ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
-@@ -628,7 +674,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      seqStorePtr->sequences[0].litLength = (U16)litLength;
  
      /* match offset */
@@ -42223,7 +20735,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  
      /* match Length */
      assert(matchLength >= MINMATCH);
-@@ -646,17 +692,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
  
  /* ZSTD_updateRep() :
   * updates in-place @rep (array of repeat offsets)
@@ -42246,7 +20758,7 @@ index 71697a11ae30..0198c8f5cac0 100644
          if (repCode > 0) {  /* note : if repCode==0, no change */
              U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
              rep[2] = (repCode >= 2) ? rep[1] : rep[2];
-@@ -673,11 +719,11 @@ typedef struct repcodes_s {
+@@ -673,11 +728,11 @@ typedef struct repcodes_s {
  } repcodes_t;
  
  MEM_STATIC repcodes_t
@@ -42260,7 +20772,7 @@ index 71697a11ae30..0198c8f5cac0 100644
      return newReps;
  }
  
-@@ -685,59 +731,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
  /*-*************************************
  *  Match length counter
  ***************************************/
@@ -42320,39 +20832,51 @@ index 71697a11ae30..0198c8f5cac0 100644
  MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
  {
      const BYTE* const pStart = pIn;
-@@ -783,32 +776,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
   *  Hashes
   ***************************************/
  static const U32 prime3bytes = 506832829U;
 -static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-+static U32    ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
- MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
  
  static const U32 prime4bytes = 2654435761U;
 -static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
 -static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
-+static U32    ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; }
-+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
  
  static const U64 prime5bytes = 889523592379ULL;
 -static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
-+static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
- static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
  
  static const U64 prime6bytes = 227718039650203ULL;
 -static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
-+static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
- static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
  
  static const U64 prime7bytes = 58295818150454627ULL;
 -static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
-+static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
- static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
  
  static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
 -static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
-+static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
- static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
  
  MEM_STATIC FORCE_INLINE_ATTR
  size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
@@ -42364,7 +20888,32 @@ index 71697a11ae30..0198c8f5cac0 100644
      switch(mls)
      {
      default:
-@@ -1167,10 +1164,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+     }
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
++    switch(mls)
++    {
++        default:
++        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++    }
++}
++
++
+ /* ZSTD_ipow() :
+  * Return base^exponent.
+  */
+@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                      (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
          assert(blockEndIdx >= loadedDictEnd);
  
@@ -42381,7 +20930,7 @@ index 71697a11ae30..0198c8f5cac0 100644
               */
              DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
              *loadedDictEndPtr = 0;
-@@ -1302,6 +1304,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
  
  #endif
  
@@ -42424,7 +20973,7 @@ index 71697a11ae30..0198c8f5cac0 100644
  
  
  /* ===============================================================
-@@ -1396,4 +1434,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
   */
  void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
  
@@ -42454,6 +21003,26 @@ index 71697a11ae30..0198c8f5cac0 100644
 +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
 +                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
 +                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++                               void* dst, size_t dstCapacity,
++                         const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
 +
  #endif /* ZSTD_COMPRESS_H */
 diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
@@ -42896,7 +21465,7 @@ index 224ece79546e..826bbc9e029b 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
-index 349fc923c355..ef5e65cfcf9a 100644
+index 349fc923c355..65ea53b62844 100644
 --- a/lib/zstd/compress/zstd_cwksp.h
 +++ b/lib/zstd/compress/zstd_cwksp.h
 @@ -1,5 +1,6 @@
@@ -42907,7 +21476,246 @@ index 349fc923c355..ef5e65cfcf9a 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -451,7 +452,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+ 
+ 
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+     ZSTD_cwksp_alloc_objects,
+-    ZSTD_cwksp_alloc_buffers,
+-    ZSTD_cwksp_alloc_aligned
++    ZSTD_cwksp_alloc_aligned_init_once,
++    ZSTD_cwksp_alloc_aligned,
++    ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+ 
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+  *
+  * Workspace Layout:
+  *
+- * [                        ... workspace ...                         ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [                        ... workspace ...                           ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+  *
+  * The various objects that live in the workspace are divided into the
+  * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+  *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- *   alignment, but don't require any initialization before they're used. These
+- *   buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ *   use. They should be used when we want to skip memory initialization
++ *   while not triggering memory checkers (like Valgrind) when reading from
++ *   from this memory without writing to it first.
++ *   These buffers should be used carefully as they might contain data
++ *   from previous compressions.
++ *   Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ *   used. The user of the buffer should make sure they write into a buffer
++ *   location before reading from it.
++ *   Buffers are aligned to 64 bytes.
+  *
+  * - Buffers: these buffers are used for various purposes that don't require
+  *   any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+  * correctly packed into the workspace buffer. That order is:
+  *
+  * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+  *
+  * Attempts to reserve objects of different types out of order will fail.
+  */
+@@ -147,6 +161,7 @@ typedef struct {
+     void* tableEnd;
+     void* tableValidEnd;
+     void* allocStart;
++    void* initOnceStart;
+ 
+     BYTE allocFailed;
+     int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+ 
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+ 
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+     assert(ws->tableEnd <= ws->allocStart);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     assert(ws->allocStart <= ws->workspaceEnd);
++    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++    assert(ws->workspace <= ws->initOnceStart);
+ }
+ 
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+  * for internal purposes (currently only alignment).
+  */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+-     * to align the beginning of the aligned section.
+-     *
+-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+-     * aligneds being sized in multiples of 64 bytes.
++    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++     * bytes to align the beginning of tables section and end of buffers;
+      */
+-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+     return slackSpace;
+ }
+ 
+@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
+     size_t const alignBytesMask = alignBytes - 1;
+     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+     assert((alignBytes & alignBytesMask) == 0);
+-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++    assert(bytes < alignBytes);
+     return bytes;
+ }
+ 
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
+ /*
+  * Internal function. Do not use directly.
+  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+     assert(phase >= ws->phase);
+     if (phase > ws->phase) {
+-        /* Going from allocating objects to allocating buffers */
+-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+-                phase >= ZSTD_cwksp_alloc_buffers) {
++        /* Going from allocating objects to allocating initOnce / tables */
++        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+             ws->tableValidEnd = ws->objectEnd;
+-        }
++            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ 
+-        /* Going from allocating buffers to allocating aligneds/tables */
+-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+-                phase >= ZSTD_cwksp_alloc_aligned) {
+-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+-                size_t const bytesToAlign =
+-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
+-            }
+             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+-                void* const alloc = ws->objectEnd;
++                void *const alloc = ws->objectEnd;
+                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                 "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+                 ws->tableEnd = objectEnd;  /* table area starts being empty */
+                 if (ws->tableValidEnd < ws->tableEnd) {
+                     ws->tableValidEnd = ws->tableEnd;
+-        }   }   }
++                }
++            }
++        }
+         ws->phase = phase;
+         ZSTD_cwksp_assert_internal_consistency(ws);
+     }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+  */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+ 
+ /*
+@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+ }
+ 
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++    if(ptr && ptr < ws->initOnceStart) {
++        /* We assume the memory following the current allocation is either:
++         * 1. Not usable as initOnce memory (end of workspace)
++         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++         * 3. An ASAN redzone, in which case we don't want to write on it
++         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++        ws->initOnceStart = ptr;
++    }
++    return ptr;
++}
++
+ /*
+  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+  */
+@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+  */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+     void* alloc;
+     void* end;
+     void* top;
+ 
+-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+-        return NULL;
++    /* We can only start allocating tables after we are done reserving space for objects at the
++     * start of the workspace */
++    if(ws->phase < phase) {
++        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++            return NULL;
++        }
+     }
+     alloc = ws->tableEnd;
+     end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
      assert(ws->tableValidEnd >= ws->objectEnd);
      assert(ws->tableValidEnd <= ws->allocStart);
      if (ws->tableValidEnd < ws->tableEnd) {
@@ -42916,6 +21724,51 @@ index 349fc923c355..ef5e65cfcf9a 100644
      }
      ZSTD_cwksp_mark_tables_clean(ws);
  }
+@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+ 
+ 
+     ws->tableEnd = ws->objectEnd;
+-    ws->allocStart = ws->workspaceEnd;
++    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->allocFailed = 0;
+-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+-        ws->phase = ZSTD_cwksp_alloc_buffers;
++    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+     }
+     ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+     ws->workspaceEnd = (BYTE*)start + size;
+     ws->objectEnd = ws->workspace;
+     ws->tableValidEnd = ws->objectEnd;
++    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+     ws->phase = ZSTD_cwksp_alloc_objects;
+     ws->isStatic = isStatic;
+     ZSTD_cwksp_clear(ws);
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+  * actual amount of space used.
+  */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+-                                                        size_t const estimatedSpace, int resizedWorkspace) {
+-    if (resizedWorkspace) {
+-        /* Resized/newly allocated wksp should have exact bounds */
+-        return ZSTD_cwksp_used(ws) == estimatedSpace;
+-    } else {
+-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+-         */
+-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+-    }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++     * the alignment bytes difference between estimation and actual usage */
++    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++           ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+ 
+ 
 diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
 index 76933dea2624..ab9440a99603 100644
 --- a/lib/zstd/compress/zstd_double_fast.c
@@ -44057,7 +22910,7 @@ index fddc2f532d21..e64d9e1b2d39 100644
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
 diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
-index 0298a01a7504..83727cd46f91 100644
+index 0298a01a7504..f6b4978ceba7 100644
 --- a/lib/zstd/compress/zstd_lazy.c
 +++ b/lib/zstd/compress/zstd_lazy.c
 @@ -1,5 +1,6 @@
@@ -44068,15 +22921,17 @@ index 0298a01a7504..83727cd46f91 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -10,6 +11,7 @@
+@@ -10,6 +11,9 @@
  
  #include "zstd_compress_internal.h"
  #include "zstd_lazy.h"
 +#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#define kLazySkippingStep 8
  
  
  /*-*************************************
-@@ -197,8 +199,8 @@ ZSTD_DUBT_findBetterDictMatch (
+@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
              U32 matchIndex = dictMatchIndex + dictIndexDelta;
              if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
@@ -44087,7 +22942,7 @@ index 0298a01a7504..83727cd46f91 100644
              }
              if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                  break;   /* drop, to guarantee consistency (miss a little bit of compression) */
-@@ -218,7 +220,7 @@ ZSTD_DUBT_findBetterDictMatch (
+@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
      }
  
      if (bestLength >= MINMATCH) {
@@ -44096,7 +22951,7 @@ index 0298a01a7504..83727cd46f91 100644
          DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                      curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
      }
-@@ -230,7 +232,7 @@ ZSTD_DUBT_findBetterDictMatch (
+@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
  static size_t
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                          const BYTE* const ip, const BYTE* const iend,
@@ -44105,7 +22960,7 @@ index 0298a01a7504..83727cd46f91 100644
                          U32 const mls,
                          const ZSTD_dictMode_e dictMode)
  {
-@@ -327,8 +329,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
              if (matchLength > bestLength) {
                  if (matchLength > matchEndIdx - matchIndex)
                      matchEndIdx = matchIndex + (U32)matchLength;
@@ -44116,7 +22971,7 @@ index 0298a01a7504..83727cd46f91 100644
                  if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                      if (dictMode == ZSTD_dictMatchState) {
                          nbCompares = 0; /* in addition to avoiding checking any
-@@ -361,16 +363,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
          if (dictMode == ZSTD_dictMatchState && nbCompares) {
              bestLength = ZSTD_DUBT_findBetterDictMatch(
                      ms, ip, iend,
@@ -44136,7 +22991,7 @@ index 0298a01a7504..83727cd46f91 100644
          }
          return bestLength;
      }
-@@ -381,14 +383,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
  FORCE_INLINE_TEMPLATE size_t
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                  const BYTE* const ip, const BYTE* const iLimit,
@@ -44153,7 +23008,7 @@ index 0298a01a7504..83727cd46f91 100644
  }
  
  /* *********************************
-@@ -561,7 +563,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
          /* save best solution */
          if (currentMl > ml) {
              ml = currentMl;
@@ -44162,7 +23017,7 @@ index 0298a01a7504..83727cd46f91 100644
              if (ip+currentMl == iLimit) {
                  /* best possible, avoids read overflow on next attempt */
                  return ml;
-@@ -598,7 +600,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
              /* save best solution */
              if (currentMl > ml) {
                  ml = currentMl;
@@ -44171,7 +23026,43 @@ index 0298a01a7504..83727cd46f91 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
          }
-@@ -691,7 +693,8 @@ size_t ZSTD_HcFindBestMatch(
+@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+                         ZSTD_matchState_t* ms,
+                         const ZSTD_compressionParameters* const cParams,
+-                        const BYTE* ip, U32 const mls)
++                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+     U32* const hashTable  = ms->hashTable;
+     const U32 hashLog = cParams->hashLog;
+@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+         hashTable[h] = idx;
+         idx++;
++        /* Stop inserting every position when in the lazy skipping mode. */
++        if (lazySkipping)
++            break;
+     }
+ 
+     ms->nextToUpdate = target;
+@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ 
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+ 
+ /* inlining is important to hardwire a hot branch (template emulation) */
+@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
+     }
+ 
+     /* HC4 match finder */
+-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+ 
+     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+         size_t currentMl=0;
          if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
              const BYTE* const match = base + matchIndex;
              assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
@@ -44181,7 +23072,7 @@ index 0298a01a7504..83727cd46f91 100644
                  currentMl = ZSTD_count(ip, match, iLimit);
          } else {
              const BYTE* const match = dictBase + matchIndex;
-@@ -703,7 +706,7 @@ size_t ZSTD_HcFindBestMatch(
+@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
          /* save best solution */
          if (currentMl > ml) {
              ml = currentMl;
@@ -44190,7 +23081,7 @@ index 0298a01a7504..83727cd46f91 100644
              if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
          }
  
-@@ -739,7 +742,7 @@ size_t ZSTD_HcFindBestMatch(
+@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
              if (currentMl > ml) {
                  ml = currentMl;
                  assert(curr > matchIndex + dmsIndexDelta);
@@ -44199,15 +23090,16 @@ index 0298a01a7504..83727cd46f91 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
  
-@@ -757,7 +760,6 @@ size_t ZSTD_HcFindBestMatch(
+@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
  ***********************************/
  /* Constants for row-based hash */
- #define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
 -#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
  
-@@ -769,29 +771,8 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
   * Starting from the LSB, returns the idx of the next non-zero bit.
   * Basically counting the nb of trailing zeroes.
   */
@@ -44234,12 +23126,122 @@ index 0298a01a7504..83727cd46f91 100644
 -    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
 -    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
 -#   endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+-    assert(count < 64);
+-    count &= 0x3F; /* for fickle pattern recognition */
+-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+-    assert(count < 32);
+-    count &= 0x1F; /* for fickle pattern recognition */
+-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+-    assert(count < 16);
+-    count &= 0x0F; /* for fickle pattern recognition */
+-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
 +MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
 +    return ZSTD_countTrailingZeros64(val);
  }
  
- /* ZSTD_rotateRight_*():
-@@ -971,7 +952,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+ /* ZSTD_row_nextIndex():
+  * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+-  U32 const next = (*tagRow - 1) & rowMask;
+-  *tagRow = (BYTE)next;
+-  return next;
++    U32 next = (*tagRow-1) & rowMask;
++    next += (next == 0) ? rowMask : 0; /* skip first position */
++    *tagRow = (BYTE)next;
++    return next;
+ }
+ 
+ /* ZSTD_isAligned():
+@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+  * Performs prefetching for the hashTable and tagTable at a given row.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+     PREFETCH_L1(hashTable + relRow);
+     if (rowLog >= 5) {
+         PREFETCH_L1(hashTable + relRow + 16);
+@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+                                    U32 idx, const BYTE* const iLimit)
+ {
+     U32 const* const hashTable = ms->hashTable;
+-    U16 const* const tagTable = ms->tagTable;
++    BYTE const* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+ 
+     for (; idx < lim; ++idx) {
+-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+  */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-                                                  U16 const* tagTable, BYTE const* base,
++                                                  BYTE const* tagTable, BYTE const* base,
+                                                   U32 idx, U32 const hashLog,
+-                                                  U32 const rowLog, U32 const mls)
++                                                  U32 const rowLog, U32 const mls,
++                                                  U64 const hashSalt)
+ {
+-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+                                                         U32 const rowMask, U32 const useCache)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32 const hashLog = ms->rowHashLog;
+     const BYTE* const base = ms->window.base;
+ 
+     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         U32* const row = hashTable + relRow;
+-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+-                                                       Explicit cast allows us to get exact desired position within each row */
++        BYTE* tagRow = tagTable + relRow;
+         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ 
+-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+         row[pos] = updateStartIdx;
+     }
+ }
+@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
      const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
  
      DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
@@ -44276,7 +23278,7 @@ index 0298a01a7504..83727cd46f91 100644
  }
  
  #if defined(ZSTD_ARCH_X86_SSE2)
-@@ -994,71 +1003,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
  }
  #endif
  
@@ -44285,8 +23287,7 @@ index 0298a01a7504..83727cd46f91 100644
 - * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
 - * to match up with the actual layout of the entries within the hashTable */
 +#if defined(ZSTD_ARCH_ARM_NEON)
- FORCE_INLINE_TEMPLATE ZSTD_VecMask
--ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
 +ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
 +{
 +    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
@@ -44339,10 +23340,12 @@ index 0298a01a7504..83727cd46f91 100644
 + * Each row is a circular buffer beginning at the value of "headGrouped". So we
 + * must rotate the "matches" bitfield to match up with the actual layout of the
 + * entries within the hashTable */
-+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
 +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
  {
-     const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++    const BYTE* const src = tagRow;
      assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
      assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
 +    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
@@ -44407,7 +23410,7 @@ index 0298a01a7504..83727cd46f91 100644
          const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
          const size_t xFF = ~((size_t)0);
          const size_t x01 = xFF / 0xFF;
-@@ -1091,11 +1111,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
          }
          matches = ~matches;
          if (rowEntries == 16) {
@@ -44422,15 +23425,56 @@ index 0298a01a7504..83727cd46f91 100644
          }
      }
  #endif
-@@ -1143,6 +1163,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
+                         const U32 rowLog)
+ {
+     U32* const hashTable = ms->hashTable;
+-    U16* const tagTable = ms->tagTable;
++    BYTE* const tagTable = ms->tagTable;
+     U32* const hashCache = ms->hashCache;
+     const U32 hashLog = ms->rowHashLog;
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
      const U32 rowEntries = (1U << rowLog);
      const U32 rowMask = rowEntries - 1;
      const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
 +    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++    const U64 hashSalt = ms->hashSalt;
      U32 nbAttempts = 1U << cappedSearchLog;
      size_t ml=4-1;
++    U32 hash;
  
-@@ -1185,15 +1206,15 @@ size_t ZSTD_RowFindBestMatch(
+     /* DMS/DDS variables that may be referenced laster */
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
+     if (dictMode == ZSTD_dictMatchState) {
+         /* Prefetch DMS rows */
+         U32* const dmsHashTable = dms->hashTable;
+-        U16* const dmsTagTable = dms->tagTable;
++        BYTE* const dmsTagTable = dms->tagTable;
+         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
+     }
+ 
+     /* Update the hashTable and tagTable up to (but not including) ip */
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++    if (!ms->lazySkipping) {
++        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++    } else {
++        /* Stop inserting every position when in the lazy skipping mode.
++         * The hash cache is also not kept up to date in this mode.
++         */
++        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++        ms->nextToUpdate = curr;
++    }
++    ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+     {   /* Get the hash for ip, compute the appropriate row */
+-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
          U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
          U32* const row = hashTable + relRow;
          BYTE* tagRow = (BYTE*)(tagTable + relRow);
@@ -44443,13 +23487,32 @@ index 0298a01a7504..83727cd46f91 100644
 +        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
  
          /* Cycle through the matches and prefetch */
-         for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
 -            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
 +            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
              U32 const matchIndex = row[matchPos];
++            if(matchPos == 0) continue;
              assert(numMatches < rowEntries);
              if (matchIndex < lowLimit)
-@@ -1224,7 +1245,8 @@ size_t ZSTD_RowFindBestMatch(
+                 break;
+@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dictBase + matchIndex);
+             }
+             matchBuffer[numMatches++] = matchIndex;
++            --nbAttempts;
+         }
+ 
+         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+            in ZSTD_row_update_internal() at the next search. */
+         {
+             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++            tagRow[pos] = (BYTE)tag;
+             row[pos] = ms->nextToUpdate++;
+         }
+ 
+@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
              if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                  const BYTE* const match = base + matchIndex;
                  assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
@@ -44459,7 +23522,7 @@ index 0298a01a7504..83727cd46f91 100644
                      currentMl = ZSTD_count(ip, match, iLimit);
              } else {
                  const BYTE* const match = dictBase + matchIndex;
-@@ -1236,7 +1258,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
              /* Save best solution */
              if (currentMl > ml) {
                  ml = currentMl;
@@ -44468,7 +23531,7 @@ index 0298a01a7504..83727cd46f91 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
          }
-@@ -1254,14 +1276,14 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
          const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
          const U32 dmsIndexDelta        = dictLimit - dmsSize;
  
@@ -44480,13 +23543,21 @@ index 0298a01a7504..83727cd46f91 100644
 -            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
 +            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
  
-             for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
 -                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
 +                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
                  U32 const matchIndex = dmsRow[matchPos];
++                if(matchPos == 0) continue;
                  if (matchIndex < dmsLowestIndex)
                      break;
-@@ -1285,7 +1307,7 @@ size_t ZSTD_RowFindBestMatch(
+                 PREFETCH_L1(dmsBase + matchIndex);
+                 matchBuffer[numMatches++] = matchIndex;
++                --nbAttempts;
+             }
+ 
+             /* Return the longest match */
+@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
                  if (currentMl > ml) {
                      ml = currentMl;
                      assert(curr > matchIndex + dmsIndexDelta);
@@ -44495,7 +23566,7 @@ index 0298a01a7504..83727cd46f91 100644
                      if (ip+currentMl == iLimit) break;
                  }
              }
-@@ -1491,7 +1513,8 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
      const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
      const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  
@@ -44505,7 +23576,7 @@ index 0298a01a7504..83727cd46f91 100644
  
      const int isDMS = dictMode == ZSTD_dictMatchState;
      const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
-@@ -1512,8 +1535,8 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
          U32 const curr = (U32)(ip - base);
          U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
          U32 const maxRep = curr - windowLow;
@@ -44516,7 +23587,22 @@ index 0298a01a7504..83727cd46f91 100644
      }
      if (isDxS) {
          /* dictMatchState repCode checks don't currently handle repCode == 0
-@@ -1537,7 +1560,7 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
+         assert(offset_2 <= dictAndPrefixLength);
+     }
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                            ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
  #endif
      while (ip < ilimit) {
          size_t matchLength=0;
@@ -44525,7 +23611,7 @@ index 0298a01a7504..83727cd46f91 100644
          const BYTE* start=ip+1;
          DEBUGLOG(7, "search baseline (depth 0)");
  
-@@ -1562,10 +1585,10 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
          }
  
          /* first search (depth 0) */
@@ -44539,7 +23625,21 @@ index 0298a01a7504..83727cd46f91 100644
          }
  
          if (matchLength < 4) {
-@@ -1579,12 +1602,12 @@ ZSTD_compressBlock_lazy_generic(
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
++            ip += step;
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
              DEBUGLOG(7, "search depth 1");
              ip ++;
              if ( (dictMode == ZSTD_noDict)
@@ -44555,7 +23655,7 @@ index 0298a01a7504..83727cd46f91 100644
              }
              if (isDxS) {
                  const U32 repIndex = (U32)(ip - base) - offset_1;
-@@ -1596,17 +1619,17 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
                      const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                      size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                      int const gain2 = (int)(mlRep * 3);
@@ -44580,7 +23680,7 @@ index 0298a01a7504..83727cd46f91 100644
                      continue;   /* search a better one */
              }   }
  
-@@ -1615,12 +1638,12 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
                  DEBUGLOG(7, "search depth 2");
                  ip ++;
                  if ( (dictMode == ZSTD_noDict)
@@ -44596,7 +23696,7 @@ index 0298a01a7504..83727cd46f91 100644
                  }
                  if (isDxS) {
                      const U32 repIndex = (U32)(ip - base) - offset_1;
-@@ -1632,17 +1655,17 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
                          const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                          size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                          int const gain2 = (int)(mlRep * 4);
@@ -44621,7 +23721,7 @@ index 0298a01a7504..83727cd46f91 100644
                          continue;
              }   }   }
              break;  /* nothing found : store previous solution */
-@@ -1653,24 +1676,24 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
           * notably if `value` is unsigned, resulting in a large positive `-value`.
           */
          /* catch up */
@@ -44651,8 +23751,17 @@ index 0298a01a7504..83727cd46f91 100644
 +            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
              anchor = ip = start + matchLength;
          }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
  
-@@ -1686,8 +1709,8 @@ ZSTD_compressBlock_lazy_generic(
+         /* check immediate repcode */
+         if (isDxS) {
+@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                      const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                      matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
@@ -44663,7 +23772,7 @@ index 0298a01a7504..83727cd46f91 100644
                      ip += matchLength;
                      anchor = ip;
                      continue;
-@@ -1701,16 +1724,20 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
                   && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                  /* store sequence */
                  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
@@ -44689,7 +23798,24 @@ index 0298a01a7504..83727cd46f91 100644
  
      /* Return the last literals size */
      return (size_t)(iend - anchor);
-@@ -1903,7 +1930,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+ 
++    /* Reset the lazy skipping state */
++    ms->lazySkipping = 0;
++
+     /* init */
+     ip += (ip == prefixStart);
+     if (searchMethod == search_rowHash) {
+-        ZSTD_row_fillHashCache(ms, base, rowLog,
+-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+-                               ms->nextToUpdate, ilimit);
++        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+     }
+ 
+     /* Match Loop */
+@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
  #endif
      while (ip < ilimit) {
          size_t matchLength=0;
@@ -44698,7 +23824,7 @@ index 0298a01a7504..83727cd46f91 100644
          const BYTE* start=ip+1;
          U32 curr = (U32)(ip-base);
  
-@@ -1922,10 +1949,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
          }   }
  
          /* first search (depth 0) */
@@ -44712,7 +23838,21 @@ index 0298a01a7504..83727cd46f91 100644
          }
  
          if (matchLength < 4) {
-@@ -1939,7 +1966,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
++            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++            ip += step + 1;   /* jump faster over incompressible sections */
++            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++             * In this mode we stop inserting every position into our tables, and only insert
++             * positions that we search, which is one in step positions.
++             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++             * triggered once we've gone 2KB without finding any matches.
++             */
++            ms->lazySkipping = step > kLazySkippingStep;
+             continue;
+         }
+ 
+@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
              ip ++;
              curr++;
              /* check repCode */
@@ -44721,7 +23861,7 @@ index 0298a01a7504..83727cd46f91 100644
                  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                  const U32 repIndex = (U32)(curr - offset_1);
                  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-@@ -1951,18 +1978,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                      const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                      size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                      int const gain2 = (int)(repLength * 3);
@@ -44747,7 +23887,7 @@ index 0298a01a7504..83727cd46f91 100644
                      continue;   /* search a better one */
              }   }
  
-@@ -1971,7 +1998,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                  ip ++;
                  curr++;
                  /* check repCode */
@@ -44756,7 +23896,7 @@ index 0298a01a7504..83727cd46f91 100644
                      const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                      const U32 repIndex = (U32)(curr - offset_1);
                      const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-@@ -1983,36 +2010,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                          const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                          size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                          int const gain2 = (int)(repLength * 4);
@@ -44803,8 +23943,17 @@ index 0298a01a7504..83727cd46f91 100644
 +            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
              anchor = ip = start + matchLength;
          }
++        if (ms->lazySkipping) {
++            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++            if (searchMethod == search_rowHash) {
++                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++            }
++            ms->lazySkipping = 0;
++        }
  
-@@ -2029,8 +2056,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         /* check immediate repcode */
+         while (ip <= ilimit) {
+@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                  /* repcode detected we should take it */
                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
@@ -44815,7 +23964,7 @@ index 0298a01a7504..83727cd46f91 100644
                  ip += matchLength;
                  anchor = ip;
                  continue;   /* faster when present ... (?) */
-@@ -2096,7 +2123,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
+@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
  size_t ZSTD_compressBlock_lazy2_extDict_row(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize)
@@ -44922,7 +24071,7 @@ index 647f865be290..cfccfc46f6f7 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
-index fd82acfda62f..a6bf7f856437 100644
+index fd82acfda62f..1e41cb04f482 100644
 --- a/lib/zstd/compress/zstd_opt.c
 +++ b/lib/zstd/compress/zstd_opt.c
 @@ -1,5 +1,6 @@
@@ -45281,7 +24430,16 @@ index fd82acfda62f..a6bf7f856437 100644
          (*nbMatches)++;
      }
  }
-@@ -1098,14 +1123,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     ZSTD_optimal_t lastSequence;
+     ZSTD_optLdm_t optLdm;
+ 
++    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
++
+     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
  
              /* large match -> immediate encoding */
              {   U32 const maxML = matches[nbMatches-1].len;
@@ -45300,7 +24458,7 @@ index fd82acfda62f..a6bf7f856437 100644
                      DEBUGLOG(6, "large match (%u>%u), immediate encoding",
                                  maxML, sufficient_len);
                      cur = 0;
-@@ -1122,15 +1147,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                      opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                  }
                  for (matchNb = 0; matchNb < nbMatches; matchNb++) {
@@ -45320,7 +24478,7 @@ index fd82acfda62f..a6bf7f856437 100644
                          opt[pos].litlen = litlen;
                          opt[pos].price = (int)sequencePrice;
                  }   }
-@@ -1230,7 +1255,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                      U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                      U32 mlen;
  
@@ -45329,7 +24487,7 @@ index fd82acfda62f..a6bf7f856437 100644
                                  matchNb, matches[matchNb].off, lastML, litlen);
  
                      for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
-@@ -1296,7 +1321,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                  for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                      U32 const llen = opt[storePos].litlen;
                      U32 const mlen = opt[storePos].mlen;
@@ -45338,7 +24496,7 @@ index fd82acfda62f..a6bf7f856437 100644
                      U32 const advance = llen + mlen;
                      DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                  anchor - istart, (unsigned)llen, (unsigned)mlen);
-@@ -1308,8 +1333,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                      }
  
                      assert(anchor + llen <= iend);
@@ -45349,7 +24507,7 @@ index fd82acfda62f..a6bf7f856437 100644
                      anchor += advance;
                      ip = anchor;
              }   }
-@@ -1349,7 +1374,7 @@ size_t ZSTD_compressBlock_btopt(
+@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
  /* ZSTD_initStats_ultra():
   * make a first compression pass, just to seed stats with more accurate starting values.
   * only works on first block, with no dictionary and no ldm.
@@ -45358,7 +24516,7 @@ index fd82acfda62f..a6bf7f856437 100644
   */
  static void
  ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
-@@ -1368,7 +1393,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
  
      ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
  
@@ -45367,7 +24525,7 @@ index fd82acfda62f..a6bf7f856437 100644
      ZSTD_resetSeqStore(seqStore);
      ms->window.base -= srcSize;
      ms->window.dictLimit += (U32)srcSize;
-@@ -1392,20 +1417,20 @@ size_t ZSTD_compressBlock_btultra2(
+@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
      U32 const curr = (U32)((const BYTE*)src - ms->window.base);
      DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
  
@@ -45407,7 +24565,7 @@ index 22b862858ba7..faa73ff4b03d 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
-index 60958afebc41..28a036f7543b 100644
+index 60958afebc41..d172e35fbd9a 100644
 --- a/lib/zstd/decompress/huf_decompress.c
 +++ b/lib/zstd/decompress/huf_decompress.c
 @@ -1,7 +1,8 @@
@@ -45779,7 +24937,7 @@ index 60958afebc41..28a036f7543b 100644
 +
 +    /* Copy the arguments to local variables */
 +    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
-+    ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
 +    ZSTD_memcpy(&op, &args->op, sizeof(op));
 +
 +    assert(MEM_isLittleEndian());
@@ -45863,7 +25021,7 @@ index 60958afebc41..28a036f7543b 100644
 -static HUF_ASM_X86_64_BMI2_ATTRS
 +    /* Save the final values of each of the state variables back to args. */
 +    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
-+    ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
 +    ZSTD_memcpy(&args->op, &op, sizeof(op));
 +}
 +
@@ -46128,7 +25286,7 @@ index 60958afebc41..28a036f7543b 100644
 +
 +    /* Copy the arguments to local registers. */
 +    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
-+    ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
++    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
 +    ZSTD_memcpy(&op, &args->op, sizeof(op));
 +
 +    oend[0] = op[1];
@@ -46251,7 +25409,7 @@ index 60958afebc41..28a036f7543b 100644
 +
 +    /* Save the final values of each of the state variables back to args. */
 +    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
-+    ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
++    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
 +    ZSTD_memcpy(&args->op, &op, sizeof(op));
 +}
 +
@@ -46598,7 +25756,7 @@ index 60958afebc41..28a036f7543b 100644
  }
 -
 diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
-index dbbc7919de53..4f801e0dd564 100644
+index dbbc7919de53..30ef65e1ab5c 100644
 --- a/lib/zstd/decompress/zstd_ddict.c
 +++ b/lib/zstd/decompress/zstd_ddict.c
 @@ -1,5 +1,6 @@
@@ -46609,7 +25767,13 @@ index dbbc7919de53..4f801e0dd564 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -19,7 +20,6 @@
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h"         /* bmi2 */
  #include "../common/mem.h"         /* low level memory routines */
  #define FSE_STATIC_LINKING_ONLY
  #include "../common/fse.h"
@@ -46617,7 +25781,7 @@ index dbbc7919de53..4f801e0dd564 100644
  #include "../common/huf.h"
  #include "zstd_decompress_internal.h"
  #include "zstd_ddict.h"
-@@ -131,7 +131,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
          ZSTD_memcpy(internalBuffer, dict, dictSize);
      }
      ddict->dictSize = dictSize;
@@ -46626,7 +25790,7 @@ index dbbc7919de53..4f801e0dd564 100644
  
      /* parse dictionary content */
      FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
-@@ -237,5 +237,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
  unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
  {
      if (ddict==NULL) return 0;
@@ -46646,7 +25810,7 @@ index 8c1a79d666f8..de459a0dacd1 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
-index b9b935a9f5c0..d7eebb17a2c5 100644
+index 6b3177c94711..5e2a3ef03732 100644
 --- a/lib/zstd/decompress/zstd_decompress.c
 +++ b/lib/zstd/decompress/zstd_decompress.c
 @@ -1,5 +1,6 @@
@@ -46657,7 +25821,12 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -56,13 +57,13 @@
+@@ -52,17 +53,18 @@
+ /*-*******************************************************
+ *  Dependencies
+ *********************************************************/
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
  #include "../common/mem.h"         /* low level memory routines */
  #define FSE_STATIC_LINKING_ONLY
  #include "../common/fse.h"
@@ -46672,7 +25841,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  
  
  
-@@ -72,11 +73,11 @@
+@@ -72,11 +74,11 @@
   *************************************/
  
  #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
@@ -46689,7 +25858,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  
  #define DDICT_HASHSET_TABLE_BASE_SIZE 64
  #define DDICT_HASHSET_RESIZE_FACTOR 2
-@@ -237,6 +238,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
      dctx->outBufferMode = ZSTD_bm_buffered;
      dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
      dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
@@ -46697,7 +25866,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  }
  
  static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
-@@ -421,16 +423,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
   *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
   * @return : 0, `zfhPtr` is correctly filled,
   *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
@@ -46742,7 +25911,119 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      if ( (format != ZSTD_f_zstd1_magicless)
        && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
          if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-@@ -730,10 +756,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                     frameParameter_unsupported, "");
+-    {
+-        size_t const skippableSize = skippableHeaderSize + sizeU32;
++    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
+         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+         return skippableSize;
+     }
+ }
+ 
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+  *
+  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
+  * in the magicVariant.
+  *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+  *
+  * @return : number of bytes written or a ZSTD error.
+  */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+-                                            const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++                               unsigned* magicVariant,  /* optional, can be NULL */
++                         const void* src, size_t srcSize)
+ {
+-    U32 const magicNumber = MEM_readLE32(src);
+-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+-    /* check input validity */
+-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+ 
+-    /* deliver payload */
+-    if (skippableContentSize > 0  && dst != NULL)
+-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+-    if (magicVariant != NULL)
+-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+-    return skippableContentSize;
++    {   U32 const magicNumber = MEM_readLE32(src);
++        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++        /* check input validity */
++        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++        /* deliver payload */
++        if (skippableContentSize > 0  && dst != NULL)
++            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++        if (magicVariant != NULL)
++            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++        return skippableContentSize;
++    }
+ }
+ 
+ /* ZSTD_findDecompressedSize() :
+- *  compatible with legacy mode
+  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+  *      skippable frames
+- *  @return : decompressed size of the frames contained */
++ *  note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+     unsigned long long totalDstSize = 0;
+@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ 
+         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-            if (ZSTD_isError(skippableSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+             assert(skippableSize <= srcSize);
+ 
+             src = (const BYTE *)src + skippableSize;
+@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+             continue;
+         }
+ 
+-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+ 
+-            /* check for overflow */
+-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+-            totalDstSize += ret;
++            if (totalDstSize + fcs < totalDstSize)
++                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++            totalDstSize += fcs;
+         }
++        /* skip to next frame */
+         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+-            if (ZSTD_isError(frameSrcSize)) {
+-                return ZSTD_CONTENTSIZE_ERROR;
+-            }
++            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++            assert(frameSrcSize <= srcSize);
+ 
+             src = (const BYTE *)src + frameSrcSize;
+             srcSize -= frameSrcSize;
+@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
              ip += 4;
          }
  
@@ -46755,7 +26036,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
          return frameSizeInfo;
      }
  }
-@@ -773,6 +800,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
      return bound;
  }
  
@@ -46804,61 +26085,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  
  /*-*************************************************************
   *   Frame decoding
-@@ -798,7 +867,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
-         if (srcSize == 0) return 0;
-         RETURN_ERROR(dstBuffer_null, "");
-     }
--    ZSTD_memcpy(dst, src, srcSize);
-+    ZSTD_memmove(dst, src, srcSize);
-     return srcSize;
- }
- 
-@@ -858,6 +927,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
- 
-     /* Loop on each block */
-     while (1) {
-+        BYTE* oBlockEnd = oend;
-         size_t decodedSize;
-         blockProperties_t blockProperties;
-         size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
-@@ -867,16 +937,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
-         remainingSrcSize -= ZSTD_blockHeaderSize;
-         RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
- 
-+        if (ip >= op && ip < oBlockEnd) {
-+            /* We are decompressing in-place. Limit the output pointer so that we
-+             * don't overwrite the block that we are currently reading. This will
-+             * fail decompression if the input & output pointers aren't spaced
-+             * far enough apart.
-+             *
-+             * This is important to set, even when the pointers are far enough
-+             * apart, because ZSTD_decompressBlock_internal() can decide to store
-+             * literals in the output buffer, after the block it is decompressing.
-+             * Since we don't want anything to overwrite our input, we have to tell
-+             * ZSTD_decompressBlock_internal to never write past ip.
-+             *
-+             * See ZSTD_allocateLiteralsBuffer() for reference.
-+             */
-+            oBlockEnd = op + (ip - op);
-+        }
-+
-         switch(blockProperties.blockType)
-         {
-         case bt_compressed:
--            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
-+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
-             break;
-         case bt_raw :
-+            /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
-             decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
-             break;
-         case bt_rle :
--            decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize);
-+            decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize);
-             break;
-         case bt_reserved :
-         default:
-@@ -911,6 +999,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
      }
      ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
      /* Allow caller to get size read */
@@ -46866,7 +26093,31 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      *srcPtr = ip;
      *srcSizePtr = remainingSrcSize;
      return (size_t)(op-ostart);
-@@ -1042,8 +1131,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+ 
+ 
+-        {   U32 const magicNumber = MEM_readLE32(src);
+-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++        if (srcSize >= 4) {
++            U32 const magicNumber = MEM_readLE32(src);
++            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++                /* skippable frame detected : skip it */
+                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+                 assert(skippableSize <= srcSize);
+ 
+                 src = (const BYTE *)src + skippableSize;
+                 srcSize -= skippableSize;
+-                continue;
++                continue; /* check next frame */
+         }   }
+ 
+         if (ddict) {
+@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
  size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
  
  /*
@@ -46877,7 +26128,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
   * be streamed.
   *
   * For blocks that can be streamed, this allows us to reduce the latency until we produce
-@@ -1243,7 +1332,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
  
      default:
          assert(0);   /* impossible */
@@ -46886,7 +26137,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      }
  }
  
-@@ -1284,11 +1373,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
          /* in minimal huffman, we always use X1 variants */
          size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                  dictPtr, dictEnd - dictPtr,
@@ -46900,7 +26151,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  #endif
          RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
          dictPtr += hSize;
-@@ -1384,7 +1473,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
      dctx->prefixStart = NULL;
      dctx->virtualStart = NULL;
      dctx->dictEnd = NULL;
@@ -46909,7 +26160,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      dctx->litEntropy = dctx->fseEntropy = 0;
      dctx->dictID = 0;
      dctx->bType = bt_reserved;
-@@ -1446,7 +1535,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
   *  This could for one of the following reasons :
   *  - The frame does not require a dictionary (most common case).
   *  - The frame was built with dictID intentionally removed.
@@ -46918,7 +26169,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
   *    Note : this use case also happens when using a non-conformant dictionary.
   *  - `srcSize` is too small, and as a result, frame header could not be decoded.
   *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
-@@ -1455,7 +1544,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
   *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
  unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
  {
@@ -46927,7 +26178,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
      if (ZSTD_isError(hError)) return 0;
      return zfp.dictID;
-@@ -1562,7 +1651,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
  size_t ZSTD_initDStream(ZSTD_DStream* zds)
  {
      DEBUGLOG(4, "ZSTD_initDStream");
@@ -46938,7 +26189,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  }
  
  /* ZSTD_initDStream_usingDDict() :
-@@ -1570,20 +1661,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+@@ -1589,20 +1664,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
   * this function cannot fail */
  size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
  {
@@ -46960,7 +26211,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
  
  size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
  {
-@@ -1651,6 +1734,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+@@ -1670,6 +1737,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
              bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
              bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
              return bounds;
@@ -46972,7 +26223,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
          default:;
      }
      bounds.error = ERROR(parameter_unsupported);
-@@ -1691,6 +1779,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+@@ -1710,6 +1782,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
          case ZSTD_d_refMultipleDDicts:
              *value = (int)dctx->refMultipleDDicts;
              return 0;
@@ -46982,7 +26233,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
          default:;
      }
      RETURN_ERROR(parameter_unsupported, "");
-@@ -1724,6 +1815,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+@@ -1743,6 +1818,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
              }
              dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
              return 0;
@@ -46993,7 +26244,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
          default:;
      }
      RETURN_ERROR(parameter_unsupported, "");
-@@ -1899,7 +1994,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1918,7 +1997,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  if (zds->refMultipleDDicts && zds->ddictSet) {
                      ZSTD_DCtx_selectFrameDDict(zds);
                  }
@@ -47001,7 +26252,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                  if (ZSTD_isError(hSize)) {
                      return hSize;   /* error */
                  }
-@@ -1913,6 +2007,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1932,6 +2010,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                              zds->lhSize += remainingInput;
                          }
                          input->pos = input->size;
@@ -47013,7 +26264,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                          return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                      }
                      assert(ip != NULL);
-@@ -1930,8 +2029,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1949,8 +2032,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                      size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                      if (ZSTD_isError(decompressedSize)) return decompressedSize;
                      DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
@@ -47024,7 +26275,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                      zds->expected = 0;
                      zds->streamStage = zdss_init;
                      someMoreWork = 0;
-@@ -2015,6 +2115,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2034,6 +2118,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  }
                  if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                      FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
@@ -47032,7 +26283,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                      ip += neededInSize;
                      /* Function modifies the stage so we must break */
                      break;
-@@ -2029,7 +2130,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2048,7 +2133,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  int const isSkipFrame = ZSTD_isSkipFrame(zds);
                  size_t loadedSize;
                  /* At this point we shouldn't be decompressing a block that we can stream. */
@@ -47041,7 +26292,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                  if (isSkipFrame) {
                      loadedSize = MIN(toLoad, (size_t)(iend-ip));
                  } else {
-@@ -2038,8 +2139,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2057,8 +2142,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                      "should never happen");
                      loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                  }
@@ -47055,7 +26306,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                  if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
  
                  /* decode loaded input */
-@@ -2049,14 +2153,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2068,14 +2156,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  break;
              }
          case zdss_flush:
@@ -47076,7 +26327,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
                          DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                  (int)(zds->outBuffSize - zds->outStart),
                                  (U32)zds->fParams.blockSizeMax);
-@@ -2070,7 +2177,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2089,7 +2180,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
  
          default:
              assert(0);    /* impossible */
@@ -47085,7 +26336,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
      }   }
  
      /* result */
-@@ -2083,8 +2190,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2102,8 +2193,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
      if ((ip==istart) && (op==ostart)) {  /* no forward progress */
          zds->noForwardProgress ++;
          if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
@@ -47096,7 +26347,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
              assert(0);
          }
      } else {
-@@ -2121,11 +2228,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+@@ -2140,11 +2231,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                              void* dst, size_t dstCapacity, size_t* dstPos,
                        const void* src, size_t srcSize, size_t* srcPos)
  {
@@ -47122,7 +26373,7 @@ index b9b935a9f5c0..d7eebb17a2c5 100644
 +    }
  }
 diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
-index c1913b8e7c89..ffbe53ba0346 100644
+index c1913b8e7c89..9f5577e5bc19 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.c
 +++ b/lib/zstd/decompress/zstd_decompress_block.c
 @@ -1,5 +1,6 @@
@@ -47337,7 +26588,7 @@ index c1913b8e7c89..ffbe53ba0346 100644
      seq.matchLength = mlDInfo->baseValue;
      seq.litLength = llDInfo->baseValue;
      {   U32 const ofBase = ofDInfo->baseValue;
-@@ -1186,9 +1221,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
          U32 const llnbBits = llDInfo->nbBits;
          U32 const mlnbBits = mlDInfo->nbBits;
          U32 const ofnbBits = ofDInfo->nbBits;
@@ -47352,8 +26603,13 @@ index c1913b8e7c89..ffbe53ba0346 100644
           * performance.
           */
  
-@@ -1201,13 +1240,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
-     #endif
+         /* sequence */
+         {   size_t offset;
+-    #if defined(__clang__)
+-            if (LIKELY(ofBits > 1)) {
+-    #else
+             if (ofBits > 1) {
+-    #endif
                  ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                  ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
 -                assert(ofBits <= MaxOff);
@@ -47373,7 +26629,31 @@ index c1913b8e7c89..ffbe53ba0346 100644
                  } else {
                      offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                      if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
-@@ -1552,7 +1594,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+             seq.offset = offset;
+         }
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(mlBits > 0))
+-    #else
+         if (mlBits > 0)
+-    #endif
+             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+ 
+         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+ 
+-    #if defined(__clang__)
+-        if (UNLIKELY(llBits > 0))
+-    #else
+         if (llBits > 0)
+-    #endif
+             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+ 
+         if (MEM_32bits())
+@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
      const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
      const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
      const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
@@ -47382,12 +26662,12 @@ index c1913b8e7c89..ffbe53ba0346 100644
      (void)frame;
  
      /* Regen sequences */
-@@ -1945,34 +1987,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
  
  
 +/*
-+ * @returns The total size of the history referencable by zstd, including
++ * @returns The total size of the history referenceable by zstd, including
 + * both the prefix and the extDict. At @p op any offset larger than this
 + * is invalid.
 + */
@@ -47395,15 +26675,15 @@ index c1913b8e7c89..ffbe53ba0346 100644
 +{
 +    return (size_t)(op - virtualStart);
 +}
- 
--#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
--    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
--/* ZSTD_getLongOffsetsShare() :
++
 +typedef struct {
 +    unsigned longOffsetShare;
 +    unsigned maxNbAdditionalBits;
 +} ZSTD_OffsetInfo;
-+
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
 +/* ZSTD_getOffsetInfo() :
   * condition : offTable must be valid
   * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
@@ -47482,7 +26762,7 @@ index c1913b8e7c89..ffbe53ba0346 100644
  
  size_t
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
-@@ -1980,20 +2067,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                          const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
  {   /* blockType == blockCompressed */
      const BYTE* ip = (const BYTE*)src;
@@ -47513,7 +26793,7 @@ index c1913b8e7c89..ffbe53ba0346 100644
          if (ZSTD_isError(litCSize)) return litCSize;
          ip += litCSize;
          srcSize -= litCSize;
-@@ -2001,6 +2089,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
  
      /* Build Decoding Tables */
      {
@@ -47537,7 +26817,7 @@ index c1913b8e7c89..ffbe53ba0346 100644
          /* These macros control at build-time which decompressor implementation
           * we use. If neither is defined, we do some inspection and dispatch at
           * runtime.
-@@ -2008,6 +2113,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
      !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
          int usePrefetchDecoder = dctx->ddictIsCold;
@@ -47549,9 +26829,14 @@ index c1913b8e7c89..ffbe53ba0346 100644
  #endif
          int nbSeq;
          size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
-@@ -2017,26 +2127,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+         ip += seqHSize;
+         srcSize -= seqHSize;
  
-         RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++                "invalid dst");
  
 -#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
 -    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
@@ -47598,8 +26883,34 @@ index c1913b8e7c89..ffbe53ba0346 100644
  
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
          /* else */
+@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+ 
+ 
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-                            void* dst, size_t dstCapacity,
+-                      const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                                       void* dst, size_t dstCapacity,
++                                 const void* src, size_t srcSize)
+ {
+     size_t dSize;
+     ZSTD_checkContinuity(dctx, dst, dstCapacity);
+@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+     dctx->previousDstEnd = (char*)dst + dSize;
+     return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize)
++{
++    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
 diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
-index 3d2d57a5d25a..e372f048d186 100644
+index 3d2d57a5d25a..5888e6cc788b 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.h
 +++ b/lib/zstd/decompress/zstd_decompress_block.h
 @@ -1,5 +1,6 @@
@@ -47610,6 +26921,17 @@ index 3d2d57a5d25a..e372f048d186 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+                    unsigned tableLog, void* wksp, size_t wkspSize,
+                    int bmi2);
+ 
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++                            void* dst, size_t dstCapacity,
++                      const void* src, size_t srcSize);
++
+ 
+ #endif /* ZSTD_DEC_BLOCK_H */
 diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
 index 98102edb6a83..32f79fb2873d 100644
 --- a/lib/zstd/decompress/zstd_decompress_internal.h
@@ -47658,7 +26980,7 @@ index a06ca187aab5..8a47eb2a4514 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
-index 22686e367e6f..2fead39eb743 100644
+index 22686e367e6f..466828e35752 100644
 --- a/lib/zstd/zstd_common_module.c
 +++ b/lib/zstd/zstd_common_module.c
 @@ -1,6 +1,6 @@
@@ -47669,6 +26991,16 @@ index 22686e367e6f..2fead39eb743 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
 diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
 index 04e1b5c01d9b..8ecf43226af2 100644
 --- a/lib/zstd/zstd_compress_module.c
@@ -47703,3518 +27035,4 @@ index f4ed952ed485..7d31518e9d5a 100644
  EXPORT_SYMBOL(zstd_reset_dstream);
  
 -- 
-2.40.0.rc2
-
-From 16b77e5461b5cc96bf4476bde0fee2ecc25aca83 Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 10 Mar 2023 19:28:54 +0100
-Subject: [PATCH 16/16] v4l2-core: add v4l2loopback
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- drivers/media/v4l2-core/Kconfig               |    5 +
- drivers/media/v4l2-core/Makefile              |    2 +
- drivers/media/v4l2-core/v4l2loopback.c        | 2906 +++++++++++++++++
- drivers/media/v4l2-core/v4l2loopback.h        |   96 +
- .../media/v4l2-core/v4l2loopback_formats.h    |  445 +++
- 5 files changed, 3454 insertions(+)
- create mode 100644 drivers/media/v4l2-core/v4l2loopback.c
- create mode 100644 drivers/media/v4l2-core/v4l2loopback.h
- create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h
-
-diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
-index 348559bc2468..32a46fcc751f 100644
---- a/drivers/media/v4l2-core/Kconfig
-+++ b/drivers/media/v4l2-core/Kconfig
-@@ -40,6 +40,11 @@ config VIDEO_TUNER
- config V4L2_JPEG_HELPER
- 	tristate
- 
-+config V4L2_LOOPBACK
-+	tristate "V4L2 loopback device"
-+	help
-+	  V4L2 loopback device
-+
- # Used by drivers that need v4l2-h264.ko
- config V4L2_H264
- 	tristate
-diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
-index 41d91bd10cf2..4de37a844f95 100644
---- a/drivers/media/v4l2-core/Makefile
-+++ b/drivers/media/v4l2-core/Makefile
-@@ -32,6 +32,8 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
- obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
- obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
- 
-+obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o
-+
- obj-$(CONFIG_VIDEOBUF_DMA_CONTIG) += videobuf-dma-contig.o
- obj-$(CONFIG_VIDEOBUF_DMA_SG) += videobuf-dma-sg.o
- obj-$(CONFIG_VIDEOBUF_GEN) += videobuf-core.o
-diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c
-new file mode 100644
-index 000000000000..2ab1f760cfb5
---- /dev/null
-+++ b/drivers/media/v4l2-core/v4l2loopback.c
-@@ -0,0 +1,2906 @@
-+/* -*- c-file-style: "linux" -*- */
-+/*
-+ * v4l2loopback.c  --  video4linux2 loopback driver
-+ *
-+ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com)
-+ * Copyright (C) 2010-2019 IOhannes m zmoelnig (zmoelnig@iem.at)
-+ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de)
-+ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com)
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ */
-+#include <linux/version.h>
-+#include <linux/vmalloc.h>
-+#include <linux/mm.h>
-+#include <linux/time.h>
-+#include <linux/module.h>
-+#include <linux/videodev2.h>
-+#include <linux/sched.h>
-+#include <linux/slab.h>
-+#include <linux/fs.h>
-+#include <linux/capability.h>
-+#include <linux/eventpoll.h>
-+#include <media/v4l2-ioctl.h>
-+#include <media/v4l2-common.h>
-+#include <media/v4l2-device.h>
-+#include <media/v4l2-ctrls.h>
-+#include <media/v4l2-event.h>
-+
-+#include <linux/miscdevice.h>
-+#include "v4l2loopback.h"
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1)
-+#define kstrtoul strict_strtoul
-+#endif
-+
-+#if defined(timer_setup) && defined(from_timer)
-+#define HAVE_TIMER_SETUP
-+#endif
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
-+#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER
-+#endif
-+
-+#define V4L2LOOPBACK_VERSION_CODE                                              \
-+	KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \
-+		       V4L2LOOPBACK_VERSION_BUGFIX)
-+
-+MODULE_DESCRIPTION("V4L2 loopback video device");
-+MODULE_AUTHOR("Vasily Levin, "
-+	      "IOhannes m zmoelnig <zmoelnig@iem.at>,"
-+	      "Stefan Diewald,"
-+	      "Anton Novikov"
-+	      "et al.");
-+#ifdef SNAPSHOT_VERSION
-+MODULE_VERSION(__stringify(SNAPSHOT_VERSION));
-+#else
-+MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify(
-+	V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX));
-+#endif
-+MODULE_LICENSE("GPL");
-+
-+/*
-+ * helpers
-+ */
-+#define dprintk(fmt, args...)                                          \
-+	do {                                                           \
-+		if (debug > 0) {                                       \
-+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
-+				       __LINE__) "], pid(%d):  " fmt,  \
-+			       task_pid_nr(current), ##args);          \
-+		}                                                      \
-+	} while (0)
-+
-+#define MARK()                                                             \
-+	do {                                                               \
-+		if (debug > 1) {                                           \
-+			printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \
-+			       __LINE__, __func__, task_pid_nr(current));  \
-+		}                                                          \
-+	} while (0)
-+
-+#define dprintkrw(fmt, args...)                                        \
-+	do {                                                           \
-+		if (debug > 2) {                                       \
-+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
-+				       __LINE__) "], pid(%d): " fmt,   \
-+			       task_pid_nr(current), ##args);          \
-+		}                                                      \
-+	} while (0)
-+
-+/* TODO: Make sure that function is never interrupted. */
-+static inline int mod_inc(int *number, int mod)
-+{
-+	int result;
-+	result = (*number + 1) % mod;
-+	if (unlikely(result < 0))
-+		result += mod;
-+	*number = result;
-+	return result;
-+}
-+
-+static inline void v4l2l_get_timestamp(struct v4l2_buffer *b)
-+{
-+	/* ktime_get_ts is considered deprecated, so use ktime_get_ts64 if possible */
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
-+	struct timespec ts;
-+	ktime_get_ts(&ts);
-+#else
-+	struct timespec64 ts;
-+	ktime_get_ts64(&ts);
-+#endif
-+
-+	b->timestamp.tv_sec = ts.tv_sec;
-+	b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC);
-+}
-+
-+#if !defined(__poll_t)
-+typedef unsigned __poll_t;
-+#endif
-+
-+/* module constants
-+ *  can be overridden during he build process using something like
-+ *	make KCPPFLAGS="-DMAX_DEVICES=100"
-+ */
-+
-+/* maximum number of v4l2loopback devices that can be created */
-+#ifndef MAX_DEVICES
-+#define MAX_DEVICES 8
-+#endif
-+
-+/* whether the default is to announce capabilities exclusively or not */
-+#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
-+#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0
-+#endif
-+
-+/* when a producer is considered to have gone stale */
-+#ifndef MAX_TIMEOUT
-+#define MAX_TIMEOUT (100 * 1000) /* in msecs */
-+#endif
-+
-+/* max buffers that can be mapped, actually they
-+ * are all mapped to max_buffers buffers */
-+#ifndef MAX_BUFFERS
-+#define MAX_BUFFERS 32
-+#endif
-+
-+/* module parameters */
-+static int debug = 0;
-+module_param(debug, int, S_IRUGO | S_IWUSR);
-+MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)");
-+
-+#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2
-+static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS;
-+module_param(max_buffers, int, S_IRUGO);
-+MODULE_PARM_DESC(max_buffers,
-+		 "how many buffers should be allocated [DEFAULT: " __stringify(
-+			 V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]");
-+
-+/* how many times a device can be opened
-+ * the per-module default value can be overridden on a per-device basis using
-+ * the /sys/devices interface
-+ *
-+ * note that max_openers should be at least 2 in order to get a working system:
-+ *   one opener for the producer and one opener for the consumer
-+ *   however, we leave that to the user
-+ */
-+#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10
-+static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS;
-+module_param(max_openers, int, S_IRUGO | S_IWUSR);
-+MODULE_PARM_DESC(
-+	max_openers,
-+	"how many users can open the loopback device [DEFAULT: " __stringify(
-+		V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]");
-+
-+static int devices = -1;
-+module_param(devices, int, 0);
-+MODULE_PARM_DESC(devices, "how many devices should be created");
-+
-+static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 };
-+module_param_array(video_nr, int, NULL, 0444);
-+MODULE_PARM_DESC(video_nr,
-+		 "video device numbers (-1=auto, 0=/dev/video0, etc.)");
-+
-+static char *card_label[MAX_DEVICES];
-+module_param_array(card_label, charp, NULL, 0000);
-+MODULE_PARM_DESC(card_label, "card labels for each device");
-+
-+static bool exclusive_caps[MAX_DEVICES] = {
-+	[0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
-+};
-+module_param_array(exclusive_caps, bool, NULL, 0444);
-+/* FIXXME: wording */
-+MODULE_PARM_DESC(
-+	exclusive_caps,
-+	"whether to announce OUTPUT/CAPTURE capabilities exclusively or not  [DEFAULT: " __stringify(
-+		V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]");
-+
-+/* format specifications */
-+#define V4L2LOOPBACK_SIZE_MIN_WIDTH 48
-+#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 32
-+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192
-+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192
-+
-+#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640
-+#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480
-+
-+static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
-+module_param(max_width, int, S_IRUGO);
-+MODULE_PARM_DESC(max_width,
-+		 "maximum allowed frame width [DEFAULT: " __stringify(
-+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]");
-+static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
-+module_param(max_height, int, S_IRUGO);
-+MODULE_PARM_DESC(max_height,
-+		 "maximum allowed frame height [DEFAULT: " __stringify(
-+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]");
-+
-+static DEFINE_IDR(v4l2loopback_index_idr);
-+static DEFINE_MUTEX(v4l2loopback_ctl_mutex);
-+
-+/* frame intervals */
-+#define V4L2LOOPBACK_FPS_MIN 0
-+#define V4L2LOOPBACK_FPS_MAX 1000
-+
-+/* control IDs */
-+#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000)
-+#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0)
-+#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1)
-+#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2)
-+#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3)
-+
-+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl);
-+static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = {
-+	.s_ctrl = v4l2loopback_s_ctrl,
-+};
-+static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = {
-+	// clang-format off
-+	.ops	= &v4l2loopback_ctrl_ops,
-+	.id	= CID_KEEP_FORMAT,
-+	.name	= "keep_format",
-+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
-+	.min	= 0,
-+	.max	= 1,
-+	.step	= 1,
-+	.def	= 0,
-+	// clang-format on
-+};
-+static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = {
-+	// clang-format off
-+	.ops	= &v4l2loopback_ctrl_ops,
-+	.id	= CID_SUSTAIN_FRAMERATE,
-+	.name	= "sustain_framerate",
-+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
-+	.min	= 0,
-+	.max	= 1,
-+	.step	= 1,
-+	.def	= 0,
-+	// clang-format on
-+};
-+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = {
-+	// clang-format off
-+	.ops	= &v4l2loopback_ctrl_ops,
-+	.id	= CID_TIMEOUT,
-+	.name	= "timeout",
-+	.type	= V4L2_CTRL_TYPE_INTEGER,
-+	.min	= 0,
-+	.max	= MAX_TIMEOUT,
-+	.step	= 1,
-+	.def	= 0,
-+	// clang-format on
-+};
-+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = {
-+	// clang-format off
-+	.ops	= &v4l2loopback_ctrl_ops,
-+	.id	= CID_TIMEOUT_IMAGE_IO,
-+	.name	= "timeout_image_io",
-+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
-+	.min	= 0,
-+	.max	= 1,
-+	.step	= 1,
-+	.def	= 0,
-+	// clang-format on
-+};
-+
-+/* module structures */
-+struct v4l2loopback_private {
-+	int device_nr;
-+};
-+
-+/* TODO(vasaka) use typenames which are common to kernel, but first find out if
-+ * it is needed */
-+/* struct keeping state and settings of loopback device */
-+
-+struct v4l2l_buffer {
-+	struct v4l2_buffer buffer;
-+	struct list_head list_head;
-+	int use_count;
-+};
-+
-+struct v4l2_loopback_device {
-+	struct v4l2_device v4l2_dev;
-+	struct v4l2_ctrl_handler ctrl_handler;
-+	struct video_device *vdev;
-+	/* pixel and stream format */
-+	struct v4l2_pix_format pix_format;
-+	struct v4l2_captureparm capture_param;
-+	unsigned long frame_jiffies;
-+
-+	/* ctrls */
-+	int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all
-+			    openers close() the device */
-+	int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain
-+				  (close to) nominal framerate */
-+
-+	/* buffers stuff */
-+	u8 *image; /* pointer to actual buffers data */
-+	unsigned long int imagesize; /* size of buffers data */
-+	int buffers_number; /* should not be big, 4 is a good choice */
-+	struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */
-+	int used_buffers; /* number of the actually used buffers */
-+	int max_openers; /* how many times can this device be opened */
-+
-+	int write_position; /* number of last written frame + 1 */
-+	struct list_head outbufs_list; /* buffers in output DQBUF order */
-+	int bufpos2index
-+		[MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers)
-+                        * to inner buffer index */
-+	long buffer_size;
-+
-+	/* sustain_framerate stuff */
-+	struct timer_list sustain_timer;
-+	unsigned int reread_count;
-+
-+	/* timeout stuff */
-+	unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */
-+	int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will
-+			       * read/write to timeout_image */
-+	u8 *timeout_image; /* copy of it will be captured when timeout passes */
-+	struct v4l2l_buffer timeout_image_buffer;
-+	struct timer_list timeout_timer;
-+	int timeout_happened;
-+
-+	/* sync stuff */
-+	atomic_t open_count;
-+
-+	int ready_for_capture; /* set to the number of writers that opened the
-+                                * device and negotiated format. */
-+	int ready_for_output; /* set to true when no writer is currently attached
-+			       * this differs slightly from !ready_for_capture,
-+			       * e.g. when using fallback images */
-+	int active_readers; /* increase if any reader starts streaming */
-+	int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE)
-+                                * should only be announced if the resp. "ready"
-+                                * flag is set; default=TRUE */
-+
-+	int max_width;
-+	int max_height;
-+
-+	char card_label[32];
-+
-+	wait_queue_head_t read_event;
-+	spinlock_t lock;
-+};
-+
-+/* types of opener shows what opener wants to do with loopback */
-+enum opener_type {
-+	// clang-format off
-+	UNNEGOTIATED	= 0,
-+	READER		= 1,
-+	WRITER		= 2,
-+	// clang-format on
-+};
-+
-+/* struct keeping state and type of opener */
-+struct v4l2_loopback_opener {
-+	enum opener_type type;
-+	int read_position; /* number of last processed frame + 1 or
-+			    * write_position - 1 if reader went out of sync */
-+	unsigned int reread_count;
-+	struct v4l2_buffer *buffers;
-+	int buffers_number; /* should not be big, 4 is a good choice */
-+	int timeout_image_io;
-+
-+	struct v4l2_fh fh;
-+};
-+
-+#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh)
-+
-+/* this is heavily inspired by the bttv driver found in the linux kernel */
-+struct v4l2l_format {
-+	char *name;
-+	int fourcc; /* video4linux 2 */
-+	int depth; /* bit/pixel */
-+	int flags;
-+};
-+/* set the v4l2l_format.flags to PLANAR for non-packed formats */
-+#define FORMAT_FLAGS_PLANAR 0x01
-+#define FORMAT_FLAGS_COMPRESSED 0x02
-+
-+#include "v4l2loopback_formats.h"
-+
-+static const unsigned int FORMATS = ARRAY_SIZE(formats);
-+
-+static char *fourcc2str(unsigned int fourcc, char buf[4])
-+{
-+	buf[0] = (fourcc >> 0) & 0xFF;
-+	buf[1] = (fourcc >> 8) & 0xFF;
-+	buf[2] = (fourcc >> 16) & 0xFF;
-+	buf[3] = (fourcc >> 24) & 0xFF;
-+
-+	return buf;
-+}
-+
-+static const struct v4l2l_format *format_by_fourcc(int fourcc)
-+{
-+	unsigned int i;
-+
-+	for (i = 0; i < FORMATS; i++) {
-+		if (formats[i].fourcc == fourcc)
-+			return formats + i;
-+	}
-+
-+	dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF,
-+		(fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF,
-+		(fourcc >> 24) & 0xFF);
-+	return NULL;
-+}
-+
-+static void pix_format_set_size(struct v4l2_pix_format *f,
-+				const struct v4l2l_format *fmt,
-+				unsigned int width, unsigned int height)
-+{
-+	f->width = width;
-+	f->height = height;
-+
-+	if (fmt->flags & FORMAT_FLAGS_PLANAR) {
-+		f->bytesperline = width; /* Y plane */
-+		f->sizeimage = (width * height * fmt->depth) >> 3;
-+	} else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) {
-+		/* doesn't make sense for compressed formats */
-+		f->bytesperline = 0;
-+		f->sizeimage = (width * height * fmt->depth) >> 3;
-+	} else {
-+		f->bytesperline = (width * fmt->depth) >> 3;
-+		f->sizeimage = height * f->bytesperline;
-+	}
-+}
-+
-+static int set_timeperframe(struct v4l2_loopback_device *dev,
-+			    struct v4l2_fract *tpf)
-+{
-+	if ((tpf->denominator < 1) || (tpf->numerator < 1)) {
-+		return -EINVAL;
-+	}
-+	dev->capture_param.timeperframe = *tpf;
-+	dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator /
-+					      tpf->denominator);
-+	return 0;
-+}
-+
-+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd);
-+
-+/* device attributes */
-+/* available via sysfs: /sys/devices/virtual/video4linux/video* */
-+
-+static ssize_t attr_show_format(struct device *cd,
-+				struct device_attribute *attr, char *buf)
-+{
-+	/* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */
-+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
-+	const struct v4l2_fract *tpf;
-+	char buf4cc[5], buf_fps[32];
-+
-+	if (!dev || !dev->ready_for_capture)
-+		return 0;
-+	tpf = &dev->capture_param.timeperframe;
-+
-+	fourcc2str(dev->pix_format.pixelformat, buf4cc);
-+	buf4cc[4] = 0;
-+	if (tpf->numerator == 1)
-+		snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator);
-+	else
-+		snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator,
-+			 tpf->numerator);
-+	return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width,
-+		       dev->pix_format.height, buf_fps);
-+}
-+
-+static ssize_t attr_store_format(struct device *cd,
-+				 struct device_attribute *attr, const char *buf,
-+				 size_t len)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
-+	int fps_num = 0, fps_den = 1;
-+
-+	if (!dev)
-+		return -ENODEV;
-+
-+	/* only fps changing is supported */
-+	if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) {
-+		struct v4l2_fract f = { .numerator = fps_den,
-+					.denominator = fps_num };
-+		int err = 0;
-+		if ((err = set_timeperframe(dev, &f)) < 0)
-+			return err;
-+		return len;
-+	}
-+	return -EINVAL;
-+}
-+
-+static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format,
-+		   attr_store_format);
-+
-+static ssize_t attr_show_buffers(struct device *cd,
-+				 struct device_attribute *attr, char *buf)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
-+
-+	if (!dev)
-+		return -ENODEV;
-+
-+	return sprintf(buf, "%d\n", dev->used_buffers);
-+}
-+
-+static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL);
-+
-+static ssize_t attr_show_maxopeners(struct device *cd,
-+				    struct device_attribute *attr, char *buf)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
-+
-+	if (!dev)
-+		return -ENODEV;
-+
-+	return sprintf(buf, "%d\n", dev->max_openers);
-+}
-+
-+static ssize_t attr_store_maxopeners(struct device *cd,
-+				     struct device_attribute *attr,
-+				     const char *buf, size_t len)
-+{
-+	struct v4l2_loopback_device *dev = NULL;
-+	unsigned long curr = 0;
-+
-+	if (kstrtoul(buf, 0, &curr))
-+		return -EINVAL;
-+
-+	dev = v4l2loopback_cd2dev(cd);
-+	if (!dev)
-+		return -ENODEV;
-+
-+	if (dev->max_openers == curr)
-+		return len;
-+
-+	if (curr > __INT_MAX__ || dev->open_count.counter > curr) {
-+		/* request to limit to less openers as are currently attached to us */
-+		return -EINVAL;
-+	}
-+
-+	dev->max_openers = (int)curr;
-+
-+	return len;
-+}
-+
-+static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners,
-+		   attr_store_maxopeners);
-+
-+static void v4l2loopback_remove_sysfs(struct video_device *vdev)
-+{
-+#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x)
-+
-+	if (vdev) {
-+		V4L2_SYSFS_DESTROY(format);
-+		V4L2_SYSFS_DESTROY(buffers);
-+		V4L2_SYSFS_DESTROY(max_openers);
-+		/* ... */
-+	}
-+}
-+
-+static void v4l2loopback_create_sysfs(struct video_device *vdev)
-+{
-+	int res = 0;
-+
-+#define V4L2_SYSFS_CREATE(x)                                 \
-+	res = device_create_file(&vdev->dev, &dev_attr_##x); \
-+	if (res < 0)                                         \
-+	break
-+	if (!vdev)
-+		return;
-+	do {
-+		V4L2_SYSFS_CREATE(format);
-+		V4L2_SYSFS_CREATE(buffers);
-+		V4L2_SYSFS_CREATE(max_openers);
-+		/* ... */
-+	} while (0);
-+
-+	if (res >= 0)
-+		return;
-+	dev_err(&vdev->dev, "%s error: %d\n", __func__, res);
-+}
-+
-+/* Event APIs */
-+
-+#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START)
-+#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000
-+#define V4L2_EVENT_PRI_CLIENT_USAGE \
-+	(V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1)
-+
-+struct v4l2_event_client_usage {
-+	__u32 count;
-+};
-+
-+/* global module data */
-+/* find a device based on it's device-number (e.g. '3' for /dev/video3) */
-+struct v4l2loopback_lookup_cb_data {
-+	int device_nr;
-+	struct v4l2_loopback_device *device;
-+};
-+static int v4l2loopback_lookup_cb(int id, void *ptr, void *data)
-+{
-+	struct v4l2_loopback_device *device = ptr;
-+	struct v4l2loopback_lookup_cb_data *cbdata = data;
-+	if (cbdata && device && device->vdev) {
-+		if (device->vdev->num == cbdata->device_nr) {
-+			cbdata->device = device;
-+			cbdata->device_nr = id;
-+			return 1;
-+		}
-+	}
-+	return 0;
-+}
-+static int v4l2loopback_lookup(int device_nr,
-+			       struct v4l2_loopback_device **device)
-+{
-+	struct v4l2loopback_lookup_cb_data data = {
-+		.device_nr = device_nr,
-+		.device = NULL,
-+	};
-+	int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb,
-+			       &data);
-+	if (1 == err) {
-+		if (device)
-+			*device = data.device;
-+		return data.device_nr;
-+	}
-+	return -ENODEV;
-+}
-+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd)
-+{
-+	struct video_device *loopdev = to_video_device(cd);
-+	struct v4l2loopback_private *ptr =
-+		(struct v4l2loopback_private *)video_get_drvdata(loopdev);
-+	int nr = ptr->device_nr;
-+
-+	return idr_find(&v4l2loopback_index_idr, nr);
-+}
-+
-+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f)
-+{
-+	struct v4l2loopback_private *ptr = video_drvdata(f);
-+	int nr = ptr->device_nr;
-+
-+	return idr_find(&v4l2loopback_index_idr, nr);
-+}
-+
-+/* forward declarations */
-+static void client_usage_queue_event(struct video_device *vdev);
-+static void init_buffers(struct v4l2_loopback_device *dev);
-+static int allocate_buffers(struct v4l2_loopback_device *dev);
-+static void free_buffers(struct v4l2_loopback_device *dev);
-+static void try_free_buffers(struct v4l2_loopback_device *dev);
-+static int allocate_timeout_image(struct v4l2_loopback_device *dev);
-+static void check_timers(struct v4l2_loopback_device *dev);
-+static const struct v4l2_file_operations v4l2_loopback_fops;
-+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops;
-+
-+/* Queue helpers */
-+/* next functions sets buffer flags and adjusts counters accordingly */
-+static inline void set_done(struct v4l2l_buffer *buffer)
-+{
-+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
-+	buffer->buffer.flags |= V4L2_BUF_FLAG_DONE;
-+}
-+
-+static inline void set_queued(struct v4l2l_buffer *buffer)
-+{
-+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
-+	buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED;
-+}
-+
-+static inline void unset_flags(struct v4l2l_buffer *buffer)
-+{
-+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
-+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
-+}
-+
-+/* V4L2 ioctl caps and params calls */
-+/* returns device capabilities
-+ * called on VIDIOC_QUERYCAP
-+ */
-+static int vidioc_querycap(struct file *file, void *priv,
-+			   struct v4l2_capability *cap)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	int device_nr =
-+		((struct v4l2loopback_private *)video_get_drvdata(dev->vdev))
-+			->device_nr;
-+	__u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE;
-+
-+	strlcpy(cap->driver, "v4l2 loopback", sizeof(cap->driver));
-+	snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label);
-+	snprintf(cap->bus_info, sizeof(cap->bus_info),
-+		 "platform:v4l2loopback-%03d", device_nr);
-+
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0)
-+	/* since 3.1.0, the v4l2-core system is supposed to set the version */
-+	cap->version = V4L2LOOPBACK_VERSION_CODE;
-+#endif
-+
-+	if (dev->announce_all_caps) {
-+		capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
-+	} else {
-+		if (dev->ready_for_capture) {
-+			capabilities |= V4L2_CAP_VIDEO_CAPTURE;
-+		}
-+		if (dev->ready_for_output) {
-+			capabilities |= V4L2_CAP_VIDEO_OUTPUT;
-+		}
-+	}
-+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
-+	dev->vdev->device_caps =
-+#endif /* >=linux-4.7.0 */
-+		cap->device_caps = cap->capabilities = capabilities;
-+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
-+	cap->capabilities |= V4L2_CAP_DEVICE_CAPS;
-+#endif
-+
-+	memset(cap->reserved, 0, sizeof(cap->reserved));
-+	return 0;
-+}
-+
-+static int vidioc_enum_framesizes(struct file *file, void *fh,
-+				  struct v4l2_frmsizeenum *argp)
-+{
-+	struct v4l2_loopback_device *dev;
-+
-+	/* there can be only one... */
-+	if (argp->index)
-+		return -EINVAL;
-+
-+	dev = v4l2loopback_getdevice(file);
-+	if (dev->ready_for_capture) {
-+		/* format has already been negotiated
-+		 * cannot change during runtime
-+		 */
-+		if (argp->pixel_format != dev->pix_format.pixelformat)
-+			return -EINVAL;
-+
-+		argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
-+
-+		argp->discrete.width = dev->pix_format.width;
-+		argp->discrete.height = dev->pix_format.height;
-+	} else {
-+		/* if the format has not been negotiated yet, we accept anything
-+		 */
-+		if (NULL == format_by_fourcc(argp->pixel_format))
-+			return -EINVAL;
-+
-+		argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
-+
-+		argp->stepwise.min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH;
-+		argp->stepwise.min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT;
-+
-+		argp->stepwise.max_width = dev->max_width;
-+		argp->stepwise.max_height = dev->max_height;
-+
-+		argp->stepwise.step_width = 1;
-+		argp->stepwise.step_height = 1;
-+	}
-+	return 0;
-+}
-+
-+/* returns frameinterval (fps) for the set resolution
-+ * called on VIDIOC_ENUM_FRAMEINTERVALS
-+ */
-+static int vidioc_enum_frameintervals(struct file *file, void *fh,
-+				      struct v4l2_frmivalenum *argp)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+
-+	/* there can be only one... */
-+	if (argp->index)
-+		return -EINVAL;
-+
-+	if (dev->ready_for_capture) {
-+		if (argp->width != dev->pix_format.width ||
-+		    argp->height != dev->pix_format.height ||
-+		    argp->pixel_format != dev->pix_format.pixelformat)
-+			return -EINVAL;
-+
-+		argp->type = V4L2_FRMIVAL_TYPE_DISCRETE;
-+		argp->discrete = dev->capture_param.timeperframe;
-+	} else {
-+		if (argp->width < V4L2LOOPBACK_SIZE_MIN_WIDTH ||
-+		    argp->width > max_width ||
-+		    argp->height < V4L2LOOPBACK_SIZE_MIN_HEIGHT ||
-+		    argp->height > max_height ||
-+		    NULL == format_by_fourcc(argp->pixel_format))
-+			return -EINVAL;
-+
-+		argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
-+		argp->stepwise.min.numerator = 1;
-+		argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX;
-+		argp->stepwise.max.numerator = 1;
-+		argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN;
-+		argp->stepwise.step.numerator = 1;
-+		argp->stepwise.step.denominator = 1;
-+	}
-+
-+	return 0;
-+}
-+
-+/* ------------------ CAPTURE ----------------------- */
-+
-+/* returns device formats
-+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
-+ */
-+static int vidioc_enum_fmt_cap(struct file *file, void *fh,
-+			       struct v4l2_fmtdesc *f)
-+{
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (f->index)
-+		return -EINVAL;
-+	if (dev->ready_for_capture) {
-+		const __u32 format = dev->pix_format.pixelformat;
-+
-+		snprintf(f->description, sizeof(f->description), "[%c%c%c%c]",
-+			 (format >> 0) & 0xFF, (format >> 8) & 0xFF,
-+			 (format >> 16) & 0xFF, (format >> 24) & 0xFF);
-+
-+		f->pixelformat = dev->pix_format.pixelformat;
-+	} else {
-+		return -EINVAL;
-+	}
-+	f->flags = 0;
-+	MARK();
-+	return 0;
-+}
-+
-+/* returns current video format
-+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
-+ */
-+static int vidioc_g_fmt_cap(struct file *file, void *priv,
-+			    struct v4l2_format *fmt)
-+{
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (!dev->ready_for_capture)
-+		return -EINVAL;
-+
-+	fmt->fmt.pix = dev->pix_format;
-+	MARK();
-+	return 0;
-+}
-+
-+/* checks if it is OK to change to format fmt;
-+ * actual check is done by inner_try_fmt_cap
-+ * just checking that pixelformat is OK and set other parameters, app should
-+ * obey this decision
-+ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
-+ */
-+static int vidioc_try_fmt_cap(struct file *file, void *priv,
-+			      struct v4l2_format *fmt)
-+{
-+	struct v4l2_loopback_device *dev;
-+	char buf[5];
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (0 == dev->ready_for_capture) {
-+		dprintk("setting fmt_cap not possible yet\n");
-+		return -EBUSY;
-+	}
-+
-+	if (fmt->fmt.pix.pixelformat != dev->pix_format.pixelformat)
-+		return -EINVAL;
-+
-+	fmt->fmt.pix = dev->pix_format;
-+
-+	buf[4] = 0;
-+	dprintk("capFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf));
-+	return 0;
-+}
-+
-+/* sets new output format, if possible
-+ * actually format is set  by input and we even do not check it, just return
-+ * current one, but it is possible to set subregions of input TODO(vasaka)
-+ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
-+ */
-+static int vidioc_s_fmt_cap(struct file *file, void *priv,
-+			    struct v4l2_format *fmt)
-+{
-+	return vidioc_try_fmt_cap(file, priv, fmt);
-+}
-+
-+/* ------------------ OUTPUT ----------------------- */
-+
-+/* returns device formats;
-+ * LATER: allow all formats
-+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
-+ */
-+static int vidioc_enum_fmt_out(struct file *file, void *fh,
-+			       struct v4l2_fmtdesc *f)
-+{
-+	struct v4l2_loopback_device *dev;
-+	const struct v4l2l_format *fmt;
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (dev->ready_for_capture) {
-+		const __u32 format = dev->pix_format.pixelformat;
-+
-+		/* format has been fixed by the writer, so only one single format is supported */
-+		if (f->index)
-+			return -EINVAL;
-+
-+		fmt = format_by_fourcc(format);
-+		if (NULL == fmt)
-+			return -EINVAL;
-+
-+		/* f->flags = ??; */
-+		snprintf(f->description, sizeof(f->description), "%s",
-+			 fmt->name);
-+
-+		f->pixelformat = dev->pix_format.pixelformat;
-+	} else {
-+		/* fill in a dummy format */
-+		/* coverity[unsigned_compare] */
-+		if (f->index < 0 || f->index >= FORMATS)
-+			return -EINVAL;
-+
-+		fmt = &formats[f->index];
-+
-+		f->pixelformat = fmt->fourcc;
-+		snprintf(f->description, sizeof(f->description), "%s",
-+			 fmt->name);
-+	}
-+	f->flags = 0;
-+
-+	return 0;
-+}
-+
-+/* returns current video format format fmt */
-+/* NOTE: this is called from the producer
-+ * so if format has not been negotiated yet,
-+ * it should return ALL of available formats,
-+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
-+ */
-+static int vidioc_g_fmt_out(struct file *file, void *priv,
-+			    struct v4l2_format *fmt)
-+{
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	/*
-+	 * LATER: this should return the currently valid format
-+	 * gstreamer doesn't like it, if this returns -EINVAL, as it
-+	 * then concludes that there is _no_ valid format
-+	 * CHECK whether this assumption is wrong,
-+	 * or whether we have to always provide a valid format
-+	 */
-+
-+	fmt->fmt.pix = dev->pix_format;
-+	return 0;
-+}
-+
-+/* checks if it is OK to change to format fmt;
-+ * if format is negotiated do not change it
-+ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
-+ */
-+static int vidioc_try_fmt_out(struct file *file, void *priv,
-+			      struct v4l2_format *fmt)
-+{
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	/* TODO(vasaka) loopback does not care about formats writer want to set,
-+	 * maybe it is a good idea to restrict format somehow */
-+	if (dev->ready_for_capture) {
-+		fmt->fmt.pix = dev->pix_format;
-+	} else {
-+		__u32 w = fmt->fmt.pix.width;
-+		__u32 h = fmt->fmt.pix.height;
-+		__u32 pixfmt = fmt->fmt.pix.pixelformat;
-+		const struct v4l2l_format *format = format_by_fourcc(pixfmt);
-+
-+		if (w > dev->max_width)
-+			w = dev->max_width;
-+		if (h > dev->max_height)
-+			h = dev->max_height;
-+
-+		dprintk("trying image %dx%d\n", w, h);
-+
-+		if (w < 1)
-+			w = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
-+
-+		if (h < 1)
-+			h = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
-+
-+		if (NULL == format)
-+			format = &formats[0];
-+
-+		pix_format_set_size(&fmt->fmt.pix, format, w, h);
-+
-+		fmt->fmt.pix.pixelformat = format->fourcc;
-+
-+		if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) ||
-+		    (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3))
-+			fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
-+
-+		if (V4L2_FIELD_ANY == fmt->fmt.pix.field)
-+			fmt->fmt.pix.field = V4L2_FIELD_NONE;
-+
-+		/* FIXXME: try_fmt should never modify the device-state */
-+		dev->pix_format = fmt->fmt.pix;
-+	}
-+	return 0;
-+}
-+
-+/* sets new output format, if possible;
-+ * allocate data here because we do not know if it will be streaming or
-+ * read/write IO
-+ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
-+ */
-+static int vidioc_s_fmt_out(struct file *file, void *priv,
-+			    struct v4l2_format *fmt)
-+{
-+	struct v4l2_loopback_device *dev;
-+	char buf[5];
-+	int ret;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	ret = vidioc_try_fmt_out(file, priv, fmt);
-+
-+	dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture,
-+		dev->pix_format.sizeimage);
-+
-+	buf[4] = 0;
-+	dprintk("outFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf));
-+
-+	if (ret < 0)
-+		return ret;
-+
-+	if (!dev->ready_for_capture) {
-+		dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
-+		fmt->fmt.pix.sizeimage = dev->buffer_size;
-+		ret = allocate_buffers(dev);
-+	}
-+	return ret;
-+}
-+
-+// #define V4L2L_OVERLAY
-+#ifdef V4L2L_OVERLAY
-+/* ------------------ OVERLAY ----------------------- */
-+/* currently unsupported */
-+/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work
-+ * while it should only require it, if overlay is requested
-+ * once the gstreamer element is fixed, remove the overlay dummies
-+ */
-+#warning OVERLAY dummies
-+static int vidioc_g_fmt_overlay(struct file *file, void *priv,
-+				struct v4l2_format *fmt)
-+{
-+	return 0;
-+}
-+
-+static int vidioc_s_fmt_overlay(struct file *file, void *priv,
-+				struct v4l2_format *fmt)
-+{
-+	return 0;
-+}
-+#endif /* V4L2L_OVERLAY */
-+
-+/* ------------------ PARAMs ----------------------- */
-+
-+/* get some data flow parameters, only capability, fps and readbuffers has
-+ * effect on this driver
-+ * called on VIDIOC_G_PARM
-+ */
-+static int vidioc_g_parm(struct file *file, void *priv,
-+			 struct v4l2_streamparm *parm)
-+{
-+	/* do not care about type of opener, hope these enums would always be
-+	 * compatible */
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	parm->parm.capture = dev->capture_param;
-+	return 0;
-+}
-+
-+/* get some data flow parameters, only capability, fps and readbuffers has
-+ * effect on this driver
-+ * called on VIDIOC_S_PARM
-+ */
-+static int vidioc_s_parm(struct file *file, void *priv,
-+			 struct v4l2_streamparm *parm)
-+{
-+	struct v4l2_loopback_device *dev;
-+	int err = 0;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	dprintk("vidioc_s_parm called frate=%d/%d\n",
-+		parm->parm.capture.timeperframe.numerator,
-+		parm->parm.capture.timeperframe.denominator);
-+
-+	switch (parm->type) {
-+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-+		if ((err = set_timeperframe(
-+			     dev, &parm->parm.capture.timeperframe)) < 0)
-+			return err;
-+		break;
-+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-+		if ((err = set_timeperframe(
-+			     dev, &parm->parm.capture.timeperframe)) < 0)
-+			return err;
-+		break;
-+	default:
-+		return -1;
-+	}
-+
-+	parm->parm.capture = dev->capture_param;
-+	return 0;
-+}
-+
-+#ifdef V4L2LOOPBACK_WITH_STD
-+/* sets a tv standard, actually we do not need to handle this any special way
-+ * added to support effecttv
-+ * called on VIDIOC_S_STD
-+ */
-+static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std)
-+{
-+	v4l2_std_id req_std = 0, supported_std = 0;
-+	const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0;
-+
-+	if (_std) {
-+		req_std = *_std;
-+		*_std = all_std;
-+	}
-+
-+	/* we support everything in V4L2_STD_ALL, but not more... */
-+	supported_std = (all_std & req_std);
-+	if (no_std == supported_std)
-+		return -EINVAL;
-+
-+	return 0;
-+}
-+
-+/* gets a fake video standard
-+ * called on VIDIOC_G_STD
-+ */
-+static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
-+{
-+	if (norm)
-+		*norm = V4L2_STD_ALL;
-+	return 0;
-+}
-+/* gets a fake video standard
-+ * called on VIDIOC_QUERYSTD
-+ */
-+static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
-+{
-+	if (norm)
-+		*norm = V4L2_STD_ALL;
-+	return 0;
-+}
-+#endif /* V4L2LOOPBACK_WITH_STD */
-+
-+static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id,
-+				 s64 val)
-+{
-+	switch (id) {
-+	case CID_KEEP_FORMAT:
-+		if (val < 0 || val > 1)
-+			return -EINVAL;
-+		dev->keep_format = val;
-+		try_free_buffers(
-+			dev); /* will only free buffers if !keep_format */
-+		break;
-+	case CID_SUSTAIN_FRAMERATE:
-+		if (val < 0 || val > 1)
-+			return -EINVAL;
-+		spin_lock_bh(&dev->lock);
-+		dev->sustain_framerate = val;
-+		check_timers(dev);
-+		spin_unlock_bh(&dev->lock);
-+		break;
-+	case CID_TIMEOUT:
-+		if (val < 0 || val > MAX_TIMEOUT)
-+			return -EINVAL;
-+		spin_lock_bh(&dev->lock);
-+		dev->timeout_jiffies = msecs_to_jiffies(val);
-+		check_timers(dev);
-+		spin_unlock_bh(&dev->lock);
-+		allocate_timeout_image(dev);
-+		break;
-+	case CID_TIMEOUT_IMAGE_IO:
-+		if (val < 0 || val > 1)
-+			return -EINVAL;
-+		dev->timeout_image_io = val;
-+		break;
-+	default:
-+		return -EINVAL;
-+	}
-+	return 0;
-+}
-+
-+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl)
-+{
-+	struct v4l2_loopback_device *dev = container_of(
-+		ctrl->handler, struct v4l2_loopback_device, ctrl_handler);
-+	return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val);
-+}
-+
-+/* returns set of device outputs, in our case there is only one
-+ * called on VIDIOC_ENUMOUTPUT
-+ */
-+static int vidioc_enum_output(struct file *file, void *fh,
-+			      struct v4l2_output *outp)
-+{
-+	__u32 index = outp->index;
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	MARK();
-+
-+	if (!dev->announce_all_caps && !dev->ready_for_output)
-+		return -ENOTTY;
-+
-+	if (0 != index)
-+		return -EINVAL;
-+
-+	/* clear all data (including the reserved fields) */
-+	memset(outp, 0, sizeof(*outp));
-+
-+	outp->index = index;
-+	strlcpy(outp->name, "loopback in", sizeof(outp->name));
-+	outp->type = V4L2_OUTPUT_TYPE_ANALOG;
-+	outp->audioset = 0;
-+	outp->modulator = 0;
-+#ifdef V4L2LOOPBACK_WITH_STD
-+	outp->std = V4L2_STD_ALL;
-+#ifdef V4L2_OUT_CAP_STD
-+	outp->capabilities |= V4L2_OUT_CAP_STD;
-+#endif /*  V4L2_OUT_CAP_STD */
-+#endif /* V4L2LOOPBACK_WITH_STD */
-+
-+	return 0;
-+}
-+
-+/* which output is currently active,
-+ * called on VIDIOC_G_OUTPUT
-+ */
-+static int vidioc_g_output(struct file *file, void *fh, unsigned int *i)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	if (!dev->announce_all_caps && !dev->ready_for_output)
-+		return -ENOTTY;
-+	if (i)
-+		*i = 0;
-+	return 0;
-+}
-+
-+/* set output, can make sense if we have more than one video src,
-+ * called on VIDIOC_S_OUTPUT
-+ */
-+static int vidioc_s_output(struct file *file, void *fh, unsigned int i)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	if (!dev->announce_all_caps && !dev->ready_for_output)
-+		return -ENOTTY;
-+
-+	if (i)
-+		return -EINVAL;
-+
-+	return 0;
-+}
-+
-+/* returns set of device inputs, in our case there is only one,
-+ * but later I may add more
-+ * called on VIDIOC_ENUMINPUT
-+ */
-+static int vidioc_enum_input(struct file *file, void *fh,
-+			     struct v4l2_input *inp)
-+{
-+	__u32 index = inp->index;
-+	MARK();
-+
-+	if (0 != index)
-+		return -EINVAL;
-+
-+	/* clear all data (including the reserved fields) */
-+	memset(inp, 0, sizeof(*inp));
-+
-+	inp->index = index;
-+	strlcpy(inp->name, "loopback", sizeof(inp->name));
-+	inp->type = V4L2_INPUT_TYPE_CAMERA;
-+	inp->audioset = 0;
-+	inp->tuner = 0;
-+	inp->status = 0;
-+
-+#ifdef V4L2LOOPBACK_WITH_STD
-+	inp->std = V4L2_STD_ALL;
-+#ifdef V4L2_IN_CAP_STD
-+	inp->capabilities |= V4L2_IN_CAP_STD;
-+#endif
-+#endif /* V4L2LOOPBACK_WITH_STD */
-+
-+	return 0;
-+}
-+
-+/* which input is currently active,
-+ * called on VIDIOC_G_INPUT
-+ */
-+static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	if (!dev->announce_all_caps && !dev->ready_for_capture)
-+		return -ENOTTY;
-+	if (i)
-+		*i = 0;
-+	return 0;
-+}
-+
-+/* set input, can make sense if we have more than one video src,
-+ * called on VIDIOC_S_INPUT
-+ */
-+static int vidioc_s_input(struct file *file, void *fh, unsigned int i)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	if (!dev->announce_all_caps && !dev->ready_for_capture)
-+		return -ENOTTY;
-+	if (i == 0)
-+		return 0;
-+	return -EINVAL;
-+}
-+
-+/* --------------- V4L2 ioctl buffer related calls ----------------- */
-+
-+/* negotiate buffer type
-+ * only mmap streaming supported
-+ * called on VIDIOC_REQBUFS
-+ */
-+static int vidioc_reqbufs(struct file *file, void *fh,
-+			  struct v4l2_requestbuffers *b)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	int i;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+
-+	dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count,
-+		dev->buffers_number);
-+	if (opener->timeout_image_io) {
-+		if (b->memory != V4L2_MEMORY_MMAP)
-+			return -EINVAL;
-+		b->count = 1;
-+		return 0;
-+	}
-+
-+	init_buffers(dev);
-+	switch (b->memory) {
-+	case V4L2_MEMORY_MMAP:
-+		/* do nothing here, buffers are always allocated */
-+		if (b->count < 1 || dev->buffers_number < 1)
-+			return 0;
-+
-+		if (b->count > dev->buffers_number)
-+			b->count = dev->buffers_number;
-+
-+		/* make sure that outbufs_list contains buffers from 0 to used_buffers-1
-+		 * actually, it will have been already populated via v4l2_loopback_init()
-+		 * at this point */
-+		if (list_empty(&dev->outbufs_list)) {
-+			for (i = 0; i < dev->used_buffers; ++i)
-+				list_add_tail(&dev->buffers[i].list_head,
-+					      &dev->outbufs_list);
-+		}
-+
-+		/* also, if dev->used_buffers is going to be decreased, we should remove
-+		 * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */
-+		if (b->count < dev->used_buffers) {
-+			struct v4l2l_buffer *pos, *n;
-+
-+			list_for_each_entry_safe(pos, n, &dev->outbufs_list,
-+						 list_head) {
-+				if (pos->buffer.index >= b->count)
-+					list_del(&pos->list_head);
-+			}
-+
-+			/* after we update dev->used_buffers, buffers in outbufs_list will
-+			 * correspond to dev->write_position + [0;b->count-1] range */
-+			i = dev->write_position;
-+			list_for_each_entry(pos, &dev->outbufs_list,
-+					    list_head) {
-+				dev->bufpos2index[mod_inc(&i, b->count)] =
-+					pos->buffer.index;
-+			}
-+		}
-+
-+		opener->buffers_number = b->count;
-+		if (opener->buffers_number < dev->used_buffers)
-+			dev->used_buffers = opener->buffers_number;
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
-+}
-+
-+/* returns buffer asked for;
-+ * give app as many buffers as it wants, if it less than MAX,
-+ * but map them in our inner buffers
-+ * called on VIDIOC_QUERYBUF
-+ */
-+static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b)
-+{
-+	enum v4l2_buf_type type;
-+	int index;
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+
-+	MARK();
-+
-+	type = b->type;
-+	index = b->index;
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+
-+	if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) &&
-+	    (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) {
-+		return -EINVAL;
-+	}
-+	if (b->index > max_buffers)
-+		return -EINVAL;
-+
-+	if (opener->timeout_image_io)
-+		*b = dev->timeout_image_buffer.buffer;
-+	else
-+		*b = dev->buffers[b->index % dev->used_buffers].buffer;
-+
-+	b->type = type;
-+	b->index = index;
-+	dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory,
-+		  dev->buffers_number, dev->buffer_size);
-+
-+	/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
-+            https://github.com/umlaeute/v4l2loopback/issues/60 */
-+	b->flags &= ~V4L2_BUF_FLAG_DONE;
-+	b->flags |= V4L2_BUF_FLAG_QUEUED;
-+
-+	return 0;
-+}
-+
-+static void buffer_written(struct v4l2_loopback_device *dev,
-+			   struct v4l2l_buffer *buf)
-+{
-+	del_timer_sync(&dev->sustain_timer);
-+	del_timer_sync(&dev->timeout_timer);
-+	spin_lock_bh(&dev->lock);
-+
-+	dev->bufpos2index[mod_inc(&dev->write_position, dev->used_buffers)] =
-+		buf->buffer.index;
-+	list_move_tail(&buf->list_head, &dev->outbufs_list);
-+	dev->reread_count = 0;
-+
-+	check_timers(dev);
-+	spin_unlock_bh(&dev->lock);
-+}
-+
-+/* put buffer to queue
-+ * called on VIDIOC_QBUF
-+ */
-+static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	struct v4l2l_buffer *b;
-+	int index;
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+
-+	if (buf->index > max_buffers)
-+		return -EINVAL;
-+	if (opener->timeout_image_io)
-+		return 0;
-+
-+	index = buf->index % dev->used_buffers;
-+	b = &dev->buffers[index];
-+
-+	switch (buf->type) {
-+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-+		dprintkrw("capture QBUF index: %d\n", index);
-+		set_queued(b);
-+		return 0;
-+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-+		dprintkrw("output QBUF pos: %d index: %d\n",
-+			  dev->write_position, index);
-+		if (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0)
-+			v4l2l_get_timestamp(&b->buffer);
-+		else
-+			b->buffer.timestamp = buf->timestamp;
-+		b->buffer.bytesused = buf->bytesused;
-+		set_done(b);
-+		buffer_written(dev, b);
-+
-+		/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
-+                    https://github.com/umlaeute/v4l2loopback/issues/60 */
-+		buf->flags &= ~V4L2_BUF_FLAG_DONE;
-+		buf->flags |= V4L2_BUF_FLAG_QUEUED;
-+
-+		wake_up_all(&dev->read_event);
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
-+}
-+
-+static int can_read(struct v4l2_loopback_device *dev,
-+		    struct v4l2_loopback_opener *opener)
-+{
-+	int ret;
-+
-+	spin_lock_bh(&dev->lock);
-+	check_timers(dev);
-+	ret = dev->write_position > opener->read_position ||
-+	      dev->reread_count > opener->reread_count || dev->timeout_happened;
-+	spin_unlock_bh(&dev->lock);
-+	return ret;
-+}
-+
-+static int get_capture_buffer(struct file *file)
-+{
-+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
-+	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
-+	int pos, ret;
-+	int timeout_happened;
-+
-+	if ((file->f_flags & O_NONBLOCK) &&
-+	    (dev->write_position <= opener->read_position &&
-+	     dev->reread_count <= opener->reread_count &&
-+	     !dev->timeout_happened))
-+		return -EAGAIN;
-+	wait_event_interruptible(dev->read_event, can_read(dev, opener));
-+
-+	spin_lock_bh(&dev->lock);
-+	if (dev->write_position == opener->read_position) {
-+		if (dev->reread_count > opener->reread_count + 2)
-+			opener->reread_count = dev->reread_count - 1;
-+		++opener->reread_count;
-+		pos = (opener->read_position + dev->used_buffers - 1) %
-+		      dev->used_buffers;
-+	} else {
-+		opener->reread_count = 0;
-+		if (dev->write_position >
-+		    opener->read_position + dev->used_buffers)
-+			opener->read_position = dev->write_position - 1;
-+		pos = mod_inc(&opener->read_position, dev->used_buffers);
-+	}
-+	timeout_happened = dev->timeout_happened;
-+	dev->timeout_happened = 0;
-+	spin_unlock_bh(&dev->lock);
-+
-+	ret = dev->bufpos2index[pos];
-+	if (timeout_happened) {
-+		if (ret < 0) {
-+			dprintk("trying to return not mapped buf[%d]\n", ret);
-+			return -EFAULT;
-+		}
-+		/* although allocated on-demand, timeout_image is freed only
-+		 * in free_buffers(), so we don't need to worry about it being
-+		 * deallocated suddenly */
-+		memcpy(dev->image + dev->buffers[ret].buffer.m.offset,
-+		       dev->timeout_image, dev->buffer_size);
-+	}
-+	return ret;
-+}
-+
-+/* put buffer to dequeue
-+ * called on VIDIOC_DQBUF
-+ */
-+static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	int index;
-+	struct v4l2l_buffer *b;
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+	if (opener->timeout_image_io) {
-+		*buf = dev->timeout_image_buffer.buffer;
-+		return 0;
-+	}
-+
-+	switch (buf->type) {
-+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-+		index = get_capture_buffer(file);
-+		if (index < 0)
-+			return index;
-+		dprintkrw("capture DQBUF pos: %d index: %d\n",
-+			  opener->read_position - 1, index);
-+		if (!(dev->buffers[index].buffer.flags &
-+		      V4L2_BUF_FLAG_MAPPED)) {
-+			dprintk("trying to return not mapped buf[%d]\n", index);
-+			return -EINVAL;
-+		}
-+		unset_flags(&dev->buffers[index]);
-+		*buf = dev->buffers[index].buffer;
-+		return 0;
-+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-+		b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer,
-+			       list_head);
-+		list_move_tail(&b->list_head, &dev->outbufs_list);
-+		dprintkrw("output DQBUF index: %d\n", b->buffer.index);
-+		unset_flags(b);
-+		*buf = b->buffer;
-+		buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
-+}
-+
-+/* ------------- STREAMING ------------------- */
-+
-+/* start streaming
-+ * called on VIDIOC_STREAMON
-+ */
-+static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+
-+	switch (type) {
-+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-+		if (!dev->ready_for_capture) {
-+			int ret = allocate_buffers(dev);
-+			if (ret < 0)
-+				return ret;
-+		}
-+		opener->type = WRITER;
-+		dev->ready_for_output = 0;
-+		dev->ready_for_capture++;
-+		return 0;
-+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-+		if (!dev->ready_for_capture)
-+			return -EIO;
-+		opener->type = READER;
-+		dev->active_readers++;
-+		client_usage_queue_event(dev->vdev);
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
-+	return -EINVAL;
-+}
-+
-+/* stop streaming
-+ * called on VIDIOC_STREAMOFF
-+ */
-+static int vidioc_streamoff(struct file *file, void *fh,
-+			    enum v4l2_buf_type type)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+
-+	MARK();
-+	dprintk("%d\n", type);
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(fh);
-+	switch (type) {
-+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
-+		if (dev->ready_for_capture > 0)
-+			dev->ready_for_capture--;
-+		return 0;
-+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
-+		if (opener->type == READER) {
-+			opener->type = 0;
-+			dev->active_readers--;
-+			client_usage_queue_event(dev->vdev);
-+		}
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
-+	return -EINVAL;
-+}
-+
-+#ifdef CONFIG_VIDEO_V4L1_COMPAT
-+static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p)
-+{
-+	struct v4l2_loopback_device *dev;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	p->frames = dev->buffers_number;
-+	p->offsets[0] = 0;
-+	p->offsets[1] = 0;
-+	p->size = dev->buffer_size;
-+	return 0;
-+}
-+#endif
-+
-+static void client_usage_queue_event(struct video_device *vdev)
-+{
-+	struct v4l2_event ev;
-+	struct v4l2_loopback_device *dev;
-+
-+	dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device,
-+			   v4l2_dev);
-+
-+	memset(&ev, 0, sizeof(ev));
-+	ev.type = V4L2_EVENT_PRI_CLIENT_USAGE;
-+	((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers;
-+
-+	v4l2_event_queue(vdev, &ev);
-+}
-+
-+static int client_usage_ops_add(struct v4l2_subscribed_event *sev,
-+				unsigned elems)
-+{
-+	if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL))
-+		return 0;
-+
-+	client_usage_queue_event(sev->fh->vdev);
-+	return 0;
-+}
-+
-+static void client_usage_ops_replace(struct v4l2_event *old,
-+				     const struct v4l2_event *new)
-+{
-+	*((struct v4l2_event_client_usage *)&old->u) =
-+		*((struct v4l2_event_client_usage *)&new->u);
-+}
-+
-+static void client_usage_ops_merge(const struct v4l2_event *old,
-+				   struct v4l2_event *new)
-+{
-+	*((struct v4l2_event_client_usage *)&new->u) =
-+		*((struct v4l2_event_client_usage *)&old->u);
-+}
-+
-+const struct v4l2_subscribed_event_ops client_usage_ops = {
-+	.add = client_usage_ops_add,
-+	.replace = client_usage_ops_replace,
-+	.merge = client_usage_ops_merge,
-+};
-+
-+static int vidioc_subscribe_event(struct v4l2_fh *fh,
-+				  const struct v4l2_event_subscription *sub)
-+{
-+	switch (sub->type) {
-+	case V4L2_EVENT_CTRL:
-+		return v4l2_ctrl_subscribe_event(fh, sub);
-+	case V4L2_EVENT_PRI_CLIENT_USAGE:
-+		return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops);
-+	}
-+
-+	return -EINVAL;
-+}
-+
-+/* file operations */
-+static void vm_open(struct vm_area_struct *vma)
-+{
-+	struct v4l2l_buffer *buf;
-+	MARK();
-+
-+	buf = vma->vm_private_data;
-+	buf->use_count++;
-+
-+	buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED;
-+}
-+
-+static void vm_close(struct vm_area_struct *vma)
-+{
-+	struct v4l2l_buffer *buf;
-+	MARK();
-+
-+	buf = vma->vm_private_data;
-+	buf->use_count--;
-+
-+	if (buf->use_count <= 0)
-+		buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED;
-+}
-+
-+static struct vm_operations_struct vm_ops = {
-+	.open = vm_open,
-+	.close = vm_close,
-+};
-+
-+static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma)
-+{
-+	u8 *addr;
-+	unsigned long start;
-+	unsigned long size;
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	struct v4l2l_buffer *buffer = NULL;
-+	MARK();
-+
-+	start = (unsigned long)vma->vm_start;
-+	size = (unsigned long)(vma->vm_end - vma->vm_start);
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(file->private_data);
-+
-+	if (size > dev->buffer_size) {
-+		dprintk("userspace tries to mmap too much, fail\n");
-+		return -EINVAL;
-+	}
-+	if (opener->timeout_image_io) {
-+		/* we are going to map the timeout_image_buffer */
-+		if ((vma->vm_pgoff << PAGE_SHIFT) !=
-+		    dev->buffer_size * MAX_BUFFERS) {
-+			dprintk("invalid mmap offset for timeout_image_io mode\n");
-+			return -EINVAL;
-+		}
-+	} else if ((vma->vm_pgoff << PAGE_SHIFT) >
-+		   dev->buffer_size * (dev->buffers_number - 1)) {
-+		dprintk("userspace tries to mmap too far, fail\n");
-+		return -EINVAL;
-+	}
-+
-+	/* FIXXXXXME: allocation should not happen here! */
-+	if (NULL == dev->image)
-+		if (allocate_buffers(dev) < 0)
-+			return -EINVAL;
-+
-+	if (opener->timeout_image_io) {
-+		buffer = &dev->timeout_image_buffer;
-+		addr = dev->timeout_image;
-+	} else {
-+		int i;
-+		for (i = 0; i < dev->buffers_number; ++i) {
-+			buffer = &dev->buffers[i];
-+			if ((buffer->buffer.m.offset >> PAGE_SHIFT) ==
-+			    vma->vm_pgoff)
-+				break;
-+		}
-+
-+		if (i >= dev->buffers_number)
-+			return -EINVAL;
-+
-+		addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT);
-+	}
-+
-+	while (size > 0) {
-+		struct page *page;
-+
-+		page = vmalloc_to_page(addr);
-+
-+		if (vm_insert_page(vma, start, page) < 0)
-+			return -EAGAIN;
-+
-+		start += PAGE_SIZE;
-+		addr += PAGE_SIZE;
-+		size -= PAGE_SIZE;
-+	}
-+
-+	vma->vm_ops = &vm_ops;
-+	vma->vm_private_data = buffer;
-+
-+	vm_open(vma);
-+
-+	MARK();
-+	return 0;
-+}
-+
-+static unsigned int v4l2_loopback_poll(struct file *file,
-+				       struct poll_table_struct *pts)
-+{
-+	struct v4l2_loopback_opener *opener;
-+	struct v4l2_loopback_device *dev;
-+	__poll_t req_events = poll_requested_events(pts);
-+	int ret_mask = 0;
-+	MARK();
-+
-+	opener = fh_to_opener(file->private_data);
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (req_events & POLLPRI) {
-+		if (!v4l2_event_pending(&opener->fh))
-+			poll_wait(file, &opener->fh.wait, pts);
-+		if (v4l2_event_pending(&opener->fh)) {
-+			ret_mask |= POLLPRI;
-+			if (!(req_events & DEFAULT_POLLMASK))
-+				return ret_mask;
-+		}
-+	}
-+
-+	switch (opener->type) {
-+	case WRITER:
-+		ret_mask |= POLLOUT | POLLWRNORM;
-+		break;
-+	case READER:
-+		if (!can_read(dev, opener)) {
-+			if (ret_mask)
-+				return ret_mask;
-+			poll_wait(file, &dev->read_event, pts);
-+		}
-+		if (can_read(dev, opener))
-+			ret_mask |= POLLIN | POLLRDNORM;
-+		if (v4l2_event_pending(&opener->fh))
-+			ret_mask |= POLLPRI;
-+		break;
-+	default:
-+		break;
-+	}
-+
-+	MARK();
-+	return ret_mask;
-+}
-+
-+/* do not want to limit device opens, it can be as many readers as user want,
-+ * writers are limited by means of setting writer field */
-+static int v4l2_loopback_open(struct file *file)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_opener *opener;
-+	MARK();
-+	dev = v4l2loopback_getdevice(file);
-+	if (dev->open_count.counter >= dev->max_openers)
-+		return -EBUSY;
-+	/* kfree on close */
-+	opener = kzalloc(sizeof(*opener), GFP_KERNEL);
-+	if (opener == NULL)
-+		return -ENOMEM;
-+
-+	atomic_inc(&dev->open_count);
-+
-+	opener->timeout_image_io = dev->timeout_image_io;
-+	if (opener->timeout_image_io) {
-+		int r = allocate_timeout_image(dev);
-+
-+		if (r < 0) {
-+			dprintk("timeout image allocation failed\n");
-+
-+			atomic_dec(&dev->open_count);
-+
-+			kfree(opener);
-+			return r;
-+		}
-+	}
-+
-+	dev->timeout_image_io = 0;
-+
-+	v4l2_fh_init(&opener->fh, video_devdata(file));
-+	file->private_data = &opener->fh;
-+
-+	v4l2_fh_add(&opener->fh);
-+	dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL);
-+	MARK();
-+	return 0;
-+}
-+
-+static int v4l2_loopback_close(struct file *file)
-+{
-+	struct v4l2_loopback_opener *opener;
-+	struct v4l2_loopback_device *dev;
-+	int is_writer = 0, is_reader = 0;
-+	MARK();
-+
-+	opener = fh_to_opener(file->private_data);
-+	dev = v4l2loopback_getdevice(file);
-+
-+	if (WRITER == opener->type)
-+		is_writer = 1;
-+	if (READER == opener->type)
-+		is_reader = 1;
-+
-+	atomic_dec(&dev->open_count);
-+	if (dev->open_count.counter == 0) {
-+		del_timer_sync(&dev->sustain_timer);
-+		del_timer_sync(&dev->timeout_timer);
-+	}
-+	try_free_buffers(dev);
-+
-+	v4l2_fh_del(&opener->fh);
-+	v4l2_fh_exit(&opener->fh);
-+
-+	kfree(opener);
-+	if (is_writer)
-+		dev->ready_for_output = 1;
-+	if (is_reader) {
-+		dev->active_readers--;
-+		client_usage_queue_event(dev->vdev);
-+	}
-+	MARK();
-+	return 0;
-+}
-+
-+static ssize_t v4l2_loopback_read(struct file *file, char __user *buf,
-+				  size_t count, loff_t *ppos)
-+{
-+	int read_index;
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_buffer *b;
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+
-+	read_index = get_capture_buffer(file);
-+	if (read_index < 0)
-+		return read_index;
-+	if (count > dev->buffer_size)
-+		count = dev->buffer_size;
-+	b = &dev->buffers[read_index].buffer;
-+	if (count > b->bytesused)
-+		count = b->bytesused;
-+	if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset),
-+			 count)) {
-+		printk(KERN_ERR
-+		       "v4l2-loopback: failed copy_to_user() in read buf\n");
-+		return -EFAULT;
-+	}
-+	dprintkrw("leave v4l2_loopback_read()\n");
-+	return count;
-+}
-+
-+static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf,
-+				   size_t count, loff_t *ppos)
-+{
-+	struct v4l2_loopback_opener *opener;
-+	struct v4l2_loopback_device *dev;
-+	int write_index;
-+	struct v4l2_buffer *b;
-+	int err = 0;
-+
-+	MARK();
-+
-+	dev = v4l2loopback_getdevice(file);
-+	opener = fh_to_opener(file->private_data);
-+
-+	if (UNNEGOTIATED == opener->type) {
-+		spin_lock(&dev->lock);
-+
-+		if (dev->ready_for_output) {
-+			err = vidioc_streamon(file, file->private_data,
-+					      V4L2_BUF_TYPE_VIDEO_OUTPUT);
-+		}
-+
-+		spin_unlock(&dev->lock);
-+
-+		if (err < 0)
-+			return err;
-+	}
-+
-+	if (WRITER != opener->type)
-+		return -EINVAL;
-+
-+	if (!dev->ready_for_capture) {
-+		int ret = allocate_buffers(dev);
-+		if (ret < 0)
-+			return ret;
-+		dev->ready_for_capture = 1;
-+	}
-+	dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count);
-+	if (count > dev->buffer_size)
-+		count = dev->buffer_size;
-+
-+	write_index = dev->write_position % dev->used_buffers;
-+	b = &dev->buffers[write_index].buffer;
-+
-+	if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf,
-+			   count)) {
-+		printk(KERN_ERR
-+		       "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n",
-+		       count);
-+		return -EFAULT;
-+	}
-+	v4l2l_get_timestamp(b);
-+	b->bytesused = count;
-+	b->sequence = dev->write_position;
-+	buffer_written(dev, &dev->buffers[write_index]);
-+	wake_up_all(&dev->read_event);
-+	dprintkrw("leave v4l2_loopback_write()\n");
-+	return count;
-+}
-+
-+/* init functions */
-+/* frees buffers, if already allocated */
-+static void free_buffers(struct v4l2_loopback_device *dev)
-+{
-+	MARK();
-+	dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev);
-+	if (!dev)
-+		return;
-+	if (dev->image) {
-+		vfree(dev->image);
-+		dev->image = NULL;
-+	}
-+	if (dev->timeout_image) {
-+		vfree(dev->timeout_image);
-+		dev->timeout_image = NULL;
-+	}
-+	dev->imagesize = 0;
-+}
-+/* frees buffers, if they are no longer needed */
-+static void try_free_buffers(struct v4l2_loopback_device *dev)
-+{
-+	MARK();
-+	if (0 == dev->open_count.counter && !dev->keep_format) {
-+		free_buffers(dev);
-+		dev->ready_for_capture = 0;
-+		dev->buffer_size = 0;
-+		dev->write_position = 0;
-+	}
-+}
-+/* allocates buffers, if buffer_size is set */
-+static int allocate_buffers(struct v4l2_loopback_device *dev)
-+{
-+	int err;
-+
-+	MARK();
-+	/* vfree on close file operation in case no open handles left */
-+
-+	if (dev->buffer_size < 1 || dev->buffers_number < 1)
-+		return -EINVAL;
-+
-+	if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number)
-+		return -ENOSPC;
-+
-+	if (dev->image) {
-+		dprintk("allocating buffers again: %ld %ld\n",
-+			dev->buffer_size * dev->buffers_number, dev->imagesize);
-+		/* FIXME: prevent double allocation more intelligently! */
-+		if (dev->buffer_size * dev->buffers_number == dev->imagesize)
-+			return 0;
-+
-+		/* if there is only one writer, no problem should occur */
-+		if (dev->open_count.counter == 1)
-+			free_buffers(dev);
-+		else
-+			return -EINVAL;
-+	}
-+
-+	dev->imagesize = (unsigned long)dev->buffer_size *
-+			 (unsigned long)dev->buffers_number;
-+
-+	dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size,
-+		dev->buffers_number);
-+	err = -ENOMEM;
-+
-+	if (dev->timeout_jiffies > 0) {
-+		err = allocate_timeout_image(dev);
-+		if (err < 0)
-+			goto error;
-+	}
-+
-+	dev->image = vmalloc(dev->imagesize);
-+	if (dev->image == NULL)
-+		goto error;
-+
-+	dprintk("vmallocated %ld bytes\n", dev->imagesize);
-+	MARK();
-+
-+	init_buffers(dev);
-+	return 0;
-+
-+error:
-+	free_buffers(dev);
-+	return err;
-+}
-+
-+/* init inner buffers, they are capture mode and flags are set as
-+ * for capture mod buffers */
-+static void init_buffers(struct v4l2_loopback_device *dev)
-+{
-+	int i;
-+	int buffer_size;
-+	int bytesused;
-+	MARK();
-+
-+	buffer_size = dev->buffer_size;
-+	bytesused = dev->pix_format.sizeimage;
-+
-+	for (i = 0; i < dev->buffers_number; ++i) {
-+		struct v4l2_buffer *b = &dev->buffers[i].buffer;
-+		b->index = i;
-+		b->bytesused = bytesused;
-+		b->length = buffer_size;
-+		b->field = V4L2_FIELD_NONE;
-+		b->flags = 0;
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1)
-+		b->input = 0;
-+#endif
-+		b->m.offset = i * buffer_size;
-+		b->memory = V4L2_MEMORY_MMAP;
-+		b->sequence = 0;
-+		b->timestamp.tv_sec = 0;
-+		b->timestamp.tv_usec = 0;
-+		b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+
-+		v4l2l_get_timestamp(b);
-+	}
-+	dev->timeout_image_buffer = dev->buffers[0];
-+	dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size;
-+	MARK();
-+}
-+
-+static int allocate_timeout_image(struct v4l2_loopback_device *dev)
-+{
-+	MARK();
-+	if (dev->buffer_size <= 0)
-+		return -EINVAL;
-+
-+	if (dev->timeout_image == NULL) {
-+		dev->timeout_image = vzalloc(dev->buffer_size);
-+		if (dev->timeout_image == NULL)
-+			return -ENOMEM;
-+	}
-+	return 0;
-+}
-+
-+/* fills and register video device */
-+static void init_vdev(struct video_device *vdev, int nr)
-+{
-+	MARK();
-+
-+#ifdef V4L2LOOPBACK_WITH_STD
-+	vdev->tvnorms = V4L2_STD_ALL;
-+#endif /* V4L2LOOPBACK_WITH_STD */
-+
-+	vdev->vfl_type = VFL_TYPE_VIDEO;
-+	vdev->fops = &v4l2_loopback_fops;
-+	vdev->ioctl_ops = &v4l2_loopback_ioctl_ops;
-+	vdev->release = &video_device_release;
-+	vdev->minor = -1;
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
-+	vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE |
-+			    V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE |
-+			    V4L2_CAP_STREAMING;
-+#endif
-+
-+	if (debug > 1)
-+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 20, 0)
-+		vdev->debug = V4L2_DEBUG_IOCTL | V4L2_DEBUG_IOCTL_ARG;
-+#else
-+		vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL |
-+				  V4L2_DEV_DEBUG_IOCTL_ARG;
-+#endif
-+
-+		/* since kernel-3.7, there is a new field 'vfl_dir' that has to be
-+	 * set to VFL_DIR_M2M for bidirectional devices */
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
-+	vdev->vfl_dir = VFL_DIR_M2M;
-+#endif
-+
-+	MARK();
-+}
-+
-+/* init default capture parameters, only fps may be changed in future */
-+static void init_capture_param(struct v4l2_captureparm *capture_param)
-+{
-+	MARK();
-+	capture_param->capability = 0;
-+	capture_param->capturemode = 0;
-+	capture_param->extendedmode = 0;
-+	capture_param->readbuffers = max_buffers;
-+	capture_param->timeperframe.numerator = 1;
-+	capture_param->timeperframe.denominator = 30;
-+}
-+
-+static void check_timers(struct v4l2_loopback_device *dev)
-+{
-+	if (!dev->ready_for_capture)
-+		return;
-+
-+	if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer))
-+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
-+	if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer))
-+		mod_timer(&dev->sustain_timer,
-+			  jiffies + dev->frame_jiffies * 3 / 2);
-+}
-+#ifdef HAVE_TIMER_SETUP
-+static void sustain_timer_clb(struct timer_list *t)
-+{
-+	struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer);
-+#else
-+static void sustain_timer_clb(unsigned long nr)
-+{
-+	struct v4l2_loopback_device *dev =
-+		idr_find(&v4l2loopback_index_idr, nr);
-+#endif
-+	spin_lock(&dev->lock);
-+	if (dev->sustain_framerate) {
-+		dev->reread_count++;
-+		dprintkrw("reread: %d %d\n", dev->write_position,
-+			  dev->reread_count);
-+		if (dev->reread_count == 1)
-+			mod_timer(&dev->sustain_timer,
-+				  jiffies + max(1UL, dev->frame_jiffies / 2));
-+		else
-+			mod_timer(&dev->sustain_timer,
-+				  jiffies + dev->frame_jiffies);
-+		wake_up_all(&dev->read_event);
-+	}
-+	spin_unlock(&dev->lock);
-+}
-+#ifdef HAVE_TIMER_SETUP
-+static void timeout_timer_clb(struct timer_list *t)
-+{
-+	struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer);
-+#else
-+static void timeout_timer_clb(unsigned long nr)
-+{
-+	struct v4l2_loopback_device *dev =
-+		idr_find(&v4l2loopback_index_idr, nr);
-+#endif
-+	spin_lock(&dev->lock);
-+	if (dev->timeout_jiffies > 0) {
-+		dev->timeout_happened = 1;
-+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
-+		wake_up_all(&dev->read_event);
-+	}
-+	spin_unlock(&dev->lock);
-+}
-+
-+/* init loopback main structure */
-+#define DEFAULT_FROM_CONF(confmember, default_condition, default_value)        \
-+	((conf) ?                                                              \
-+		 ((conf->confmember default_condition) ? (default_value) :     \
-+							 (conf->confmember)) : \
-+		 default_value)
-+
-+static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_ctrl_handler *hdl;
-+	struct v4l2loopback_private *vdev_priv = NULL;
-+
-+	int err = -ENOMEM;
-+
-+	int _max_width = DEFAULT_FROM_CONF(
-+		max_width, < V4L2LOOPBACK_SIZE_MIN_WIDTH, max_width);
-+	int _max_height = DEFAULT_FROM_CONF(
-+		max_height, < V4L2LOOPBACK_SIZE_MIN_HEIGHT, max_height);
-+	bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ?
-+					  (conf->announce_all_caps) :
-+					  V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS;
-+	int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers);
-+	int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers);
-+
-+	int nr = -1;
-+
-+	_announce_all_caps = (!!_announce_all_caps);
-+
-+	if (conf) {
-+		if (conf->capture_nr >= 0 &&
-+		    conf->output_nr == conf->capture_nr) {
-+			nr = conf->capture_nr;
-+		} else if (conf->capture_nr < 0 && conf->output_nr < 0) {
-+			nr = -1;
-+		} else if (conf->capture_nr < 0) {
-+			nr = conf->output_nr;
-+		} else if (conf->output_nr < 0) {
-+			nr = conf->capture_nr;
-+		} else {
-+			printk(KERN_ERR
-+			       "split OUTPUT and CAPTURE devices not yet supported.");
-+			printk(KERN_INFO
-+			       "both devices must have the same number (%d != %d).",
-+			       conf->output_nr, conf->capture_nr);
-+			return -EINVAL;
-+		}
-+	}
-+
-+	if (idr_find(&v4l2loopback_index_idr, nr))
-+		return -EEXIST;
-+
-+	dprintk("creating v4l2loopback-device #%d\n", nr);
-+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-+	if (!dev)
-+		return -ENOMEM;
-+
-+	/* allocate id, if @id >= 0, we're requesting that specific id */
-+	if (nr >= 0) {
-+		err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1,
-+				GFP_KERNEL);
-+		if (err == -ENOSPC)
-+			err = -EEXIST;
-+	} else {
-+		err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL);
-+	}
-+	if (err < 0)
-+		goto out_free_dev;
-+	nr = err;
-+	err = -ENOMEM;
-+
-+	if (conf && conf->card_label[0]) {
-+		snprintf(dev->card_label, sizeof(dev->card_label), "%s",
-+			 conf->card_label);
-+	} else {
-+		snprintf(dev->card_label, sizeof(dev->card_label),
-+			 "Dummy video device (0x%04X)", nr);
-+	}
-+	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
-+		 "v4l2loopback-%03d", nr);
-+
-+	err = v4l2_device_register(NULL, &dev->v4l2_dev);
-+	if (err)
-+		goto out_free_idr;
-+	MARK();
-+
-+	dev->vdev = video_device_alloc();
-+	if (dev->vdev == NULL) {
-+		err = -ENOMEM;
-+		goto out_unregister;
-+	}
-+
-+	vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL);
-+	if (vdev_priv == NULL) {
-+		err = -ENOMEM;
-+		goto out_unregister;
-+	}
-+
-+	video_set_drvdata(dev->vdev, vdev_priv);
-+	if (video_get_drvdata(dev->vdev) == NULL) {
-+		err = -ENOMEM;
-+		goto out_unregister;
-+	}
-+
-+	MARK();
-+	snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s",
-+		 dev->card_label);
-+
-+	vdev_priv->device_nr = nr;
-+
-+	init_vdev(dev->vdev, nr);
-+	dev->vdev->v4l2_dev = &dev->v4l2_dev;
-+	init_capture_param(&dev->capture_param);
-+	err = set_timeperframe(dev, &dev->capture_param.timeperframe);
-+	if (err)
-+		goto out_unregister;
-+	dev->keep_format = 0;
-+	dev->sustain_framerate = 0;
-+
-+	dev->announce_all_caps = _announce_all_caps;
-+	dev->max_width = _max_width;
-+	dev->max_height = _max_height;
-+	dev->max_openers = _max_openers;
-+	dev->buffers_number = dev->used_buffers = _max_buffers;
-+
-+	dev->write_position = 0;
-+
-+	MARK();
-+	spin_lock_init(&dev->lock);
-+	INIT_LIST_HEAD(&dev->outbufs_list);
-+	if (list_empty(&dev->outbufs_list)) {
-+		int i;
-+
-+		for (i = 0; i < dev->used_buffers; ++i)
-+			list_add_tail(&dev->buffers[i].list_head,
-+				      &dev->outbufs_list);
-+	}
-+	memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index));
-+	atomic_set(&dev->open_count, 0);
-+	dev->ready_for_capture = 0;
-+	dev->ready_for_output = 1;
-+
-+	dev->buffer_size = 0;
-+	dev->image = NULL;
-+	dev->imagesize = 0;
-+#ifdef HAVE_TIMER_SETUP
-+	timer_setup(&dev->sustain_timer, sustain_timer_clb, 0);
-+	timer_setup(&dev->timeout_timer, timeout_timer_clb, 0);
-+#else
-+	setup_timer(&dev->sustain_timer, sustain_timer_clb, nr);
-+	setup_timer(&dev->timeout_timer, timeout_timer_clb, nr);
-+#endif
-+	dev->reread_count = 0;
-+	dev->timeout_jiffies = 0;
-+	dev->timeout_image = NULL;
-+	dev->timeout_happened = 0;
-+
-+	hdl = &dev->ctrl_handler;
-+	err = v4l2_ctrl_handler_init(hdl, 4);
-+	if (err)
-+		goto out_unregister;
-+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL);
-+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL);
-+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL);
-+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL);
-+	if (hdl->error) {
-+		err = hdl->error;
-+		goto out_free_handler;
-+	}
-+	dev->v4l2_dev.ctrl_handler = hdl;
-+
-+	err = v4l2_ctrl_handler_setup(hdl);
-+	if (err)
-+		goto out_free_handler;
-+
-+	/* FIXME set buffers to 0 */
-+
-+	/* Set initial format */
-+	dev->pix_format.width = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; */
-+	dev->pix_format.height = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; */
-+	dev->pix_format.pixelformat = formats[0].fourcc;
-+	dev->pix_format.colorspace =
-+		V4L2_COLORSPACE_SRGB; /* do we need to set this ? */
-+	dev->pix_format.field = V4L2_FIELD_NONE;
-+
-+	dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
-+	dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size,
-+		dev->pix_format.sizeimage);
-+
-+	if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0))
-+		goto out_free_handler;
-+
-+	init_waitqueue_head(&dev->read_event);
-+
-+	/* register the device -> it creates /dev/video* */
-+	if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) {
-+		printk(KERN_ERR
-+		       "v4l2loopback: failed video_register_device()\n");
-+		err = -EFAULT;
-+		goto out_free_device;
-+	}
-+	v4l2loopback_create_sysfs(dev->vdev);
-+
-+	MARK();
-+	if (ret_nr)
-+		*ret_nr = dev->vdev->num;
-+	return 0;
-+
-+out_free_device:
-+	video_device_release(dev->vdev);
-+out_free_handler:
-+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
-+out_unregister:
-+	video_set_drvdata(dev->vdev, NULL);
-+	if (vdev_priv != NULL)
-+		kfree(vdev_priv);
-+	v4l2_device_unregister(&dev->v4l2_dev);
-+out_free_idr:
-+	idr_remove(&v4l2loopback_index_idr, nr);
-+out_free_dev:
-+	kfree(dev);
-+	return err;
-+}
-+
-+static void v4l2_loopback_remove(struct v4l2_loopback_device *dev)
-+{
-+	free_buffers(dev);
-+	v4l2loopback_remove_sysfs(dev->vdev);
-+	kfree(video_get_drvdata(dev->vdev));
-+	video_unregister_device(dev->vdev);
-+	v4l2_device_unregister(&dev->v4l2_dev);
-+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
-+	kfree(dev);
-+}
-+
-+static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd,
-+				       unsigned long parm)
-+{
-+	struct v4l2_loopback_device *dev;
-+	struct v4l2_loopback_config conf;
-+	struct v4l2_loopback_config *confptr = &conf;
-+	int device_nr;
-+	int ret;
-+
-+	ret = mutex_lock_killable(&v4l2loopback_ctl_mutex);
-+	if (ret)
-+		return ret;
-+
-+	ret = -EINVAL;
-+	switch (cmd) {
-+	default:
-+		ret = -ENOSYS;
-+		break;
-+		/* add a v4l2loopback device (pair), based on the user-provided specs */
-+	case V4L2LOOPBACK_CTL_ADD:
-+		if (parm) {
-+			if ((ret = copy_from_user(&conf, (void *)parm,
-+						  sizeof(conf))) < 0)
-+				break;
-+		} else
-+			confptr = NULL;
-+		ret = v4l2_loopback_add(confptr, &device_nr);
-+		if (ret >= 0)
-+			ret = device_nr;
-+		break;
-+		/* remove a v4l2loopback device (both capture and output) */
-+	case V4L2LOOPBACK_CTL_REMOVE:
-+		ret = v4l2loopback_lookup((int)parm, &dev);
-+		if (ret >= 0 && dev) {
-+			int nr = ret;
-+			ret = -EBUSY;
-+			if (dev->open_count.counter > 0)
-+				break;
-+			idr_remove(&v4l2loopback_index_idr, nr);
-+			v4l2_loopback_remove(dev);
-+			ret = 0;
-+		};
-+		break;
-+		/* get information for a loopback device.
-+                 * this is mostly about limits (which cannot be queried directly with  VIDIOC_G_FMT and friends
-+                 */
-+	case V4L2LOOPBACK_CTL_QUERY:
-+		if (!parm)
-+			break;
-+		if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) <
-+		    0)
-+			break;
-+		device_nr = (conf.output_nr < 0) ? conf.capture_nr :
-+						   conf.output_nr;
-+		MARK();
-+		/* get the device from either capture_nr or output_nr (whatever is valid) */
-+		if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0)
-+			break;
-+		MARK();
-+		/* if we got the device from output_nr and there is a valid capture_nr,
-+                 * make sure that both refer to the same device (or bail out)
-+                 */
-+		if ((device_nr != conf.capture_nr) && (conf.capture_nr >= 0) &&
-+		    (ret != v4l2loopback_lookup(conf.capture_nr, 0)))
-+			break;
-+		MARK();
-+		/* if otoh, we got the device from capture_nr and there is a valid output_nr,
-+                 * make sure that both refer to the same device (or bail out)
-+                 */
-+		if ((device_nr != conf.output_nr) && (conf.output_nr >= 0) &&
-+		    (ret != v4l2loopback_lookup(conf.output_nr, 0)))
-+			break;
-+		MARK();
-+
-+		/* v4l2_loopback_config identified a single device, so fetch the data */
-+		snprintf(conf.card_label, sizeof(conf.card_label), "%s",
-+			 dev->card_label);
-+		MARK();
-+		conf.output_nr = conf.capture_nr = dev->vdev->num;
-+		conf.max_width = dev->max_width;
-+		conf.max_height = dev->max_height;
-+		conf.announce_all_caps = dev->announce_all_caps;
-+		conf.max_buffers = dev->buffers_number;
-+		conf.max_openers = dev->max_openers;
-+		conf.debug = debug;
-+		MARK();
-+		if (copy_to_user((void *)parm, &conf, sizeof(conf))) {
-+			ret = -EFAULT;
-+			break;
-+		}
-+		MARK();
-+		ret = 0;
-+		;
-+		break;
-+	}
-+
-+	MARK();
-+	mutex_unlock(&v4l2loopback_ctl_mutex);
-+	MARK();
-+	return ret;
-+}
-+
-+/* LINUX KERNEL */
-+
-+static const struct file_operations v4l2loopback_ctl_fops = {
-+	// clang-format off
-+	.owner		= THIS_MODULE,
-+	.open		= nonseekable_open,
-+	.unlocked_ioctl	= v4l2loopback_control_ioctl,
-+	.compat_ioctl	= v4l2loopback_control_ioctl,
-+	.llseek		= noop_llseek,
-+	// clang-format on
-+};
-+
-+static struct miscdevice v4l2loopback_misc = {
-+	// clang-format off
-+	.minor		= MISC_DYNAMIC_MINOR,
-+	.name		= "v4l2loopback",
-+	.fops		= &v4l2loopback_ctl_fops,
-+	// clang-format on
-+};
-+
-+static const struct v4l2_file_operations v4l2_loopback_fops = {
-+	// clang-format off
-+	.owner		= THIS_MODULE,
-+	.open		= v4l2_loopback_open,
-+	.release	= v4l2_loopback_close,
-+	.read		= v4l2_loopback_read,
-+	.write		= v4l2_loopback_write,
-+	.poll		= v4l2_loopback_poll,
-+	.mmap		= v4l2_loopback_mmap,
-+	.unlocked_ioctl	= video_ioctl2,
-+	// clang-format on
-+};
-+
-+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = {
-+	// clang-format off
-+	.vidioc_querycap		= &vidioc_querycap,
-+	.vidioc_enum_framesizes		= &vidioc_enum_framesizes,
-+	.vidioc_enum_frameintervals	= &vidioc_enum_frameintervals,
-+
-+	.vidioc_enum_output		= &vidioc_enum_output,
-+	.vidioc_g_output		= &vidioc_g_output,
-+	.vidioc_s_output		= &vidioc_s_output,
-+
-+	.vidioc_enum_input		= &vidioc_enum_input,
-+	.vidioc_g_input			= &vidioc_g_input,
-+	.vidioc_s_input			= &vidioc_s_input,
-+
-+	.vidioc_enum_fmt_vid_cap	= &vidioc_enum_fmt_cap,
-+	.vidioc_g_fmt_vid_cap		= &vidioc_g_fmt_cap,
-+	.vidioc_s_fmt_vid_cap		= &vidioc_s_fmt_cap,
-+	.vidioc_try_fmt_vid_cap		= &vidioc_try_fmt_cap,
-+
-+	.vidioc_enum_fmt_vid_out	= &vidioc_enum_fmt_out,
-+	.vidioc_s_fmt_vid_out		= &vidioc_s_fmt_out,
-+	.vidioc_g_fmt_vid_out		= &vidioc_g_fmt_out,
-+	.vidioc_try_fmt_vid_out		= &vidioc_try_fmt_out,
-+
-+#ifdef V4L2L_OVERLAY
-+	.vidioc_s_fmt_vid_overlay	= &vidioc_s_fmt_overlay,
-+	.vidioc_g_fmt_vid_overlay	= &vidioc_g_fmt_overlay,
-+#endif
-+
-+#ifdef V4L2LOOPBACK_WITH_STD
-+	.vidioc_s_std			= &vidioc_s_std,
-+	.vidioc_g_std			= &vidioc_g_std,
-+	.vidioc_querystd		= &vidioc_querystd,
-+#endif /* V4L2LOOPBACK_WITH_STD */
-+
-+	.vidioc_g_parm			= &vidioc_g_parm,
-+	.vidioc_s_parm			= &vidioc_s_parm,
-+
-+	.vidioc_reqbufs			= &vidioc_reqbufs,
-+	.vidioc_querybuf		= &vidioc_querybuf,
-+	.vidioc_qbuf			= &vidioc_qbuf,
-+	.vidioc_dqbuf			= &vidioc_dqbuf,
-+
-+	.vidioc_streamon		= &vidioc_streamon,
-+	.vidioc_streamoff		= &vidioc_streamoff,
-+
-+#ifdef CONFIG_VIDEO_V4L1_COMPAT
-+	.vidiocgmbuf			= &vidiocgmbuf,
-+#endif
-+
-+	.vidioc_subscribe_event		= &vidioc_subscribe_event,
-+	.vidioc_unsubscribe_event	= &v4l2_event_unsubscribe,
-+	// clang-format on
-+};
-+
-+static int free_device_cb(int id, void *ptr, void *data)
-+{
-+	struct v4l2_loopback_device *dev = ptr;
-+	v4l2_loopback_remove(dev);
-+	return 0;
-+}
-+static void free_devices(void)
-+{
-+	idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL);
-+	idr_destroy(&v4l2loopback_index_idr);
-+}
-+
-+static int __init v4l2loopback_init_module(void)
-+{
-+	int err;
-+	int i;
-+	MARK();
-+
-+	err = misc_register(&v4l2loopback_misc);
-+	if (err < 0)
-+		return err;
-+
-+	if (devices < 0) {
-+		devices = 1;
-+
-+		/* try guessing the devices from the "video_nr" parameter */
-+		for (i = MAX_DEVICES - 1; i >= 0; i--) {
-+			if (video_nr[i] >= 0) {
-+				devices = i + 1;
-+				break;
-+			}
-+		}
-+	}
-+
-+	if (devices > MAX_DEVICES) {
-+		devices = MAX_DEVICES;
-+		printk(KERN_INFO
-+		       "v4l2loopback: number of initial devices is limited to: %d\n",
-+		       MAX_DEVICES);
-+	}
-+
-+	if (max_buffers > MAX_BUFFERS) {
-+		max_buffers = MAX_BUFFERS;
-+		printk(KERN_INFO
-+		       "v4l2loopback: number of buffers is limited to: %d\n",
-+		       MAX_BUFFERS);
-+	}
-+
-+	if (max_openers < 0) {
-+		printk(KERN_INFO
-+		       "v4l2loopback: allowing %d openers rather than %d\n",
-+		       2, max_openers);
-+		max_openers = 2;
-+	}
-+
-+	if (max_width < V4L2LOOPBACK_SIZE_MIN_WIDTH) {
-+		max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
-+		printk(KERN_INFO "v4l2loopback: using max_width %d\n",
-+		       max_width);
-+	}
-+	if (max_height < V4L2LOOPBACK_SIZE_MIN_HEIGHT) {
-+		max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
-+		printk(KERN_INFO "v4l2loopback: using max_height %d\n",
-+		       max_height);
-+	}
-+
-+	/* kfree on module release */
-+	for (i = 0; i < devices; i++) {
-+		struct v4l2_loopback_config cfg = {
-+			// clang-format off
-+			.output_nr		= video_nr[i],
-+			.capture_nr		= video_nr[i],
-+			.max_width		= max_width,
-+			.max_height		= max_height,
-+			.announce_all_caps	= (!exclusive_caps[i]),
-+			.max_buffers		= max_buffers,
-+			.max_openers		= max_openers,
-+			.debug			= debug,
-+			// clang-format on
-+		};
-+		cfg.card_label[0] = 0;
-+		if (card_label[i])
-+			snprintf(cfg.card_label, sizeof(cfg.card_label), "%s",
-+				 card_label[i]);
-+		err = v4l2_loopback_add(&cfg, 0);
-+		if (err) {
-+			free_devices();
-+			goto error;
-+		}
-+	}
-+
-+	dprintk("module installed\n");
-+
-+	printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n",
-+	       // clang-format off
-+	       (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff,
-+	       (V4L2LOOPBACK_VERSION_CODE >>  8) & 0xff,
-+	       (V4L2LOOPBACK_VERSION_CODE      ) & 0xff,
-+#ifdef SNAPSHOT_VERSION
-+	       " (" __stringify(SNAPSHOT_VERSION) ")"
-+#else
-+	       ""
-+#endif
-+	       );
-+	// clang-format on
-+
-+	return 0;
-+error:
-+	misc_deregister(&v4l2loopback_misc);
-+	return err;
-+}
-+
-+static void v4l2loopback_cleanup_module(void)
-+{
-+	MARK();
-+	/* unregister the device -> it deletes /dev/video* */
-+	free_devices();
-+	/* and get rid of /dev/v4l2loopback */
-+	misc_deregister(&v4l2loopback_misc);
-+	dprintk("module removed\n");
-+}
-+
-+MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR);
-+
-+module_init(v4l2loopback_init_module);
-+module_exit(v4l2loopback_cleanup_module);
-diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h
-new file mode 100644
-index 000000000000..10f8e662d37a
---- /dev/null
-+++ b/drivers/media/v4l2-core/v4l2loopback.h
-@@ -0,0 +1,96 @@
-+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-+/*
-+ * v4l2loopback.h
-+ *
-+ * Written by IOhannes m zmölnig, 7/1/20.
-+ *
-+ * Copyright 2020 by IOhannes m zmölnig.  Redistribution of this file is
-+ * permitted under the GNU General Public License.
-+ */
-+#ifndef _V4L2LOOPBACK_H
-+#define _V4L2LOOPBACK_H
-+
-+#define V4L2LOOPBACK_VERSION_MAJOR 0
-+#define V4L2LOOPBACK_VERSION_MINOR 12
-+#define V4L2LOOPBACK_VERSION_BUGFIX 7
-+
-+/* /dev/v4l2loopback interface */
-+
-+struct v4l2_loopback_config {
-+	/**
-+         * the device-number (/dev/video<nr>)
-+         * V4L2LOOPBACK_CTL_ADD:
-+         * setting this to a value<0, will allocate an available one
-+         * if nr>=0 and the device already exists, the ioctl will EEXIST
-+         * if output_nr and capture_nr are the same, only a single device will be created
-+	 * NOTE: currently split-devices (where output_nr and capture_nr differ)
-+	 *   are not implemented yet.
-+	 *   until then, requesting different device-IDs will result in EINVAL.
-+         *
-+         * V4L2LOOPBACK_CTL_QUERY:
-+         * either both output_nr and capture_nr must refer to the same loopback,
-+         * or one (and only one) of them must be -1
-+         *
-+         */
-+	int output_nr;
-+	int capture_nr;
-+
-+	/**
-+         * a nice name for your device
-+         * if (*card_label)==0, an automatic name is assigned
-+         */
-+	char card_label[32];
-+
-+	/**
-+         * maximum allowed frame size
-+         * if too low, default values are used
-+         */
-+	int max_width;
-+	int max_height;
-+
-+	/**
-+         * number of buffers to allocate for the queue
-+         * if set to <=0, default values are used
-+         */
-+	int max_buffers;
-+
-+	/**
-+         * how many consumers are allowed to open this device concurrently
-+         * if set to <=0, default values are used
-+         */
-+	int max_openers;
-+
-+	/**
-+         * set the debugging level for this device
-+         */
-+	int debug;
-+
-+	/**
-+         * whether to announce OUTPUT/CAPTURE capabilities exclusively
-+         * for this device or not
-+         * (!exclusive_caps)
-+	 * NOTE: this is going to be removed once separate output/capture
-+	 *       devices are implemented
-+         */
-+	int announce_all_caps;
-+};
-+
-+/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the
-+ * to-be-created device set.
-+ * if the ptr is NULL, a new device is created with default values at the driver's discretion.
-+ *
-+ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY,
-+ * to get more information on the device)
-+ */
-+#define V4L2LOOPBACK_CTL_ADD 0x4C80
-+
-+/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set
-+ * (the two values must either refer to video-devices associated with the same loopback device
-+ *  or exactly one of them must be <0
-+ */
-+#define V4L2LOOPBACK_CTL_QUERY 0x4C82
-+
-+/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */
-+#define V4L2LOOPBACK_CTL_REMOVE 0x4C81
-+
-+#endif /* _V4L2LOOPBACK_H */
-diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h
-new file mode 100644
-index 000000000000..d855a3796554
---- /dev/null
-+++ b/drivers/media/v4l2-core/v4l2loopback_formats.h
-@@ -0,0 +1,445 @@
-+static const struct v4l2l_format formats[] = {
-+#ifndef V4L2_PIX_FMT_VP9
-+#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0')
-+#endif
-+#ifndef V4L2_PIX_FMT_HEVC
-+#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C')
-+#endif
-+
-+	/* here come the packed formats */
-+	{
-+		.name = "32 bpp RGB, le",
-+		.fourcc = V4L2_PIX_FMT_BGR32,
-+		.depth = 32,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "32 bpp RGB, be",
-+		.fourcc = V4L2_PIX_FMT_RGB32,
-+		.depth = 32,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "24 bpp RGB, le",
-+		.fourcc = V4L2_PIX_FMT_BGR24,
-+		.depth = 24,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "24 bpp RGB, be",
-+		.fourcc = V4L2_PIX_FMT_RGB24,
-+		.depth = 24,
-+		.flags = 0,
-+	},
-+#ifdef V4L2_PIX_FMT_ABGR32
-+	{
-+		.name = "32 bpp RGBA, le",
-+		.fourcc = V4L2_PIX_FMT_ABGR32,
-+		.depth = 32,
-+		.flags = 0,
-+	},
-+#endif
-+#ifdef V4L2_PIX_FMT_RGBA32
-+	{
-+		.name = "32 bpp RGBA",
-+		.fourcc = V4L2_PIX_FMT_RGBA32,
-+		.depth = 32,
-+		.flags = 0,
-+	},
-+#endif
-+#ifdef V4L2_PIX_FMT_RGB332
-+	{
-+		.name = "8 bpp RGB-3-3-2",
-+		.fourcc = V4L2_PIX_FMT_RGB332,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB332 */
-+#ifdef V4L2_PIX_FMT_RGB444
-+	{
-+		.name = "16 bpp RGB (xxxxrrrr ggggbbbb)",
-+		.fourcc = V4L2_PIX_FMT_RGB444,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB444 */
-+#ifdef V4L2_PIX_FMT_RGB555
-+	{
-+		.name = "16 bpp RGB-5-5-5",
-+		.fourcc = V4L2_PIX_FMT_RGB555,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB555 */
-+#ifdef V4L2_PIX_FMT_RGB565
-+	{
-+		.name = "16 bpp RGB-5-6-5",
-+		.fourcc = V4L2_PIX_FMT_RGB565,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB565 */
-+#ifdef V4L2_PIX_FMT_RGB555X
-+	{
-+		.name = "16 bpp RGB-5-5-5 BE",
-+		.fourcc = V4L2_PIX_FMT_RGB555X,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB555X */
-+#ifdef V4L2_PIX_FMT_RGB565X
-+	{
-+		.name = "16 bpp RGB-5-6-5 BE",
-+		.fourcc = V4L2_PIX_FMT_RGB565X,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_RGB565X */
-+#ifdef V4L2_PIX_FMT_BGR666
-+	{
-+		.name = "18 bpp BGR-6-6-6",
-+		.fourcc = V4L2_PIX_FMT_BGR666,
-+		.depth = 18,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_BGR666 */
-+	{
-+		.name = "4:2:2, packed, YUYV",
-+		.fourcc = V4L2_PIX_FMT_YUYV,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "4:2:2, packed, UYVY",
-+		.fourcc = V4L2_PIX_FMT_UYVY,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#ifdef V4L2_PIX_FMT_YVYU
-+	{
-+		.name = "4:2:2, packed YVYU",
-+		.fourcc = V4L2_PIX_FMT_YVYU,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif
-+#ifdef V4L2_PIX_FMT_VYUY
-+	{
-+		.name = "4:2:2, packed VYUY",
-+		.fourcc = V4L2_PIX_FMT_VYUY,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif
-+	{
-+		.name = "4:2:2, packed YYUV",
-+		.fourcc = V4L2_PIX_FMT_YYUV,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "YUV-8-8-8-8",
-+		.fourcc = V4L2_PIX_FMT_YUV32,
-+		.depth = 32,
-+		.flags = 0,
-+	},
-+	{
-+		.name = "8 bpp, Greyscale",
-+		.fourcc = V4L2_PIX_FMT_GREY,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#ifdef V4L2_PIX_FMT_Y4
-+	{
-+		.name = "4 bpp Greyscale",
-+		.fourcc = V4L2_PIX_FMT_Y4,
-+		.depth = 4,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_Y4 */
-+#ifdef V4L2_PIX_FMT_Y6
-+	{
-+		.name = "6 bpp Greyscale",
-+		.fourcc = V4L2_PIX_FMT_Y6,
-+		.depth = 6,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_Y6 */
-+#ifdef V4L2_PIX_FMT_Y10
-+	{
-+		.name = "10 bpp Greyscale",
-+		.fourcc = V4L2_PIX_FMT_Y10,
-+		.depth = 10,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_Y10 */
-+#ifdef V4L2_PIX_FMT_Y12
-+	{
-+		.name = "12 bpp Greyscale",
-+		.fourcc = V4L2_PIX_FMT_Y12,
-+		.depth = 12,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_Y12 */
-+	{
-+		.name = "16 bpp, Greyscale",
-+		.fourcc = V4L2_PIX_FMT_Y16,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#ifdef V4L2_PIX_FMT_YUV444
-+	{
-+		.name = "16 bpp xxxxyyyy uuuuvvvv",
-+		.fourcc = V4L2_PIX_FMT_YUV444,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_YUV444 */
-+#ifdef V4L2_PIX_FMT_YUV555
-+	{
-+		.name = "16 bpp YUV-5-5-5",
-+		.fourcc = V4L2_PIX_FMT_YUV555,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_YUV555 */
-+#ifdef V4L2_PIX_FMT_YUV565
-+	{
-+		.name = "16 bpp YUV-5-6-5",
-+		.fourcc = V4L2_PIX_FMT_YUV565,
-+		.depth = 16,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_YUV565 */
-+
-+/* bayer formats */
-+#ifdef V4L2_PIX_FMT_SRGGB8
-+	{
-+		.name = "Bayer RGGB 8bit",
-+		.fourcc = V4L2_PIX_FMT_SRGGB8,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_SRGGB8 */
-+#ifdef V4L2_PIX_FMT_SGRBG8
-+	{
-+		.name = "Bayer GRBG 8bit",
-+		.fourcc = V4L2_PIX_FMT_SGRBG8,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_SGRBG8 */
-+#ifdef V4L2_PIX_FMT_SGBRG8
-+	{
-+		.name = "Bayer GBRG 8bit",
-+		.fourcc = V4L2_PIX_FMT_SGBRG8,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_SGBRG8 */
-+#ifdef V4L2_PIX_FMT_SBGGR8
-+	{
-+		.name = "Bayer BA81 8bit",
-+		.fourcc = V4L2_PIX_FMT_SBGGR8,
-+		.depth = 8,
-+		.flags = 0,
-+	},
-+#endif /* V4L2_PIX_FMT_SBGGR8 */
-+
-+	/* here come the planar formats */
-+	{
-+		.name = "4:1:0, planar, Y-Cr-Cb",
-+		.fourcc = V4L2_PIX_FMT_YVU410,
-+		.depth = 9,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+	{
-+		.name = "4:2:0, planar, Y-Cr-Cb",
-+		.fourcc = V4L2_PIX_FMT_YVU420,
-+		.depth = 12,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+	{
-+		.name = "4:1:0, planar, Y-Cb-Cr",
-+		.fourcc = V4L2_PIX_FMT_YUV410,
-+		.depth = 9,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+	{
-+		.name = "4:2:0, planar, Y-Cb-Cr",
-+		.fourcc = V4L2_PIX_FMT_YUV420,
-+		.depth = 12,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+#ifdef V4L2_PIX_FMT_YUV422P
-+	{
-+		.name = "16 bpp YVU422 planar",
-+		.fourcc = V4L2_PIX_FMT_YUV422P,
-+		.depth = 16,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+#endif /* V4L2_PIX_FMT_YUV422P */
-+#ifdef V4L2_PIX_FMT_YUV411P
-+	{
-+		.name = "16 bpp YVU411 planar",
-+		.fourcc = V4L2_PIX_FMT_YUV411P,
-+		.depth = 16,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+#endif /* V4L2_PIX_FMT_YUV411P */
-+#ifdef V4L2_PIX_FMT_Y41P
-+	{
-+		.name = "12 bpp YUV 4:1:1",
-+		.fourcc = V4L2_PIX_FMT_Y41P,
-+		.depth = 12,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+#endif /* V4L2_PIX_FMT_Y41P */
-+#ifdef V4L2_PIX_FMT_NV12
-+	{
-+		.name = "12 bpp Y/CbCr 4:2:0 ",
-+		.fourcc = V4L2_PIX_FMT_NV12,
-+		.depth = 12,
-+		.flags = FORMAT_FLAGS_PLANAR,
-+	},
-+#endif /* V4L2_PIX_FMT_NV12 */
-+
-+/* here come the compressed formats */
-+
-+#ifdef V4L2_PIX_FMT_MJPEG
-+	{
-+		.name = "Motion-JPEG",
-+		.fourcc = V4L2_PIX_FMT_MJPEG,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_MJPEG */
-+#ifdef V4L2_PIX_FMT_JPEG
-+	{
-+		.name = "JFIF JPEG",
-+		.fourcc = V4L2_PIX_FMT_JPEG,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_JPEG */
-+#ifdef V4L2_PIX_FMT_DV
-+	{
-+		.name = "DV1394",
-+		.fourcc = V4L2_PIX_FMT_DV,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_DV */
-+#ifdef V4L2_PIX_FMT_MPEG
-+	{
-+		.name = "MPEG-1/2/4 Multiplexed",
-+		.fourcc = V4L2_PIX_FMT_MPEG,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_MPEG */
-+#ifdef V4L2_PIX_FMT_H264
-+	{
-+		.name = "H264 with start codes",
-+		.fourcc = V4L2_PIX_FMT_H264,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_H264 */
-+#ifdef V4L2_PIX_FMT_H264_NO_SC
-+	{
-+		.name = "H264 without start codes",
-+		.fourcc = V4L2_PIX_FMT_H264_NO_SC,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_H264_NO_SC */
-+#ifdef V4L2_PIX_FMT_H264_MVC
-+	{
-+		.name = "H264 MVC",
-+		.fourcc = V4L2_PIX_FMT_H264_MVC,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_H264_MVC */
-+#ifdef V4L2_PIX_FMT_H263
-+	{
-+		.name = "H263",
-+		.fourcc = V4L2_PIX_FMT_H263,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_H263 */
-+#ifdef V4L2_PIX_FMT_MPEG1
-+	{
-+		.name = "MPEG-1 ES",
-+		.fourcc = V4L2_PIX_FMT_MPEG1,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_MPEG1 */
-+#ifdef V4L2_PIX_FMT_MPEG2
-+	{
-+		.name = "MPEG-2 ES",
-+		.fourcc = V4L2_PIX_FMT_MPEG2,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_MPEG2 */
-+#ifdef V4L2_PIX_FMT_MPEG4
-+	{
-+		.name = "MPEG-4 part 2 ES",
-+		.fourcc = V4L2_PIX_FMT_MPEG4,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_MPEG4 */
-+#ifdef V4L2_PIX_FMT_XVID
-+	{
-+		.name = "Xvid",
-+		.fourcc = V4L2_PIX_FMT_XVID,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_XVID */
-+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
-+	{
-+		.name = "SMPTE 421M Annex G compliant stream",
-+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_G,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */
-+#ifdef V4L2_PIX_FMT_VC1_ANNEX_L
-+	{
-+		.name = "SMPTE 421M Annex L compliant stream",
-+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_L,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */
-+#ifdef V4L2_PIX_FMT_VP8
-+	{
-+		.name = "VP8",
-+		.fourcc = V4L2_PIX_FMT_VP8,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_VP8 */
-+#ifdef V4L2_PIX_FMT_VP9
-+	{
-+		.name = "VP9",
-+		.fourcc = V4L2_PIX_FMT_VP9,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_VP9 */
-+#ifdef V4L2_PIX_FMT_HEVC
-+	{
-+		.name = "HEVC",
-+		.fourcc = V4L2_PIX_FMT_HEVC,
-+		.depth = 32,
-+		.flags = FORMAT_FLAGS_COMPRESSED,
-+	},
-+#endif /* V4L2_PIX_FMT_HEVC */
-+};
--- 
-2.40.0.rc2
+2.40.0
diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch
index 11213cb..0df5c7b 100644
--- a/patches/0002-eevdf.patch
+++ b/patches/0002-eevdf.patch
@@ -1,80 +1,29 @@
-From b6d3ec3be2639fe928a09b558e979c36b41ea63b Mon Sep 17 00:00:00 2001
+From 40a2f9f3e7e56936385c5a97957cd43fbb85fd32 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 11 Mar 2023 18:42:39 +0100
+Date: Sun, 9 Apr 2023 21:35:07 +0200
 Subject: [PATCH] EEVDF
 
-Ever since looking at the latency-nice patches, I've wondered if EEVDF would
-not make more sense, and I did point Vincent at some older patches I had for
-that (which is here his augmented rbtree thing comes from).
-
-Also, since I really dislike the dual tree, I also figured we could dynamically
-switch between an augmented tree and not (and while I have code for that,
-that's not included in this posting because with the current results I don't
-think we actually need this).
-
-Anyway, since I'm somewhat under the weather, I spend last week desperately
-trying to connect a small cluster of neurons in defiance of the snot overlord
-and bring back the EEVDF patches from the dark crypts where they'd been
-gathering cobwebs for the past 13 odd years.
-
-By friday they worked well enough, and this morning (because obviously I forgot
-the weekend is ideal to run benchmarks) I ran a bunch of hackbenck, netperf,
-tbench and sysbench -- there's a bunch of wins and losses, but nothing that
-indicates a total fail.
-
-( in fact, some of the schbench results seem to indicate EEVDF schedules a lot
-  more consistent than CFS and has a bunch of latency wins )
-
-( hackbench also doesn't show the augmented tree and generally more expensive
-  pick to be a loss, in fact it shows a slight win here )
-
-  hackbech load + cyclictest --policy other results:
-
-			EEVDF			 CFS
-
-		# Min Latencies: 00053
-  LNICE(19)	# Avg Latencies: 04350
-		# Max Latencies: 76019
-
-		# Min Latencies: 00052		00053
-  LNICE(0)	# Avg Latencies: 00690		00687
-		# Max Latencies: 14145		13913
-
-		# Min Latencies: 00019
-  LNICE(-19)	# Avg Latencies: 00261
-		# Max Latencies: 05642
-
-The nice -19 numbers aren't as pretty as Vincent's, but at the end I was going
-cross-eyed from staring at tree prints and I just couldn't figure out where it
-was going side-ways.
-
-There's definitely more benchmarking/tweaking to be done (0-day already
-reported a stress-ng loss), but if we can pull this off we can delete a whole
-much of icky heuristics code. EEVDF is a much better defined policy than what
-we currently have.
-
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- Documentation/admin-guide/cgroup-v2.rst |  10 +
- include/linux/rbtree_augmented.h        |  26 ++
- include/linux/sched.h                   |   8 +
- include/linux/sched/prio.h              |  27 ++
- include/uapi/linux/sched.h              |   4 +-
- include/uapi/linux/sched/types.h        |  19 +
- init/init_task.c                        |   1 +
- kernel/sched/core.c                     |  66 ++++
- kernel/sched/debug.c                    |  39 +-
- kernel/sched/fair.c                     | 486 ++++++++++++++++++++----
- kernel/sched/features.h                 |  10 +-
- kernel/sched/sched.h                    |  12 +
- tools/include/uapi/linux/sched.h        |   4 +-
- 13 files changed, 614 insertions(+), 98 deletions(-)
+ Documentation/admin-guide/cgroup-v2.rst |   10 +
+ include/linux/rbtree_augmented.h        |   26 +
+ include/linux/sched.h                   |    9 +-
+ include/uapi/linux/sched.h              |    4 +-
+ include/uapi/linux/sched/types.h        |   19 +
+ init/init_task.c                        |    3 +-
+ kernel/sched/core.c                     |   67 +-
+ kernel/sched/debug.c                    |   50 +-
+ kernel/sched/fair.c                     | 1171 ++++++++++-------------
+ kernel/sched/features.h                 |   28 +-
+ kernel/sched/sched.h                    |   23 +-
+ tools/include/uapi/linux/sched.h        |    4 +-
+ 12 files changed, 697 insertions(+), 717 deletions(-)
 
 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
-index 74cec76be9f2..2e511d4a4c6a 100644
+index f67c0829350b..a39dfda3d032 100644
 --- a/Documentation/admin-guide/cgroup-v2.rst
 +++ b/Documentation/admin-guide/cgroup-v2.rst
-@@ -1118,6 +1118,16 @@ All time durations are in microseconds.
+@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
          values similar to the sched_setattr(2). This maximum utilization
          value is used to clamp the task specific maximum utilization clamp.
  
@@ -129,7 +78,7 @@ index d1c53e9d8c75..a78e692a9ff5 100644
   * Template for declaring augmented rbtree callbacks (generic case)
   *
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 28ce1be0ba47..764df627c243 100644
+index 6d398b337b0d..6a719374f688 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -548,6 +548,9 @@ struct sched_entity {
@@ -142,25 +91,21 @@ index 28ce1be0ba47..764df627c243 100644
  	struct list_head		group_node;
  	unsigned int			on_rq;
  
-@@ -555,6 +558,8 @@ struct sched_entity {
+@@ -555,11 +558,10 @@ struct sched_entity {
  	u64				sum_exec_runtime;
  	u64				vruntime;
  	u64				prev_sum_exec_runtime;
-+	s64				lag;
++	s64				vlag;
 +	u64				slice;
  
  	u64				nr_migrations;
- 	u64				prev_sleep_sum_runtime;
-@@ -571,6 +576,8 @@ struct sched_entity {
- 	/* cached value of my_q->h_nr_running */
- 	unsigned long			runnable_weight;
- #endif
-+	/* preemption offset in ns */
-+	long				latency_offset;
+-	u64				prev_sleep_sum_runtime;
+-	/* average duration of a task */
+-	u64				dur_avg;
  
- #ifdef CONFIG_SMP
- 	/*
-@@ -787,6 +794,7 @@ struct task_struct {
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	int				depth;
+@@ -787,6 +789,7 @@ struct task_struct {
  	int				static_prio;
  	int				normal_prio;
  	unsigned int			rt_priority;
@@ -168,42 +113,6 @@ index 28ce1be0ba47..764df627c243 100644
  
  	struct sched_entity		se;
  	struct sched_rt_entity		rt;
-diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
-index ab83d85e1183..be79503d86af 100644
---- a/include/linux/sched/prio.h
-+++ b/include/linux/sched/prio.h
-@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio)
- 	return (MAX_NICE - prio + 1);
- }
- 
-+/*
-+ * Latency nice is meant to provide scheduler hints about the relative
-+ * latency requirements of a task with respect to other tasks.
-+ * Thus a task with latency_nice == 19 can be hinted as the task with no
-+ * latency requirements, in contrast to the task with latency_nice == -20
-+ * which should be given priority in terms of lower latency.
-+ */
-+#define MAX_LATENCY_NICE	19
-+#define MIN_LATENCY_NICE	-20
-+
-+#define LATENCY_NICE_WIDTH	\
-+	(MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
-+
-+/*
-+ * Default tasks should be treated as a task with latency_nice = 0.
-+ */
-+#define DEFAULT_LATENCY_NICE	0
-+#define DEFAULT_LATENCY_PRIO	(DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
-+
-+/*
-+ * Convert user-nice values [ -20 ... 0 ... 19 ]
-+ * to static latency [ 0..39 ],
-+ * and back.
-+ */
-+#define NICE_TO_LATENCY(nice)	((nice) + DEFAULT_LATENCY_PRIO)
-+#define LATENCY_TO_NICE(prio)	((prio) - DEFAULT_LATENCY_PRIO)
-+
- #endif /* _LINUX_SCHED_PRIO_H */
 diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
 index 3bac0a8ceab2..b2e932c25be6 100644
 --- a/include/uapi/linux/sched.h
@@ -270,96 +179,113 @@ index f2c4589d4dbf..db1e8199e8c8 100644
  
  #endif /* _UAPI_LINUX_SCHED_TYPES_H */
 diff --git a/init/init_task.c b/init/init_task.c
-index ff6c4b9bfe6b..071deff8dbd1 100644
+index ff6c4b9bfe6b..511cbcf3510d 100644
 --- a/init/init_task.c
 +++ b/init/init_task.c
 @@ -78,6 +78,7 @@ struct task_struct init_task
  	.prio		= MAX_PRIO - 20,
  	.static_prio	= MAX_PRIO - 20,
  	.normal_prio	= MAX_PRIO - 20,
-+	.latency_prio	= DEFAULT_LATENCY_PRIO,
++	.latency_prio	= DEFAULT_PRIO,
  	.policy		= SCHED_NORMAL,
  	.cpus_ptr	= &init_task.cpus_mask,
  	.user_cpus_ptr	= NULL,
+@@ -89,7 +90,7 @@ struct task_struct init_task
+ 		.fn = do_no_restart_syscall,
+ 	},
+ 	.se		= {
+-		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
++		.group_node	= LIST_HEAD_INIT(init_task.se.group_node),
+ 	},
+ 	.rt		= {
+ 		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 5237639786b7..9db5f9ec9022 100644
+index 17bb9637f314..fbc08605b068 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -1283,6 +1283,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+@@ -1285,6 +1285,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
  	}
  }
  
-+static void set_latency_offset(struct task_struct *p)
++static inline void set_latency_prio(struct task_struct *p, int prio)
 +{
-+	p->se.latency_offset = calc_latency_offset(p->latency_prio);
++	p->latency_prio = prio;
++	set_latency_fair(&p->se, prio - MAX_RT_PRIO);
 +}
 +
  #ifdef CONFIG_UCLAMP_TASK
  /*
   * Serializes updates of utilization clamp values
-@@ -4431,8 +4436,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4434,10 +4440,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
- 	p->se.dur_avg			= 0;
- 	p->se.prev_sleep_sum_runtime	= 0;
-+	p->se.lag			= 0;
+-	p->se.dur_avg			= 0;
+-	p->se.prev_sleep_sum_runtime	= 0;
++	p->se.vlag			= 0;
  	INIT_LIST_HEAD(&p->se.group_node);
  
-+	set_latency_offset(p);
++	set_latency_prio(p, p->latency_prio);
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	p->se.cfs_rq			= NULL;
  #endif
-@@ -4684,6 +4692,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4688,6 +4695,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 
  		p->prio = p->normal_prio = p->static_prio;
  		set_load_weight(p, false);
++		set_latency_prio(p, NICE_TO_PRIO(0));
  
-+		p->latency_prio = NICE_TO_LATENCY(0);
-+		set_latency_offset(p);
-+
  		/*
  		 * We don't need the reset flag anymore after the fork. It has
- 		 * fulfilled its duty:
-@@ -7446,6 +7457,15 @@ static void __setscheduler_params(struct task_struct *p,
+@@ -7433,7 +7441,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
+ #define SETPARAM_POLICY	-1
+ 
+ static void __setscheduler_params(struct task_struct *p,
+-		const struct sched_attr *attr)
++				  const struct sched_attr *attr)
+ {
+ 	int policy = attr->sched_policy;
+ 
+@@ -7457,6 +7465,13 @@ static void __setscheduler_params(struct task_struct *p,
  	set_load_weight(p, true);
  }
  
 +static void __setscheduler_latency(struct task_struct *p,
-+		const struct sched_attr *attr)
++				   const struct sched_attr *attr)
 +{
-+	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
-+		p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
-+		set_latency_offset(p);
-+	}
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE)
++		set_latency_prio(p, NICE_TO_PRIO(attr->sched_latency_nice));
 +}
 +
  /*
   * Check the target process has a UID that matches the current process's:
   */
-@@ -7586,6 +7606,13 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7597,6 +7612,13 @@ static int __sched_setscheduler(struct task_struct *p,
  			return retval;
  	}
  
 +	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
-+		if (attr->sched_latency_nice > MAX_LATENCY_NICE)
++		if (attr->sched_latency_nice > MAX_NICE)
 +			return -EINVAL;
-+		if (attr->sched_latency_nice < MIN_LATENCY_NICE)
++		if (attr->sched_latency_nice < MIN_NICE)
 +			return -EINVAL;
 +	}
 +
  	if (pi)
  		cpuset_read_lock();
  
-@@ -7620,6 +7647,9 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7631,6 +7653,9 @@ static int __sched_setscheduler(struct task_struct *p,
  			goto change;
  		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
  			goto change;
 +		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
-+		    attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
++		    attr->sched_latency_nice != PRIO_TO_NICE(p->latency_prio))
 +			goto change;
  
  		p->sched_reset_on_fork = reset_on_fork;
  		retval = 0;
-@@ -7708,6 +7738,7 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7719,6 +7744,7 @@ static int __sched_setscheduler(struct task_struct *p,
  		__setscheduler_params(p, attr);
  		__setscheduler_prio(p, newprio);
  	}
@@ -367,7 +293,7 @@ index 5237639786b7..9db5f9ec9022 100644
  	__setscheduler_uclamp(p, attr);
  
  	if (queued) {
-@@ -7918,6 +7949,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+@@ -7929,6 +7955,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
  	    size < SCHED_ATTR_SIZE_VER1)
  		return -EINVAL;
  
@@ -377,16 +303,16 @@ index 5237639786b7..9db5f9ec9022 100644
  	/*
  	 * XXX: Do we want to be lenient like existing syscalls; or do we want
  	 * to be strict and return an error on out-of-bounds values?
-@@ -8155,6 +8189,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+@@ -8166,6 +8195,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
  	get_params(p, &kattr);
  	kattr.sched_flags &= SCHED_FLAG_ALL;
  
-+	kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
++	kattr.sched_latency_nice = PRIO_TO_NICE(p->latency_prio);
 +
  #ifdef CONFIG_UCLAMP_TASK
  	/*
  	 * This could race with another potential updater, but this is fine
-@@ -11027,6 +11063,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+@@ -11038,6 +11069,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
  {
  	return sched_group_set_idle(css_tg(css), idle);
  }
@@ -394,7 +320,7 @@ index 5237639786b7..9db5f9ec9022 100644
 +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
 +				    struct cftype *cft)
 +{
-+	return LATENCY_TO_NICE(css_tg(css)->latency_prio);
++	return PRIO_TO_NICE(css_tg(css)->latency_prio);
 +}
 +
 +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
@@ -402,17 +328,17 @@ index 5237639786b7..9db5f9ec9022 100644
 +{
 +	int prio;
 +
-+	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
++	if (nice < MIN_NICE || nice > MAX_NICE)
 +		return -ERANGE;
 +
-+	prio = NICE_TO_LATENCY(nice);
++	prio = NICE_TO_PRIO(nice);
 +
 +	return sched_group_set_latency(css_tg(css), prio);
 +}
  #endif
  
  static struct cftype cpu_legacy_files[] = {
-@@ -11041,6 +11096,11 @@ static struct cftype cpu_legacy_files[] = {
+@@ -11052,6 +11102,11 @@ static struct cftype cpu_legacy_files[] = {
  		.read_s64 = cpu_idle_read_s64,
  		.write_s64 = cpu_idle_write_s64,
  	},
@@ -424,7 +350,7 @@ index 5237639786b7..9db5f9ec9022 100644
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
-@@ -11258,6 +11318,12 @@ static struct cftype cpu_files[] = {
+@@ -11269,6 +11324,12 @@ static struct cftype cpu_files[] = {
  		.read_s64 = cpu_idle_read_s64,
  		.write_s64 = cpu_idle_write_s64,
  	},
@@ -438,10 +364,22 @@ index 5237639786b7..9db5f9ec9022 100644
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 8d64fba16cfe..fe9edfa43f65 100644
+index 8d64fba16cfe..e0d10ac21016 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
-@@ -535,9 +535,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+@@ -308,10 +308,7 @@ static __init int sched_init_debug(void)
+ 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+ #endif
+ 
+-	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+-	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+-	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
+-	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
++	debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+ 
+ 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ 	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+@@ -535,9 +532,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  	else
  		SEQ_printf(m, " %c", task_state_to_char(p));
  
@@ -456,7 +394,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644
  		(long long)(p->nvcsw + p->nivcsw),
  		p->prio);
  
-@@ -580,10 +584,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+@@ -580,10 +581,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
  
  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  {
@@ -469,7 +407,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644
  	unsigned long flags;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -597,26 +600,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+@@ -597,26 +597,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  			SPLIT_NS(cfs_rq->exec_clock));
  
  	raw_spin_rq_lock_irqsave(rq, flags);
@@ -509,7 +447,27 @@ index 8d64fba16cfe..fe9edfa43f65 100644
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
  			cfs_rq->nr_spread_over);
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
-@@ -1044,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+@@ -817,10 +816,7 @@ static void sched_debug_header(struct seq_file *m)
+ 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
+ #define PN(x) \
+ 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+-	PN(sysctl_sched_latency);
+-	PN(sysctl_sched_min_granularity);
+-	PN(sysctl_sched_idle_min_granularity);
+-	PN(sysctl_sched_wakeup_granularity);
++	PN(sysctl_sched_base_slice);
+ 	P(sysctl_sched_child_runs_first);
+ 	P(sysctl_sched_features);
+ #undef PN
+@@ -1024,7 +1020,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 	__PS("nr_involuntary_switches", p->nivcsw);
+ 
+ 	P(se.load.weight);
+-	P(se.dur_avg);
+ #ifdef CONFIG_SMP
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+@@ -1044,6 +1039,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
  #endif
  	P(policy);
  	P(prio);
@@ -518,7 +476,7 @@ index 8d64fba16cfe..fe9edfa43f65 100644
  		P(dl.runtime);
  		P(dl.deadline);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 84254f52c56a..c40b775452bc 100644
+index 115be8a965f2..76bd212ee5bd 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -47,6 +47,7 @@
@@ -529,7 +487,128 @@ index 84254f52c56a..c40b775452bc 100644
  
  #include <asm/switch_to.h>
  
-@@ -619,13 +620,134 @@ static inline bool entity_before(struct sched_entity *a,
+@@ -56,26 +57,6 @@
+ #include "stats.h"
+ #include "autogroup.h"
+ 
+-/*
+- * Targeted preemption latency for CPU-bound tasks:
+- *
+- * NOTE: this latency value is not the same as the concept of
+- * 'timeslice length' - timeslices in CFS are of variable length
+- * and have no persistent notion like in traditional, time-slice
+- * based scheduling concepts.
+- *
+- * (to see the precise effective timeslice length of your workload,
+- *  run vmstat and monitor the context-switches (cs) field)
+- *
+- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+- */
+-#ifdef CONFIG_CACHY
+-unsigned int sysctl_sched_latency			= 3000000ULL;
+-static unsigned int normalized_sysctl_sched_latency	= 3000000ULL;
+-#else
+-unsigned int sysctl_sched_latency			= 6000000ULL;
+-static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
+-#endif
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -94,26 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+  *
+  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
+-#ifdef CONFIG_CACHY
+-unsigned int sysctl_sched_min_granularity			= 400000ULL;
+-static unsigned int normalized_sysctl_sched_min_granularity	= 400000ULL;
+-#else
+-unsigned int sysctl_sched_min_granularity			= 750000ULL;
+-static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
+-#endif
+-
+-/*
+- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+- * Applies only when SCHED_IDLE tasks compete with normal tasks.
+- *
+- * (default: 0.75 msec)
+- */
+-unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+-
+-/*
+- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
+- */
+-static unsigned int sched_nr_latency = 8;
++unsigned int sysctl_sched_base_slice			= 750000ULL;
++static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
+ 
+ /*
+  * After fork, child runs first. If set to 0 (default) then
+@@ -121,23 +84,6 @@ static unsigned int sched_nr_latency = 8;
+  */
+ unsigned int sysctl_sched_child_runs_first __read_mostly;
+ 
+-/*
+- * SCHED_OTHER wake-up granularity.
+- *
+- * This option delays the preemption effects of decoupled workloads
+- * and reduces their over-scheduling. Synchronous workloads will still
+- * have immediate wakeup/sleep latencies.
+- *
+- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+- */
+-#ifdef CONFIG_CACHY
+-unsigned int sysctl_sched_wakeup_granularity			= 500000UL;
+-static unsigned int normalized_sysctl_sched_wakeup_granularity	= 500000UL;
+-#else
+-unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
+-static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+-#endif
+-
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
+ int sched_thermal_decay_shift;
+@@ -189,12 +135,8 @@ int __weak arch_asym_cpu_priority(int cpu)
+  *
+  * (default: 5 msec, units: microseconds)
+  */
+-#ifdef CONFIG_CACHY
+-static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+-#else
+ static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+ #endif
+-#endif
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+ /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+@@ -295,9 +237,7 @@ static void update_sysctl(void)
+ 
+ #define SET_SYSCTL(name) \
+ 	(sysctl_##name = (factor) * normalized_sysctl_##name)
+-	SET_SYSCTL(sched_min_granularity);
+-	SET_SYSCTL(sched_latency);
+-	SET_SYSCTL(sched_wakeup_granularity);
++	SET_SYSCTL(sched_base_slice);
+ #undef SET_SYSCTL
+ }
+ 
+@@ -365,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
+ 	return mul_u64_u32_shr(delta_exec, fact, shift);
+ }
+ 
++/*
++ * delta /= w
++ */
++static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
++{
++	if (unlikely(se->load.weight != NICE_0_LOAD))
++		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
++
++	return delta;
++}
+ 
+ const struct sched_class fair_sched_class;
+ 
+@@ -619,35 +569,203 @@ static inline bool entity_before(const struct sched_entity *a,
  	return (s64)(a->vruntime - b->vruntime) < 0;
  }
  
@@ -541,6 +620,7 @@ index 84254f52c56a..c40b775452bc 100644
  #define __node_2_se(node) \
  	rb_entry((node), struct sched_entity, run_node)
  
+-static void update_min_vruntime(struct cfs_rq *cfs_rq)
 +/*
 + * Compute virtual time from the per-task service numbers:
 + *
@@ -576,17 +656,23 @@ index 84254f52c56a..c40b775452bc 100644
 +static void
 +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
++	unsigned long weight = scale_load_down(se->load.weight);
 +	s64 key = entity_key(cfs_rq, se);
-+	cfs_rq->avg_vruntime += key * se->load.weight;
-+	cfs_rq->avg_load += se->load.weight;
++
++	cfs_rq->avg_vruntime += key * weight;
++	cfs_rq->avg_slice += se->slice * weight;
++	cfs_rq->avg_load += weight;
 +}
 +
 +static void
 +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
++	unsigned long weight = scale_load_down(se->load.weight);
 +	s64 key = entity_key(cfs_rq, se);
-+	cfs_rq->avg_vruntime -= key * se->load.weight;
-+	cfs_rq->avg_load -= se->load.weight;
++
++	cfs_rq->avg_vruntime -= key * weight;
++	cfs_rq->avg_slice -= se->slice * weight;
++	cfs_rq->avg_load -= weight;
 +}
 +
 +static inline
@@ -599,27 +685,69 @@ index 84254f52c56a..c40b775452bc 100644
 +}
 +
 +u64 avg_vruntime(struct cfs_rq *cfs_rq)
-+{
-+	struct sched_entity *curr = cfs_rq->curr;
-+	s64 lag = cfs_rq->avg_vruntime;
+ {
+ 	struct sched_entity *curr = cfs_rq->curr;
+-	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
++	s64 avg = cfs_rq->avg_vruntime;
 +	long load = cfs_rq->avg_load;
-+
+ 
+-	u64 vruntime = cfs_rq->min_vruntime;
 +	if (curr && curr->on_rq) {
-+		lag += entity_key(cfs_rq, curr) * curr->load.weight;
-+		load += curr->load.weight;
-+	}
-+
++		unsigned long weight = scale_load_down(curr->load.weight);
+ 
+-	if (curr) {
+-		if (curr->on_rq)
+-			vruntime = curr->vruntime;
+-		else
+-			curr = NULL;
++		avg += entity_key(cfs_rq, curr) * weight;
++		load += weight;
+ 	}
+ 
+-	if (leftmost) { /* non-empty tree */
+-		struct sched_entity *se = __node_2_se(leftmost);
 +	if (load)
-+		lag = div_s64(lag, load);
++		avg = div_s64(avg, load);
+ 
+-		if (!curr)
+-			vruntime = se->vruntime;
+-		else
+-			vruntime = min_vruntime(vruntime, se->vruntime);
++	return cfs_rq->min_vruntime + avg;
++}
 +
-+	return cfs_rq->min_vruntime + lag;
++/*
++ * lag_i = S - s_i = w_i * (V - v_i)
++ *
++ * However, since V is approximated by the weighted average of all entities it
++ * is possible -- by addition/removal/reweight to the tree -- to move V around
++ * and end up with a larger lag than we started with.
++ *
++ * Limit this to either double the slice length with a minimum of TICK_NSEC
++ * since that is the timing granularity.
++ *
++ * EEVDF gives the following limit for a steady state system:
++ *
++ *   -r_max < lag < max(r_max, q)
++ *
++ * XXX could add max_slice to the augmented data to track this.
++ */
++void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	s64 lag, limit;
++
++	SCHED_WARN_ON(!se->on_rq);
++	lag = avg_vruntime(cfs_rq) - se->vruntime;
++
++	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
++	se->vlag = clamp(lag, -limit, limit);
 +}
 +
 +/*
 + * Entity is eligible once it received less service than it ought to have,
 + * eg. lag >= 0.
 + *
-+ * lag_i = S - s_i = w_i*(V - w_i)
++ * lag_i = S - s_i = w_i*(V - v_i)
 + *
 + * lag_i >= 0 -> V >= v_i
 + *
@@ -628,19 +756,27 @@ index 84254f52c56a..c40b775452bc 100644
 + *          \Sum w_i
 + *
 + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
++ *
++ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
++ *       to the loss in precision caused by the division.
 + */
 +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +	struct sched_entity *curr = cfs_rq->curr;
-+	s64 avg_vruntime = cfs_rq->avg_vruntime;
-+	long avg_load = cfs_rq->avg_load;
++	s64 avg = cfs_rq->avg_vruntime;
++	long load = cfs_rq->avg_load;
 +
 +	if (curr && curr->on_rq) {
-+		avg_vruntime += entity_key(cfs_rq, curr) * curr->load.weight;
-+		avg_load += curr->load.weight;
-+	}
++		unsigned long weight = scale_load_down(curr->load.weight);
 +
-+	return avg_vruntime >= entity_key(cfs_rq, se) * avg_load;
++		avg += entity_key(cfs_rq, curr) * weight;
++		load += weight;
+ 	}
+ 
+-	/* ensure we never gain time by being placed backwards. */
+-	u64_u32_store(cfs_rq->min_vruntime,
+-		      max_vruntime(cfs_rq->min_vruntime, vruntime));
++	return avg >= entity_key(cfs_rq, se) * load;
 +}
 +
 +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -657,35 +793,43 @@ index 84254f52c56a..c40b775452bc 100644
 +	return min_vruntime;
 +}
 +
- static void update_min_vruntime(struct cfs_rq *cfs_rq)
- {
-+	struct sched_entity *se = __pick_first_entity(cfs_rq);
- 	struct sched_entity *curr = cfs_rq->curr;
--	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
- 
- 	u64 vruntime = cfs_rq->min_vruntime;
- 
-@@ -636,9 +758,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
- 			curr = NULL;
- 	}
- 
--	if (leftmost) { /* non-empty tree */
--		struct sched_entity *se = __node_2_se(leftmost);
--
-+	if (se) {
- 		if (!curr)
- 			vruntime = se->vruntime;
- 		else
-@@ -647,7 +767,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
- 
- 	/* ensure we never gain time by being placed backwards. */
- 	u64_u32_store(cfs_rq->min_vruntime,
--		      max_vruntime(cfs_rq->min_vruntime, vruntime));
-+		      __update_min_vruntime(cfs_rq, vruntime));
++static void update_min_vruntime(struct cfs_rq *cfs_rq)
++{
++	if (sched_feat(MINIMAL_VA)) {
++		u64 vruntime = avg_vruntime(cfs_rq);
++		s64 delta = (s64)(vruntime - cfs_rq->min_vruntime);
++
++		avg_vruntime_update(cfs_rq, delta);
++
++		u64_u32_store(cfs_rq->min_vruntime, vruntime);
++	} else {
++		struct sched_entity *se = __pick_first_entity(cfs_rq);
++		struct sched_entity *curr = cfs_rq->curr;
++
++		u64 vruntime = cfs_rq->min_vruntime;
++
++		if (curr) {
++			if (curr->on_rq)
++				vruntime = curr->vruntime;
++			else
++				curr = NULL;
++		}
++
++		if (se) {
++			if (!curr)
++				vruntime = se->vruntime;
++			else
++				vruntime = min_vruntime(vruntime, se->vruntime);
++		}
++
++		/* ensure we never gain time by being placed backwards. */
++		u64_u32_store(cfs_rq->min_vruntime,
++				__update_min_vruntime(cfs_rq, vruntime));
++	}
  }
  
  static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
-@@ -655,17 +775,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -655,17 +773,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
  	return entity_before(__node_2_se(a), __node_2_se(b));
  }
  
@@ -739,24 +883,11 @@ index 84254f52c56a..c40b775452bc 100644
  }
  
  struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-@@ -688,6 +842,101 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
- 	return __node_2_se(next);
+@@ -678,14 +830,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+ 	return __node_2_se(left);
  }
  
-+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-+{
-+	struct sched_entity *left = __pick_first_entity(cfs_rq);
-+
-+	/*
-+	 * If curr is set we have to see if its left of the leftmost entity
-+	 * still in the tree, provided there was anything in the tree at all.
-+	 */
-+	if (!left || (curr && entity_before(curr, left)))
-+		left = curr;
-+
-+	return left;
-+}
-+
+-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 +/*
 + * Earliest Eligible Virtual Deadline First
 + *
@@ -777,11 +908,14 @@ index 84254f52c56a..c40b775452bc 100644
 + * Which allows an EDF like search on (sub)trees.
 + */
 +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-+{
+ {
+-	struct rb_node *next = rb_next(&se->run_node);
 +	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 +	struct sched_entity *curr = cfs_rq->curr;
 +	struct sched_entity *best = NULL;
-+
+ 
+-	if (!next)
+-		return NULL;
 +	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 +		curr = NULL;
 +
@@ -790,10 +924,6 @@ index 84254f52c56a..c40b775452bc 100644
 +
 +		/*
 +		 * If this entity is not eligible, try the left subtree.
-+		 *
-+		 * XXX: would it be worth it to do the single division for
-+		 *      avg_vruntime() once, instead of the multiplication
-+		 *      in entity_eligible() O(log n) times?
 +		 */
 +		if (!entity_eligible(cfs_rq, se)) {
 +			node = node->rb_left;
@@ -834,81 +964,153 @@ index 84254f52c56a..c40b775452bc 100644
 +			return left;
 +		}
 +	}
-+
+ 
+-	return __node_2_se(next);
 +	return best;
-+}
-+
+ }
+ 
  #ifdef CONFIG_SCHED_DEBUG
- struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+@@ -707,104 +926,43 @@ int sched_update_scaling(void)
  {
-@@ -721,6 +970,14 @@ int sched_update_scaling(void)
+ 	unsigned int factor = get_update_sysctl_factor();
+ 
+-	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+-					sysctl_sched_min_granularity);
+-
+ #define WRT_SYSCTL(name) \
+ 	(normalized_sysctl_##name = sysctl_##name / (factor))
+-	WRT_SYSCTL(sched_min_granularity);
+-	WRT_SYSCTL(sched_latency);
+-	WRT_SYSCTL(sched_wakeup_granularity);
++	WRT_SYSCTL(sched_base_slice);
+ #undef WRT_SYSCTL
+ 
+ 	return 0;
  }
  #endif
  
-+long calc_latency_offset(int prio)
-+{
+-/*
+- * delta /= w
+- */
+-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
++void set_latency_fair(struct sched_entity *se, int prio)
+ {
+-	if (unlikely(se->load.weight != NICE_0_LOAD))
+-		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 +	u32 weight = sched_prio_to_weight[prio];
-+	u64 base = sysctl_sched_min_granularity;
-+
-+	return div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
-+}
-+
- /*
-  * delta /= w
-  */
-@@ -797,14 +1054,30 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	return slice;
++	u64 base = sysctl_sched_base_slice;
+ 
+-	return delta;
+-}
+-
+-/*
+- * The idea is to set a period in which each task runs once.
+- *
+- * When there are too many tasks (sched_nr_latency) we have to stretch
+- * this period because otherwise the slices get too small.
+- *
+- * p = (nr <= nl) ? l : l*nr/nl
+- */
+-static u64 __sched_period(unsigned long nr_running)
+-{
+-	if (unlikely(nr_running > sched_nr_latency))
+-		return nr_running * sysctl_sched_min_granularity;
+-	else
+-		return sysctl_sched_latency;
++	/*
++	 * For EEVDF the virtual time slope is determined by w_i (iow.
++	 * nice) while the request time r_i is determined by
++	 * latency-nice.
++	 *
++	 * Smaller request gets better latency.
++	 */
++	se->slice = div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
  }
  
+-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+-
+ /*
+- * We calculate the wall-time slice from the period by taking a part
+- * proportional to the weight.
+- *
+- * s = p*P[w/rw]
++ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
++ * this is probably good enough.
+  */
+-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
++static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	unsigned int nr_running = cfs_rq->nr_running;
+-	struct sched_entity *init_se = se;
+-	unsigned int min_gran;
+-	u64 slice;
+-
+-	if (sched_feat(ALT_PERIOD))
+-		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+-
+-	slice = __sched_period(nr_running + !se->on_rq);
+-
+-	for_each_sched_entity(se) {
+-		struct load_weight *load;
+-		struct load_weight lw;
+-		struct cfs_rq *qcfs_rq;
+-
+-		qcfs_rq = cfs_rq_of(se);
+-		load = &qcfs_rq->load;
+-
+-		if (unlikely(!se->on_rq)) {
+-			lw = qcfs_rq->load;
+-
+-			update_load_add(&lw, se->load.weight);
+-			load = &lw;
+-		}
+-		slice = __calc_delta(slice, se->load.weight, load);
+-	}
+-
+-	if (sched_feat(BASE_SLICE)) {
+-		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+-			min_gran = sysctl_sched_idle_min_granularity;
+-		else
+-			min_gran = sysctl_sched_min_granularity;
+-
+-		slice = max_t(u64, slice, min_gran);
+-	}
+-
+-	return slice;
+-}
++	if ((s64)(se->vruntime - se->deadline) < 0)
++		return;
+ 
 -/*
 - * We calculate the vruntime slice of a to-be-inserted task.
 - *
 - * vs = s/w
 - */
 -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-+static void set_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
+-{
 -	return calc_delta_fair(sched_slice(cfs_rq, se), se);
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * For EEVDF the virtual time slope is determined by w_i (iow.
-+		 * nice) while the request time r_i is determined by
-+		 * latency-nice.
-+		 */
-+		se->slice = se->latency_offset;
-+	} else {
-+		/*
-+		 * When many tasks blow up the sched_period; it is possible
-+		 * that sched_slice() reports unusually large results (when
-+		 * many tasks are very light for example). Therefore impose a
-+		 * maximum.
-+		 */
-+		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
-+	}
-+
 +	/*
-+	 * vd_i = ve_i + r_i / w_i
++	 * EEVDF: vd_i = ve_i + r_i / w_i
 +	 */
 +	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
-+	se->min_deadline = se->deadline;
  }
  
  #include "pelt.h"
-@@ -939,6 +1212,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -939,6 +1097,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	schedstat_add(cfs_rq->exec_clock, delta_exec);
  
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
-+	/*
-+	 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
-+	 * this is probably good enough.
-+	 */
-+	if ((s64)(curr->vruntime - curr->deadline) > 0)
-+		set_slice(cfs_rq, curr);
-+
++	update_deadline(cfs_rq, curr);
  	update_min_vruntime(cfs_rq);
  
  	if (entity_is_task(curr)) {
-@@ -3340,6 +3620,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+@@ -3336,16 +3495,28 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 			    unsigned long weight)
+ {
++	unsigned long old_weight = se->load.weight;
++
+ 	if (se->on_rq) {
  		/* commit outstanding execution time */
  		if (cfs_rq->curr == se)
  			update_curr(cfs_rq);
@@ -917,7 +1119,21 @@ index 84254f52c56a..c40b775452bc 100644
  		update_load_sub(&cfs_rq->load, se->load.weight);
  	}
  	dequeue_load_avg(cfs_rq, se);
-@@ -3355,9 +3637,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 
+ 	update_load_set(&se->load, weight);
+ 
++	if (!se->on_rq) {
++		/*
++		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v),
++		 * we need to scale se->vlag when w_i changes.
++		 */
++		se->vlag = div_s64(se->vlag * old_weight, weight);
++	}
++
+ #ifdef CONFIG_SMP
+ 	do {
+ 		u32 divider = get_pelt_divider(&se->avg);
+@@ -3355,9 +3526,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  #endif
  
  	enqueue_load_avg(cfs_rq, se);
@@ -931,170 +1147,484 @@ index 84254f52c56a..c40b775452bc 100644
  }
  
  void reweight_task(struct task_struct *p, int prio)
-@@ -4669,49 +4953,49 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- {
--	u64 vruntime = cfs_rq->min_vruntime;
--	u64 sleep_time;
-+	u64 vruntime = avg_vruntime(cfs_rq);
+@@ -4653,158 +4826,135 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
  
--	/*
+ #endif /* CONFIG_SMP */
+ 
+-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+-{
+-#ifdef CONFIG_SCHED_DEBUG
+-	s64 d = se->vruntime - cfs_rq->min_vruntime;
+-
+-	if (d < 0)
+-		d = -d;
+-
+-	if (d > 3*sysctl_sched_latency)
+-		schedstat_inc(cfs_rq->nr_spread_over);
+-#endif
+-}
+-
+-static inline bool entity_is_long_sleeper(struct sched_entity *se)
++static void
++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+-	struct cfs_rq *cfs_rq;
+-	u64 sleep_time;
++	u64 vslice = calc_delta_fair(se->slice, se);
++	u64 vruntime = avg_vruntime(cfs_rq);
++	s64 lag = 0;
+ 
+-	if (se->exec_start == 0)
+-		return false;
++	/*
++	 * Due to how V is constructed as the weighted average of entities,
++	 * adding tasks with positive lag, or removing tasks with negative lag
++	 * will move 'time' backwards, this can screw around with the lag of
++	 * other tasks.
++	 *
++	 * EEVDF: placement strategy #1 / #2
++	 */
++	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
++		struct sched_entity *curr = cfs_rq->curr;
++		unsigned long load;
+ 
+-	cfs_rq = cfs_rq_of(se);
++		lag = se->vlag;
+ 
+-	sleep_time = rq_clock_task(rq_of(cfs_rq));
++		/*
++		 * For latency sensitive tasks; those that have a shorter than
++		 * average slice and do not fully consume the slice, transition
++		 * to EEVDF placement strategy #2.
++		 */
++		if (sched_feat(PLACE_FUDGE) &&
++		    cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) {
++			lag += vslice;
++			if (lag > 0)
++				lag = 0;
++		}
+ 
+-	/* Happen while migrating because of clock task divergence */
+-	if (sleep_time <= se->exec_start)
+-		return false;
++		/*
++		 * If we want to place a task and preserve lag, we have to
++		 * consider the effect of the new entity on the weighted
++		 * average and compensate for this, otherwise lag can quickly
++		 * evaporate:
++		 *
++		 * l_i = V - v_i <=> v_i = V - l_i
++		 *
++		 * V = v_avg = W*v_avg / W
++		 *
++		 * V' = (W*v_avg + w_i*v_i) / (W + w_i)
++		 *    = (W*v_avg + w_i(v_avg - l_i)) / (W + w_i)
++		 *    = v_avg + w_i*l_i/(W + w_i)
++		 *
++		 * l_i' = V' - v_i = v_avg + w_i*l_i/(W + w_i) - (v_avg - l)
++		 *      = l_i - w_i*l_i/(W + w_i)
++		 *
++		 * l_i = (W + w_i) * l_i' / W
++		 */
++		load = cfs_rq->avg_load;
++		if (curr && curr->on_rq)
++			load += scale_load_down(curr->load.weight);
+ 
+-	sleep_time -= se->exec_start;
+-	if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+-		return true;
++		lag *= load + scale_load_down(se->load.weight);
++		if (WARN_ON_ONCE(!load))
++			load = 1;
++		lag = div_s64(lag, load);
+ 
+-	return false;
+-}
++		vruntime -= lag;
++	}
+ 
+-static void
+-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+-{
+-	u64 vruntime = cfs_rq->min_vruntime;
++	/*
++	 * Base the deadline on the 'normal' EEVDF placement policy in an
++	 * attempt to not let the bonus crud below wreck things completely.
++	 */
++	se->deadline = vruntime;
+ 
+ 	/*
 -	 * The 'current' period is already promised to the current tasks,
 -	 * however the extra weight of the new task will slow them down a
 -	 * little, place the new task so that it fits in the slot that
 -	 * stays open at the end.
--	 */
++	 * The whole 'sleeper' bonus hack... :-/ This is strictly unfair.
++	 *
++	 * By giving a sleeping task a little boost, it becomes possible for a
++	 * 50% task to compete equally with a 100% task. That is, strictly fair
++	 * that setup would result in a 67% / 33% split. Sleeper bonus will
++	 * change that to 50% / 50%.
++	 *
++	 * This thing hurts my brain, because tasks leaving with negative lag
++	 * will move 'time' backward, so comparing against a historical
++	 * se->vruntime is dodgy as heck.
+ 	 */
 -	if (initial && sched_feat(START_DEBIT))
 -		vruntime += sched_vslice(cfs_rq, se);
-+	if (sched_feat(PRESERVE_LAG))
-+		vruntime -= se->lag;
++	if (sched_feat(PLACE_BONUS) &&
++	    (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)) {
++		/*
++		 * If se->vruntime is ahead of vruntime, something dodgy
++		 * happened and we cannot give bonus due to not having valid
++		 * history.
++		 */
++		if ((s64)(se->vruntime - vruntime) < 0) {
++			vruntime -= se->slice/2;
++			vruntime = max_vruntime(se->vruntime, vruntime);
++		}
++	}
  
 -	/* sleeps up to a single latency don't count. */
 -	if (!initial) {
 -		unsigned long thresh;
-+	if (sched_feat(FAIR_SLEEPERS)) {
-+//		u64 sleep_time;
++	se->vruntime = vruntime;
  
 -		if (se_is_idle(se))
 -			thresh = sysctl_sched_min_granularity;
 -		else
 -			thresh = sysctl_sched_latency;
-+		/* sleeps up to a single latency don't count. */
-+		if (!initial) {
-+			unsigned long thresh = TICK_NSEC;
-+
-+			if (!sched_feat(EEVDF)) {
-+				if (se_is_idle(se))
-+					thresh = sysctl_sched_min_granularity;
-+				else
-+					thresh = sysctl_sched_latency;
-+			}
-+
-+			/*
-+			 * Halve their sleep time's effect, to allow
-+			 * for a gentler effect of sleepers:
-+			 */
-+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
-+				thresh >>= 1;
-+
-+			vruntime -= calc_delta_fair(thresh, se);
-+		}
++	/*
++	 * When joining the competition; the exisiting tasks will be,
++	 * on average, halfway through their slice, as such start tasks
++	 * off with half a slice to ease into the competition.
++	 */
++	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
++		vslice /= 2;
  
- 		/*
+-		/*
 -		 * Halve their sleep time's effect, to allow
 -		 * for a gentler effect of sleepers:
-+		 * Pull vruntime of the entity being placed to the base level of
-+		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
-+		 * slept for a long time, don't even try to compare its vruntime with
-+		 * the base as it may be too far off and the comparison may get
-+		 * inversed due to s64 overflow.
-+		sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
-+		if ((s64)sleep_time < 60LL * NSEC_PER_SEC)
- 		 */
+-		 */
 -		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 -			thresh >>= 1;
 -
 -		vruntime -= thresh;
-+			vruntime = max_vruntime(se->vruntime, vruntime);
- 	}
- 
+-	}
+-
 -	/*
 -	 * Pull vruntime of the entity being placed to the base level of
--	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
--	 * slept for a long time, don't even try to compare its vruntime with
--	 * the base as it may be too far off and the comparison may get
--	 * inversed due to s64 overflow.
+-	 * cfs_rq, to prevent boosting it if placed backwards.
+-	 * However, min_vruntime can advance much faster than real time, with
+-	 * the extreme being when an entity with the minimal weight always runs
+-	 * on the cfs_rq. If the waking entity slept for a long time, its
+-	 * vruntime difference from min_vruntime may overflow s64 and their
+-	 * comparison may get inversed, so ignore the entity's original
+-	 * vruntime in that case.
+-	 * The maximal vruntime speedup is given by the ratio of normal to
+-	 * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
+-	 * When placing a migrated waking entity, its exec_start has been set
+-	 * from a different rq. In order to take into account a possible
+-	 * divergence between new and prev rq's clocks task because of irq and
+-	 * stolen time, we take an additional margin.
+-	 * So, cutting off on the sleep time of
+-	 *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
+-	 * should be safe.
 -	 */
--	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
--	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+-	if (entity_is_long_sleeper(se))
 -		se->vruntime = vruntime;
 -	else
 -		se->vruntime = max_vruntime(se->vruntime, vruntime);
-+	se->vruntime = vruntime;
-+	set_slice(cfs_rq, se);
++	/*
++	 * EEVDF: vd_i = ve_i + r_i/w_i
++	 */
++	se->deadline += vslice;
  }
  
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-@@ -4879,6 +5163,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 
+ static inline bool cfs_bandwidth_used(void);
+ 
+-/*
+- * MIGRATION
+- *
+- *	dequeue
+- *	  update_curr()
+- *	    update_min_vruntime()
+- *	  vruntime -= min_vruntime
+- *
+- *	enqueue
+- *	  update_curr()
+- *	    update_min_vruntime()
+- *	  vruntime += min_vruntime
+- *
+- * this way the vruntime transition between RQs is done when both
+- * min_vruntime are up-to-date.
+- *
+- * WAKEUP (remote)
+- *
+- *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
+- *	  vruntime -= min_vruntime
+- *
+- *	enqueue
+- *	  update_curr()
+- *	    update_min_vruntime()
+- *	  vruntime += min_vruntime
+- *
+- * this way we don't have the most up-to-date min_vruntime on the originating
+- * CPU and an up-to-date min_vruntime on the destination CPU.
+- */
+-
+ static void
+ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+-	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+ 	bool curr = cfs_rq->curr == se;
+ 
+ 	/*
+ 	 * If we're the current task, we must renormalise before calling
+ 	 * update_curr().
+ 	 */
+-	if (renorm && curr)
+-		se->vruntime += cfs_rq->min_vruntime;
++	if (curr)
++		place_entity(cfs_rq, se, flags);
+ 
+ 	update_curr(cfs_rq);
+ 
+-	/*
+-	 * Otherwise, renormalise after, such that we're placed at the current
+-	 * moment in time, instead of some random moment in the past. Being
+-	 * placed in the past could significantly boost this task to the
+-	 * fairness detriment of existing tasks.
+-	 */
+-	if (renorm && !curr)
+-		se->vruntime += cfs_rq->min_vruntime;
+-
+ 	/*
+ 	 * When enqueuing a sched_entity, we must:
+ 	 *   - Update loads to have both entity and cfs_rq synced with now.
+@@ -4816,18 +4966,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	 */
+ 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ 	se_update_runnable(se);
++	/*
++	 * XXX update_load_avg() above will have attached us to the pelt sum;
++	 * but update_cfs_group() here will re-adjust the weight and have to
++	 * undo/redo all that. Seems wasteful.
++	 */
+ 	update_cfs_group(se);
++
++	/*
++	 * XXX now that the entity has been re-weighted, and it's lag adjusted,
++	 * we can place the entity.
++	 */
++	if (!curr)
++		place_entity(cfs_rq, se, flags);
++
+ 	account_entity_enqueue(cfs_rq, se);
+ 
+-	if (flags & ENQUEUE_WAKEUP)
+-		place_entity(cfs_rq, se, 0);
+ 	/* Entity has migrated, no longer consider this task hot */
+ 	if (flags & ENQUEUE_MIGRATED)
+ 		se->exec_start = 0;
+ 
+ 	check_schedstat_required();
+ 	update_stats_enqueue_fair(cfs_rq, se, flags);
+-	check_spread(cfs_rq, se);
+ 	if (!curr)
+ 		__enqueue_entity(cfs_rq, se);
+ 	se->on_rq = 1;
+@@ -4839,17 +4999,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	}
+ }
+ 
+-static void __clear_buddies_last(struct sched_entity *se)
+-{
+-	for_each_sched_entity(se) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-		if (cfs_rq->last != se)
+-			break;
+-
+-		cfs_rq->last = NULL;
+-	}
+-}
+-
+ static void __clear_buddies_next(struct sched_entity *se)
+ {
+ 	for_each_sched_entity(se) {
+@@ -4861,27 +5010,10 @@ static void __clear_buddies_next(struct sched_entity *se)
+ 	}
+ }
+ 
+-static void __clear_buddies_skip(struct sched_entity *se)
+-{
+-	for_each_sched_entity(se) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-		if (cfs_rq->skip != se)
+-			break;
+-
+-		cfs_rq->skip = NULL;
+-	}
+-}
+-
+ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	if (cfs_rq->last == se)
+-		__clear_buddies_last(se);
+-
+ 	if (cfs_rq->next == se)
+ 		__clear_buddies_next(se);
+-
+-	if (cfs_rq->skip == se)
+-		__clear_buddies_skip(se);
+ }
+ 
+ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+@@ -4915,20 +5047,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
  	clear_buddies(cfs_rq, se);
  
-+	if (sched_feat(PRESERVE_LAG) && (flags & DEQUEUE_SLEEP))
-+		se->lag = avg_vruntime(cfs_rq) - se->vruntime;
-+
++	update_entity_lag(cfs_rq, se);
  	if (se != cfs_rq->curr)
  		__dequeue_entity(cfs_rq, se);
  	se->on_rq = 0;
-@@ -4917,19 +5204,20 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	account_entity_dequeue(cfs_rq, se);
+ 
+-	/*
+-	 * Normalize after update_curr(); which will also have moved
+-	 * min_vruntime if @se is the one holding it back. But before doing
+-	 * update_min_vruntime() again, which will discount @se's position and
+-	 * can move min_vruntime forward still more.
+-	 */
+-	if (!(flags & DEQUEUE_SLEEP))
+-		se->vruntime -= cfs_rq->min_vruntime;
+-
+ 	/* return excess runtime on last dequeue */
+ 	return_cfs_rq_runtime(cfs_rq);
+ 
+@@ -4953,44 +5077,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  static void
  check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
 -	unsigned long ideal_runtime, delta_exec;
-+	unsigned long delta_exec;
- 	struct sched_entity *se;
- 	s64 delta;
- 
+-	struct sched_entity *se;
+-	s64 delta;
+-
 -	/*
 -	 * When many tasks blow up the sched_period; it is possible that
 -	 * sched_slice() reports unusually large results (when many tasks are
 -	 * very light for example). Therefore impose a maximum.
 -	 */
 -	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-+	if (sched_feat(EEVDF)) {
-+		if (pick_eevdf(cfs_rq) != curr)
-+			goto preempt;
-+
-+		return;
-+	}
- 
- 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+-
+-	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 -	if (delta_exec > ideal_runtime) {
-+	if (delta_exec > curr->slice) {
-+preempt:
++	if (pick_eevdf(cfs_rq) != curr) {
  		resched_curr(rq_of(cfs_rq));
  		/*
  		 * The current task ran long enough, ensure it doesn't get
-@@ -4953,7 +5241,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
- 	if (delta < 0)
- 		return;
- 
+ 		 * re-elected due to buddy favours.
+ 		 */
+ 		clear_buddies(cfs_rq, curr);
+-		return;
+ 	}
+-
+-	/*
+-	 * Ensure that a task that missed wakeup preemption by a
+-	 * narrow margin doesn't have to wait for a full slice.
+-	 * This also mitigates buddy induced latencies under load.
+-	 */
+-	if (delta_exec < sysctl_sched_min_granularity)
+-		return;
+-
+-	se = __pick_first_entity(cfs_rq);
+-	delta = curr->vruntime - se->vruntime;
+-
+-	if (delta < 0)
+-		return;
+-
 -	if (delta > ideal_runtime)
-+	if (delta > curr->slice)
- 		resched_curr(rq_of(cfs_rq));
+-		resched_curr(rq_of(cfs_rq));
  }
  
-@@ -5008,17 +5296,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+ static void
+@@ -5031,9 +5125,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
+ }
+ 
+-static int
+-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+-
+ /*
+  * Pick the next process, keeping these things in mind, in this order:
+  * 1) keep things fair between processes/task groups
+@@ -5044,50 +5135,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
 -	struct sched_entity *left = __pick_first_entity(cfs_rq);
 -	struct sched_entity *se;
-+	struct sched_entity *left, *se;
- 
--	/*
+-
+ 	/*
 -	 * If curr is set we have to see if its left of the leftmost entity
 -	 * still in the tree, provided there was anything in the tree at all.
--	 */
++	 * Enabling NEXT_BUDDY will affect latency but not fairness.
+ 	 */
 -	if (!left || (curr && entity_before(curr, left)))
 -		left = curr;
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * Enabling NEXT_BUDDY will affect latency but not fairness.
-+		 */
-+		if (sched_feat(NEXT_BUDDY) &&
-+		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
-+			return cfs_rq->next;
- 
+-
 -	se = left; /* ideally we run the leftmost entity */
-+		return pick_eevdf(cfs_rq);
-+	}
-+
-+	se = left = pick_cfs(cfs_rq, curr);
++	if (sched_feat(NEXT_BUDDY) &&
++	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
++		return cfs_rq->next;
  
- 	/*
- 	 * Avoid running the skip buddy, if running something else can
-@@ -6113,13 +6404,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+-	/*
+-	 * Avoid running the skip buddy, if running something else can
+-	 * be done without getting too unfair.
+-	 */
+-	if (cfs_rq->skip && cfs_rq->skip == se) {
+-		struct sched_entity *second;
+-
+-		if (se == curr) {
+-			second = __pick_first_entity(cfs_rq);
+-		} else {
+-			second = __pick_next_entity(se);
+-			if (!second || (curr && entity_before(curr, second)))
+-				second = curr;
+-		}
+-
+-		if (second && wakeup_preempt_entity(second, left) < 1)
+-			se = second;
+-	}
+-
+-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+-		/*
+-		 * Someone really wants this to run. If it's not unfair, run it.
+-		 */
+-		se = cfs_rq->next;
+-	} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+-		/*
+-		 * Prefer last buddy, try to return the CPU to a preempted task.
+-		 */
+-		se = cfs_rq->last;
+-	}
+-
+-	return se;
++	return pick_eevdf(cfs_rq);
+ }
+ 
+ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+@@ -5104,8 +5159,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ 	/* throttle cfs_rqs exceeding runtime */
+ 	check_cfs_rq_runtime(cfs_rq);
+ 
+-	check_spread(cfs_rq, prev);
+-
+ 	if (prev->on_rq) {
+ 		update_stats_wait_start_fair(cfs_rq, prev);
+ 		/* Put 'current' back into the tree. */
+@@ -6149,13 +6202,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
@@ -1109,46 +1639,331 @@ index 84254f52c56a..c40b775452bc 100644
  		s64 delta = slice - ran;
  
  		if (delta < 0) {
-@@ -7891,7 +8181,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -6179,8 +6231,7 @@ static void hrtick_update(struct rq *rq)
+ 	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
+ 		return;
+ 
+-	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+-		hrtick_start_fair(rq, curr);
++	hrtick_start_fair(rq, curr);
+ }
+ #else /* !CONFIG_SCHED_HRTICK */
+ static inline void
+@@ -6221,17 +6272,6 @@ static int sched_idle_rq(struct rq *rq)
+ 			rq->nr_running);
+ }
+ 
+-/*
+- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
+- * of idle_nr_running, which does not consider idle descendants of normal
+- * entities.
+- */
+-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
+-{
+-	return cfs_rq->nr_running &&
+-		cfs_rq->nr_running == cfs_rq->idle_nr_running;
+-}
+-
+ #ifdef CONFIG_SMP
+ static int sched_idle_cpu(int cpu)
+ {
+@@ -6333,18 +6373,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 
+ static void set_next_buddy(struct sched_entity *se);
+ 
+-static inline void dur_avg_update(struct task_struct *p, bool task_sleep)
+-{
+-	u64 dur;
+-
+-	if (!task_sleep)
+-		return;
+-
+-	dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime;
+-	p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime;
+-	update_avg(&p->se.dur_avg, dur);
+-}
+-
+ /*
+  * The dequeue_task method is called before nr_running is
+  * decreased. We remove the task from the rbtree and
+@@ -6417,7 +6445,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 
+ dequeue_throttle:
+ 	util_est_update(&rq->cfs, p, task_sleep);
+-	dur_avg_update(p, task_sleep);
+ 	hrtick_update(rq);
+ }
+ 
+@@ -6551,23 +6578,6 @@ static int wake_wide(struct task_struct *p)
+ 	return 1;
+ }
+ 
+-/*
+- * If a task switches in and then voluntarily relinquishes the
+- * CPU quickly, it is regarded as a short duration task.
+- *
+- * SIS_SHORT tries to wake up the short wakee on current CPU. This
+- * aims to avoid race condition among CPUs due to frequent context
+- * switch. Besides, the candidate short task should not be the one
+- * that wakes up more than one tasks, otherwise SIS_SHORT might
+- * stack too many tasks on current CPU.
+- */
+-static inline int is_short_task(struct task_struct *p)
+-{
+-	return sched_feat(SIS_SHORT) && !p->wakee_flips &&
+-	       p->se.dur_avg &&
+-	       ((p->se.dur_avg * 8) < sysctl_sched_min_granularity);
+-}
+-
+ /*
+  * The purpose of wake_affine() is to quickly determine on which CPU we can run
+  * soonest. For the purpose of speed we only consider the waking and previous
+@@ -6604,11 +6614,6 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
+ 	if (available_idle_cpu(prev_cpu))
+ 		return prev_cpu;
+ 
+-	/* The only running task is a short duration one. */
+-	if (cpu_rq(this_cpu)->nr_running == 1 &&
+-	    is_short_task(rcu_dereference(cpu_curr(this_cpu))))
+-		return this_cpu;
+-
+ 	return nr_cpumask_bits;
+ }
+ 
+@@ -6983,20 +6988,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 			/* overloaded LLC is unlikely to have idle cpu/core */
+ 			if (nr == 1)
+ 				return -1;
+-
+-			/*
+-			 * If the scan number suggested by SIS_UTIL is smaller
+-			 * than 60% of llc_weight, it indicates a util_avg% higher
+-			 * than 50%. System busier than this could lower its bar to
+-			 * choose a compromised "idle" CPU. This co-exists with
+-			 * !has_idle_core to not stack too many tasks on one CPU.
+-			 */
+-			if (!has_idle_core && this == target &&
+-			    (5 * nr < 3 * sd->span_weight) &&
+-			    cpu_rq(target)->nr_running <= 1 &&
+-			    is_short_task(p) &&
+-			    is_short_task(rcu_dereference(cpu_curr(target))))
+-				return target;
+ 		}
+ 	}
+ 
+@@ -7729,18 +7720,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+ {
+ 	struct sched_entity *se = &p->se;
+ 
+-	/*
+-	 * As blocked tasks retain absolute vruntime the migration needs to
+-	 * deal with this by subtracting the old and adding the new
+-	 * min_vruntime -- the latter is done by enqueue_entity() when placing
+-	 * the task on the new runqueue.
+-	 */
+-	if (READ_ONCE(p->__state) == TASK_WAKING) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-
+-		se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
+-	}
+-
+ 	if (!task_on_rq_migrating(p)) {
+ 		remove_entity_load_avg(se);
+ 
+@@ -7778,66 +7757,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ }
+ #endif /* CONFIG_SMP */
+ 
+-static unsigned long wakeup_gran(struct sched_entity *se)
+-{
+-	unsigned long gran = sysctl_sched_wakeup_granularity;
+-
+-	/*
+-	 * Since its curr running now, convert the gran from real-time
+-	 * to virtual-time in his units.
+-	 *
+-	 * By using 'se' instead of 'curr' we penalize light tasks, so
+-	 * they get preempted easier. That is, if 'se' < 'curr' then
+-	 * the resulting gran will be larger, therefore penalizing the
+-	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
+-	 * be smaller, again penalizing the lighter task.
+-	 *
+-	 * This is especially important for buddies when the leftmost
+-	 * task is higher priority than the buddy.
+-	 */
+-	return calc_delta_fair(gran, se);
+-}
+-
+-/*
+- * Should 'se' preempt 'curr'.
+- *
+- *             |s1
+- *        |s2
+- *   |s3
+- *         g
+- *      |<--->|c
+- *
+- *  w(c, s1) = -1
+- *  w(c, s2) =  0
+- *  w(c, s3) =  1
+- *
+- */
+-static int
+-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+-{
+-	s64 gran, vdiff = curr->vruntime - se->vruntime;
+-
+-	if (vdiff <= 0)
+-		return -1;
+-
+-	gran = wakeup_gran(se);
+-	if (vdiff > gran)
+-		return 1;
+-
+-	return 0;
+-}
+-
+-static void set_last_buddy(struct sched_entity *se)
+-{
+-	for_each_sched_entity(se) {
+-		if (SCHED_WARN_ON(!se->on_rq))
+-			return;
+-		if (se_is_idle(se))
+-			return;
+-		cfs_rq_of(se)->last = se;
+-	}
+-}
+-
+ static void set_next_buddy(struct sched_entity *se)
+ {
+ 	for_each_sched_entity(se) {
+@@ -7849,12 +7768,6 @@ static void set_next_buddy(struct sched_entity *se)
+ 	}
+ }
+ 
+-static void set_skip_buddy(struct sched_entity *se)
+-{
+-	for_each_sched_entity(se)
+-		cfs_rq_of(se)->skip = se;
+-}
+-
+ /*
+  * Preempt the current task with a newly woken task if needed:
+  */
+@@ -7863,7 +7776,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	struct task_struct *curr = rq->curr;
+ 	struct sched_entity *se = &curr->se, *pse = &p->se;
+ 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+-	int scale = cfs_rq->nr_running >= sched_nr_latency;
+ 	int next_buddy_marked = 0;
+ 	int cse_is_idle, pse_is_idle;
+ 
+@@ -7879,7 +7791,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ 		return;
+ 
+-	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
++	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
+ 		set_next_buddy(pse);
+ 		next_buddy_marked = 1;
+ 	}
+@@ -7924,35 +7836,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	if (cse_is_idle != pse_is_idle)
  		return;
  
 -	update_curr(cfs_rq_of(se));
+-	if (wakeup_preempt_entity(se, pse) == 1) {
+-		/*
+-		 * Bias pick_next to pick the sched entity that is
+-		 * triggering this preemption.
+-		 */
+-		if (!next_buddy_marked)
+-			set_next_buddy(pse);
 +	cfs_rq = cfs_rq_of(se);
 +	update_curr(cfs_rq);
 +
-+	if (sched_feat(EEVDF)) {
-+		/*
-+		 * XXX pick_eevdf(cfs_rq) != se ?
-+		 */
-+		if (pick_eevdf(cfs_rq) == pse)
-+			goto preempt;
-+
-+		return;
-+	}
-+
- 	if (wakeup_preempt_entity(se, pse) == 1) {
- 		/*
- 		 * Bias pick_next to pick the sched entity that is
-@@ -8137,7 +8439,7 @@ static void yield_task_fair(struct rq *rq)
++	/*
++	 * XXX pick_eevdf(cfs_rq) != se ?
++	 */
++	if (pick_eevdf(cfs_rq) == pse)
+ 		goto preempt;
+-	}
+ 
+ 	return;
+ 
+ preempt:
+ 	resched_curr(rq);
+-	/*
+-	 * Only set the backward buddy when the current task is still
+-	 * on the rq. This can happen when a wakeup gets interleaved
+-	 * with schedule on the ->pre_schedule() or idle_balance()
+-	 * point, either of which can * drop the rq lock.
+-	 *
+-	 * Also, during early boot the idle thread is in the fair class,
+-	 * for obvious reasons its a bad idea to schedule back to it.
+-	 */
+-	if (unlikely(!se->on_rq || curr == rq->idle))
+-		return;
+-
+-	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+-		set_last_buddy(se);
+ }
+ 
+ #ifdef CONFIG_SMP
+@@ -8153,8 +8049,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ 
+ /*
+  * sched_yield() is very simple
+- *
+- * The magic of dealing with the ->skip buddy is in pick_next_entity.
+  */
+ static void yield_task_fair(struct rq *rq)
+ {
+@@ -8170,21 +8064,19 @@ static void yield_task_fair(struct rq *rq)
  
  	clear_buddies(cfs_rq, se);
  
 -	if (curr->policy != SCHED_BATCH) {
-+	if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
- 		update_rq_clock(rq);
- 		/*
- 		 * Update run-time statistics of the 'current'.
-@@ -8150,6 +8452,8 @@ static void yield_task_fair(struct rq *rq)
- 		 */
- 		rq_clock_skip_update(rq);
- 	}
-+	if (sched_feat(EEVDF))
-+		se->deadline += calc_delta_fair(se->slice, se);
+-		update_rq_clock(rq);
+-		/*
+-		 * Update run-time statistics of the 'current'.
+-		 */
+-		update_curr(cfs_rq);
+-		/*
+-		 * Tell update_rq_clock() that we've just updated,
+-		 * so we don't do microscopic update in schedule()
+-		 * and double the fastpath cost.
+-		 */
+-		rq_clock_skip_update(rq);
+-	}
++	update_rq_clock(rq);
++	/*
++	 * Update run-time statistics of the 'current'.
++	 */
++	update_curr(cfs_rq);
++	/*
++	 * Tell update_rq_clock() that we've just updated,
++	 * so we don't do microscopic update in schedule()
++	 * and double the fastpath cost.
++	 */
++	rq_clock_skip_update(rq);
  
- 	set_skip_buddy(se);
+-	set_skip_buddy(se);
++	se->deadline += calc_delta_fair(se->slice, se);
  }
-@@ -11902,8 +12206,8 @@ static void rq_offline_fair(struct rq *rq)
+ 
+ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
+@@ -8427,8 +8319,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+ 	 * Buddy candidates are cache hot:
+ 	 */
+ 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
+-			(&p->se == cfs_rq_of(&p->se)->next ||
+-			 &p->se == cfs_rq_of(&p->se)->last))
++	    (&p->se == cfs_rq_of(&p->se)->next))
+ 		return 1;
+ 
+ 	if (sysctl_sched_migration_cost == -1)
+@@ -11932,8 +11823,8 @@ static void rq_offline_fair(struct rq *rq)
  static inline bool
  __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
  {
@@ -1158,31 +1973,130 @@ index 84254f52c56a..c40b775452bc 100644
  
  	return (rtime * min_nr_tasks > slice);
  }
-@@ -12330,6 +12634,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+@@ -12077,8 +11968,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+  */
+ static void task_fork_fair(struct task_struct *p)
+ {
+-	struct cfs_rq *cfs_rq;
+ 	struct sched_entity *se = &p->se, *curr;
++	struct cfs_rq *cfs_rq;
+ 	struct rq *rq = this_rq();
+ 	struct rq_flags rf;
+ 
+@@ -12087,22 +11978,9 @@ static void task_fork_fair(struct task_struct *p)
+ 
+ 	cfs_rq = task_cfs_rq(current);
+ 	curr = cfs_rq->curr;
+-	if (curr) {
++	if (curr)
+ 		update_curr(cfs_rq);
+-		se->vruntime = curr->vruntime;
+-	}
+-	place_entity(cfs_rq, se, 1);
+-
+-	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+-		/*
+-		 * Upon rescheduling, sched_class::put_prev_task() will place
+-		 * 'current' within the tree based on its new key value.
+-		 */
+-		swap(curr->vruntime, se->vruntime);
+-		resched_curr(rq);
+-	}
+-
+-	se->vruntime -= cfs_rq->min_vruntime;
++	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
+ 	rq_unlock(rq, &rf);
+ }
+ 
+@@ -12131,34 +12009,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+ 		check_preempt_curr(rq, p, 0);
+ }
+ 
+-static inline bool vruntime_normalized(struct task_struct *p)
+-{
+-	struct sched_entity *se = &p->se;
+-
+-	/*
+-	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+-	 * the dequeue_entity(.flags=0) will already have normalized the
+-	 * vruntime.
+-	 */
+-	if (p->on_rq)
+-		return true;
+-
+-	/*
+-	 * When !on_rq, vruntime of the task has usually NOT been normalized.
+-	 * But there are some cases where it has already been normalized:
+-	 *
+-	 * - A forked child which is waiting for being woken up by
+-	 *   wake_up_new_task().
+-	 * - A task which has been woken up by try_to_wake_up() and
+-	 *   waiting for actually being woken up by sched_ttwu_pending().
+-	 */
+-	if (!se->sum_exec_runtime ||
+-	    (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
+-		return true;
+-
+-	return false;
+-}
+-
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+  * Propagate the changes of the sched_entity across the tg tree to make it
+@@ -12229,16 +12079,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
+ static void detach_task_cfs_rq(struct task_struct *p)
+ {
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+-
+-	if (!vruntime_normalized(p)) {
+-		/*
+-		 * Fix up our vruntime so that the current sleep doesn't
+-		 * cause 'unlimited' sleep bonus.
+-		 */
+-		place_entity(cfs_rq, se, 0);
+-		se->vruntime -= cfs_rq->min_vruntime;
+-	}
+ 
+ 	detach_entity_cfs_rq(se);
+ }
+@@ -12246,12 +12086,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
+ static void attach_task_cfs_rq(struct task_struct *p)
+ {
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 
+ 	attach_entity_cfs_rq(se);
+-
+-	if (!vruntime_normalized(p))
+-		se->vruntime += cfs_rq->min_vruntime;
+ }
+ 
+ static void switched_from_fair(struct rq *rq, struct task_struct *p)
+@@ -12362,6 +12198,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  		goto err;
  
  	tg->shares = NICE_0_LOAD;
-+	tg->latency_prio = DEFAULT_LATENCY_PRIO;
++	tg->latency_prio = DEFAULT_PRIO;
  
  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
-@@ -12428,6 +12733,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+@@ -12460,6 +12297,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  	}
  
  	se->my_q = cfs_rq;
 +
-+	se->latency_offset = calc_latency_offset(tg->latency_prio);
++	set_latency_fair(se, tg->latency_prio - MAX_RT_PRIO);
 +
  	/* guarantee group entities always have weight */
  	update_load_set(&se->load, NICE_0_LOAD);
  	se->parent = parent;
-@@ -12558,6 +12866,34 @@ int sched_group_set_idle(struct task_group *tg, long idle)
+@@ -12590,6 +12430,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
  	return 0;
  }
  
 +int sched_group_set_latency(struct task_group *tg, int prio)
 +{
-+	long latency_offset;
 +	int i;
 +
 +	if (tg == &root_task_group)
@@ -1196,13 +2110,9 @@ index 84254f52c56a..c40b775452bc 100644
 +	}
 +
 +	tg->latency_prio = prio;
-+	latency_offset = calc_latency_offset(prio);
 +
-+	for_each_possible_cpu(i) {
-+		struct sched_entity *se = tg->se[i];
-+
-+		WRITE_ONCE(se->latency_offset, latency_offset);
-+	}
++	for_each_possible_cpu(i)
++		set_latency_fair(tg->se[i], prio - MAX_RT_PRIO);
 +
 +	mutex_unlock(&shares_mutex);
 +	return 0;
@@ -1211,7 +2121,7 @@ index 84254f52c56a..c40b775452bc 100644
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  void free_fair_sched_group(struct task_group *tg) { }
-@@ -12584,7 +12920,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
+@@ -12616,7 +12479,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  	 * idle runqueue:
  	 */
  	if (rq->cfs.load.weight)
@@ -1221,39 +2131,65 @@ index 84254f52c56a..c40b775452bc 100644
  	return rr_interval;
  }
 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index efdc29c42161..49c7e6fa4c71 100644
+index efdc29c42161..d4b7d3f7c044 100644
 --- a/kernel/sched/features.h
 +++ b/kernel/sched/features.h
-@@ -1,16 +1,18 @@
+@@ -1,16 +1,15 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-+
- /*
-  * Only give sleepers 50% of their service deficit. This allows
-  * them to run sooner, but does not allow tons of sleepers to
-  * rip the spread apart.
-  */
-+SCHED_FEAT(FAIR_SLEEPERS, false)
- SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+-/*
+- * Only give sleepers 50% of their service deficit. This allows
+- * them to run sooner, but does not allow tons of sleepers to
+- * rip the spread apart.
+- */
+-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
  
  /*
 - * Place new tasks ahead so that they do not starve already running
 - * tasks
-+ * Using the avg_vruntime, do the right thing and preserve lag
-+ * across sleep+wake cycles.
++ * Using the avg_vruntime, do the right thing and preserve lag across
++ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
   */
 -SCHED_FEAT(START_DEBIT, true)
-+SCHED_FEAT(PRESERVE_LAG, true)
++SCHED_FEAT(PLACE_LAG, true)
++SCHED_FEAT(PLACE_FUDGE, true)
++SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
++SCHED_FEAT(PLACE_BONUS, false)
++
++SCHED_FEAT(MINIMAL_VA, false)
  
  /*
   * Prefer to schedule the task we woke last (assuming it failed
-@@ -102,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)
+@@ -19,13 +18,6 @@ SCHED_FEAT(START_DEBIT, true)
+  */
+ SCHED_FEAT(NEXT_BUDDY, false)
  
- SCHED_FEAT(ALT_PERIOD, true)
- SCHED_FEAT(BASE_SLICE, true)
-+
-+SCHED_FEAT(EEVDF, true)
+-/*
+- * Prefer to schedule the task that ran last (when we did
+- * wake-preempt) as that likely will touch the same data, increases
+- * cache locality.
+- */
+-SCHED_FEAT(LAST_BUDDY, true)
+-
+ /*
+  * Consider buddies to be cache hot, decreases the likeliness of a
+  * cache buddy being migrated away, increases cache locality.
+@@ -62,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_PROP, false)
+ SCHED_FEAT(SIS_UTIL, true)
+-SCHED_FEAT(SIS_SHORT, true)
+ 
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+@@ -99,6 +90,3 @@ SCHED_FEAT(UTIL_EST, true)
+ SCHED_FEAT(UTIL_EST_FASTUP, true)
+ 
+ SCHED_FEAT(LATENCY_WARN, false)
+-
+-SCHED_FEAT(ALT_PERIOD, true)
+-SCHED_FEAT(BASE_SLICE, true)
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 9e8bb6278604..fe5af7aaa931 100644
+index 7331d436ebc4..bfce45b21441 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -378,6 +378,8 @@ struct task_group {
@@ -1274,28 +2210,67 @@ index 9e8bb6278604..fe5af7aaa931 100644
  #ifdef CONFIG_SMP
  extern void set_task_rq_fair(struct sched_entity *se,
  			     struct cfs_rq *prev, struct cfs_rq *next);
-@@ -554,6 +558,9 @@ struct cfs_rq {
+@@ -554,6 +558,10 @@ struct cfs_rq {
  	unsigned int		idle_nr_running;   /* SCHED_IDLE */
  	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
  
 +	s64			avg_vruntime;
++	u64			avg_slice;
 +	u64			avg_load;
 +
  	u64			exec_clock;
  	u64			min_vruntime;
  #ifdef CONFIG_SCHED_CORE
-@@ -2478,6 +2485,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+@@ -573,8 +581,6 @@ struct cfs_rq {
+ 	 */
+ 	struct sched_entity	*curr;
+ 	struct sched_entity	*next;
+-	struct sched_entity	*last;
+-	struct sched_entity	*skip;
+ 
+ #ifdef	CONFIG_SCHED_DEBUG
+ 	unsigned int		nr_spread_over;
+@@ -2154,7 +2160,7 @@ extern const u32		sched_prio_to_wmult[40];
+  * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+  * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+  * ENQUEUE_MIGRATED  - the task was migrated during wakeup
+- *
++ * ENQUEUE_INITIAL   - place a new task (fork/clone)
+  */
+ 
+ #define DEQUEUE_SLEEP		0x01
+@@ -2174,6 +2180,7 @@ extern const u32		sched_prio_to_wmult[40];
+ #else
+ #define ENQUEUE_MIGRATED	0x00
+ #endif
++#define ENQUEUE_INITIAL		0x80
+ 
+ #define RETRY_TASK		((void *)-1UL)
+ 
+@@ -2476,10 +2483,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
+ extern const_debug unsigned int sysctl_sched_migration_cost;
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+-extern unsigned int sysctl_sched_latency;
+-extern unsigned int sysctl_sched_min_granularity;
+-extern unsigned int sysctl_sched_idle_min_granularity;
+-extern unsigned int sysctl_sched_wakeup_granularity;
++extern unsigned int sysctl_sched_base_slice;
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
+ 
+@@ -2492,6 +2496,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
  extern unsigned int sysctl_numa_balancing_hot_threshold;
  #endif
  
-+extern long calc_latency_offset(int prio);
++extern void set_latency_fair(struct sched_entity *se, int prio);
 +
  #ifdef CONFIG_SCHED_HRTICK
  
  /*
-@@ -3251,4 +3260,7 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
- 	cgroup_account_cputime(curr, delta_exec);
- }
+@@ -3323,4 +3329,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
+ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+ #endif
  
 +extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -1323,4 +2298,4 @@ index 3bac0a8ceab2..b2e932c25be6 100644
  
  #endif /* _UAPI_LINUX_SCHED_H */
 -- 
-2.40.0.rc2
+2.40.0
diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch
index 2ba8854..09b4f45 100644
--- a/patches/0003-bore.patch
+++ b/patches/0003-bore.patch
@@ -1,38 +1,40 @@
-From e016cce088886f56617becc8fcc598a0114e4faa Mon Sep 17 00:00:00 2001
+From d1d05832308e210422f7c52d052b026deb9fabf1 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 11 Mar 2023 18:44:19 +0100
-Subject: [PATCH] bore-eevdf
+Date: Thu, 6 Apr 2023 19:12:01 +0200
+Subject: [PATCH] bore
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- include/linux/sched.h |   5 ++
- init/Kconfig          |  20 +++++++
- kernel/sched/core.c   |  29 ++++++++++
- kernel/sched/debug.c  |   3 +
- kernel/sched/fair.c   | 124 +++++++++++++++++++++++++++++++++++++++++-
- 5 files changed, 180 insertions(+), 1 deletion(-)
+ include/linux/sched.h   |   6 ++
+ init/Kconfig            |  20 ++++++
+ kernel/sched/core.c     |  30 ++++++++
+ kernel/sched/debug.c    |   3 +
+ kernel/sched/fair.c     | 149 +++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/features.h |   8 +++
+ 6 files changed, 213 insertions(+), 3 deletions(-)
 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 764df627c243..f912da35db34 100644
+index 63d242164b1a..39a046d6cf90 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
-@@ -558,6 +558,11 @@ struct sched_entity {
+@@ -555,6 +555,12 @@ struct sched_entity {
  	u64				sum_exec_runtime;
  	u64				vruntime;
  	u64				prev_sum_exec_runtime;
 +#ifdef CONFIG_SCHED_BORE
 +	u64				prev_burst_time;
 +	u64				burst_time;
-+	u8				burst_score;
++	u64				max_burst_time;
++	u8				penalty_score;
 +#endif // CONFIG_SCHED_BORE
- 	s64				lag;
- 	u64				slice;
+ 
+ 	u64				nr_migrations;
  
 diff --git a/init/Kconfig b/init/Kconfig
-index 748a9491ca12..d10f1e6257cd 100644
+index 1fb5f313d18f..6595e5ed2416 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
-@@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE
+@@ -1285,6 +1285,26 @@ config CHECKPOINT_RESTORE
  
  	  If unsure, say N here.
  
@@ -60,89 +62,90 @@ index 748a9491ca12..d10f1e6257cd 100644
  	bool "Automatic process group scheduling"
  	select CGROUPS
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 9db5f9ec9022..1f1e1f586407 100644
+index 0d18c3969f90..34db768f6ba8 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -4418,6 +4418,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+@@ -4418,6 +4418,22 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  	return try_to_wake_up(p, state, 0);
  }
  
 +#ifdef CONFIG_SCHED_BORE
-+static inline void sched_fork_update_prev_burst(struct task_struct *p)
++static inline void adjust_prev_burst(struct task_struct *p)
 +{
-+	struct task_struct *sib;
 +	u32 cnt = 0;
 +	u64 sum = 0, avg = 0;
++	struct task_struct *sib;
 +	list_for_each_entry(sib, &p->sibling, sibling) {
 +		cnt++;
-+		sum += sib->se.prev_burst_time >> 8;
++		sum += sib->se.max_burst_time >> 8;
 +	}
 +	if (cnt) avg = div_u64(sum, cnt) << 8;
 +	if (p->se.prev_burst_time < avg) p->se.prev_burst_time = avg;
++	p->se.max_burst_time = p->se.prev_burst_time;
 +}
 +#endif // CONFIG_SCHED_BORE
 +
  /*
   * Perform scheduler related setup for a newly forked process p.
   * p is forked by current.
-@@ -4434,6 +4449,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4434,6 +4450,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	p->se.prev_sum_exec_runtime	= 0;
  	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
 +#ifdef CONFIG_SCHED_BORE
 +	p->se.burst_time      = 0;
 +#endif // CONFIG_SCHED_BORE
- 	p->se.dur_avg			= 0;
- 	p->se.prev_sleep_sum_runtime	= 0;
- 	p->se.lag			= 0;
-@@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init);
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -4659,6 +4678,9 @@ late_initcall(sched_core_sysctl_init);
  int sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
  	__sched_fork(clone_flags, p);
 +#ifdef CONFIG_SCHED_BORE
-+	sched_fork_update_prev_burst(p);
-+	p->se.burst_time = 0;
++	adjust_prev_burst(p);
 +#endif // CONFIG_SCHED_BORE
  	/*
  	 * We mark the process as NEW here. This guarantees that
  	 * nobody will actually run it, and a signal or other external
-@@ -9153,6 +9175,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
+@@ -9126,6 +9148,10 @@ void __init init_idle(struct task_struct *idle, int cpu)
  
  	idle->__state = TASK_RUNNING;
  	idle->se.exec_start = sched_clock();
 +#ifdef CONFIG_SCHED_BORE
 +	idle->se.prev_burst_time = 0;
++	idle->se.max_burst_time = 0;
 +#endif //CONFIG_SCHED_BORE
  	/*
  	 * PF_KTHREAD should already be set at this point; regardless, make it
  	 * look like a proper per-CPU kthread.
-@@ -9820,6 +9845,10 @@ void __init sched_init(void)
+@@ -9793,6 +9819,10 @@ void __init sched_init(void)
  	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
  #endif
  
 +#ifdef CONFIG_SCHED_BORE
-+	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification for 1.7-eevdf2 by Masahito Suzuki");
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.1.1 by Masahito Suzuki");
 +#endif // CONFIG_SCHED_BORE
 +
  	wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index fe9edfa43f65..3672df7c1f6a 100644
+index 1637b65ba07a..752c43a9ff13 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
-@@ -551,6 +551,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
  
 +#ifdef CONFIG_SCHED_BORE
-+	SEQ_printf(m, " %2d", p->se.burst_score);
++	SEQ_printf(m, " %2d", p->se.penalty_score);
 +#endif
  #ifdef CONFIG_NUMA_BALANCING
  	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
  #endif
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index c40b775452bc..1e4ca5419a11 100644
+index 6986ea31c984..ee461e4586ca 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -19,6 +19,9 @@
@@ -151,19 +154,19 @@ index c40b775452bc..1e4ca5419a11 100644
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 + *
 + *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
-+ *  Copyright (C) 2021 Masahito Suzuki <firelzrd@gmail.com>
++ *  Copyright (C) 2021-2023 Masahito Suzuki <firelzrd@gmail.com>
   */
  #include <linux/energy_model.h>
  #include <linux/mmap_lock.h>
-@@ -141,6 +144,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+@@ -126,6 +129,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
  
 +#ifdef CONFIG_SCHED_BORE
-+unsigned int __read_mostly sched_bore                = 1;
-+unsigned int __read_mostly sched_burst_penalty_scale = 1280;
-+unsigned int __read_mostly sched_burst_granularity   = 6;
-+unsigned int __read_mostly sched_burst_smoothness    = 2;
++unsigned int __read_mostly sched_bore                 = 3;
++unsigned int __read_mostly sched_burst_penalty_offset = 12;
++unsigned int __read_mostly sched_burst_penalty_scale  = 1292;
++unsigned int __read_mostly sched_burst_smoothness     = 1;
 +static int three          = 3;
 +static int sixty_four     = 64;
 +static int maxval_12_bits = 4095;
@@ -172,7 +175,7 @@ index c40b775452bc..1e4ca5419a11 100644
  int sched_thermal_decay_shift;
  static int __init setup_sched_thermal_decay_shift(char *str)
  {
-@@ -204,6 +217,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+@@ -185,6 +198,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  
  #ifdef CONFIG_SYSCTL
  static struct ctl_table sched_fair_sysctls[] = {
@@ -184,7 +187,16 @@ index c40b775452bc..1e4ca5419a11 100644
 +		.mode		= 0644,
 +		.proc_handler	= &proc_dointvec_minmax,
 +		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
++		.extra2		= &three,
++	},
++	{
++		.procname	= "sched_burst_penalty_offset",
++		.data		= &sched_burst_penalty_offset,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &sixty_four,
 +	},
 +	{
 +		.procname	= "sched_burst_penalty_scale",
@@ -196,15 +208,6 @@ index c40b775452bc..1e4ca5419a11 100644
 +		.extra2		= &maxval_12_bits,
 +	},
 +	{
-+		.procname	= "sched_burst_granularity",
-+		.data		= &sched_burst_granularity,
-+		.maxlen		= sizeof(unsigned int),
-+		.mode		= 0644,
-+		.proc_handler	= &proc_dointvec_minmax,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= &sixty_four,
-+	},
-+	{
 +		.procname	= "sched_burst_smoothness",
 +		.data		= &sched_burst_smoothness,
 +		.maxlen		= sizeof(unsigned int),
@@ -217,61 +220,70 @@ index c40b775452bc..1e4ca5419a11 100644
  	{
  		.procname       = "sched_child_runs_first",
  		.data           = &sysctl_sched_child_runs_first,
-@@ -1182,6 +1233,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
+@@ -891,6 +942,47 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
  }
  #endif /* CONFIG_SMP */
  
 +#ifdef CONFIG_SCHED_BORE
-+static inline void update_burst_score(struct sched_entity *se) {
-+	u64 burst_time;
-+	s32 bits;
-+	u32 intgr, fdigs, dec10;
-+	
-+	burst_time = max(se->burst_time, se->prev_burst_time);
-+	bits = fls64(burst_time);
-+	intgr = max((u32)bits, sched_burst_granularity) - sched_burst_granularity;
-+	fdigs = max(bits - 1, (s32)sched_burst_granularity);
-+	dec10 = (intgr << 10) | (burst_time << (64 - fdigs) >> 54);
-+	se->burst_score = min((u32)39, dec10 * sched_burst_penalty_scale >> 20);
++static inline u32 __calc_bits10(u64 burst_time) {
++	u32 bits = fls64(burst_time);
++	u32 fdigs = likely(bits) ? bits - 1 : 0;
++	return (bits << 10) | (burst_time << (64 - fdigs) >> 54);
 +}
 +
-+static u64 burst_scale(u64 delta, struct sched_entity *se) {
-+	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22);
++static inline u32 __calc_burst_score(u32 bits10, u32 offset) {
++	u32 val10 = max((s32)0, (s32)bits10 - (s32)(offset << 10));
++	return min((u32)39, val10 * sched_burst_penalty_scale >> 20);
 +}
 +
-+static u64 calc_delta_fair_bscale(u64 delta, struct sched_entity *se) {
-+	return burst_scale(calc_delta_fair(delta, se), se);
++static void update_burst_score(struct sched_entity *se) {
++	u32 bits10 = __calc_bits10(se->max_burst_time);
++	se->penalty_score = __calc_burst_score(bits10, sched_burst_penalty_offset);
++}
++
++static inline u64 penalty_scale(u64 delta, struct sched_entity *se) {
++	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22);
++}
++
++static inline u64 preempt_scale(
++	u64 delta, struct sched_entity *curr, struct sched_entity *se) {
++
++	u32 score = max(0, (s32)se->penalty_score - (s32)curr->penalty_score) >> 1;
++	return mul_u64_u32_shr(delta, sched_prio_to_wmult[min(39, 20 + score)], 22);
 +}
 +
 +static inline u64 binary_smooth(u64 old, u64 new, unsigned int smoothness) {
 +	return (new + old * ((1 << smoothness) - 1)) >> smoothness;
 +}
 +
-+static inline void reset_burst(struct sched_entity *se) {
++static void reset_burst(struct sched_entity *se) {
 +	se->prev_burst_time = binary_smooth(
 +		se->prev_burst_time, se->burst_time, sched_burst_smoothness);
 +	se->burst_time = 0;
++
++	se->max_burst_time = se->prev_burst_time;
 +}
 +#endif // CONFIG_SCHED_BORE
 +
  /*
   * Update the current task's runtime statistics.
   */
-@@ -1211,6 +1295,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -920,6 +1012,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	curr->sum_exec_runtime += delta_exec;
  	schedstat_add(cfs_rq->exec_clock, delta_exec);
  
 +#ifdef CONFIG_SCHED_BORE
 +	curr->burst_time += delta_exec;
++	curr->max_burst_time = max(curr->max_burst_time, curr->burst_time);
 +	update_burst_score(curr);
-+	if (sched_bore)
-+		curr->vruntime += calc_delta_fair_bscale(delta_exec, curr);
++	if (sched_bore & 1)
++		curr->vruntime += penalty_scale(calc_delta_fair(delta_exec, curr), curr);
 +	else
 +#endif // CONFIG_SCHED_BORE
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
- 	/*
- 	 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
-@@ -5283,6 +5374,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	update_min_vruntime(cfs_rq);
+ 
+@@ -5013,8 +5113,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
@@ -279,18 +291,30 @@ index c40b775452bc..1e4ca5419a11 100644
 +static int
 +wakeup_preempt_entity_bscale(struct sched_entity *curr,
 +                             struct sched_entity *se, bool do_scale);
-+#endif // CONFIG_SCHED_BORE
++#else // CONFIG_SCHED_BORE
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
++#endif // CONFIG_SCHED_BORE
  
-@@ -5330,7 +5426,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ /*
+  * Pick the next process, keeping these things in mind, in this order:
+@@ -5053,16 +5159,34 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 				second = curr;
+ 		}
+ 
++#ifdef CONFIG_SCHED_BORE
++		if (second && wakeup_preempt_entity_bscale(
++			second, left, sched_bore & 2) < 1)
++#else // CONFIG_SCHED_BORE
+ 		if (second && wakeup_preempt_entity(second, left) < 1)
++#endif // CONFIG_SCHED_BORE
  			se = second;
  	}
  
 -	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
 +#ifdef CONFIG_SCHED_BORE
 +	if (cfs_rq->next && wakeup_preempt_entity_bscale(
-+		                  cfs_rq->next, left, sched_bore & 2) < 1)
++		cfs_rq->next, left, sched_bore & 2) < 1)
 +#else // CONFIG_SCHED_BORE
 +	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 +#endif // CONFIG_SCHED_BORE
@@ -298,7 +322,20 @@ index c40b775452bc..1e4ca5419a11 100644
  		/*
  		 * Someone really wants this to run. If it's not unfair, run it.
  		 */
-@@ -6615,6 +6717,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		se = cfs_rq->next;
+-	} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
++	}
++#ifdef CONFIG_SCHED_BORE
++	else if (cfs_rq->last && wakeup_preempt_entity_bscale(
++		cfs_rq->last, left, sched_bore & 2) < 1)
++#else // CONFIG_SCHED_BORE
++	else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
++#endif // CONFIG_SCHED_BORE
++	{
+ 		/*
+ 		 * Prefer last buddy, try to return the CPU to a preempted task.
+ 		 */
+@@ -6331,6 +6455,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	util_est_dequeue(&rq->cfs, p);
  
  	for_each_sched_entity(se) {
@@ -308,7 +345,7 @@ index c40b775452bc..1e4ca5419a11 100644
  		cfs_rq = cfs_rq_of(se);
  		dequeue_entity(cfs_rq, se, flags);
  
-@@ -8070,7 +8175,12 @@ static unsigned long wakeup_gran(struct sched_entity *se)
+@@ -7746,7 +7873,12 @@ static unsigned long wakeup_gran(struct sched_entity *se)
   *
   */
  static int
@@ -321,28 +358,31 @@ index c40b775452bc..1e4ca5419a11 100644
  {
  	s64 gran, vdiff = curr->vruntime - se->vruntime;
  
-@@ -8078,11 +8188,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+@@ -7754,6 +7886,9 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  		return -1;
  
  	gran = wakeup_gran(se);
 +#ifdef CONFIG_SCHED_BORE
-+	if (do_scale) gran = burst_scale(gran, se);
++	if (do_scale) gran = preempt_scale(gran, curr, se);
 +#endif // CONFIG_SCHED_BORE
  	if (vdiff > gran)
  		return 1;
  
- 	return 0;
- }
-+#ifdef CONFIG_SCHED_BORE
-+static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-+{
-+	return wakeup_preempt_entity_bscale(curr, se, false);
-+}
-+#endif // CONFIG_SCHED_BORE
+@@ -7858,7 +7993,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 		return;
  
- static void set_last_buddy(struct sched_entity *se)
- {
-@@ -8430,6 +8549,9 @@ static void yield_task_fair(struct rq *rq)
+ 	update_curr(cfs_rq_of(se));
+-	if (wakeup_preempt_entity(se, pse) == 1) {
++#ifdef CONFIG_SCHED_BORE
++	if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1)
++#else // CONFIG_SCHED_BORE
++	if (wakeup_preempt_entity(se, pse) == 1)
++#endif // CONFIG_SCHED_BORE
++	{
+ 		/*
+ 		 * Bias pick_next to pick the sched entity that is
+ 		 * triggering this preemption.
+@@ -8094,6 +8234,9 @@ static void yield_task_fair(struct rq *rq)
  	struct task_struct *curr = rq->curr;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  	struct sched_entity *se = &curr->se;
@@ -352,5 +392,33 @@ index c40b775452bc..1e4ca5419a11 100644
  
  	/*
  	 * Are we the only task in the tree?
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index ee7f23c76bd3..3115bde98211 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -4,7 +4,11 @@
+  * them to run sooner, but does not allow tons of sleepers to
+  * rip the spread apart.
+  */
++#ifdef CONFIG_SCHED_BORE
++SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false)
++#else // CONFIG_SCHED_BORE
+ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Place new tasks ahead so that they do not starve already running
+@@ -17,7 +21,11 @@ SCHED_FEAT(START_DEBIT, true)
+  * wakeup-preemption), since its likely going to consume data we
+  * touched, increases cache locality.
+  */
++#ifdef CONFIG_SCHED_BORE
++SCHED_FEAT(NEXT_BUDDY, true)
++#else // CONFIG_SCHED_BORE
+ SCHED_FEAT(NEXT_BUDDY, false)
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Prefer to schedule the task that ran last (when we did
 -- 
-2.40.0.rc2
+2.40.0
diff --git a/patches/0006-Nintendo-controller-one.patch b/patches/0006-Nintendo-controller-one.patch
new file mode 100644
index 0000000..a27203e
--- /dev/null
+++ b/patches/0006-Nintendo-controller-one.patch
@@ -0,0 +1,46 @@
+diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c
+index 5bfc0c4504608..2b781cc9082b4 100644
+--- a/drivers/hid/hid-nintendo.c
++++ b/drivers/hid/hid-nintendo.c
+@@ -1527,6 +1527,7 @@ static int joycon_set_rumble(struct joycon_ctlr *ctlr, u16 amp_r, u16 amp_l,
+ 	u16 freq_l_low;
+ 	u16 freq_l_high;
+ 	unsigned long flags;
++	int next_rq_head;
+ 
+ 	spin_lock_irqsave(&ctlr->lock, flags);
+ 	freq_r_low = ctlr->rumble_rl_freq;
+@@ -1547,8 +1548,21 @@ static int joycon_set_rumble(struct joycon_ctlr *ctlr, u16 amp_r, u16 amp_l,
+ 	joycon_encode_rumble(data, freq_l_low, freq_l_high, amp);
+ 
+ 	spin_lock_irqsave(&ctlr->lock, flags);
+-	if (++ctlr->rumble_queue_head >= JC_RUMBLE_QUEUE_SIZE)
+-		ctlr->rumble_queue_head = 0;
++
++	next_rq_head = ctlr->rumble_queue_head + 1;
++	if (next_rq_head >= JC_RUMBLE_QUEUE_SIZE)
++		next_rq_head = 0;
++
++	/* Did we overrun the circular buffer?
++	 * If so, be sure we keep the latest intended rumble state.
++	 */
++	if (next_rq_head == ctlr->rumble_queue_tail) {
++		hid_dbg(ctlr->hdev, "rumble queue is full");
++		/* overwrite the prior value at the end of the circular buf */
++		next_rq_head = ctlr->rumble_queue_head;
++	}
++
++	ctlr->rumble_queue_head = next_rq_head;
+ 	memcpy(ctlr->rumble_data[ctlr->rumble_queue_head], data,
+ 	       JC_RUMBLE_DATA_SIZE);
+ 
+@@ -2128,7 +2142,7 @@ static int nintendo_hid_probe(struct hid_device *hdev,
+ 
+ 	ctlr->hdev = hdev;
+ 	ctlr->ctlr_state = JOYCON_CTLR_STATE_INIT;
+-	ctlr->rumble_queue_head = JC_RUMBLE_QUEUE_SIZE - 1;
++	ctlr->rumble_queue_head = 0;
+ 	ctlr->rumble_queue_tail = 0;
+ 	hid_set_drvdata(hdev, ctlr);
+ 	mutex_init(&ctlr->output_mutex);
+generated by cgit (git 2.34.1) at 2023-04-10 17:10:33 +0000
\ No newline at end of file
diff --git a/patches/0006-Nintendo-controller-two.patch b/patches/0006-Nintendo-controller-two.patch
new file mode 100644
index 0000000..a61397f
--- /dev/null
+++ b/patches/0006-Nintendo-controller-two.patch
@@ -0,0 +1,116 @@
+
+diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c
+index 2b781cc9082b4..250f5d2f888ab 100644
+--- a/drivers/hid/hid-nintendo.c
++++ b/drivers/hid/hid-nintendo.c
+@@ -433,7 +433,9 @@ struct joycon_ctlr {
+ 	u8 usb_ack_match;
+ 	u8 subcmd_ack_match;
+ 	bool received_input_report;
++	unsigned int last_input_report_msecs;
+ 	unsigned int last_subcmd_sent_msecs;
++	unsigned int consecutive_valid_report_deltas;
+ 
+ 	/* factory calibration data */
+ 	struct joycon_stick_cal left_stick_cal_x;
+@@ -543,19 +545,54 @@ static void joycon_wait_for_input_report(struct joycon_ctlr *ctlr)
+  * Sending subcommands and/or rumble data at too high a rate can cause bluetooth
+  * controller disconnections.
+  */
++#define JC_INPUT_REPORT_MIN_DELTA	8
++#define JC_INPUT_REPORT_MAX_DELTA	17
++#define JC_SUBCMD_TX_OFFSET_MS		4
++#define JC_SUBCMD_VALID_DELTA_REQ	3
++#define JC_SUBCMD_RATE_MAX_ATTEMPTS	500
++#define JC_SUBCMD_RATE_LIMITER_USB_MS	20
++#define JC_SUBCMD_RATE_LIMITER_BT_MS	60
++#define JC_SUBCMD_RATE_LIMITER_MS(ctlr)	((ctlr)->hdev->bus == BUS_USB ? JC_SUBCMD_RATE_LIMITER_USB_MS : JC_SUBCMD_RATE_LIMITER_BT_MS)
+ static void joycon_enforce_subcmd_rate(struct joycon_ctlr *ctlr)
+ {
+-	static const unsigned int max_subcmd_rate_ms = 25;
+-	unsigned int current_ms = jiffies_to_msecs(jiffies);
+-	unsigned int delta_ms = current_ms - ctlr->last_subcmd_sent_msecs;
++	unsigned int current_ms;
++	unsigned long subcmd_delta;
++	int consecutive_valid_deltas = 0;
++	int attempts = 0;
++	unsigned long flags;
++
++	if (unlikely(ctlr->ctlr_state != JOYCON_CTLR_STATE_READ))
++		return;
+ 
+-	while (delta_ms < max_subcmd_rate_ms &&
+-	       ctlr->ctlr_state == JOYCON_CTLR_STATE_READ) {
++	do {
+ 		joycon_wait_for_input_report(ctlr);
+ 		current_ms = jiffies_to_msecs(jiffies);
+-		delta_ms = current_ms - ctlr->last_subcmd_sent_msecs;
++		subcmd_delta = current_ms - ctlr->last_subcmd_sent_msecs;
++
++		spin_lock_irqsave(&ctlr->lock, flags);
++		consecutive_valid_deltas = ctlr->consecutive_valid_report_deltas;
++		spin_unlock_irqrestore(&ctlr->lock, flags);
++
++		attempts++;
++	} while ((consecutive_valid_deltas < JC_SUBCMD_VALID_DELTA_REQ ||
++		  subcmd_delta < JC_SUBCMD_RATE_LIMITER_MS(ctlr)) &&
++		 ctlr->ctlr_state == JOYCON_CTLR_STATE_READ &&
++		 attempts < JC_SUBCMD_RATE_MAX_ATTEMPTS);
++
++	if (attempts >= JC_SUBCMD_RATE_MAX_ATTEMPTS) {
++		hid_warn(ctlr->hdev, "%s: exceeded max attempts", __func__);
++		return;
+ 	}
++
+ 	ctlr->last_subcmd_sent_msecs = current_ms;
++
++	/*
++	 * Wait a short time after receiving an input report before
++	 * transmitting. This should reduce odds of a TX coinciding with an RX.
++	 * Minimizing concurrent BT traffic with the controller seems to lower
++	 * the rate of disconnections.
++	 */
++	msleep(JC_SUBCMD_TX_OFFSET_MS);
+ }
+ 
+ static int joycon_hid_send_sync(struct joycon_ctlr *ctlr, u8 *data, size_t len,
+@@ -1223,6 +1260,7 @@ static void joycon_parse_report(struct joycon_ctlr *ctlr,
+ 	u8 tmp;
+ 	u32 btns;
+ 	unsigned long msecs = jiffies_to_msecs(jiffies);
++	unsigned long report_delta_ms = msecs - ctlr->last_input_report_msecs;
+ 
+ 	spin_lock_irqsave(&ctlr->lock, flags);
+ 	if (IS_ENABLED(CONFIG_NINTENDO_FF) && rep->vibrator_report &&
+@@ -1364,6 +1402,31 @@ static void joycon_parse_report(struct joycon_ctlr *ctlr,
+ 
+ 	input_sync(dev);
+ 
++	spin_lock_irqsave(&ctlr->lock, flags);
++	ctlr->last_input_report_msecs = msecs;
++	/*
++	 * Was this input report a reasonable time delta compared to the prior
++	 * report? We use this information to decide when a safe time is to send
++	 * rumble packets or subcommand packets.
++	 */
++	if (report_delta_ms >= JC_INPUT_REPORT_MIN_DELTA &&
++	    report_delta_ms <= JC_INPUT_REPORT_MAX_DELTA) {
++		if (ctlr->consecutive_valid_report_deltas < JC_SUBCMD_VALID_DELTA_REQ)
++			ctlr->consecutive_valid_report_deltas++;
++	} else {
++		ctlr->consecutive_valid_report_deltas = 0;
++	}
++	/*
++	 * Our consecutive valid report tracking is only relevant for
++	 * bluetooth-connected controllers. For USB devices, we're beholden to
++	 * USB's underlying polling rate anyway. Always set to the consecutive
++	 * delta requirement.
++	 */
++	if (ctlr->hdev->bus == BUS_USB)
++		ctlr->consecutive_valid_report_deltas = JC_SUBCMD_VALID_DELTA_REQ;
++
++	spin_unlock_irqrestore(&ctlr->lock, flags);
++
+ 	/*
+ 	 * Immediately after receiving a report is the most reliable time to
+ 	 * send a subcommand to the controller. Wake any subcommand senders
\ No newline at end of file
diff --git a/scripts/patch.sh b/scripts/patch.sh
index 5b711eb..04f493e 100755
--- a/scripts/patch.sh
+++ b/scripts/patch.sh
@@ -13,6 +13,10 @@ patch -Np1 < "../patches/0003-bore.patch"
 patch -Np1 < "../patches/0004-hdr.patch"
 # AMD GPU USB C fix patch
 patch -Np1 < "../patches/0005-amd-usbc-fix.patch"
+# Nintendo controller rumble patch
+patch -Np1 < "../patches/0006-Nintendo-controller-one.patch"
+# Nintendo controller BT patch
+patch -Np1 < "../patches/0006-Nintendo-controller-two.patch"
 # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork
 # Extra Leigon laptop goodies
 patch -Np1 < "../patches/0001-Add-legion-laptop-v0.1.patch"
diff --git a/scripts/source.sh b/scripts/source.sh
index 3a486a5..4c457c0 100755
--- a/scripts/source.sh
+++ b/scripts/source.sh
@@ -2,7 +2,7 @@
 
 echo "Pika Kernel - Getting source"
 
-wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.6.tar.gz
-tar -zxf ./linux-6.2.6.tar.gz
+wget -nv https://git.kernel.org/torvalds/t/linux-6.3-rc6.tar.gz
+tar -zxf ./linux-6.3-rc6.tar.gz
 
-cd linux-6.2.6
+cd linux-6.3-rc6