diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..4ca88b6
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,28 @@
+
+name: PikaOS Kernel Build Only
+
+on:
+  workflow_dispatch
+
+jobs:
+  build:
+    runs-on: self-hosted
+    container:
+      image: ubuntu:latest
+      volumes:
+        - /proc:/proc
+      options: --privileged -it
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install needed packages
+      run: apt update && apt install bc bison build-essential ccache cpio fakeroot flex git kmod libelf-dev libncurses5-dev libssl-dev lz4 qtbase5-dev rsync schedtool wget zstd tar -y
+
+    - name: Build Kernel
+      run: ./main.sh
+      
+    - uses: actions/upload-artifact@v3
+      with:
+        name: PikaOS Kernel
+        path: builds/
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..e8a9f3a
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,25 @@
+name: PikaOS Kernel Build And Release
+
+on:
+  workflow_dispatch
+
+jobs:
+  build:
+    runs-on: self-hosted
+    container:
+      image: ubuntu:latest
+      volumes:
+        - /proc:/proc
+      options: --privileged -it
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install needed packages
+      run: apt update && apt install bc bison build-essential ccache cpio fakeroot flex git kmod libelf-dev libncurses5-dev libssl-dev lz4 qtbase5-dev rsync schedtool wget zstd tar -y
+
+    - name: Build Kernel
+      run: ./main.sh
+
+    - name: Release Kernel
+      run: ./scripts/release.sh
\ No newline at end of file
diff --git a/main.sh b/main.sh
new file mode 100644
index 0000000..0c27e16
--- /dev/null
+++ b/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+bash ./scripts/source.sh
+bash ./scripts/patch.sh
+bash ./scripts/config.sh
+bash ./scripts/build.sh
+bash ./scripts/output.sh
+
+
diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
new file mode 100644
index 0000000..59a9269
--- /dev/null
+++ b/patches/0001-cachy-all.patch
@@ -0,0 +1,45207 @@
+From 6e4bce513a02a0be7b1f30c06751eb146cf20b1b Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 15 Jan 2023 16:50:23 +0100
+Subject: [PATCH 01/15] bbr2
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/tcp.h                |    3 +-
+ include/net/inet_connection_sock.h |    3 +-
+ include/net/tcp.h                  |   41 +-
+ include/uapi/linux/inet_diag.h     |   33 +
+ net/ipv4/Kconfig                   |   22 +
+ net/ipv4/Makefile                  |    1 +
+ net/ipv4/tcp.c                     |    1 +
+ net/ipv4/tcp_bbr.c                 |   38 +-
+ net/ipv4/tcp_bbr2.c                | 2674 ++++++++++++++++++++++++++++
+ net/ipv4/tcp_cong.c                |    1 +
+ net/ipv4/tcp_input.c               |   27 +-
+ net/ipv4/tcp_output.c              |   26 +-
+ net/ipv4/tcp_rate.c                |   30 +-
+ net/ipv4/tcp_timer.c               |    1 +
+ 14 files changed, 2867 insertions(+), 34 deletions(-)
+ create mode 100644 net/ipv4/tcp_bbr2.c
+
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index ca7f05a130d2..09dbcd67ee8e 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -255,7 +255,8 @@ struct tcp_sock {
+ 	u8	compressed_ack;
+ 	u8	dup_ack_counter:2,
+ 		tlp_retrans:1,	/* TLP is a retransmission */
+-		unused:5;
++		fast_ack_mode:2, /* which fast ack mode ? */
++		unused:3;
+ 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+ 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+ 	u8	chrono_type:2,	/* current chronograph type */
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index c2b15f7e5516..d85858efa571 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -135,7 +135,8 @@ struct inet_connection_sock {
+ 	u32			  icsk_probes_tstamp;
+ 	u32			  icsk_user_timeout;
+ 
+-	u64			  icsk_ca_priv[104 / sizeof(u64)];
++/* XXX inflated by temporary internal debugging info */
++	u64			  icsk_ca_priv[216 / sizeof(u64)];
+ #define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
+ };
+ 
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index db9f828e9d1e..e1f05c2b4707 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
+ #define	TCP_ECN_QUEUE_CWR	2
+ #define	TCP_ECN_DEMAND_CWR	4
+ #define	TCP_ECN_SEEN		8
++#define	TCP_ECN_ECT_PERMANENT	16
+ 
+ enum tcp_tw_status {
+ 	TCP_TW_SUCCESS = 0,
+@@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+ 	return max_t(s64, t1 - t0, 0);
+ }
+ 
++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
++{
++	return max_t(s32, t1 - t0, 0);
++}
++
+ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
+ {
+ 	return tcp_ns_to_ts(skb->skb_mstamp_ns);
+@@ -898,9 +904,14 @@ struct tcp_skb_cb {
+ 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
+ 			__u32 delivered;
+ 			/* start of send pipeline phase */
+-			u64 first_tx_mstamp;
++			u32 first_tx_mstamp;
+ 			/* when we reached the "delivered" count */
+-			u64 delivered_mstamp;
++			u32 delivered_mstamp;
++#define TCPCB_IN_FLIGHT_BITS 20
++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
++			u32 in_flight:20,   /* packets in flight at transmit */
++			    unused2:12;
++			u32 lost;	/* packets lost so far upon tx of skb */
+ 		} tx;   /* only used for outgoing skbs */
+ 		union {
+ 			struct inet_skb_parm	h4;
+@@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags {
+ #define TCP_CONG_NON_RESTRICTED 0x1
+ /* Requires ECN/ECT set on all packets */
+ #define TCP_CONG_NEEDS_ECN	0x2
+-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
++#define TCP_CONG_WANTS_CE_EVENTS	0x4
++#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
++			 TCP_CONG_NEEDS_ECN | \
++			 TCP_CONG_WANTS_CE_EVENTS)
+ 
+ union tcp_cc_info;
+ 
+@@ -1046,8 +1061,11 @@ struct ack_sample {
+  */
+ struct rate_sample {
+ 	u64  prior_mstamp; /* starting timestamp for interval */
++	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
+ 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+ 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
++	u32 tx_in_flight;	/* packets in flight at starting timestamp */
++	s32  lost;		/* number of packets lost over interval */
+ 	s32  delivered;		/* number of packets delivered over interval */
+ 	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+ 	long interval_us;	/* time for tp->delivered to incr "delivered" */
+@@ -1061,6 +1079,7 @@ struct rate_sample {
+ 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+ 	bool is_retrans;	/* is sample from retransmission? */
+ 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
++	bool is_ece;		/* did this ACK have ECN marked? */
+ };
+ 
+ struct tcp_congestion_ops {
+@@ -1084,8 +1103,11 @@ struct tcp_congestion_ops {
+ 	/* hook for packet ack accounting (optional) */
+ 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ 
+-	/* override sysctl_tcp_min_tso_segs */
+-	u32 (*min_tso_segs)(struct sock *sk);
++	/* pick target number of segments per TSO/GSO skb (optional): */
++	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
++
++	/* react to a specific lost skb (optional) */
++	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+ 
+ 	/* call when packets are delivered to update cwnd and pacing rate,
+ 	 * after all the ca_state processing. (optional)
+@@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+ }
+ #endif
+ 
++static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
++{
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
++					   TCP_CONG_WANTS_CE_EVENTS);
++}
++
+ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+ {
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+@@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+ void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+ 
+ /* From tcp_rate.c */
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 			    struct rate_sample *rs);
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index 50655de04c9b..0e24f11627d5 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -231,9 +231,42 @@ struct tcp_bbr_info {
+ 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+ };
+ 
++/* Phase as reported in netlink/ss stats. */
++enum tcp_bbr2_phase {
++	BBR2_PHASE_INVALID		= 0,
++	BBR2_PHASE_STARTUP		= 1,
++	BBR2_PHASE_DRAIN		= 2,
++	BBR2_PHASE_PROBE_RTT		= 3,
++	BBR2_PHASE_PROBE_BW_UP		= 4,
++	BBR2_PHASE_PROBE_BW_DOWN	= 5,
++	BBR2_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR2_PHASE_PROBE_BW_REFILL	= 7
++};
++
++struct tcp_bbr2_info {
++	/* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
++	__u32	bbr_bw_lsb;		/* lower 32 bits of bw */
++	__u32	bbr_bw_msb;		/* upper 32 bits of bw */
++	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
++	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
++	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
++	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
++	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
++	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
++	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
++	__u8	bbr_mode;		/* current bbr_mode in state machine */
++	__u8	bbr_phase;		/* current state machine phase */
++	__u8	unused1;		/* alignment padding; not used yet */
++	__u8	bbr_version;		/* MUST be at this offset in struct */
++	__u32	bbr_inflight_lo;	/* lower/short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher/long-term data volume bound */
++	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
++};
++
+ union tcp_cc_info {
+ 	struct tcpvegas_info	vegas;
+ 	struct tcp_dctcp_info	dctcp;
+ 	struct tcp_bbr_info	bbr;
++	struct tcp_bbr2_info	bbr2;
+ };
+ #endif /* _UAPI_INET_DIAG_H_ */
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 2dfb12230f08..b6bec331a82e 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -678,6 +678,24 @@ config TCP_CONG_BBR
+ 	  AQM schemes that do not provide a delay signal. It requires the fq
+ 	  ("Fair Queue") pacing packet scheduler.
+ 
++config TCP_CONG_BBR2
++	tristate "BBR2 TCP"
++	default n
++	help
++
++	BBR2 TCP congestion control is a model-based congestion control
++	algorithm that aims to maximize network utilization, keep queues and
++	retransmit rates low, and to be able to coexist with Reno/CUBIC in
++	common scenarios. It builds an explicit model of the network path.  It
++	tolerates a targeted degree of random packet loss and delay that are
++	unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
++	or cable modem links, and can use DCTCP-L4S-style ECN signals.  It can
++	coexist with flows that use loss-based congestion control, and can
++	operate with shallow buffers, deep buffers, bufferbloat, policers, or
++	AQM schemes that do not provide a delay signal. It requires pacing,
++	using either TCP internal pacing or the fq ("Fair Queue") pacing packet
++	scheduler.
++
+ choice
+ 	prompt "Default TCP congestion control"
+ 	default DEFAULT_CUBIC
+@@ -715,6 +733,9 @@ choice
+ 	config DEFAULT_BBR
+ 		bool "BBR" if TCP_CONG_BBR=y
+ 
++	config DEFAULT_BBR2
++		bool "BBR2" if TCP_CONG_BBR2=y
++
+ 	config DEFAULT_RENO
+ 		bool "Reno"
+ endchoice
+@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG
+ 	default "dctcp" if DEFAULT_DCTCP
+ 	default "cdg" if DEFAULT_CDG
+ 	default "bbr" if DEFAULT_BBR
++	default "bbr2" if DEFAULT_BBR2
+ 	default "cubic"
+ 
+ config TCP_MD5SIG
+diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
+index af7d2cf490fb..e7a86a50838a 100644
+--- a/net/ipv4/Makefile
++++ b/net/ipv4/Makefile
+@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+ obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
+ obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
++obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
+ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+ obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
+ obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 33f559f491c8..e9e8040d6491 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->rx_opt.dsack = 0;
+ 	tp->rx_opt.num_sacks = 0;
+ 	tp->rcv_ooopack = 0;
++	tp->fast_ack_mode = 0;
+ 
+ 
+ 	/* Clean up fastopen related fields */
+diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
+index d2c470524e58..af08fb3cb139 100644
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+ 		sk->sk_pacing_rate = rate;
+ }
+ 
+-/* override sysctl_tcp_min_tso_segs */
+ static u32 bbr_min_tso_segs(struct sock *sk)
+ {
+ 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ }
+ 
++/* Return the number of segments BBR would like in a TSO/GSO skb, given
++ * a particular max gso size as a constraint.
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	u32 segs;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
++{
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
++}
++
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
+ static u32 bbr_tso_segs_goal(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	u32 segs, bytes;
+-
+-	/* Sort of tcp_tso_autosize() but ignoring
+-	 * driver provided sk_gso_max_size.
+-	 */
+-	bytes = min_t(unsigned long,
+-		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+ 
+-	return min(segs, 0x7FU);
++	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
+ }
+ 
+ /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
++	.tso_segs	= bbr_tso_segs,
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
+new file mode 100644
+index 000000000000..85f8052144d1
+--- /dev/null
++++ b/net/ipv4/tcp_bbr2.c
+@@ -0,0 +1,2674 @@
++/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
++ *
++ * BBRv2 is a model-based congestion control algorithm that aims for low
++ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
++ * of the network path, it uses measurements of bandwidth and RTT, as well as
++ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals.  Note that
++ * although it can use ECN or loss signals explicitly, it does not require
++ * either; it can bound its in-flight data based on its estimate of the BDP.
++ *
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
++ *
++ * Here is a state transition diagram for BBR:
++ *
++ *             |
++ *             V
++ *    +---> STARTUP  ----+
++ *    |        |         |
++ *    |        V         |
++ *    |      DRAIN   ----+
++ *    |        |         |
++ *    |        V         |
++ *    +---> PROBE_BW ----+
++ *    |      ^    |      |
++ *    |      |    |      |
++ *    |      +----+      |
++ *    |                  |
++ *    +---- PROBE_RTT <--+
++ *
++ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
++ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
++ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
++ * A long-lived BBR flow spends the vast majority of its time remaining
++ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
++ * in a fair manner, with a small, bounded queue. *If* a flow has been
++ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
++ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
++ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
++ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
++ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
++ * otherwise we enter STARTUP to try to fill the pipe.
++ *
++ * BBR is described in detail in:
++ *   "BBR: Congestion-Based Congestion Control",
++ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
++ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
++ *
++ * There is a public e-mail list for discussing BBR development and testing:
++ *   https://groups.google.com/forum/#!forum/bbr-dev
++ *
++ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
++ * otherwise TCP stack falls back to an internal pacing using one high
++ * resolution timer per TCP socket and may use more resources.
++ */
++#include <linux/module.h>
++#include <net/tcp.h>
++#include <linux/inet_diag.h>
++#include <linux/inet.h>
++#include <linux/random.h>
++
++#include "tcp_dctcp.h"
++
++/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
++ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
++ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
++ * Since the minimum window is >=4 packets, the lower bound isn't
++ * an issue. The upper bound isn't an issue with existing technologies.
++ */
++#define BW_SCALE 24
++#define BW_UNIT (1 << BW_SCALE)
++
++#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
++#define BBR_UNIT (1 << BBR_SCALE)
++
++#define FLAG_DEBUG_VERBOSE	0x1	/* Verbose debugging messages */
++#define FLAG_DEBUG_LOOPBACK	0x2	/* Do NOT skip loopback addr */
++
++#define CYCLE_LEN		8	/* number of phases in a pacing gain cycle */
++
++/* BBR has the following modes for deciding how fast to send: */
++enum bbr_mode {
++	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
++	BBR_DRAIN,	/* drain any queue created during startup */
++	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
++	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
++};
++
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
++/* BBR congestion control block */
++struct bbr {
++	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
++	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
++	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
++	u32	probe_rtt_min_us;	/* min RTT in bbr_probe_rtt_win_ms window */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
++	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u64	cycle_mstamp;	     /* time of this cycle phase start */
++	u32     mode:3,		     /* current bbr_mode in state machine */
++		prev_ca_state:3,     /* CA state on previous ACK */
++		packet_conservation:1,  /* use packet conservation? */
++		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1, 	/* can we take fast path? */
++		unused2:11,
++		idle_restart:1,	     /* restarting after idle? */
++		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
++		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		has_seen_rtt:1;	     /* have we seen an RTT sample yet? */
++	u32	pacing_gain:11,	/* current gain for setting pacing rate */
++		cwnd_gain:11,	/* current gain for setting cwnd */
++		full_bw_reached:1,   /* reached full bw in Startup? */
++		full_bw_cnt:2,	/* number of rounds without large bw gains */
++		init_cwnd:7;	/* initial cwnd */
++	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
++	u32	full_bw;	/* recent bw, to estimate if pipe is full */
++
++	/* For tracking ACK aggregation: */
++	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
++	u16	extra_acked[2];		/* max excess data ACKed in epoch */
++	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
++		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
++		extra_acked_win_idx:1,	/* current index in extra_acked array */
++	/* BBR v2 state: */
++		unused1:2,
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1;		/* ECN in this cycle? */
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* upper bound of sending bandwidth range*/
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	/* Params configurable using setsockopt. Refer to correspoding
++	 * module param for detailed description of params.
++	 */
++	struct bbr_params {
++		u32	high_gain:11,		/* max allowed value: 2047 */
++			drain_gain:10,		/* max allowed value: 1023 */
++			cwnd_gain:11;		/* max allowed value: 2047 */
++		u32	cwnd_min_target:4,	/* max allowed value: 15 */
++			min_rtt_win_sec:5,	/* max allowed value: 31 */
++			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
++			full_bw_cnt:3,		/* max allowed value: 7 */
++			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
++			unused3:6,
++			drain_to_target:1,	/* boolean */
++			precise_ece_ack:1,	/* boolean */
++			extra_acked_in_startup:1, /* allowed values: {0, 1} */
++			fast_path:1;		/* boolean */
++		u32	full_bw_thresh:10,	/* max allowed value: 1023 */
++			startup_cwnd_gain:11,	/* max allowed value: 2047 */
++			bw_probe_pif_gain:9,	/* max allowed value: 511 */
++			usage_based_cwnd:1, 	/* boolean */
++			unused2:1;
++		u16	probe_rtt_win_ms:14,	/* max allowed value: 16383 */
++			refill_add_inc:2;	/* max allowed value: 3 */
++		u16	extra_acked_gain:11,	/* max allowed value: 2047 */
++			extra_acked_win_rtts:5; /* max allowed value: 31*/
++		u16	pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
++		/* Mostly BBR v2 parameters below here: */
++		u32	ecn_alpha_gain:8,	/* max allowed value: 255 */
++			ecn_factor:8,		/* max allowed value: 255 */
++			ecn_thresh:8,		/* max allowed value: 255 */
++			beta:8;			/* max allowed value: 255 */
++		u32	ecn_max_rtt_us:19,	/* max allowed value: 524287 */
++			bw_probe_reno_gain:9,	/* max allowed value: 511 */
++			full_loss_cnt:4;	/* max allowed value: 15 */
++		u32	probe_rtt_cwnd_gain:8,	/* max allowed value: 255 */
++			inflight_headroom:8,	/* max allowed value: 255 */
++			loss_thresh:8,		/* max allowed value: 255 */
++			bw_probe_max_rounds:8;	/* max allowed value: 255 */
++		u32	bw_probe_rand_rounds:4, /* max allowed value: 15 */
++			bw_probe_base_us:26,	/* usecs: 0..2^26-1 (67 secs) */
++			full_ecn_cnt:2;		/* max allowed value: 3 */
++		u32	bw_probe_rand_us:26,	/* usecs: 0..2^26-1 (67 secs) */
++			undo:1,			/* boolean */
++			tso_rtt_shift:4,	/* max allowed value: 15 */
++			unused5:1;
++		u32	ecn_reprobe_gain:9,	/* max allowed value: 511 */
++			unused1:14,
++			ecn_alpha_init:9;	/* max allowed value: 256 */
++	} params;
++
++	struct {
++		u32	snd_isn; /* Initial sequence number */
++		u32	rs_bw; 	 /* last valid rate sample bw */
++		u32	target_cwnd; /* target cwnd, based on BDP */
++		u8	undo:1,  /* Undo even happened but not yet logged */
++			unused:7;
++		char	event;	 /* single-letter event debug codes */
++		u16	unused2;
++	} debug;
++};
++
++struct bbr_context {
++	u32 sample_bw;
++	u32 target_cwnd;
++	u32 log:1;
++};
++
++/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
++static u32 bbr_min_rtt_win_sec = 10;
++/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
++ * Max allowed value is 511 (0x1FF).
++ */
++static u32 bbr_probe_rtt_mode_ms = 200;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static u32 bbr_probe_rtt_win_ms = 5000;
++/* Skip TSO below the following bandwidth (bits/sec): */
++static int bbr_min_tso_rate = 1200000;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static u32 bbr_tso_rtt_shift = 9;  /* halve allowance per 2^9 usecs, 512us */
++
++/* Select cwnd TSO budget approach:
++ *  0: padding
++ *  1: flooring
++ */
++static uint bbr_cwnd_tso_budget = 1;
++
++/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
++ * In order to help drive the network toward lower queues and low latency while
++ * maintaining high utilization, the average pacing rate aims to be slightly
++ * lower than the estimated bandwidth. This is an important aspect of the
++ * design.
++ */
++static const int bbr_pacing_margin_percent = 1;
++
++/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++ * that will allow a smoothly increasing pacing rate that will double each RTT
++ * and send the same number of packets per RTT that an un-paced, slow-starting
++ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
++ */
++static int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
++/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
++static int bbr_startup_cwnd_gain  = BBR_UNIT * 2885 / 1000 + 1;
++/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++ * the queue created in BBR_STARTUP in a single round. Max allowed value
++ * is 1023 (0x3FF).
++ */
++static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
++/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
++ * Max allowed value is 2047 (0x7FF).
++ */
++static int bbr_cwnd_gain  = BBR_UNIT * 2;
++/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
++ * Max allowed value for each element is 1023 (0x3FF).
++ */
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
++};
++static int bbr_pacing_gain[] = {
++	BBR_UNIT * 5 / 4,	/* probe for more available bw */
++	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
++	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
++	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++};
++
++/* Try to keep at least this many packets in flight, if things go smoothly. For
++ * smooth functioning, a sliding window protocol ACKing every other packet
++ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
++ */
++static u32 bbr_cwnd_min_target = 4;
++
++/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
++ * Use 0 to disable. Max allowed value is 255.
++ */
++static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* If bw has increased significantly (1.25x), there may be more bw available.
++ * Max allowed value is 1023 (0x3FF).
++ */
++static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
++/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
++ * Max allowed value is 7 (0x7).
++ */
++static u32 bbr_full_bw_cnt = 3;
++
++static u32 bbr_flags;		/* Debugging related stuff */
++
++/* Whether to debug using printk.
++ */
++static bool bbr_debug_with_printk;
++
++/* Whether to debug using ftrace event tcp:tcp_bbr_event.
++ * Ignored when bbr_debug_with_printk is set.
++ */
++static bool bbr_debug_ftrace;
++
++/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
++static bool bbr_drain_to_target = true;		/* default: enabled */
++
++/* Experiment: Flags to control BBR with ECN behavior.
++ */
++static bool bbr_precise_ece_ack = true;		/* default: enabled */
++
++/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
++ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
++ */
++static u32 bbr_cwnd_warn_val	= 1U << 20;
++
++static u16 bbr_debug_port_mask;
++
++/* BBR module parameters. These are module parameters only in Google prod.
++ * Upstream these are intentionally not module parameters.
++ */
++static int bbr_pacing_gain_size = CYCLE_LEN;
++
++/* Gain factor for adding extra_acked to target cwnd: */
++static int bbr_extra_acked_gain = 256;
++
++/* Window length of extra_acked window. Max allowed val is 31. */
++static u32 bbr_extra_acked_win_rtts = 5;
++
++/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
++static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
++
++/* Time period for clamping cwnd increment due to ack aggregation */
++static u32 bbr_extra_acked_max_us = 100 * 1000;
++
++/* Use extra acked in startup ?
++ * 0: disabled
++ * 1: use latest extra_acked value from 1-2 rtt in startup
++ */
++static int bbr_extra_acked_in_startup = 1;		/* default: enabled */
++
++/* Experiment: don't grow cwnd beyond twice of what we just probed. */
++static bool bbr_usage_based_cwnd;		/* default: disabled */
++
++/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
++ * when they know that any ECN marks that the connections experience will be
++ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
++ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
++ * negotiation or configuration that is outside the scope of the BBRv2
++ * alpha release.
++ */
++static bool bbr_ecn_enable = false;
++
++module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
++module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
++module_param_named(high_gain,         bbr_high_gain,         int,    0644);
++module_param_named(drain_gain,        bbr_drain_gain,        int,    0644);
++module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
++module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
++module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
++			 &bbr_pacing_gain_size, 0644);
++module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
++module_param_named(probe_rtt_cwnd_gain,
++		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);
++module_param_named(cwnd_warn_val,     bbr_cwnd_warn_val,     uint,   0664);
++module_param_named(debug_port_mask,   bbr_debug_port_mask,   ushort, 0644);
++module_param_named(flags,             bbr_flags,             uint,   0644);
++module_param_named(debug_ftrace,      bbr_debug_ftrace, bool,   0644);
++module_param_named(debug_with_printk, bbr_debug_with_printk, bool,   0644);
++module_param_named(min_rtt_win_sec,   bbr_min_rtt_win_sec,   uint,   0644);
++module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint,   0644);
++module_param_named(probe_rtt_win_ms,  bbr_probe_rtt_win_ms,  uint,   0644);
++module_param_named(full_bw_thresh,    bbr_full_bw_thresh,    uint,   0644);
++module_param_named(full_bw_cnt,       bbr_full_bw_cnt,       uint,   0644);
++module_param_named(cwnd_tso_bduget,   bbr_cwnd_tso_budget,   uint,   0664);
++module_param_named(extra_acked_gain,  bbr_extra_acked_gain,  int,    0664);
++module_param_named(extra_acked_win_rtts,
++		   bbr_extra_acked_win_rtts, uint,   0664);
++module_param_named(extra_acked_max_us,
++		   bbr_extra_acked_max_us, uint,   0664);
++module_param_named(ack_epoch_acked_reset_thresh,
++		   bbr_ack_epoch_acked_reset_thresh, uint,   0664);
++module_param_named(drain_to_target,   bbr_drain_to_target,   bool,   0664);
++module_param_named(precise_ece_ack,   bbr_precise_ece_ack,   bool,   0664);
++module_param_named(extra_acked_in_startup,
++		   bbr_extra_acked_in_startup, int, 0664);
++module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool,   0664);
++module_param_named(ecn_enable,       bbr_ecn_enable,         bool,   0664);
++
++static void bbr2_exit_probe_rtt(struct sock *sk);
++static void bbr2_reset_congestion_signals(struct sock *sk);
++
++static void bbr_check_probe_rtt_done(struct sock *sk);
++
++/* Do we estimate that STARTUP filled the pipe? */
++static bool bbr_full_bw_reached(const struct sock *sk)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return bbr->full_bw_reached;
++}
++
++/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
++static u32 bbr_max_bw(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
++}
++
++/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
++static u32 bbr_bw(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return min(bbr_max_bw(sk), bbr->bw_lo);
++}
++
++/* Return maximum extra acked in past k-2k round trips,
++ * where k = bbr_extra_acked_win_rtts.
++ */
++static u16 bbr_extra_acked(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
++}
++
++/* Return rate in bytes per second, optionally with a gain.
++ * The order here is chosen carefully to avoid overflow of u64. This should
++ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
++ */
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
++{
++	unsigned int mss = tcp_sk(sk)->mss_cache;
++
++	rate *= mss;
++	rate *= gain;
++	rate >>= BBR_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
++}
++
++static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
++{
++	rate = bbr_bw_bytes_per_sec(sk, rate);
++	rate *= 8;
++	do_div(rate, 1000);
++	return rate;
++}
++
++static u32 bbr_tso_segs_goal(struct sock *sk);
++static void bbr_debug(struct sock *sk, u32 acked,
++		      const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	static const char ca_states[] = {
++		[TCP_CA_Open]		= 'O',
++		[TCP_CA_Disorder]	= 'D',
++		[TCP_CA_CWR]		= 'C',
++		[TCP_CA_Recovery]	= 'R',
++		[TCP_CA_Loss]		= 'L',
++	};
++	static const char mode[] = {
++		'G',  /* Growing   - BBR_STARTUP */
++		'D',  /* Drain     - BBR_DRAIN */
++		'W',  /* Window    - BBR_PROBE_BW */
++		'M',  /* Min RTT   - BBR_PROBE_RTT */
++	};
++	static const char ack_phase[] = { /* bbr_ack_phase strings */
++		'I',	/* BBR_ACKS_INIT	   - 'Init' */
++		'R',	/* BBR_ACKS_REFILLING	   - 'Refilling' */
++		'B',	/* BBR_ACKS_PROBE_STARTING - 'Before' */
++		'F',	/* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
++		'A',	/* BBR_ACKS_PROBE_STOPPING - 'After' */
++	};
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 una = tp->snd_una - bbr->debug.snd_isn;
++	const u32 fack = tcp_highest_sack_seq(tp);
++	const u16 dport = ntohs(inet_sk(sk)->inet_dport);
++	bool is_port_match = (bbr_debug_port_mask &&
++			      ((dport & bbr_debug_port_mask) == 0));
++	char debugmsg[320];
++
++	if (sk->sk_state == TCP_SYN_SENT)
++		return;  /* no bbr_init() yet if SYN retransmit -> CA_Loss */
++
++	if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
++		char addr[INET6_ADDRSTRLEN + 10] = { 0 };
++
++		if (sk->sk_family == AF_INET)
++			snprintf(addr, sizeof(addr), "%pI4:%u",
++				 &inet_sk(sk)->inet_daddr, dport);
++		else if (sk->sk_family == AF_INET6)
++			snprintf(addr, sizeof(addr), "%pI6:%u",
++				 &sk->sk_v6_daddr, dport);
++
++		WARN_ONCE(1,
++			"BBR %s cwnd alert: %u "
++			"snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
++			"bw: %u rtt: %u min_rtt: %u "
++			"acked: %u tso_segs: %u "
++			"bw: %d %ld %d pif: %u\n",
++			addr, tp->snd_cwnd,
++			una, inet_csk(sk)->icsk_ca_state,
++			bbr->pacing_gain, bbr->cwnd_gain,
++			bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
++			acked, bbr_tso_segs_goal(sk),
++			rs->delivered, rs->interval_us, rs->is_retrans,
++			tcp_packets_in_flight(tp));
++	}
++
++	if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
++		return;
++
++	if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
++		return;
++
++	if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
++		return;
++
++	if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
++	    !(bbr_flags & FLAG_DEBUG_LOOPBACK))
++		return;
++
++	snprintf(debugmsg, sizeof(debugmsg) - 1,
++		 "BBR %pI4:%-5u %5u,%03u:%-7u %c "
++		 "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
++		 "bw %llu lb %llu ib %llu qb %llu "
++		 "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
++		 "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
++		 "v %d %c %u %c %s\n",
++		 &inet_sk(sk)->inet_daddr, dport,
++		 una / 1000, una % 1000, fack - tp->snd_una,
++		 ca_states[inet_csk(sk)->icsk_ca_state],
++		 bbr->debug.undo ? '@' : mode[bbr->mode],
++		 tp->snd_cwnd,
++		 bbr_extra_acked(sk),	/* br (legacy): extra_acked */
++		 rs->tx_in_flight,	/* cr (legacy): tx_inflight */
++		 rs->rtt_us,
++		 rs->delivered,
++		 rs->interval_us,
++		 bbr->min_rtt_us,
++		 rs->is_app_limited ? '_' : 'l',
++		 bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
++		 bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
++		 0ULL,				    /* lb: [obsolete] */
++		 0ULL,				    /* ib: [obsolete] */
++		 div_u64((u64)sk->sk_pacing_rate * 8, 1000),
++		 acked,
++		 tcp_packets_in_flight(tp),
++		 rs->is_ack_delayed ? 'd' : '.',
++		 bbr->round_start ? '*' : '.',
++		 tp->delivered, tp->lost,
++		 tp->app_limited,
++		 0,			    	    /* #: [obsolete] */
++		 ctx->target_cwnd,
++		 tp->reord_seen ? 'r' : '.',  /* r: reordering seen? */
++		 ca_states[bbr->prev_ca_state],
++		 (rs->lost + rs->delivered) > 0 ?
++		 (1000 * rs->lost /
++		  (rs->lost + rs->delivered)) : 0,    /* lr: loss rate x1000 */
++		 (rs->delivered) > 0 ?
++		 (1000 * rs->delivered_ce /
++		  (rs->delivered)) : 0,		      /* er: ECN rate x1000 */
++		 1000 * bbr->ecn_alpha >> BBR_SCALE,  /* ea: ECN alpha x1000 */
++		 bbr->bw_lo == ~0U ?
++		   -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
++		 bbr->inflight_lo,	/* il */
++		 bbr->inflight_hi,	/* ih */
++		 bbr->bw_probe_up_cnt,	/* c */
++		 2,			/* v: version */
++		 bbr->debug.event,
++		 bbr->cycle_idx,
++		 ack_phase[bbr->ack_phase],
++		 bbr->bw_probe_samples ? "Y" : "N");
++	debugmsg[sizeof(debugmsg) - 1] = 0;
++
++	/* printk takes a higher precedence. */
++	if (bbr_debug_with_printk)
++		printk(KERN_DEBUG "%s", debugmsg);
++
++	if (unlikely(bbr->debug.undo))
++		bbr->debug.undo = 0;
++}
++
++/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
++static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
++{
++	u64 rate = bw;
++
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
++	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
++	return rate;
++}
++
++/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++	u32 rtt_us;
++
++	if (tp->srtt_us) {		/* any RTT sample yet? */
++		rtt_us = max(tp->srtt_us >> 3, 1U);
++		bbr->has_seen_rtt = 1;
++	} else {			 /* no RTT sample yet */
++		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
++	}
++	bw = (u64)tp->snd_cwnd * BW_UNIT;
++	do_div(bw, rtt_us);
++	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
++}
++
++/* Pace using current bw estimate and a gain factor. */
++static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
++
++	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
++		bbr_init_pacing_rate_from_rtt(sk);
++	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
++		sk->sk_pacing_rate = rate;
++}
++
++static u32 bbr_min_tso_segs(struct sock *sk)
++{
++	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
++}
++
++/* Return the number of segments BBR would like in a TSO/GSO skb, given
++ * a particular max gso size as a constraint.
++ */
++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
++				u32 gso_max_size)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
++	u64 bytes;
++
++	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
++	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
++
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr->params.tso_rtt_shift) {
++		r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_MAX_SIZE >> r;
++	}
++
++	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
++	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
++	return segs;
++}
++
++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
++static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
++{
++	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
++}
++
++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
++static u32 bbr_tso_segs_goal(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
++}
++
++/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
++static void bbr_save_cwnd(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
++		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
++	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
++		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
++}
++
++static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (event == CA_EVENT_TX_START && tp->app_limited) {
++		bbr->idle_restart = 1;
++		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++		bbr->ack_epoch_acked = 0;
++		/* Avoid pointless buffer overflows: pace at est. bw if we don't
++		 * need more speed (we're restarting from idle and app-limited).
++		 */
++		if (bbr->mode == BBR_PROBE_BW)
++			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
++		else if (bbr->mode == BBR_PROBE_RTT)
++			bbr_check_probe_rtt_done(sk);
++	} else if ((event == CA_EVENT_ECN_IS_CE ||
++		    event == CA_EVENT_ECN_NO_CE) &&
++		    bbr_ecn_enable &&
++		    bbr->params.precise_ece_ack) {
++		u32 state = bbr->ce_state;
++		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
++		bbr->ce_state = state;
++		if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
++			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
++	}
++}
++
++/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
++ *
++ * bdp = ceil(bw * min_rtt * gain)
++ *
++ * The key factor, gain, controls the amount of queue. While a small gain
++ * builds a smaller queue, it becomes more vulnerable to noise in RTT
++ * measurements (e.g., delayed ACKs or other ACK compression effects). This
++ * noise may cause BBR to under-estimate the rate.
++ */
++static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;
++	u64 w;
++
++	/* If we've never had a valid RTT sample, cap cwnd at the initial
++	 * default. This should only happen when the connection is not using TCP
++	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
++	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
++	 * case we need to slow-start up toward something safe: initial cwnd.
++	 */
++	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
++		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
++
++	w = (u64)bw * bbr->min_rtt_us;
++
++	/* Apply a gain to the given value, remove the BW_SCALE shift, and
++	 * round the value up to avoid a negative feedback loop.
++	 */
++	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
++
++	return bdp;
++}
++
++/* To achieve full performance in high-speed paths, we budget enough cwnd to
++ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
++ *   - one skb in sending host Qdisc,
++ *   - one skb in sending host TSO/GSO engine
++ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
++ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
++ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
++ * full even with ACK-every-other-packet delayed ACKs.
++ */
++static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 tso_segs_goal;
++
++	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
++
++	/* Allow enough full-sized skbs in flight to utilize end systems. */
++	if (bbr->params.cwnd_tso_budget == 1) {
++		cwnd = max_t(u32, cwnd, tso_segs_goal);
++		cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
++	} else {
++		cwnd += tso_segs_goal;
++		cwnd = (cwnd + 1) & ~1U;
++	}
++	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		cwnd += 2;
++
++	return cwnd;
++}
++
++/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
++static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
++{
++	u32 inflight;
++
++	inflight = bbr_bdp(sk, bw, gain);
++	inflight = bbr_quantization_budget(sk, inflight);
++
++	return inflight;
++}
++
++/* With pacing at lower layers, there's often less data "in the network" than
++ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
++ * we often have several skbs queued in the pacing layer with a pre-scheduled
++ * earliest departure time (EDT). BBR adapts its pacing rate based on the
++ * inflight level that it estimates has already been "baked in" by previous
++ * departure time decisions. We calculate a rough estimate of the number of our
++ * packets that might be in the network at the earliest departure time for the
++ * next skb scheduled:
++ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
++ * If we're increasing inflight, then we want to know if the transmit of the
++ * EDT skb will push inflight above the target, so inflight_at_edt includes
++ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
++ * then estimate if inflight will sink too low just before the EDT transmit.
++ */
++static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 now_ns, edt_ns, interval_us;
++	u32 interval_delivered, inflight_at_edt;
++
++	now_ns = tp->tcp_clock_cache;
++	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
++	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
++	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
++	inflight_at_edt = inflight_now;
++	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
++		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
++	if (interval_delivered >= inflight_at_edt)
++		return 0;
++	return inflight_at_edt - interval_delivered;
++}
++
++/* Find the cwnd increment based on estimate of ack aggregation */
++static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 max_aggr_cwnd, aggr_cwnd = 0;
++
++	if (bbr->params.extra_acked_gain &&
++	    (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
++		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
++				/ BW_UNIT;
++		aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
++			     >> BBR_SCALE;
++		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
++	}
++
++	return aggr_cwnd;
++}
++
++/* Returns the cwnd for PROBE_RTT mode. */
++static u32 bbr_probe_rtt_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->params.probe_rtt_cwnd_gain == 0)
++		return bbr->params.cwnd_min_target;
++	return max_t(u32, bbr->params.cwnd_min_target,
++		     bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
++}
++
++/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
++ * has drawn us down below target), or snap down to target if we're above it.
++ */
++static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
++			 u32 acked, u32 bw, int gain, u32 cwnd,
++			 struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
++
++	if (!acked)
++		goto done;  /* no packet fully ACKed; just apply caps */
++
++	target_cwnd = bbr_bdp(sk, bw, gain);
++
++	/* Increment the cwnd to account for excess ACKed data that seems
++	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
++	 */
++	target_cwnd += bbr_ack_aggregation_cwnd(sk);
++	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
++
++	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
++	bbr->debug.target_cwnd = target_cwnd;
++
++	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
++	bbr->try_fast_path = 0;
++	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
++		cwnd += acked;
++		if (cwnd >= target_cwnd) {
++			cwnd = target_cwnd;
++			bbr->try_fast_path = 1;
++		}
++	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
++		cwnd += acked;
++	} else {
++		bbr->try_fast_path = 1;
++	}
++
++	/* When growing cwnd, don't grow beyond twice what we just probed. */
++	if (bbr->params.usage_based_cwnd) {
++		max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
++		cwnd = min(cwnd, max_probe);
++	}
++
++	cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
++done:
++	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
++	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
++		tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
++
++	ctx->target_cwnd = target_cwnd;
++	ctx->log = (tp->snd_cwnd != prev_cwnd);
++}
++
++/* See if we have reached next round trip */
++static void bbr_update_round_start(struct sock *sk,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->round_start = 0;
++
++	/* See if we've reached the next RTT */
++	if (rs->interval_us > 0 &&
++	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
++		bbr->next_rtt_delivered = tp->delivered;
++		bbr->round_start = 1;
++	}
++}
++
++/* Calculate the bandwidth based on how fast packets are delivered */
++static void bbr_calculate_bw_sample(struct sock *sk,
++			const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw = 0;
++
++	/* Divide delivered by the interval to find a (lower bound) bottleneck
++	 * bandwidth sample. Delivered is in packets and interval_us in uS and
++	 * ratio will be <<1 for most connections. So delivered is first scaled.
++	 * Round up to allow growth at low rates, even with integer division.
++	 */
++	if (rs->interval_us > 0) {
++		if (WARN_ONCE(rs->delivered < 0,
++			      "negative delivered: %d interval_us: %ld\n",
++			      rs->delivered, rs->interval_us))
++			return;
++
++		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
++	}
++
++	ctx->sample_bw = bw;
++	bbr->debug.rs_bw = bw;
++}
++
++/* Estimates the windowed max degree of ack aggregation.
++ * This is used to provision extra in-flight data to keep sending during
++ * inter-ACK silences.
++ *
++ * Degree of ack aggregation is estimated as extra data acked beyond expected.
++ *
++ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
++ * cwnd += max_extra_acked
++ *
++ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
++ * Max filter is an approximate sliding window of 5-10 (packet timed) round
++ * trips for non-startup phase, and 1-2 round trips for startup.
++ */
++static void bbr_update_ack_aggregation(struct sock *sk,
++				       const struct rate_sample *rs)
++{
++	u32 epoch_us, expected_acked, extra_acked;
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
++
++	if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
++	    rs->delivered < 0 || rs->interval_us <= 0)
++		return;
++
++	if (bbr->round_start) {
++		bbr->extra_acked_win_rtts = min(0x1F,
++						bbr->extra_acked_win_rtts + 1);
++		if (bbr->params.extra_acked_in_startup &&
++		    !bbr_full_bw_reached(sk))
++			extra_acked_win_rtts_thresh = 1;
++		if (bbr->extra_acked_win_rtts >=
++		    extra_acked_win_rtts_thresh) {
++			bbr->extra_acked_win_rtts = 0;
++			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
++						   0 : 1;
++			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
++		}
++	}
++
++	/* Compute how many packets we expected to be delivered over epoch. */
++	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
++				      bbr->ack_epoch_mstamp);
++	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
++
++	/* Reset the aggregation epoch if ACK rate is below expected rate or
++	 * significantly large no. of ack received since epoch (potentially
++	 * quite old epoch).
++	 */
++	if (bbr->ack_epoch_acked <= expected_acked ||
++	    (bbr->ack_epoch_acked + rs->acked_sacked >=
++	     bbr_ack_epoch_acked_reset_thresh)) {
++		bbr->ack_epoch_acked = 0;
++		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
++		expected_acked = 0;
++	}
++
++	/* Compute excess data delivered, beyond what was expected. */
++	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
++				   bbr->ack_epoch_acked + rs->acked_sacked);
++	extra_acked = bbr->ack_epoch_acked - expected_acked;
++	extra_acked = min(extra_acked, tp->snd_cwnd);
++	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
++		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
++}
++
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				      const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh;
++
++	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
++		return;
++
++	bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
++	if (bbr_max_bw(sk) >= bw_thresh) {
++		bbr->full_bw = bbr_max_bw(sk);
++		bbr->full_bw_cnt = 0;
++		return;
++	}
++	++bbr->full_bw_cnt;
++	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr2_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
++		return true;  /* exiting DRAIN now */
++	return false;
++}
++
++static void bbr_check_probe_rtt_done(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (!(bbr->probe_rtt_done_stamp &&
++	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
++		return;
++
++	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
++	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
++	bbr2_exit_probe_rtt(sk);
++}
++
++/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
++ * periodically drain the bottleneck queue, to converge to measure the true
++ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
++ * small (reducing queuing delay and packet loss) and achieve fairness among
++ * BBR flows.
++ *
++ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
++ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
++ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
++ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
++ * re-enter the previous mode. BBR uses 200ms to approximately bound the
++ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
++ *
++ * Note that flows need only pay 2% if they are busy sending over the last 10
++ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
++ * natural silences or low-rate periods within 10 seconds where the rate is low
++ * enough for long enough to drain its queue in the bottleneck. We pick up
++ * these min RTT measurements opportunistically with our min_rtt filter. :-)
++ */
++static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool probe_rtt_expired, min_rtt_expired;
++	u32 expire;
++
++	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
++	expire = bbr->probe_rtt_min_stamp +
++		 msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
++	probe_rtt_expired = after(tcp_jiffies32, expire);
++	if (rs->rtt_us >= 0 &&
++	    (rs->rtt_us <= bbr->probe_rtt_min_us ||
++	     (probe_rtt_expired && !rs->is_ack_delayed))) {
++		bbr->probe_rtt_min_us = rs->rtt_us;
++		bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	}
++	/* Track min RTT seen in the min_rtt_win_sec filter window: */
++	expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
++	min_rtt_expired = after(tcp_jiffies32, expire);
++	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
++	    min_rtt_expired) {
++		bbr->min_rtt_us = bbr->probe_rtt_min_us;
++		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
++	}
++
++	if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
++	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
++		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
++		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
++		bbr->probe_rtt_done_stamp = 0;
++		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++		bbr->next_rtt_delivered = tp->delivered;
++	}
++
++	if (bbr->mode == BBR_PROBE_RTT) {
++		/* Ignore low rate samples during this mode. */
++		tp->app_limited =
++			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
++		/* Maintain min packets in flight for max(200 ms, 1 round). */
++		if (!bbr->probe_rtt_done_stamp &&
++		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
++			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
++				msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
++			bbr->probe_rtt_round_done = 0;
++			bbr->next_rtt_delivered = tp->delivered;
++		} else if (bbr->probe_rtt_done_stamp) {
++			if (bbr->round_start)
++				bbr->probe_rtt_round_done = 1;
++			if (bbr->probe_rtt_round_done)
++				bbr_check_probe_rtt_done(sk);
++		}
++	}
++	/* Restart after idle ends only once we process a new S/ACK for data */
++	if (rs->delivered > 0)
++		bbr->idle_restart = 0;
++}
++
++static void bbr_update_gains(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		bbr->pacing_gain = bbr->params.high_gain;
++		bbr->cwnd_gain	 = bbr->params.startup_cwnd_gain;
++		break;
++	case BBR_DRAIN:
++		bbr->pacing_gain = bbr->params.drain_gain;  /* slow, to drain */
++		bbr->cwnd_gain = bbr->params.startup_cwnd_gain;  /* keep cwnd */
++		break;
++	case BBR_PROBE_BW:
++		bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain = bbr->params.cwnd_gain;
++		break;
++	case BBR_PROBE_RTT:
++		bbr->pacing_gain = BBR_UNIT;
++		bbr->cwnd_gain = BBR_UNIT;
++		break;
++	default:
++		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
++		break;
++	}
++}
++
++static void bbr_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	int i;
++
++	WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
++
++	bbr->initialized = 1;
++	bbr->params.high_gain = min(0x7FF, bbr_high_gain);
++	bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
++	bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
++	bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
++	bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
++	bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
++	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
++	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
++	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
++	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
++	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
++	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
++	bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
++	bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
++	bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
++	bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
++	bbr->params.probe_rtt_win_ms =
++		min(0x3FFFU,
++		    min_t(u32, bbr_probe_rtt_win_ms,
++			  bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
++	for (i = 0; i < CYCLE_LEN; i++)
++		bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
++	bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
++	bbr->params.tso_rtt_shift =  min(0xFU, bbr_tso_rtt_shift);
++
++	bbr->debug.snd_isn = tp->snd_una;
++	bbr->debug.target_cwnd = 0;
++	bbr->debug.undo = 0;
++
++	bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = 0;
++	bbr->prev_ca_state = TCP_CA_Open;
++	bbr->packet_conservation = 0;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
++	bbr->full_bw_cnt = 0;
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++	bbr->mode = BBR_STARTUP;
++	bbr->debug.rs_bw = 0;
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++}
++
++static u32 bbr_sndbuf_expand(struct sock *sk)
++{
++	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
++	return 3;
++}
++
++/* __________________________________________________________________________
++ *
++ * Functions new to BBR v2 ("bbr") congestion control are below here.
++ * __________________________________________________________________________
++ */
++
++/* Incorporate a new bw sample into the current window of our max filter. */
++static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
++}
++
++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
++static void bbr2_advance_bw_hi_filter(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (!bbr->bw_hi[1])
++		return;  /* no samples in this window; remember old window */
++	bbr->bw_hi[0] = bbr->bw_hi[1];
++	bbr->bw_hi[1] = 0;
++}
++
++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
++static u32 bbr2_target_inflight(struct sock *sk)
++{
++	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
++
++	return min(bdp, tcp_sk(sk)->snd_cwnd);
++}
++
++static bool bbr2_is_probing_bandwidth(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return (bbr->mode == BBR_STARTUP) ||
++		(bbr->mode == BBR_PROBE_BW &&
++		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
++		  bbr->cycle_idx == BBR_BW_PROBE_UP));
++}
++
++/* Has the given amount of time elapsed since we marked the phase start? */
++static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++{
++	const struct tcp_sock *tp = tcp_sk(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
++
++	return tcp_stamp_us_delta(tp->tcp_mstamp,
++				  bbr->cycle_mstamp + interval_us) > 0;
++}
++
++static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->full_bw_reached = 1;
++	bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++}
++
++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
++static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
++	    !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
++		return;
++
++	if (ce_ratio >= bbr->params.ecn_thresh)
++		bbr->startup_ecn_rounds++;
++	else
++		bbr->startup_ecn_rounds = 0;
++
++	if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
++		bbr->debug.event = 'E';  /* ECN caused STARTUP exit */
++		bbr2_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++}
++
++static void bbr2_update_ecn_alpha(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	s32 delivered, delivered_ce;
++	u64 alpha, ce_ratio;
++	u32 gain;
++
++	if (bbr->params.ecn_factor == 0)
++		return;
++
++	delivered = tp->delivered - bbr->alpha_last_delivered;
++	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
++
++	if (delivered == 0 ||		/* avoid divide by zero */
++	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
++		return;
++
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_ecn_enable &&
++	    (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
++	     !bbr->params.ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	ce_ratio = (u64)delivered_ce << BBR_SCALE;
++	do_div(ce_ratio, delivered);
++	gain = bbr->params.ecn_alpha_gain;
++	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
++	alpha += (gain * ce_ratio) >> BBR_SCALE;
++	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
++
++	bbr->alpha_last_delivered = tp->delivered;
++	bbr->alpha_last_delivered_ce = tp->delivered_ce;
++
++	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
++}
++
++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
++static void bbr2_raise_inflight_hi_slope(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 growth_this_round, cnt;
++
++	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
++	growth_this_round = 1 << bbr->bw_probe_up_rounds;
++	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
++	cnt = tp->snd_cwnd / growth_this_round;
++	cnt = max(cnt, 1U);
++	bbr->bw_probe_up_cnt = cnt;
++	bbr->debug.event = 'G';  /* Grow inflight_hi slope */
++}
++
++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
++static void bbr2_probe_inflight_hi_upward(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 delta;
++
++	if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
++		bbr->bw_probe_up_acks = 0;  /* don't accmulate unused credits */
++		return;  /* not fully using inflight_hi, so don't grow it */
++	}
++
++	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
++	bbr->bw_probe_up_acks += rs->acked_sacked;
++	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
++		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
++		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
++		bbr->inflight_hi += delta;
++		bbr->debug.event = 'I';  /* Increment inflight_hi */
++	}
++
++	if (bbr->round_start)
++		bbr2_raise_inflight_hi_slope(sk);
++}
++
++/* Does loss/ECN rate for this sample say inflight is "too high"?
++ * This is used by both the bbr_check_loss_too_high_in_startup() function,
++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
++ * uses it to notice when loss/ECN rates suggest inflight is too high.
++ */
++static bool bbr2_is_inflight_too_high(const struct sock *sk,
++				     const struct rate_sample *rs)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh, ecn_thresh;
++
++	if (rs->lost > 0 && rs->tx_in_flight) {
++		loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
++				BBR_SCALE;
++		if (rs->lost > loss_thresh)
++			return true;
++	}
++
++	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
++	    bbr->ecn_eligible && bbr->params.ecn_thresh) {
++		ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
++				BBR_SCALE;
++		if (rs->delivered_ce >= ecn_thresh)
++			return true;
++	}
++
++	return false;
++}
++
++/* Calculate the tx_in_flight level that corresponded to excessive loss.
++ * We find "lost_prefix" segs of the skb where loss rate went too high,
++ * by solving for "lost_prefix" in the following equation:
++ *   lost                     /  inflight                     >= loss_thresh
++ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
++ * Then we take that equation, convert it to fixed point, and
++ * round up to the nearest packet.
++ */
++static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
++					  const struct rate_sample *rs,
++					  const struct sk_buff *skb)
++{
++	const struct bbr *bbr = inet_csk_ca(sk);
++	u32 loss_thresh  = bbr->params.loss_thresh;
++	u32 pcount, divisor, inflight_hi;
++	s32 inflight_prev, lost_prev;
++	u64 loss_budget, lost_prefix;
++
++	pcount = tcp_skb_pcount(skb);
++
++	/* How much data was in flight before this skb? */
++	inflight_prev = rs->tx_in_flight - pcount;
++	if (WARN_ONCE(inflight_prev < 0,
++		      "tx_in_flight: %u pcount: %u reneg: %u",
++		      rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
++		return ~0U;
++
++	/* How much inflight data was marked lost before this skb? */
++	lost_prev = rs->lost - pcount;
++	if (WARN_ON_ONCE(lost_prev < 0))
++		return ~0U;
++
++	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
++	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
++	loss_budget >>= BBR_SCALE;
++	if (lost_prev >= loss_budget) {
++		lost_prefix = 0;   /* previous losses crossed loss_thresh */
++	} else {
++		lost_prefix = loss_budget - lost_prev;
++		lost_prefix <<= BBR_SCALE;
++		divisor = BBR_UNIT - loss_thresh;
++		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
++			return ~0U;
++		do_div(lost_prefix, divisor);
++	}
++
++	inflight_hi = inflight_prev + lost_prefix;
++	return inflight_hi;
++}
++
++/* If loss/ECN rates during probing indicated we may have overfilled a
++ * buffer, return an operating point that tries to leave unutilized headroom in
++ * the path for other flows, for fairness convergence and lower RTTs and loss.
++ */
++static u32 bbr2_inflight_with_headroom(const struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 headroom, headroom_fraction;
++
++	if (bbr->inflight_hi == ~0U)
++		return ~0U;
++
++	headroom_fraction = bbr->params.inflight_headroom;
++	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
++	headroom = max(headroom, 1U);
++	return max_t(s32, bbr->inflight_hi - headroom,
++		     bbr->params.cwnd_min_target);
++}
++
++/* Bound cwnd to a sensible level, based on our current probing state
++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
++ */
++static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 cap;
++
++	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
++	 * and thus cong_control() without first initializing us(!).
++	 */
++	if (!bbr->initialized)
++		return;
++
++	cap = ~0U;
++	if (bbr->mode == BBR_PROBE_BW &&
++	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
++		/* Probe to see if more packets fit in the path. */
++		cap = bbr->inflight_hi;
++	} else {
++		if (bbr->mode == BBR_PROBE_RTT ||
++		    (bbr->mode == BBR_PROBE_BW &&
++		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
++			cap = bbr2_inflight_with_headroom(sk);
++	}
++	/* Adapt to any loss/ECN since our last bw probe. */
++	cap = min(cap, bbr->inflight_lo);
++
++	cap = max_t(u32, cap, bbr->params.cwnd_min_target);
++	tp->snd_cwnd = min(cap, tp->snd_cwnd);
++}
++
++/* Estimate a short-term lower bound on the capacity available now, based
++ * on measurements of the current delivery process and recent history. When we
++ * are seeing loss/ECN at times when we are not probing bw, then conservatively
++ * move toward flow balance by multiplicatively cutting our short-term
++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
++ * multiplicative decrease in order to converge to a lower capacity in time
++ * logarithmic in the magnitude of the decrease.
++ *
++ * However, we do not cut our short-term estimates lower than the current rate
++ * and volume of delivered data from this round trip, since from the current
++ * delivery process we can estimate the measured capacity available now.
++ *
++ * Anything faster than that approach would knowingly risk high loss, which can
++ * cause low bw for Reno/CUBIC and high loss recovery latency for
++ * request/response flows using any congestion control.
++ */
++static void bbr2_adapt_lower_bounds(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut, ecn_inflight_lo, beta;
++
++	/* We only use lower-bound estimates when not probing bw.
++	 * When probing we need to push inflight higher to probe bw.
++	 */
++	if (bbr2_is_probing_bandwidth(sk))
++		return;
++
++	/* ECN response. */
++	if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
++		/* Reduce inflight to (1 - alpha*ecn_factor). */
++		ecn_cut = (BBR_UNIT -
++			   ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
++			    BBR_SCALE));
++		if (bbr->inflight_lo == ~0U)
++			bbr->inflight_lo = tp->snd_cwnd;
++		ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
++	} else {
++		ecn_inflight_lo = ~0U;
++	}
++
++	/* Loss response. */
++	if (bbr->loss_in_round) {
++		/* Reduce bw and inflight to (1 - beta). */
++		if (bbr->bw_lo == ~0U)
++			bbr->bw_lo = bbr_max_bw(sk);
++		if (bbr->inflight_lo == ~0U)
++			bbr->inflight_lo = tp->snd_cwnd;
++		beta = bbr->params.beta;
++		bbr->bw_lo =
++			max_t(u32, bbr->bw_latest,
++			      (u64)bbr->bw_lo *
++			      (BBR_UNIT - beta) >> BBR_SCALE);
++		bbr->inflight_lo =
++			max_t(u32, bbr->inflight_latest,
++			      (u64)bbr->inflight_lo *
++			      (BBR_UNIT - beta) >> BBR_SCALE);
++	}
++
++	/* Adjust to the lower of the levels implied by loss or ECN. */
++	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++}
++
++/* Reset any short-term lower-bound adaptation to congestion, so that we can
++ * push our inflight up.
++ */
++static void bbr2_reset_lower_bounds(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->bw_lo = ~0U;
++	bbr->inflight_lo = ~0U;
++}
++
++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
++ * machine phase where we adapt our lower bound based on congestion signals.
++ */
++static void bbr2_reset_congestion_signals(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round = 0;
++	bbr->loss_in_cycle = 0;
++	bbr->ecn_in_cycle = 0;
++	bbr->bw_latest = 0;
++	bbr->inflight_latest = 0;
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr2_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	bbr->loss_round_start = 0;
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
++	bw = ctx->sample_bw;
++
++	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
++		bbr2_take_bw_hi_sample(sk, bw);
++
++	bbr->loss_in_round |= (rs->losses > 0);
++
++	/* Update rate and volume of delivered data from latest round trip: */
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (before(rs->prior_delivered, bbr->loss_round_delivered))
++		return;		/* skip the per-round-trip updates */
++	/* Now do per-round-trip updates. */
++	bbr->loss_round_delivered = tp->delivered;  /* mark round trip */
++	bbr->loss_round_start = 1;
++	bbr2_adapt_lower_bounds(sk);
++
++	/* Update windowed "latest" (single-round-trip) filters. */
++	bbr->loss_in_round = 0;
++	bbr->ecn_in_round  = 0;
++	bbr->bw_latest = ctx->sample_bw;
++	bbr->inflight_latest = rs->delivered;
++}
++
++/* Bandwidth probing can cause loss. To help coexistence with loss-based
++ * congestion control we spread out our probing in a Reno-conscious way. Due to
++ * the shape of the Reno sawtooth, the time required between loss epochs for an
++ * idealized Reno flow is a number of round trips that is the BDP of that
++ * flow. We count packet-timed round trips directly, since measured RTT can
++ * vary widely, and Reno is driven by packet-timed round trips.
++ */
++static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 inflight, rounds, reno_gain, reno_rounds;
++
++	/* Random loss can shave some small percentage off of our inflight
++	 * in each round. To survive this, flows need robust periodic probes.
++	 */
++	rounds = bbr->params.bw_probe_max_rounds;
++
++	reno_gain = bbr->params.bw_probe_reno_gain;
++	if (reno_gain) {
++		inflight = bbr2_target_inflight(sk);
++		reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
++		rounds = min(rounds, reno_rounds);
++	}
++	return bbr->rounds_since_probe >= rounds;
++}
++
++/* How long do we want to wait before probing for bandwidth (and risking
++ * loss)? We randomize the wait, for better mixing and fairness convergence.
++ *
++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
++ * (eg 4K video to a broadband user):
++ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ *
++ * We bound the BBR-native inter-bw-probe wall clock time to be:
++ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
++ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
++ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
++ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
++ *      amount of time to discover unutilized bw on human-scale interactive
++ *      time-scales (e.g. perhaps traffic from a web page download that we
++ *      were competing with is now complete).
++ */
++static void bbr2_pick_probe_wait(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Decide the random round-trip bound for wait until probe: */
++	bbr->rounds_since_probe =
++		get_random_u32_below(bbr->params.bw_probe_rand_rounds);
++	/* Decide the random wall clock bound for wait until probe: */
++	bbr->probe_wait_us = bbr->params.bw_probe_base_us +
++			     get_random_u32_below(bbr->params.bw_probe_rand_us);
++}
++
++static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->cycle_idx = cycle_idx;
++	/* New phase, so need to update cwnd and pacing rate. */
++	bbr->try_fast_path = 0;
++}
++
++/* Send at estimated bw to fill the pipe, but not queue. We need this phase
++ * before PROBE_UP, because as soon as we send faster than the available bw
++ * we will start building a queue, and if the buffer is shallow we can cause
++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
++ * inflight_hi estimates will underestimate.
++ */
++static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr2_reset_lower_bounds(sk);
++	if (bbr->inflight_hi != ~0U)
++		bbr->inflight_hi += bbr->params.refill_add_inc;
++	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
++	bbr->bw_probe_up_acks = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_REFILLING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++}
++
++/* Now probe max deliverable data rate and volume. */
++static void bbr2_start_bw_probe_up(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->cycle_mstamp = tp->tcp_mstamp;
++	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr2_raise_inflight_hi_slope(sk);
++}
++
++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
++ * clock time at which to probe beyond an inflight that we think to be
++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
++ * keep packet loss rates low. Also start a round-trip counter, to probe faster
++ * if we estimate a Reno flow at our BDP would probe faster.
++ */
++static void bbr2_start_bw_probe_down(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr2_reset_congestion_signals(sk);
++	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
++	bbr2_pick_probe_wait(sk);
++	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
++	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++}
++
++/* Cruise: maintain what we estimate to be a neutral, conservative
++ * operating point, without attempting to probe up for bandwidth or down for
++ * RTT, and only reducing inflight in response to loss/ECN signals.
++ */
++static void bbr2_start_bw_probe_cruise(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->inflight_lo != ~0U)
++		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
++
++	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++}
++
++/* Loss and/or ECN rate is too high while probing.
++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
++ */
++static void bbr2_handle_inflight_too_high(struct sock *sk,
++					  const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	const u32 beta = bbr->params.beta;
++
++	bbr->prev_probe_too_high = 1;
++	bbr->bw_probe_samples = 0;  /* only react once per probe */
++	bbr->debug.event = 'L';     /* Loss/ECN too high */
++	/* If we are app-limited then we are not robustly
++	 * probing the max volume of inflight data we think
++	 * might be safe (analogous to how app-limited bw
++	 * samples are not known to be robustly probing bw).
++	 */
++	if (!rs->is_app_limited)
++		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
++					 (u64)bbr2_target_inflight(sk) *
++					 (BBR_UNIT - beta) >> BBR_SCALE);
++	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
++		bbr2_start_bw_probe_down(sk);
++}
++
++/* If we're seeing bw and loss samples reflecting our bw probing, adapt
++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
++ * inflight_hi downward. If we're able to push inflight higher without such
++ * signals, push higher: adapt inflight_hi upward.
++ */
++static bool bbr2_adapt_upper_bounds(struct sock *sk,
++				   const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* Track when we'll see bw/loss samples resulting from our bw probes. */
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
++		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
++	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
++		/* End of samples from bw probing phase. */
++		bbr->bw_probe_samples = 0;
++		bbr->ack_phase = BBR_ACKS_INIT;
++		/* At this point in the cycle, our current bw sample is also
++		 * our best recent chance at finding the highest available bw
++		 * for this flow. So now is the best time to forget the bw
++		 * samples from the previous cycle, by advancing the window.
++		 */
++		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
++			bbr2_advance_bw_hi_filter(sk);
++		/* If we had an inflight_hi, then probed and pushed inflight all
++		 * the way up to hit that inflight_hi without seeing any
++		 * high loss/ECN in all the resulting ACKs from that probing,
++		 * then probe up again, this time letting inflight persist at
++		 * inflight_hi for a round trip, then accelerating beyond.
++		 */
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
++			bbr->debug.event = 'R';  /* reprobe */
++			bbr2_start_bw_probe_refill(sk, 0);
++			return true;  /* yes, decided state transition */
++		}
++	}
++
++	if (bbr2_is_inflight_too_high(sk, rs)) {
++		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
++			bbr2_handle_inflight_too_high(sk, rs);
++	} else {
++		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
++		if (bbr->inflight_hi == ~0U)  /* no excess queue signals yet? */
++			return false;
++
++		/* To be resilient to random loss, we must raise inflight_hi
++		 * if we observe in any phase that a higher level is safe.
++		 */
++		if (rs->tx_in_flight > bbr->inflight_hi) {
++			bbr->inflight_hi = rs->tx_in_flight;
++			bbr->debug.event = 'U';  /* raise up inflight_hi */
++		}
++
++		if (bbr->mode == BBR_PROBE_BW &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr2_probe_inflight_hi_upward(sk, rs);
++	}
++
++	return false;
++}
++
++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
++static bool bbr2_check_time_to_probe_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 n;
++
++	/* If we seem to be at an operating point where we are not seeing loss
++	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
++	 * quickly (in case a burst of cross-traffic has ceased and freed up bw,
++	 * or in case we are sharing with multiplicatively probing traffic).
++	 */
++	if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
++	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
++		bbr->debug.event = 'A';  /* *A*ll clear to probe *A*gain */
++		/* Calculate n so that when bbr2_raise_inflight_hi_slope()
++		 * computes growth_this_round as 2^n it will be roughly the
++		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
++		 */
++		n = ilog2((((u64)bbr->inflight_hi *
++			    bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
++		bbr2_start_bw_probe_refill(sk, n);
++		return true;
++	}
++
++	if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr2_is_reno_coexistence_probe_time(sk)) {
++		bbr2_start_bw_probe_refill(sk, 0);
++		return true;
++	}
++	return false;
++}
++
++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
++static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_under_bdp, is_long_enough;
++
++	/* Always need to pull inflight down to leave headroom in queue. */
++	if (inflight > bbr2_inflight_with_headroom(sk))
++		return false;
++
++	is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
++	if (bbr->params.drain_to_target)
++		return is_under_bdp;
++
++	is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
++	return is_under_bdp || is_long_enough;
++}
++
++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
++static void bbr2_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	bool is_risky = false, is_queuing = false;
++	u32 inflight, bw;
++
++	if (!bbr_full_bw_reached(sk))
++		return;
++
++	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
++	if (bbr2_adapt_upper_bounds(sk, rs))
++		return;		/* already decided state transition */
++
++	if (bbr->mode != BBR_PROBE_BW)
++		return;
++
++	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
++	bw = bbr_max_bw(sk);
++
++	switch (bbr->cycle_idx) {
++	/* First we spend most of our time cruising with a pacing_gain of 1.0,
++	 * which paces at the estimated bw, to try to fully use the pipe
++	 * without building queue. If we encounter loss/ECN marks, we adapt
++	 * by slowing down.
++	 */
++	case BBR_BW_PROBE_CRUISE:
++		if (bbr2_check_time_to_probe_bw(sk))
++			return;		/* already decided state transition */
++		break;
++
++	/* After cruising, when it's time to probe, we first "refill": we send
++	 * at the estimated bw to fill the pipe, before probing higher and
++	 * knowingly risking overflowing the bottleneck buffer (causing loss).
++	 */
++	case BBR_BW_PROBE_REFILL:
++		if (bbr->round_start) {
++			/* After one full round trip of sending in REFILL, we
++			 * start to see bw samples reflecting our REFILL, which
++			 * may be putting too much data in flight.
++			 */
++			bbr->bw_probe_samples = 1;
++			bbr2_start_bw_probe_up(sk);
++		}
++		break;
++
++	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
++	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
++	 * to at least pacing_gain*BDP; note that this may take more than
++	 * min_rtt if min_rtt is small (e.g. on a LAN).
++	 *
++	 * We terminate PROBE_UP bandwidth probing upon any of the following:
++	 *
++	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
++	 *     most recent previous bw probe phase. Thus we want to start
++	 *     draining the queue immediately because it's very likely the most
++	 *     recently sent packets will fill the queue and cause drops.
++	 *     (checked here)
++	 * (2) We have probed for at least 1*min_rtt_us, and the
++	 *     estimated queue is high enough (inflight > 1.25 * estimated_bdp).
++	 *     (checked here)
++	 * (3) Loss filter says loss rate is "too high".
++	 *     (checked in bbr_is_inflight_too_high())
++	 * (4) ECN filter says ECN mark rate is "too high".
++	 *     (checked in bbr_is_inflight_too_high())
++	 */
++	case BBR_BW_PROBE_UP:
++		if (bbr->prev_probe_too_high &&
++		    inflight >= bbr->inflight_hi) {
++			bbr->stopped_risky_probe = 1;
++			is_risky = true;
++			bbr->debug.event = 'D';   /* D for danger */
++		} else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
++			   inflight >=
++			   bbr_inflight(sk, bw,
++					bbr->params.bw_probe_pif_gain)) {
++			is_queuing = true;
++			bbr->debug.event = 'Q'; /* building Queue */
++		}
++		if (is_risky || is_queuing) {
++			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
++			bbr2_start_bw_probe_down(sk);  /* restart w/ down */
++		}
++		break;
++
++	/* After probing in PROBE_UP, we have usually accumulated some data in
++	 * the bottleneck buffer (if bw probing didn't find more bw). We next
++	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
++	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
++	 * our inflight is less then that target cruising point, which is the
++	 * minimum of (a) the amount needed to leave headroom, and (b) the
++	 * estimated BDP. Once inflight falls to match the target, we estimate
++	 * the queue is drained; persisting would underutilize the pipe.
++	 */
++	case BBR_BW_PROBE_DOWN:
++		if (bbr2_check_time_to_probe_bw(sk))
++			return;		/* already decided state transition */
++		if (bbr2_check_time_to_cruise(sk, inflight, bw))
++			bbr2_start_bw_probe_cruise(sk);
++		break;
++
++	default:
++		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
++	}
++}
++
++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
++static void bbr2_exit_probe_rtt(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr2_reset_lower_bounds(sk);
++	if (bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_PROBE_BW;
++		/* Raising inflight after PROBE_RTT may cause loss, so reset
++		 * the PROBE_BW clock and schedule the next bandwidth probe for
++		 * a friendly and randomized future point in time.
++		 */
++		bbr2_start_bw_probe_down(sk);
++		/* Since we are exiting PROBE_RTT, we know inflight is
++		 * below our estimated BDP, so it is reasonable to cruise.
++		 */
++		bbr2_start_bw_probe_cruise(sk);
++	} else {
++		bbr->mode = BBR_STARTUP;
++	}
++}
++
++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
++ * the end of the round in recovery to get a good estimate of how many packets
++ * have been lost, and how many we need to drain with a low pacing rate.
++ */
++static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
++					       const struct rate_sample *rs)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_full_bw_reached(sk))
++		return;
++
++	/* For STARTUP exit, check the loss rate at the end of each round trip
++	 * of Recovery episodes in STARTUP. We check the loss rate at the end
++	 * of the round trip to filter out noisy/low loss and have a better
++	 * sense of inflight (extent of loss), so we can drain more accurately.
++	 */
++	if (rs->losses && bbr->loss_events_in_round < 0xf)
++		bbr->loss_events_in_round++;  /* update saturating counter */
++	if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
++	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
++	    bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
++	    bbr2_is_inflight_too_high(sk, rs)) {
++		bbr->debug.event = 'P';  /* Packet loss caused STARTUP exit */
++		bbr2_handle_queue_too_high_in_startup(sk);
++		return;
++	}
++	if (bbr->loss_round_start)
++		bbr->loss_events_in_round = 0;
++}
++
++/* If we are done draining, advance into steady state operation in PROBE_BW. */
++static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
++			     struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr_check_drain(sk, rs, ctx)) {
++		bbr->mode = BBR_PROBE_BW;
++		bbr2_start_bw_probe_down(sk);
++	}
++}
++
++static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
++			      struct bbr_context *ctx)
++{
++	bbr2_update_congestion_signals(sk, rs, ctx);
++	bbr_update_ack_aggregation(sk, rs);
++	bbr2_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs);
++	bbr2_check_drain(sk, rs, ctx);
++	bbr2_update_cycle_phase(sk, rs);
++	bbr_update_min_rtt(sk, rs);
++}
++
++/* Fast path for app-limited case.
++ *
++ * On each ack, we execute bbr state machine, which primarily consists of:
++ * 1) update model based on new rate sample, and
++ * 2) update control based on updated model or state change.
++ *
++ * There are certain workload/scenarios, e.g. app-limited case, where
++ * either we can skip updating model or we can skip update of both model
++ * as well as control. This provides signifcant softirq cpu savings for
++ * processing incoming acks.
++ *
++ * In case of app-limited, if there is no congestion (loss/ecn) and
++ * if observed bw sample is less than current estimated bw, then we can
++ * skip some of the computation in bbr state processing:
++ *
++ * - if there is no rtt/mode/phase change: In this case, since all the
++ *   parameters of the network model are constant, we can skip model
++ *   as well control update.
++ *
++ * - else we can skip rest of the model update. But we still need to
++ *   update the control to account for the new rtt/mode/phase.
++ *
++ * Returns whether we can take fast path or not.
++ */
++static bool bbr2_fast_path(struct sock *sk, bool *update_model,
++		const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 prev_min_rtt_us, prev_mode;
++
++	if (bbr->params.fast_path && bbr->try_fast_path &&
++	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
++	    !bbr->loss_in_round && !bbr->ecn_in_round) {
++		prev_mode = bbr->mode;
++		prev_min_rtt_us = bbr->min_rtt_us;
++		bbr2_check_drain(sk, rs, ctx);
++		bbr2_update_cycle_phase(sk, rs);
++		bbr_update_min_rtt(sk, rs);
++
++		if (bbr->mode == prev_mode &&
++		    bbr->min_rtt_us == prev_min_rtt_us &&
++		    bbr->try_fast_path)
++			return true;
++
++		/* Skip model update, but control still needs to be updated */
++		*update_model = false;
++	}
++	return false;
++}
++
++static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct bbr_context ctx = { 0 };
++	bool update_model = true;
++	u32 bw;
++
++	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
++
++	bbr_update_round_start(sk, rs, &ctx);
++	if (bbr->round_start) {
++		bbr->rounds_since_probe =
++			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
++		bbr2_update_ecn_alpha(sk);
++	}
++
++	bbr->ecn_in_round  |= rs->is_ece;
++	bbr_calculate_bw_sample(sk, rs, &ctx);
++
++	if (bbr2_fast_path(sk, &update_model, rs, &ctx))
++		goto out;
++
++	if (update_model)
++		bbr2_update_model(sk, rs, &ctx);
++
++	bbr_update_gains(sk);
++	bw = bbr_bw(sk);
++	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
++	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
++		     tp->snd_cwnd, &ctx);
++	bbr2_bound_cwnd_for_inflight_model(sk);
++
++out:
++	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
++	bbr->loss_in_cycle |= rs->lost > 0;
++	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
++
++	bbr_debug(sk, rs->acked_sacked, rs, &ctx);
++}
++
++/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
++ * down here, so that the algorithm functions that use the parameters must use
++ * the per-socket parameters; if they accidentally use the global version
++ * then there will be a compile error.
++ * TODO(ncardwell): move all per-socket parameters down to this section.
++ */
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0. Max allwed value is 255.
++ */
++static u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
++ * Max allowed value is 255.
++ */
++static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;  /* 1/16 = 6.25% */
++
++/* The initial value for the ecn_alpha state variable. Default and max
++ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static u32 bbr_ecn_alpha_init = BBR_UNIT;	/* 1.0, to respond quickly */
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0. Max allwed value is 255.
++ */
++static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	    /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
++ */
++static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
++ */
++static u32 bbr_ecn_max_rtt_us = 5000;
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then use a multiplicative increase to quickly reprobe bw by
++ * starting inflight probing at the given multiple of inflight_hi.
++ * Default for this experimental knob is 0 (disabled).
++ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
++ */
++static u32 bbr_ecn_reprobe_gain;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0. Max allowed value is 15 (0xF).
++ */
++static u32 bbr_full_loss_cnt = 8;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count. Max allowed value is 3.
++ */
++static u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
++ * Default is 1.25x, as in BBR v1. Max allowed is 511.
++ */
++static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
++
++/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
++ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
++ * Max allowed is 511.
++ */
++static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ * Max value is 15.
++ */
++static u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Undo the model changes made in loss recovery if recovery was spurious? */
++static bool bbr_undo = true;
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static bool bbr_fast_path = true;	/* default: enabled */
++
++/* Use fast ack mode ? */
++static int bbr_fast_ack_mode = 1;	/* default: rwnd check off */
++
++/* How much to additively increase inflight_hi when entering REFILL? */
++static u32 bbr_refill_add_inc;		/* default: disabled */
++
++module_param_named(beta,                 bbr_beta,                 uint, 0644);
++module_param_named(ecn_alpha_gain,       bbr_ecn_alpha_gain,       uint, 0644);
++module_param_named(ecn_alpha_init,       bbr_ecn_alpha_init,       uint, 0644);
++module_param_named(ecn_factor,           bbr_ecn_factor,           uint, 0644);
++module_param_named(ecn_thresh,           bbr_ecn_thresh,           uint, 0644);
++module_param_named(ecn_max_rtt_us,       bbr_ecn_max_rtt_us,       uint, 0644);
++module_param_named(ecn_reprobe_gain,     bbr_ecn_reprobe_gain,     uint, 0644);
++module_param_named(loss_thresh,          bbr_loss_thresh,          uint, 0664);
++module_param_named(full_loss_cnt,        bbr_full_loss_cnt,        uint, 0664);
++module_param_named(full_ecn_cnt,         bbr_full_ecn_cnt,         uint, 0664);
++module_param_named(inflight_headroom,    bbr_inflight_headroom,    uint, 0664);
++module_param_named(bw_probe_pif_gain,    bbr_bw_probe_pif_gain,    uint, 0664);
++module_param_named(bw_probe_reno_gain,   bbr_bw_probe_reno_gain,   uint, 0664);
++module_param_named(bw_probe_max_rounds,  bbr_bw_probe_max_rounds,  uint, 0664);
++module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
++module_param_named(bw_probe_base_us,     bbr_bw_probe_base_us,     uint, 0664);
++module_param_named(bw_probe_rand_us,     bbr_bw_probe_rand_us,     uint, 0664);
++module_param_named(undo,                 bbr_undo,                 bool, 0664);
++module_param_named(fast_path,		 bbr_fast_path,		   bool, 0664);
++module_param_named(fast_ack_mode,	 bbr_fast_ack_mode,	   uint, 0664);
++module_param_named(refill_add_inc,       bbr_refill_add_inc,       uint, 0664);
++
++static void bbr2_init(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_init(sk);	/* run shared init code for v1 and v2 */
++
++	/* BBR v2 parameters: */
++	bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
++	bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
++	bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
++	bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
++	bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
++	bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
++	bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
++	bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
++	bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
++	bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
++	bbr->params.inflight_headroom =
++		min_t(u32, 0xFFU, bbr_inflight_headroom);
++	bbr->params.bw_probe_pif_gain =
++		min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
++	bbr->params.bw_probe_reno_gain =
++		min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
++	bbr->params.bw_probe_max_rounds =
++		min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
++	bbr->params.bw_probe_rand_rounds =
++		min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
++	bbr->params.bw_probe_base_us =
++		min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
++	bbr->params.bw_probe_rand_us =
++		min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
++	bbr->params.undo = bbr_undo;
++	bbr->params.fast_path = bbr_fast_path ? 1 : 0;
++	bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
++
++	/* BBR v2 state: */
++	bbr->initialized = 1;
++	/* Start sampling ECN mark rate after first full flight is ACKed: */
++	bbr->loss_round_delivered = tp->delivered + 1;
++	bbr->loss_round_start = 0;
++	bbr->undo_bw_lo = 0;
++	bbr->undo_inflight_lo = 0;
++	bbr->undo_inflight_hi = 0;
++	bbr->loss_events_in_round = 0;
++	bbr->startup_ecn_rounds = 0;
++	bbr2_reset_congestion_signals(sk);
++	bbr->bw_lo = ~0U;
++	bbr->bw_hi[0] = 0;
++	bbr->bw_hi[1] = 0;
++	bbr->inflight_lo = ~0U;
++	bbr->inflight_hi = ~0U;
++	bbr->bw_probe_up_cnt = ~0U;
++	bbr->bw_probe_up_acks = 0;
++	bbr->bw_probe_up_rounds = 0;
++	bbr->probe_wait_us = 0;
++	bbr->stopped_risky_probe = 0;
++	bbr->ack_phase = BBR_ACKS_INIT;
++	bbr->rounds_since_probe = 0;
++	bbr->bw_probe_samples = 0;
++	bbr->prev_probe_too_high = 0;
++	bbr->ecn_eligible = 0;
++	bbr->ecn_alpha = bbr->params.ecn_alpha_init;
++	bbr->alpha_last_delivered = 0;
++	bbr->alpha_last_delivered_ce = 0;
++
++	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
++
++	if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
++		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
++}
++
++/* Core TCP stack informs us that the given skb was just marked lost. */
++static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs;
++
++	/* Capture "current" data over the full round trip of loss,
++	 * to have a better chance to see the full capacity of the path.
++	*/
++	if (!bbr->loss_in_round)  /* first loss in this round trip? */
++		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
++	bbr->loss_in_round = 1;
++	bbr->loss_in_cycle = 1;
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not an skb sent while probing for bandwidth */
++	if (unlikely(!scb->tx.delivered_mstamp))
++		return;  /* skb was SACKed, reneged, marked lost; ignore it */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this lost skb,
++	 * then see if the loss rate went too high, and if so at which packet.
++	 */
++	memset(&rs, 0, sizeof(rs));
++	rs.tx_in_flight = scb->tx.in_flight;
++	rs.lost = tp->lost - scb->tx.lost;
++	rs.is_app_limited = scb->tx.is_app_limited;
++	if (bbr2_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr2_handle_inflight_too_high(sk, &rs);
++	}
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++static u32 bbr2_undo_cwnd(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr->debug.undo = 1;
++	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++	bbr->full_bw_cnt = 0;
++	bbr->loss_in_round = 0;
++
++	if (!bbr->params.undo)
++		return tp->snd_cwnd;
++
++	/* Revert to cwnd and other state saved before loss episode. */
++	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
++	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
++	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	return bbr->prior_cwnd;
++}
++
++/* Entering loss recovery, so save state for when we undo recovery. */
++static u32 bbr2_ssthresh(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_save_cwnd(sk);
++	/* For undo, save state that adapts based on loss signal. */
++	bbr->undo_bw_lo		= bbr->bw_lo;
++	bbr->undo_inflight_lo	= bbr->inflight_lo;
++	bbr->undo_inflight_hi	= bbr->inflight_hi;
++	return tcp_sk(sk)->snd_ssthresh;
++}
++
++static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
++{
++	switch (bbr->mode) {
++	case BBR_STARTUP:
++		return BBR2_PHASE_STARTUP;
++	case BBR_DRAIN:
++		return BBR2_PHASE_DRAIN;
++	case BBR_PROBE_BW:
++		break;
++	case BBR_PROBE_RTT:
++		return BBR2_PHASE_PROBE_RTT;
++	default:
++		return BBR2_PHASE_INVALID;
++	}
++	switch (bbr->cycle_idx) {
++	case BBR_BW_PROBE_UP:
++		return BBR2_PHASE_PROBE_BW_UP;
++	case BBR_BW_PROBE_DOWN:
++		return BBR2_PHASE_PROBE_BW_DOWN;
++	case BBR_BW_PROBE_CRUISE:
++		return BBR2_PHASE_PROBE_BW_CRUISE;
++	case BBR_BW_PROBE_REFILL:
++		return BBR2_PHASE_PROBE_BW_REFILL;
++	default:
++		return BBR2_PHASE_INVALID;
++	}
++}
++
++static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
++			    union tcp_cc_info *info)
++{
++	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
++	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
++		struct bbr *bbr = inet_csk_ca(sk);
++		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
++		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
++		u64 bw_lo = bbr->bw_lo == ~0U ?
++			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++
++		memset(&info->bbr2, 0, sizeof(info->bbr2));
++		info->bbr2.bbr_bw_lsb		= (u32)bw;
++		info->bbr2.bbr_bw_msb		= (u32)(bw >> 32);
++		info->bbr2.bbr_min_rtt		= bbr->min_rtt_us;
++		info->bbr2.bbr_pacing_gain	= bbr->pacing_gain;
++		info->bbr2.bbr_cwnd_gain	= bbr->cwnd_gain;
++		info->bbr2.bbr_bw_hi_lsb	= (u32)bw_hi;
++		info->bbr2.bbr_bw_hi_msb	= (u32)(bw_hi >> 32);
++		info->bbr2.bbr_bw_lo_lsb	= (u32)bw_lo;
++		info->bbr2.bbr_bw_lo_msb	= (u32)(bw_lo >> 32);
++		info->bbr2.bbr_mode		= bbr->mode;
++		info->bbr2.bbr_phase		= (__u8)bbr2_get_phase(bbr);
++		info->bbr2.bbr_version		= (__u8)2;
++		info->bbr2.bbr_inflight_lo	= bbr->inflight_lo;
++		info->bbr2.bbr_inflight_hi	= bbr->inflight_hi;
++		info->bbr2.bbr_extra_acked	= bbr_extra_acked(sk);
++		*attr = INET_DIAG_BBRINFO;
++		return sizeof(info->bbr2);
++	}
++	return 0;
++}
++
++static void bbr2_set_state(struct sock *sk, u8 new_state)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (new_state == TCP_CA_Loss) {
++		struct rate_sample rs = { .losses = 1 };
++		struct bbr_context ctx = { 0 };
++
++		bbr->prev_ca_state = TCP_CA_Loss;
++		bbr->full_bw = 0;
++		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
++			/* bbr_adapt_lower_bounds() needs cwnd before
++			 * we suffered an RTO, to update inflight_lo:
++			 */
++			bbr->inflight_lo =
++				max(tp->snd_cwnd, bbr->prior_cwnd);
++		}
++		bbr_debug(sk, 0, &rs, &ctx);
++	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
++		   new_state != TCP_CA_Loss) {
++		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
++		bbr->try_fast_path = 0; /* bound cwnd using latest model */
++	}
++}
++
++static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
++	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
++	.name		= "bbr2",
++	.owner		= THIS_MODULE,
++	.init		= bbr2_init,
++	.cong_control	= bbr2_main,
++	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr2_skb_marked_lost,
++	.undo_cwnd	= bbr2_undo_cwnd,
++	.cwnd_event	= bbr_cwnd_event,
++	.ssthresh	= bbr2_ssthresh,
++	.tso_segs	= bbr_tso_segs,
++	.get_info	= bbr2_get_info,
++	.set_state	= bbr2_set_state,
++};
++
++static int __init bbr_register(void)
++{
++	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
++	return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
++}
++
++static void __exit bbr_unregister(void)
++{
++	tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
++}
++
++module_init(bbr_register);
++module_exit(bbr_unregister);
++
++MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
++MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
++MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
++MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
++MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
++MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
++MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
++MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index d3cae40749e8..0f268f2ff2e9 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+ 	tcp_sk(sk)->prior_ssthresh = 0;
++	tcp_sk(sk)->fast_ack_mode = 0;
+ 	if (icsk->icsk_ca_ops->init)
+ 		icsk->icsk_ca_ops->init(sk);
+ 	if (tcp_ca_needs_ecn(sk))
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index cc072d2cfcd8..754e0212c951 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 			tcp_enter_quickack_mode(sk, 2);
+ 		break;
+ 	case INET_ECN_CE:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+ 
+ 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+ 	default:
+-		if (tcp_ca_needs_ecn(sk))
++		if (tcp_ca_wants_ce_events(sk))
+ 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ 		tp->ecn_flags |= TCP_ECN_SEEN;
+ 		break;
+@@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+  */
+ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+ {
++	struct sock *sk = (struct sock *)tp;
++	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ 	tp->lost += tcp_skb_pcount(skb);
++	if (ca_ops->skb_marked_lost)
++		ca_ops->skb_marked_lost(sk, skb);
+ }
+ 
+ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+@@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+ 	tcp_skb_pcount_add(skb, -pcount);
+ 
++	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
++	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
++		      "prev in_flight: %u skb in_flight: %u pcount: %u",
++		      TCP_SKB_CB(prev)->tx.in_flight,
++		      TCP_SKB_CB(skb)->tx.in_flight,
++		      pcount))
++		TCP_SKB_CB(skb)->tx.in_flight = 0;
++	else
++		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
++	TCP_SKB_CB(prev)->tx.in_flight += pcount;
++
+ 	/* When we're adding to gso_segs == 1, gso_size will be zero,
+ 	 * in theory this shouldn't be necessary but as long as DSACK
+ 	 * code can come after this skb later on it's better to keep
+@@ -3813,6 +3829,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 
+ 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+ 	rs.prior_in_flight = tcp_packets_in_flight(tp);
++	tcp_rate_check_app_limited(sk);
+ 
+ 	/* ts_recent update must be made after we are sure that the packet
+ 	 * is in window.
+@@ -3911,6 +3928,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tcp_newly_delivered(sk, delivered, flag);
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
++	rs.is_ece = !!(flag & FLAG_ECE);
+ 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+ 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+ 	tcp_xmit_recovery(sk, rexmit);
+@@ -5521,13 +5539,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 
+ 	    /* More than one full frame received... */
+ 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
++	     (tp->fast_ack_mode == 1 ||
+ 	     /* ... and right edge of window advances far enough.
+ 	      * (tcp_recvmsg() will send ACK otherwise).
+ 	      * If application uses SO_RCVLOWAT, we want send ack now if
+ 	      * we have not received enough bytes to satisfy the condition.
+ 	      */
+-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
++	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
++	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+ 	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 71d01cf3c13e..0da3da9e56db 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ 				th->cwr = 1;
+ 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ 			}
+-		} else if (!tcp_ca_needs_ecn(sk)) {
++		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
++			!tcp_ca_needs_ecn(sk)) {
+ 			/* ACK or retransmitted segment: clear ECT|CE */
+ 			INET_ECN_dontxmit(sk);
+ 		}
+@@ -1530,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct sk_buff *buff;
+-	int nsize, old_factor;
++	int nsize, old_factor, inflight_prev;
+ 	long limit;
+ 	int nlen;
+ 	u8 flags;
+@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ 
+ 		if (diff)
+ 			tcp_adjust_pcount(sk, skb, diff);
++
++		/* Set buff tx.in_flight as if buff were sent by itself. */
++		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
++		if (WARN_ONCE(inflight_prev < 0,
++			      "inconsistent: tx.in_flight: %u old_factor: %d",
++			      TCP_SKB_CB(skb)->tx.in_flight, old_factor))
++			inflight_prev = 0;
++		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(buff);
+ 	}
+ 
+ 	/* Link BUFF into the send queue. */
+@@ -1990,13 +2000,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+ 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+-	u32 min_tso, tso_segs;
+-
+-	min_tso = ca_ops->min_tso_segs ?
+-			ca_ops->min_tso_segs(sk) :
+-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++	u32 tso_segs;
+ 
+-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
++	tso_segs = ca_ops->tso_segs ?
++		ca_ops->tso_segs(sk, mss_now) :
++		tcp_tso_autosize(sk, mss_now,
++				 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs));
+ 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ 
+@@ -2632,6 +2641,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ 			tcp_init_tso_segs(skb, mss_now);
++			tcp_set_tx_in_flight(sk, skb);
+ 			goto repair; /* Skip network transmission */
+ 		}
+ 
+diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
+index a8f6d9d06f2e..a8b4c9504570 100644
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -34,6 +34,24 @@
+  * ready to send in the write queue.
+  */
+ 
++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 in_flight;
++
++	/* Check, sanitize, and record packets in flight after skb was sent. */
++	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
++	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
++		      "insane in_flight %u cc %s mss %u "
++		      "cwnd %u pif %u %u %u %u\n",
++		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
++		      tp->mss_cache, tp->snd_cwnd,
++		      tp->packets_out, tp->retrans_out,
++		      tp->sacked_out, tp->lost_out))
++		in_flight = TCPCB_IN_FLIGHT_MAX;
++	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
++}
++
+ /* Snapshot the current delivery information in the skb, to generate
+  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+  */
+@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+ 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+ 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+ 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
++	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
+ 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
++	tcp_set_tx_in_flight(sk, skb);
+ }
+ 
+ /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ 	if (!rs->prior_delivered ||
+ 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ 			       scb->end_seq, rs->last_end_seq)) {
++		rs->prior_lost	     = scb->tx.lost;
+ 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+ 		rs->prior_delivered  = scb->tx.delivered;
+ 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+ 		rs->is_app_limited   = scb->tx.is_app_limited;
+ 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+ 		rs->last_end_seq     = scb->end_seq;
++		rs->tx_in_flight     = scb->tx.in_flight;
+ 
+ 		/* Record send time of most recently ACKed packet: */
+ 		tp->first_tx_mstamp  = tx_tstamp;
+ 		/* Find the duration of the "send phase" of this window: */
+-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+-						     scb->tx.first_tx_mstamp);
++		rs->interval_us      = tcp_stamp32_us_delta(
++						tp->first_tx_mstamp,
++						scb->tx.first_tx_mstamp);
+ 
+ 	}
+ 	/* Mark off the skb delivered once it's sacked to avoid being
+@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 		return;
+ 	}
+ 	rs->delivered   = tp->delivered - rs->prior_delivered;
++	rs->lost        = tp->lost - rs->prior_lost;
+ 
+ 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ 	/* delivered_ce occupies less than 32 bits in the skb control block */
+@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ 	 * longer phase.
+ 	 */
+ 	snd_us = rs->interval_us;				/* send phase */
+-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
++	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
+ 				    rs->prior_mstamp); /* ack phase */
+ 	rs->interval_us = max(snd_us, ack_us);
+ 
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index cb79127f45c3..70e4de876a7f 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk)
+ 		return;
+ 	}
+ 
++	tcp_rate_check_app_limited(sk);
+ 	tcp_mstamp_refresh(tcp_sk(sk));
+ 	event = icsk->icsk_pending;
+ 
+-- 
+2.39.2
+
+From 4b786f8ae226132e5faf03acd49e1ea6ae5e8d9a Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 13 Feb 2023 09:23:53 +0100
+Subject: [PATCH 02/15] bfq
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ block/bfq-cgroup.c    | 101 ++++---
+ block/bfq-iosched.c   | 629 ++++++++++++++++++++++++++++--------------
+ block/bfq-iosched.h   | 144 +++++++---
+ block/bfq-wf2q.c      |   2 +-
+ block/blk-cgroup.c    | 122 ++++----
+ block/blk-cgroup.h    |  10 +-
+ block/blk-iocost.c    |  58 ++--
+ block/blk-iolatency.c |  39 ++-
+ block/blk-rq-qos.h    |   2 +-
+ block/blk-throttle.c  |  16 +-
+ block/blk.h           |   6 -
+ 11 files changed, 743 insertions(+), 386 deletions(-)
+
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
+index 0fbde0fc0628..59929dfd559b 100644
+--- a/block/bfq-cgroup.c
++++ b/block/bfq-cgroup.c
+@@ -706,12 +706,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 		bfq_activate_bfqq(bfqd, bfqq);
+ 	}
+ 
+-	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
++	if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver)
+ 		bfq_schedule_dispatch(bfqd);
+ 	/* release extra ref taken above, bfqq may happen to be freed now */
+ 	bfq_put_queue(bfqq);
+ }
+ 
++static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
++			       struct bfq_queue *sync_bfqq,
++			       struct bfq_io_cq *bic,
++			       struct bfq_group *bfqg,
++			       unsigned int act_idx)
++{
++	struct bfq_queue *bfqq;
++
++	if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
++		/* We are the only user of this bfqq, just move it */
++		if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
++			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
++		return;
++	}
++
++	/*
++	 * The queue was merged to a different queue. Check
++	 * that the merge chain still belongs to the same
++	 * cgroup.
++	 */
++	for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
++		if (bfqq->entity.sched_data != &bfqg->sched_data)
++			break;
++	if (bfqq) {
++		/*
++		 * Some queue changed cgroup so the merge is not valid
++		 * anymore. We cannot easily just cancel the merge (by
++		 * clearing new_bfqq) as there may be other processes
++		 * using this queue and holding refs to all queues
++		 * below sync_bfqq->new_bfqq. Similarly if the merge
++		 * already happened, we need to detach from bfqq now
++		 * so that we cannot merge bio to a request from the
++		 * old cgroup.
++		 */
++		bfq_put_cooperator(sync_bfqq);
++		bic_set_bfqq(bic, NULL, true, act_idx);
++		bfq_release_process_ref(bfqd, sync_bfqq);
++	}
++}
++
+ /**
+  * __bfq_bic_change_cgroup - move @bic to @bfqg.
+  * @bfqd: the queue descriptor.
+@@ -726,53 +766,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ 				    struct bfq_io_cq *bic,
+ 				    struct bfq_group *bfqg)
+ {
+-	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false);
+-	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true);
+-	struct bfq_entity *entity;
++	unsigned int act_idx;
+ 
+-	if (async_bfqq) {
+-		entity = &async_bfqq->entity;
++	for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) {
++		struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx);
++		struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx);
+ 
+-		if (entity->sched_data != &bfqg->sched_data) {
+-			bic_set_bfqq(bic, NULL, false);
++		if (async_bfqq &&
++		    async_bfqq->entity.sched_data != &bfqg->sched_data) {
++			bic_set_bfqq(bic, NULL, false, act_idx);
+ 			bfq_release_process_ref(bfqd, async_bfqq);
+ 		}
+-	}
+ 
+-	if (sync_bfqq) {
+-		if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+-			/* We are the only user of this bfqq, just move it */
+-			if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+-				bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+-		} else {
+-			struct bfq_queue *bfqq;
+-
+-			/*
+-			 * The queue was merged to a different queue. Check
+-			 * that the merge chain still belongs to the same
+-			 * cgroup.
+-			 */
+-			for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+-				if (bfqq->entity.sched_data !=
+-				    &bfqg->sched_data)
+-					break;
+-			if (bfqq) {
+-				/*
+-				 * Some queue changed cgroup so the merge is
+-				 * not valid anymore. We cannot easily just
+-				 * cancel the merge (by clearing new_bfqq) as
+-				 * there may be other processes using this
+-				 * queue and holding refs to all queues below
+-				 * sync_bfqq->new_bfqq. Similarly if the merge
+-				 * already happened, we need to detach from
+-				 * bfqq now so that we cannot merge bio to a
+-				 * request from the old cgroup.
+-				 */
+-				bfq_put_cooperator(sync_bfqq);
+-				bic_set_bfqq(bic, NULL, true);
+-				bfq_release_process_ref(bfqd, sync_bfqq);
+-			}
+-		}
++		if (sync_bfqq)
++			bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx);
+ 	}
+ }
+ 
+@@ -1106,9 +1113,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
+ 	struct bfq_group *bfqg;
+ 	u64 v;
+ 
+-	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
++	blkg_conf_init(&ctx, buf);
++
++	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
+ 	if (ret)
+-		return ret;
++		goto out;
+ 
+ 	if (sscanf(ctx.body, "%llu", &v) == 1) {
+ 		/* require "default" on dfl */
+@@ -1130,7 +1139,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
+ 		ret = 0;
+ 	}
+ out:
+-	blkg_conf_finish(&ctx);
++	blkg_conf_exit(&ctx);
+ 	return ret ?: nbytes;
+ }
+ 
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
+index 380e9bda2e57..c330ff5fde4c 100644
+--- a/block/bfq-iosched.c
++++ b/block/bfq-iosched.c
+@@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600;
+ #define RQ_BIC(rq)		((struct bfq_io_cq *)((rq)->elv.priv[0]))
+ #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+ 
+-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
++			      unsigned int actuator_idx)
+ {
+-	return bic->bfqq[is_sync];
++	if (is_sync)
++		return bic->bfqq[1][actuator_idx];
++
++	return bic->bfqq[0][actuator_idx];
+ }
+ 
+ static void bfq_put_stable_ref(struct bfq_queue *bfqq);
+ 
+-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
++void bic_set_bfqq(struct bfq_io_cq *bic,
++		  struct bfq_queue *bfqq,
++		  bool is_sync,
++		  unsigned int actuator_idx)
+ {
+-	struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
+-
+-	/* Clear bic pointer if bfqq is detached from this bic */
+-	if (old_bfqq && old_bfqq->bic == bic)
+-		old_bfqq->bic = NULL;
++	struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx];
+ 
+ 	/*
+ 	 * If bfqq != NULL, then a non-stable queue merge between
+@@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+ 	 * we cancel the stable merge if
+ 	 * bic->stable_merge_bfqq == bfqq.
+ 	 */
+-	bic->bfqq[is_sync] = bfqq;
++	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx];
++
++	/* Clear bic pointer if bfqq is detached from this bic */
++	if (old_bfqq && old_bfqq->bic == bic)
++		old_bfqq->bic = NULL;
+ 
+-	if (bfqq && bic->stable_merge_bfqq == bfqq) {
++	if (is_sync)
++		bic->bfqq[1][actuator_idx] = bfqq;
++	else
++		bic->bfqq[0][actuator_idx] = bfqq;
++
++	if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) {
+ 		/*
+ 		 * Actually, these same instructions are executed also
+ 		 * in bfq_setup_cooperator, in case of abort or actual
+@@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+ 		 * did so, we would nest even more complexity in this
+ 		 * function.
+ 		 */
+-		bfq_put_stable_ref(bic->stable_merge_bfqq);
++		bfq_put_stable_ref(bfqq_data->stable_merge_bfqq);
+ 
+-		bic->stable_merge_bfqq = NULL;
++		bfqq_data->stable_merge_bfqq = NULL;
+ 	}
+ }
+ 
+@@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
+ {
+ 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ 	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
+-	struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
+ 	int depth;
+ 	unsigned limit = data->q->nr_requests;
++	unsigned int act_idx;
+ 
+ 	/* Sync reads have full depth available */
+ 	if (op_is_sync(opf) && !op_is_write(opf)) {
+@@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
+ 		limit = (limit * depth) >> bfqd->full_depth_shift;
+ 	}
+ 
+-	/*
+-	 * Does queue (or any parent entity) exceed number of requests that
+-	 * should be available to it? Heavily limit depth so that it cannot
+-	 * consume more available requests and thus starve other entities.
+-	 */
+-	if (bfqq && bfqq_request_over_limit(bfqq, limit))
+-		depth = 1;
++	for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
++		struct bfq_queue *bfqq =
++			bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+ 
++		/*
++		 * Does queue (or any parent entity) exceed number of
++		 * requests that should be available to it? Heavily
++		 * limit depth so that it cannot consume more
++		 * available requests and thus starve other entities.
++		 */
++		if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
++			depth = 1;
++			break;
++		}
++	}
+ 	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+ 		__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
+ 	if (depth)
+@@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+ {
+ 	u64 dur;
+ 
+-	if (bfqd->bfq_wr_max_time > 0)
+-		return bfqd->bfq_wr_max_time;
+-
+ 	dur = bfqd->rate_dur_prod;
+ 	do_div(dur, bfqd->peak_rate);
+ 
+@@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
+ {
+ 	unsigned int old_wr_coeff = 1;
+ 	bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
++	unsigned int a_idx = bfqq->actuator_idx;
++	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
+ 
+-	if (bic->saved_has_short_ttime)
++	if (bfqq_data->saved_has_short_ttime)
+ 		bfq_mark_bfqq_has_short_ttime(bfqq);
+ 	else
+ 		bfq_clear_bfqq_has_short_ttime(bfqq);
+ 
+-	if (bic->saved_IO_bound)
++	if (bfqq_data->saved_IO_bound)
+ 		bfq_mark_bfqq_IO_bound(bfqq);
+ 	else
+ 		bfq_clear_bfqq_IO_bound(bfqq);
+ 
+-	bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
+-	bfqq->inject_limit = bic->saved_inject_limit;
+-	bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
++	bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns;
++	bfqq->inject_limit = bfqq_data->saved_inject_limit;
++	bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif;
+ 
+-	bfqq->entity.new_weight = bic->saved_weight;
+-	bfqq->ttime = bic->saved_ttime;
+-	bfqq->io_start_time = bic->saved_io_start_time;
+-	bfqq->tot_idle_time = bic->saved_tot_idle_time;
++	bfqq->entity.new_weight = bfqq_data->saved_weight;
++	bfqq->ttime = bfqq_data->saved_ttime;
++	bfqq->io_start_time = bfqq_data->saved_io_start_time;
++	bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time;
+ 	/*
+ 	 * Restore weight coefficient only if low_latency is on
+ 	 */
+ 	if (bfqd->low_latency) {
+ 		old_wr_coeff = bfqq->wr_coeff;
+-		bfqq->wr_coeff = bic->saved_wr_coeff;
++		bfqq->wr_coeff = bfqq_data->saved_wr_coeff;
+ 	}
+-	bfqq->service_from_wr = bic->saved_service_from_wr;
+-	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+-	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+-	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
++	bfqq->service_from_wr = bfqq_data->saved_service_from_wr;
++	bfqq->wr_start_at_switch_to_srt =
++		bfqq_data->saved_wr_start_at_switch_to_srt;
++	bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish;
++	bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time;
+ 
+ 	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+ 	    time_is_before_jiffies(bfqq->last_wr_start_finish +
+@@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
+ 	return bfqq_weight > in_serv_weight;
+ }
+ 
++/*
++ * Get the index of the actuator that will serve bio.
++ */
++static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio)
++{
++	unsigned int i;
++	sector_t end;
++
++	/* no search needed if one or zero ranges present */
++	if (bfqd->num_actuators == 1)
++		return 0;
++
++	/* bio_end_sector(bio) gives the sector after the last one */
++	end = bio_end_sector(bio) - 1;
++
++	for (i = 0; i < bfqd->num_actuators; i++) {
++		if (end >= bfqd->sector[i] &&
++		    end < bfqd->sector[i] + bfqd->nr_sectors[i])
++			return i;
++	}
++
++	WARN_ONCE(true,
++		  "bfq_actuator_index: bio sector out of ranges: end=%llu\n",
++		  end);
++	return 0;
++}
++
+ static bool bfq_better_to_idle(struct bfq_queue *bfqq);
+ 
+ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+@@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+ 		arrived_in_time =  ktime_get_ns() <=
+ 			bfqq->ttime.last_end_request +
+ 			bfqd->bfq_slice_idle * 3;
+-
++	unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
++	bool bfqq_non_merged_or_stably_merged =
++		bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged;
+ 
+ 	/*
+ 	 * bfqq deserves to be weight-raised if:
+@@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+ 	 */
+ 	wr_or_deserves_wr = bfqd->low_latency &&
+ 		(bfqq->wr_coeff > 1 ||
+-		 (bfq_bfqq_sync(bfqq) &&
+-		  (bfqq->bic || RQ_BIC(rq)->stably_merged) &&
+-		   (*interactive || soft_rt)));
++		 (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged &&
++		  (*interactive || soft_rt)));
+ 
+ 	/*
+ 	 * Using the last flag, update budget and check whether bfqq
+@@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 	 * We reset waker detection logic also if too much time has passed
+  	 * since the first detection. If wakeups are rare, pointless idling
+ 	 * doesn't hurt throughput that much. The condition below makes sure
+-	 * we do not uselessly idle blocking waker in more than 1/64 cases. 
++	 * we do not uselessly idle blocking waker in more than 1/64 cases.
+ 	 */
+ 	if (bfqd->last_completed_rq_bfqq !=
+ 	    bfqq->tentative_waker_bfqq ||
+@@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq)
+ 		 *   elapsed.
+ 		 */
+ 		if (bfqq == bfqd->in_service_queue &&
+-		    (bfqd->rq_in_driver == 0 ||
++		    (bfqd->tot_rq_in_driver == 0 ||
+ 		     (bfqq->last_serv_time_ns > 0 &&
+-		      bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
++		      bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
+ 		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+ 					      msecs_to_jiffies(10))) {
+ 			bfqd->last_empty_occupied_ns = ktime_get_ns();
+@@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq)
+ 			 * will be set in case injection is performed
+ 			 * on bfqq before rq is completed).
+ 			 */
+-			if (bfqd->rq_in_driver == 0)
++			if (bfqd->tot_rq_in_driver == 0)
+ 				bfqd->rqs_injected = false;
+ 		}
+ 	}
+@@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
+ 		 */
+ 		bfq_bic_update_cgroup(bic, bio);
+ 
+-		bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
++		bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf),
++					     bfq_actuator_index(bfqd, bio));
+ 	} else {
+ 		bfqd->bio_bfqq = NULL;
+ 	}
+@@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+ void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ 			     struct bfq_group *bfqg)
+ {
+-	int i, j;
++	int i, j, k;
+ 
+-	for (i = 0; i < 2; i++)
+-		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+-			if (bfqg->async_bfqq[i][j])
+-				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+-	if (bfqg->async_idle_bfqq)
+-		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
++	for (k = 0; k < bfqd->num_actuators; k++) {
++		for (i = 0; i < 2; i++)
++			for (j = 0; j < IOPRIO_NR_LEVELS; j++)
++				if (bfqg->async_bfqq[i][j][k])
++					bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]);
++		if (bfqg->async_idle_bfqq[k])
++			bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]);
++	}
+ }
+ 
+ static void bfq_end_wr(struct bfq_data *bfqd)
+ {
+ 	struct bfq_queue *bfqq;
++	int i;
+ 
+ 	spin_lock_irq(&bfqd->lock);
+ 
+-	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+-		bfq_bfqq_end_wr(bfqq);
++	for (i = 0; i < bfqd->num_actuators; i++) {
++		list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
++			bfq_bfqq_end_wr(bfqq);
++	}
+ 	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+ 		bfq_bfqq_end_wr(bfqq);
+ 	bfq_end_wr_async(bfqd);
+@@ -2794,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+ 					     struct bfq_queue *bfqq);
+ 
++static struct bfq_queue *
++bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++		       struct bfq_queue *stable_merge_bfqq,
++		       struct bfq_iocq_bfqq_data *bfqq_data)
++{
++	int proc_ref = min(bfqq_process_refs(bfqq),
++			   bfqq_process_refs(stable_merge_bfqq));
++	struct bfq_queue *new_bfqq;
++
++	if (idling_boosts_thr_without_issues(bfqd, bfqq) ||
++	    proc_ref == 0)
++		return NULL;
++
++	/* next function will take at least one ref */
++	new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq);
++
++	if (new_bfqq) {
++		bfqq_data->stably_merged = true;
++		if (new_bfqq->bic) {
++			unsigned int new_a_idx = new_bfqq->actuator_idx;
++			struct bfq_iocq_bfqq_data *new_bfqq_data =
++				&new_bfqq->bic->bfqq_data[new_a_idx];
++
++			new_bfqq_data->stably_merged = true;
++		}
++	}
++	return new_bfqq;
++}
++
+ /*
+  * Attempt to schedule a merge of bfqq with the currently in-service
+  * queue or with a close queue among the scheduled queues.  Return
+@@ -2819,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 		     void *io_struct, bool request, struct bfq_io_cq *bic)
+ {
+ 	struct bfq_queue *in_service_bfqq, *new_bfqq;
++	unsigned int a_idx = bfqq->actuator_idx;
++	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
+ 
+ 	/* if a merge has already been setup, then proceed with that first */
+ 	if (bfqq->new_bfqq)
+@@ -2840,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 		 * stable merging) also if bic is associated with a
+ 		 * sync queue, but this bfqq is async
+ 		 */
+-		if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
++		if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq &&
+ 		    !bfq_bfqq_just_created(bfqq) &&
+ 		    time_is_before_jiffies(bfqq->split_time +
+ 					  msecs_to_jiffies(bfq_late_stable_merging)) &&
+ 		    time_is_before_jiffies(bfqq->creation_time +
+ 					   msecs_to_jiffies(bfq_late_stable_merging))) {
+ 			struct bfq_queue *stable_merge_bfqq =
+-				bic->stable_merge_bfqq;
+-			int proc_ref = min(bfqq_process_refs(bfqq),
+-					   bfqq_process_refs(stable_merge_bfqq));
++				bfqq_data->stable_merge_bfqq;
+ 
+ 			/* deschedule stable merge, because done or aborted here */
+ 			bfq_put_stable_ref(stable_merge_bfqq);
+ 
+-			bic->stable_merge_bfqq = NULL;
+-
+-			if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
+-			    proc_ref > 0) {
+-				/* next function will take at least one ref */
+-				struct bfq_queue *new_bfqq =
+-					bfq_setup_merge(bfqq, stable_merge_bfqq);
+-
+-				if (new_bfqq) {
+-					bic->stably_merged = true;
+-					if (new_bfqq->bic)
+-						new_bfqq->bic->stably_merged =
+-									true;
+-				}
+-				return new_bfqq;
+-			} else
+-				return NULL;
++			bfqq_data->stable_merge_bfqq = NULL;
++
++			return bfq_setup_stable_merge(bfqd, bfqq,
++						      stable_merge_bfqq,
++						      bfqq_data);
+ 		}
+ 	}
+ 
+@@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+ {
+ 	struct bfq_io_cq *bic = bfqq->bic;
++	unsigned int a_idx = bfqq->actuator_idx;
++	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
+ 
+ 	/*
+ 	 * If !bfqq->bic, the queue is already shared or its requests
+@@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+ 	if (!bic)
+ 		return;
+ 
+-	bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
+-	bic->saved_inject_limit = bfqq->inject_limit;
+-	bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
+-
+-	bic->saved_weight = bfqq->entity.orig_weight;
+-	bic->saved_ttime = bfqq->ttime;
+-	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
+-	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+-	bic->saved_io_start_time = bfqq->io_start_time;
+-	bic->saved_tot_idle_time = bfqq->tot_idle_time;
+-	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+-	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
++	bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
++	bfqq_data->saved_inject_limit =	bfqq->inject_limit;
++	bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif;
++
++	bfqq_data->saved_weight = bfqq->entity.orig_weight;
++	bfqq_data->saved_ttime = bfqq->ttime;
++	bfqq_data->saved_has_short_ttime =
++		bfq_bfqq_has_short_ttime(bfqq);
++	bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
++	bfqq_data->saved_io_start_time = bfqq->io_start_time;
++	bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time;
++	bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
++	bfqq_data->was_in_burst_list =
++		!hlist_unhashed(&bfqq->burst_list_node);
++
+ 	if (unlikely(bfq_bfqq_just_created(bfqq) &&
+ 		     !bfq_bfqq_in_large_burst(bfqq) &&
+ 		     bfqq->bfqd->low_latency)) {
+@@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+ 		 * to bfqq, so that to avoid that bfqq unjustly fails
+ 		 * to enjoy weight raising if split soon.
+ 		 */
+-		bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+-		bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
+-		bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
+-		bic->saved_last_wr_start_finish = jiffies;
++		bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
++		bfqq_data->saved_wr_start_at_switch_to_srt =
++			bfq_smallest_from_now();
++		bfqq_data->saved_wr_cur_max_time =
++			bfq_wr_duration(bfqq->bfqd);
++		bfqq_data->saved_last_wr_start_finish = jiffies;
+ 	} else {
+-		bic->saved_wr_coeff = bfqq->wr_coeff;
+-		bic->saved_wr_start_at_switch_to_srt =
++		bfqq_data->saved_wr_coeff = bfqq->wr_coeff;
++		bfqq_data->saved_wr_start_at_switch_to_srt =
+ 			bfqq->wr_start_at_switch_to_srt;
+-		bic->saved_service_from_wr = bfqq->service_from_wr;
+-		bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+-		bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
++		bfqq_data->saved_service_from_wr =
++			bfqq->service_from_wr;
++		bfqq_data->saved_last_wr_start_finish =
++			bfqq->last_wr_start_finish;
++		bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ 	}
+ }
+ 
+@@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ 	/*
+ 	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
+ 	 */
+-	bic_set_bfqq(bic, new_bfqq, true);
++	bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx);
+ 	bfq_mark_bfqq_coop(new_bfqq);
+ 	/*
+ 	 * new_bfqq now belongs to at least two bics (it is a shared queue):
+@@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+ 	 * - start a new observation interval with this dispatch
+ 	 */
+ 	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+-	    bfqd->rq_in_driver == 0)
++	    bfqd->tot_rq_in_driver == 0)
+ 		goto update_rate_and_reset;
+ 
+ 	/* Update sampling information */
+ 	bfqd->peak_rate_samples++;
+ 
+-	if ((bfqd->rq_in_driver > 0 ||
++	if ((bfqd->tot_rq_in_driver > 0 ||
+ 		now_ns - bfqd->last_completion < BFQ_MIN_TT)
+ 	    && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
+ 		bfqd->sequential_samples++;
+@@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+ 		return false;
+ 
+ 	return (bfqq->wr_coeff > 1 &&
+-		(bfqd->wr_busy_queues <
+-		 tot_busy_queues ||
+-		 bfqd->rq_in_driver >=
+-		 bfqq->dispatched + 4)) ||
++		(bfqd->wr_busy_queues < tot_busy_queues ||
++		 bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) ||
+ 		bfq_asymmetric_scenario(bfqd, bfqq) ||
+ 		tot_busy_queues == 1;
+ }
+@@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+  * function to evaluate the I/O speed of a process.
+  */
+ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+-				 bool compensate, enum bfqq_expiration reason,
+-				 unsigned long *delta_ms)
++				 bool compensate, unsigned long *delta_ms)
+ {
+ 	ktime_t delta_ktime;
+ 	u32 delta_usecs;
+@@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
+ 	/*
+ 	 * Check whether the process is slow (see bfq_bfqq_is_slow).
+ 	 */
+-	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
++	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta);
+ 
+ 	/*
+ 	 * As above explained, charge slow (typically seeky) and
+@@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+ {
+ 	struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
+ 	unsigned int limit = in_serv_bfqq->inject_limit;
++	int i;
++
+ 	/*
+ 	 * If
+ 	 * - bfqq is not weight-raised and therefore does not carry
+@@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+ 		)
+ 		limit = 1;
+ 
+-	if (bfqd->rq_in_driver >= limit)
++	if (bfqd->tot_rq_in_driver >= limit)
+ 		return NULL;
+ 
+ 	/*
+@@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+ 	 *   (and re-added only if it gets new requests, but then it
+ 	 *   is assigned again enough budget for its new backlog).
+ 	 */
+-	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+-		if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+-		    (in_serv_always_inject || bfqq->wr_coeff > 1) &&
+-		    bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+-		    bfq_bfqq_budget_left(bfqq)) {
++	for (i = 0; i < bfqd->num_actuators; i++) {
++		list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
++			if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
++				(in_serv_always_inject || bfqq->wr_coeff > 1) &&
++				bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
++				bfq_bfqq_budget_left(bfqq)) {
+ 			/*
+ 			 * Allow for only one large in-flight request
+ 			 * on non-rotational devices, for the
+@@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+ 			 */
+ 			if (blk_queue_nonrot(bfqd->queue) &&
+ 			    blk_rq_sectors(bfqq->next_rq) >=
+-			    BFQQ_SECT_THR_NONROT)
+-				limit = min_t(unsigned int, 1, limit);
+-			else
+-				limit = in_serv_bfqq->inject_limit;
+-
+-			if (bfqd->rq_in_driver < limit) {
++			    BFQQ_SECT_THR_NONROT &&
++			    bfqd->tot_rq_in_driver >= 1)
++				continue;
++			else {
+ 				bfqd->rqs_injected = true;
+ 				return bfqq;
+ 			}
+ 		}
++	}
++
++	return NULL;
++}
++
++static struct bfq_queue *
++bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
++{
++	struct bfq_queue *bfqq;
++
++	if (bfqd->in_service_queue &&
++	    bfqd->in_service_queue->actuator_idx == idx)
++		return bfqd->in_service_queue;
++
++	list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) {
++		if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
++			bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
++				bfq_bfqq_budget_left(bfqq)) {
++			return bfqq;
++		}
++	}
+ 
+ 	return NULL;
+ }
+ 
++/*
++ * Perform a linear scan of each actuator, until an actuator is found
++ * for which the following three conditions hold: the load of the
++ * actuator is below the threshold (see comments on
++ * actuator_load_threshold for details) and lower than that of the
++ * next actuator (comments on this extra condition below), and there
++ * is a queue that contains I/O for that actuator. On success, return
++ * that queue.
++ *
++ * Performing a plain linear scan entails a prioritization among
++ * actuators. The extra condition above breaks this prioritization and
++ * tends to distribute injection uniformly across actuators.
++ */
++static struct bfq_queue *
++bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd)
++{
++	int i;
++
++	for (i = 0 ; i < bfqd->num_actuators; i++) {
++		if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold &&
++		    (i == bfqd->num_actuators - 1 ||
++		     bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) {
++			struct bfq_queue *bfqq =
++				bfq_find_active_bfqq_for_actuator(bfqd, i);
++
++			if (bfqq)
++				return bfqq;
++		}
++	}
++
++	return NULL;
++}
++
++
+ /*
+  * Select a queue for service.  If we have a current queue in service,
+  * check whether to continue servicing it, or retrieve and set a new one.
+  */
+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+ {
+-	struct bfq_queue *bfqq;
++	struct bfq_queue *bfqq, *inject_bfqq;
+ 	struct request *next_rq;
+ 	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
+ 
+@@ -4689,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+ 		goto expire;
+ 
+ check_queue:
++	/*
++	 *  If some actuator is underutilized, but the in-service
++	 *  queue does not contain I/O for that actuator, then try to
++	 *  inject I/O for that actuator.
++	 */
++	inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd);
++	if (inject_bfqq && inject_bfqq != bfqq)
++		return inject_bfqq;
++
+ 	/*
+ 	 * This loop is rarely executed more than once. Even when it
+ 	 * happens, it is much more convenient to re-execute this loop
+@@ -4748,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+ 	 */
+ 	if (bfq_bfqq_wait_request(bfqq) ||
+ 	    (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
+-		struct bfq_queue *async_bfqq =
+-			bfqq->bic && bfqq->bic->bfqq[0] &&
+-			bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
+-			bfqq->bic->bfqq[0]->next_rq ?
+-			bfqq->bic->bfqq[0] : NULL;
++		unsigned int act_idx = bfqq->actuator_idx;
++		struct bfq_queue *async_bfqq = NULL;
+ 		struct bfq_queue *blocked_bfqq =
+ 			!hlist_empty(&bfqq->woken_list) ?
+ 			container_of(bfqq->woken_list.first,
+@@ -4760,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+ 				     woken_list_node)
+ 			: NULL;
+ 
++		if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] &&
++		    bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) &&
++		    bfqq->bic->bfqq[0][act_idx]->next_rq)
++			async_bfqq = bfqq->bic->bfqq[0][act_idx];
+ 		/*
+ 		 * The next four mutually-exclusive ifs decide
+ 		 * whether to try injection, and choose the queue to
+@@ -4844,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+ 		    icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
+ 		    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
+ 		    bfq_bfqq_budget_left(async_bfqq))
+-			bfqq = bfqq->bic->bfqq[0];
++			bfqq = async_bfqq;
+ 		else if (bfqq->waker_bfqq &&
+ 			   bfq_bfqq_busy(bfqq->waker_bfqq) &&
+ 			   bfqq->waker_bfqq->next_rq &&
+@@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+ 	bfq_dispatch_remove(bfqd->queue, rq);
+ 
+ 	if (bfqq != bfqd->in_service_queue)
+-		goto return_rq;
++		return rq;
+ 
+ 	/*
+ 	 * If weight raising has to terminate for bfqq, then next
+@@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+ 	 * belongs to CLASS_IDLE and other queues are waiting for
+ 	 * service.
+ 	 */
+-	if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
+-		goto return_rq;
+-
+-	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
++	if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))
++		bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+ 
+-return_rq:
+ 	return rq;
+ }
+ 
+@@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 
+ 		/*
+ 		 * We exploit the bfq_finish_requeue_request hook to
+-		 * decrement rq_in_driver, but
++		 * decrement tot_rq_in_driver, but
+ 		 * bfq_finish_requeue_request will not be invoked on
+ 		 * this request. So, to avoid unbalance, just start
+-		 * this request, without incrementing rq_in_driver. As
+-		 * a negative consequence, rq_in_driver is deceptively
++		 * this request, without incrementing tot_rq_in_driver. As
++		 * a negative consequence, tot_rq_in_driver is deceptively
+ 		 * lower than it should be while this request is in
+ 		 * service. This may cause bfq_schedule_dispatch to be
+ 		 * invoked uselessly.
+@@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 		 * bfq_finish_requeue_request hook, if defined, is
+ 		 * probably invoked also on this request. So, by
+ 		 * exploiting this hook, we could 1) increment
+-		 * rq_in_driver here, and 2) decrement it in
++		 * tot_rq_in_driver here, and 2) decrement it in
+ 		 * bfq_finish_requeue_request. Such a solution would
+ 		 * let the value of the counter be always accurate,
+ 		 * but it would entail using an extra interface
+@@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 	 * Of course, serving one request at a time may cause loss of
+ 	 * throughput.
+ 	 */
+-	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
++	if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0)
+ 		goto exit;
+ 
+ 	bfqq = bfq_select_queue(bfqd);
+@@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+ 
+ 	if (rq) {
+ inc_in_driver_start_rq:
+-		bfqd->rq_in_driver++;
++		bfqd->rq_in_driver[bfqq->actuator_idx]++;
++		bfqd->tot_rq_in_driver++;
+ start_rq:
+ 		rq->rq_flags |= RQF_STARTED;
+ 	}
+@@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq)
+ 	 */
+ 	__bfqq = bfqq->new_bfqq;
+ 	while (__bfqq) {
+-		if (__bfqq == bfqq)
+-			break;
+ 		next = __bfqq->new_bfqq;
+ 		bfq_put_queue(__bfqq);
+ 		__bfqq = next;
+@@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+ 	bfq_release_process_ref(bfqd, bfqq);
+ }
+ 
+-static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
++			      unsigned int actuator_idx)
+ {
+-	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
++	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx);
+ 	struct bfq_data *bfqd;
+ 
+ 	if (bfqq)
+ 		bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+ 
+ 	if (bfqq && bfqd) {
+-		unsigned long flags;
+-
+-		spin_lock_irqsave(&bfqd->lock, flags);
+-		bic_set_bfqq(bic, NULL, is_sync);
++		bic_set_bfqq(bic, NULL, is_sync, actuator_idx);
+ 		bfq_exit_bfqq(bfqd, bfqq);
+-		spin_unlock_irqrestore(&bfqd->lock, flags);
+ 	}
+ }
+ 
+ static void bfq_exit_icq(struct io_cq *icq)
+ {
+ 	struct bfq_io_cq *bic = icq_to_bic(icq);
++	struct bfq_data *bfqd = bic_to_bfqd(bic);
++	unsigned long flags;
++	unsigned int act_idx;
++	/*
++	 * If bfqd and thus bfqd->num_actuators is not available any
++	 * longer, then cycle over all possible per-actuator bfqqs in
++	 * next loop. We rely on bic being zeroed on creation, and
++	 * therefore on its unused per-actuator fields being NULL.
++	 */
++	unsigned int num_actuators = BFQ_MAX_ACTUATORS;
++	struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
+ 
+-	if (bic->stable_merge_bfqq) {
+-		struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
++	/*
++	 * bfqd is NULL if scheduler already exited, and in that case
++	 * this is the last time these queues are accessed.
++	 */
++	if (bfqd) {
++		spin_lock_irqsave(&bfqd->lock, flags);
++		num_actuators = bfqd->num_actuators;
++	}
+ 
+-		/*
+-		 * bfqd is NULL if scheduler already exited, and in
+-		 * that case this is the last time bfqq is accessed.
+-		 */
+-		if (bfqd) {
+-			unsigned long flags;
++	for (act_idx = 0; act_idx < num_actuators; act_idx++) {
++		if (bfqq_data[act_idx].stable_merge_bfqq)
++			bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
+ 
+-			spin_lock_irqsave(&bfqd->lock, flags);
+-			bfq_put_stable_ref(bic->stable_merge_bfqq);
+-			spin_unlock_irqrestore(&bfqd->lock, flags);
+-		} else {
+-			bfq_put_stable_ref(bic->stable_merge_bfqq);
+-		}
++		bfq_exit_icq_bfqq(bic, true, act_idx);
++		bfq_exit_icq_bfqq(bic, false, act_idx);
+ 	}
+ 
+-	bfq_exit_icq_bfqq(bic, true);
+-	bfq_exit_icq_bfqq(bic, false);
++	if (bfqd)
++		spin_unlock_irqrestore(&bfqd->lock, flags);
+ }
+ 
+ /*
+@@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+ 
+ 	bic->ioprio = ioprio;
+ 
+-	bfqq = bic_to_bfqq(bic, false);
++	bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio));
+ 	if (bfqq) {
+ 		struct bfq_queue *old_bfqq = bfqq;
+ 
+ 		bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
+-		bic_set_bfqq(bic, bfqq, false);
++		bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio));
+ 		bfq_release_process_ref(bfqd, old_bfqq);
+ 	}
+ 
+-	bfqq = bic_to_bfqq(bic, true);
++	bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio));
+ 	if (bfqq)
+ 		bfq_set_next_ioprio_data(bfqq, bic);
+ }
+ 
+ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+-			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
++			  struct bfq_io_cq *bic, pid_t pid, int is_sync,
++			  unsigned int act_idx)
+ {
+ 	u64 now_ns = ktime_get_ns();
+ 
++	bfqq->actuator_idx = act_idx;
+ 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+ 	INIT_LIST_HEAD(&bfqq->fifo);
+ 	INIT_HLIST_NODE(&bfqq->burst_list_node);
+@@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 
+ 	/* first request is almost certainly seeky */
+ 	bfqq->seek_history = 1;
++
++	bfqq->decrease_time_jif = jiffies;
+ }
+ 
+ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+ 					       struct bfq_group *bfqg,
+-					       int ioprio_class, int ioprio)
++					       int ioprio_class, int ioprio, int act_idx)
+ {
+ 	switch (ioprio_class) {
+ 	case IOPRIO_CLASS_RT:
+-		return &bfqg->async_bfqq[0][ioprio];
++		return &bfqg->async_bfqq[0][ioprio][act_idx];
+ 	case IOPRIO_CLASS_NONE:
+ 		ioprio = IOPRIO_BE_NORM;
+ 		fallthrough;
+ 	case IOPRIO_CLASS_BE:
+-		return &bfqg->async_bfqq[1][ioprio];
++		return &bfqg->async_bfqq[1][ioprio][act_idx];
+ 	case IOPRIO_CLASS_IDLE:
+-		return &bfqg->async_idle_bfqq;
++		return &bfqg->async_idle_bfqq[act_idx];
+ 	default:
+ 		return NULL;
+ 	}
+@@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 			  struct bfq_io_cq *bic,
+ 			  struct bfq_queue *last_bfqq_created)
+ {
++	unsigned int a_idx = last_bfqq_created->actuator_idx;
+ 	struct bfq_queue *new_bfqq =
+ 		bfq_setup_merge(bfqq, last_bfqq_created);
+ 
+@@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ 		return bfqq;
+ 
+ 	if (new_bfqq->bic)
+-		new_bfqq->bic->stably_merged = true;
+-	bic->stably_merged = true;
++		new_bfqq->bic->bfqq_data[a_idx].stably_merged = true;
++	bic->bfqq_data[a_idx].stably_merged = true;
+ 
+ 	/*
+ 	 * Reusing merge functions. This implies that
+@@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+ 	 * it has been set already, but too long ago, then move it
+ 	 * forward to bfqq. Finally, move also if bfqq belongs to a
+ 	 * different group than last_bfqq_created, or if bfqq has a
+-	 * different ioprio or ioprio_class. If none of these
+-	 * conditions holds true, then try an early stable merge or
+-	 * schedule a delayed stable merge.
++	 * different ioprio, ioprio_class or actuator_idx. If none of
++	 * these conditions holds true, then try an early stable merge
++	 * or schedule a delayed stable merge. As for the condition on
++	 * actuator_idx, the reason is that, if queues associated with
++	 * different actuators are merged, then control is lost on
++	 * each actuator. Therefore some actuator may be
++	 * underutilized, and throughput may decrease.
+ 	 *
+ 	 * A delayed merge is scheduled (instead of performing an
+ 	 * early merge), in case bfqq might soon prove to be more
+@@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+ 			bfqq->creation_time) ||
+ 		bfqq->entity.parent != last_bfqq_created->entity.parent ||
+ 		bfqq->ioprio != last_bfqq_created->ioprio ||
+-		bfqq->ioprio_class != last_bfqq_created->ioprio_class)
++		bfqq->ioprio_class != last_bfqq_created->ioprio_class ||
++		bfqq->actuator_idx != last_bfqq_created->actuator_idx)
+ 		*source_bfqq = bfqq;
+ 	else if (time_after_eq(last_bfqq_created->creation_time +
+ 				 bfqd->bfq_burst_interval,
+@@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+ 			/*
+ 			 * Record the bfqq to merge to.
+ 			 */
+-			bic->stable_merge_bfqq = last_bfqq_created;
++			bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq =
++				last_bfqq_created;
+ 		}
+ 	}
+ 
+@@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ 	bfqg = bfq_bio_bfqg(bfqd, bio);
+ 	if (!is_sync) {
+ 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+-						  ioprio);
++						  ioprio,
++						  bfq_actuator_index(bfqd, bio));
+ 		bfqq = *async_bfqq;
+ 		if (bfqq)
+ 			goto out;
+@@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ 
+ 	if (bfqq) {
+ 		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+-			      is_sync);
++			      is_sync, bfq_actuator_index(bfqd, bio));
+ 		bfq_init_entity(&bfqq->entity, bfqg);
+ 		bfq_log_bfqq(bfqd, bfqq, "allocated");
+ 	} else {
+@@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+ 		 * then complete the merge and redirect it to
+ 		 * new_bfqq.
+ 		 */
+-		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
++		if (bic_to_bfqq(RQ_BIC(rq), true,
++				bfq_actuator_index(bfqd, rq->bio)) == bfqq)
+ 			bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+ 					bfqq, new_bfqq);
+ 
+@@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
+ 	struct bfq_queue *bfqq = bfqd->in_service_queue;
+ 
+ 	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+-				       bfqd->rq_in_driver);
++				       bfqd->tot_rq_in_driver);
+ 
+ 	if (bfqd->hw_tag == 1)
+ 		return;
+@@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
+ 	 * sum is not exact, as it's not taking into account deactivated
+ 	 * requests.
+ 	 */
+-	if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
++	if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+ 		return;
+ 
+ 	/*
+@@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
+ 	if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
+ 	    bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
+ 	    BFQ_HW_QUEUE_THRESHOLD &&
+-	    bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
++	    bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
+ 		return;
+ 
+ 	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+@@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+ 
+ 	bfq_update_hw_tag(bfqd);
+ 
+-	bfqd->rq_in_driver--;
++	bfqd->rq_in_driver[bfqq->actuator_idx]--;
++	bfqd->tot_rq_in_driver--;
+ 	bfqq->dispatched--;
+ 
+ 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+@@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+ 					BFQQE_NO_MORE_REQUESTS);
+ 	}
+ 
+-	if (!bfqd->rq_in_driver)
++	if (!bfqd->tot_rq_in_driver)
+ 		bfq_schedule_dispatch(bfqd);
+ }
+ 
+@@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
+ 	 * conditions to do it, or we can lower the last base value
+ 	 * computed.
+ 	 *
+-	 * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
++	 * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O
+ 	 * request in flight, because this function is in the code
+ 	 * path that handles the completion of a request of bfqq, and,
+ 	 * in particular, this function is executed before
+-	 * bfqd->rq_in_driver is decremented in such a code path.
++	 * bfqd->tot_rq_in_driver is decremented in such a code path.
+ 	 */
+-	if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
++	if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) ||
+ 	    tot_time_ns < bfqq->last_serv_time_ns) {
+ 		if (bfqq->last_serv_time_ns == 0) {
+ 			/*
+@@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
+ 			bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
+ 		}
+ 		bfqq->last_serv_time_ns = tot_time_ns;
+-	} else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
++	} else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1)
+ 		/*
+ 		 * No I/O injected and no request still in service in
+ 		 * the drive: these are the exact conditions for
+@@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+ 		return bfqq;
+ 	}
+ 
+-	bic_set_bfqq(bic, NULL, true);
++	bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
+ 
+ 	bfq_put_cooperator(bfqq);
+ 
+@@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ 						   bool split, bool is_sync,
+ 						   bool *new_queue)
+ {
+-	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
++	unsigned int act_idx = bfq_actuator_index(bfqd, bio);
++	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
++	struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx];
+ 
+ 	if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
+ 		return bfqq;
+@@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ 		bfq_put_queue(bfqq);
+ 	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
+ 
+-	bic_set_bfqq(bic, bfqq, is_sync);
++	bic_set_bfqq(bic, bfqq, is_sync, act_idx);
+ 	if (split && is_sync) {
+-		if ((bic->was_in_burst_list && bfqd->large_burst) ||
+-		    bic->saved_in_large_burst)
++		if ((bfqq_data->was_in_burst_list && bfqd->large_burst) ||
++		    bfqq_data->saved_in_large_burst)
+ 			bfq_mark_bfqq_in_large_burst(bfqq);
+ 		else {
+ 			bfq_clear_bfqq_in_large_burst(bfqq);
+-			if (bic->was_in_burst_list)
++			if (bfqq_data->was_in_burst_list)
+ 				/*
+ 				 * If bfqq was in the current
+ 				 * burst list before being
+@@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
+ 	struct bfq_queue *bfqq;
+ 	bool new_queue = false;
+ 	bool bfqq_already_existing = false, split = false;
++	unsigned int a_idx = bfq_actuator_index(bfqd, bio);
+ 
+ 	if (unlikely(!rq->elv.icq))
+ 		return NULL;
+ 
+ 	/*
+-	 * Assuming that elv.priv[1] is set only if everything is set
++	 * Assuming that RQ_BFQQ(rq) is set only if everything is set
+ 	 * for this rq. This holds true, because this function is
+ 	 * invoked only for insertion or merging, and, after such
+ 	 * events, a request cannot be manipulated any longer before
+ 	 * being removed from bfq.
+ 	 */
+-	if (rq->elv.priv[1])
+-		return rq->elv.priv[1];
++	if (RQ_BFQQ(rq))
++		return RQ_BFQQ(rq);
+ 
+ 	bic = icq_to_bic(rq->elv.icq);
+ 
+@@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
+ 	if (likely(!new_queue)) {
+ 		/* If the queue was seeky for too long, break it apart. */
+ 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
+-			!bic->stably_merged) {
++			!bic->bfqq_data[a_idx].stably_merged) {
+ 			struct bfq_queue *old_bfqq = bfqq;
+ 
+ 			/* Update bic before losing reference to bfqq */
+ 			if (bfq_bfqq_in_large_burst(bfqq))
+-				bic->saved_in_large_burst = true;
++				bic->bfqq_data[a_idx].saved_in_large_burst =
++					true;
+ 
+ 			bfqq = bfq_split_bfqq(bic, bfqq);
+ 			split = true;
+@@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+  */
+ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+ {
+-	int i, j;
++	int i, j, k;
+ 
+-	for (i = 0; i < 2; i++)
+-		for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+-			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
++	for (k = 0; k < bfqd->num_actuators; k++) {
++		for (i = 0; i < 2; i++)
++			for (j = 0; j < IOPRIO_NR_LEVELS; j++)
++				__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]);
+ 
+-	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
++		__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]);
++	}
+ }
+ 
+ /*
+@@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ {
+ 	struct bfq_data *bfqd;
+ 	struct elevator_queue *eq;
++	unsigned int i;
++	struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
+ 
+ 	eq = elevator_alloc(q, e);
+ 	if (!eq)
+@@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ 	 * Grab a permanent reference to it, so that the normal code flow
+ 	 * will not attempt to free it.
++	 * Set zero as actuator index: we will pretend that
++	 * all I/O requests are for the same actuator.
+ 	 */
+-	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0);
+ 	bfqd->oom_bfqq.ref++;
+ 	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+@@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 
+ 	bfqd->queue = q;
+ 
++	bfqd->num_actuators = 1;
++	/*
++	 * If the disk supports multiple actuators, copy independent
++	 * access ranges from the request queue structure.
++	 */
++	spin_lock_irq(&q->queue_lock);
++	if (ia_ranges) {
++		/*
++		 * Check if the disk ia_ranges size exceeds the current bfq
++		 * actuator limit.
++		 */
++		if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) {
++			pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n",
++				ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS);
++			pr_crit("Falling back to single actuator mode.\n");
++		} else {
++			bfqd->num_actuators = ia_ranges->nr_ia_ranges;
++
++			for (i = 0; i < bfqd->num_actuators; i++) {
++				bfqd->sector[i] = ia_ranges->ia_range[i].sector;
++				bfqd->nr_sectors[i] =
++					ia_ranges->ia_range[i].nr_sectors;
++			}
++		}
++	}
++
++	/* Otherwise use single-actuator dev info */
++	if (bfqd->num_actuators == 1) {
++		bfqd->sector[0] = 0;
++		bfqd->nr_sectors[0] = get_capacity(q->disk);
++	}
++	spin_unlock_irq(&q->queue_lock);
++
+ 	INIT_LIST_HEAD(&bfqd->dispatch);
+ 
+ 	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+@@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	bfqd->num_groups_with_pending_reqs = 0;
+ #endif
+ 
+-	INIT_LIST_HEAD(&bfqd->active_list);
++	INIT_LIST_HEAD(&bfqd->active_list[0]);
++	INIT_LIST_HEAD(&bfqd->active_list[1]);
+ 	INIT_LIST_HEAD(&bfqd->idle_list);
+ 	INIT_HLIST_HEAD(&bfqd->burst_list);
+ 
+@@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 	 */
+ 	bfqd->bfq_wr_coeff = 30;
+ 	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+-	bfqd->bfq_wr_max_time = 0;
+ 	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+ 	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+ 	bfqd->bfq_wr_max_softrt_rate = 7000; /*
+@@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+ 		ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
+ 	bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ 
++	/* see comments on the definition of next field inside bfq_data */
++	bfqd->actuator_load_threshold = 4;
++
+ 	spin_lock_init(&bfqd->lock);
+ 
+ 	/*
+@@ -7412,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched");
+ static int __init bfq_init(void)
+ {
+ 	int ret;
++	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.2";
+ 
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED
+ 	ret = blkcg_policy_register(&blkcg_policy_bfq);
+@@ -7443,6 +7649,11 @@ static int __init bfq_init(void)
+ 	if (ret)
+ 		goto slab_kill;
+ 
++#ifdef CONFIG_BFQ_GROUP_IOSCHED
++	strcat(msg, " (with cgroups support)");
++#endif
++	pr_info("%s", msg);
++
+ 	return 0;
+ 
+ slab_kill:
+diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
+index 466e4865ace6..75cc6a324267 100644
+--- a/block/bfq-iosched.h
++++ b/block/bfq-iosched.h
+@@ -33,6 +33,14 @@
+  */
+ #define BFQ_SOFTRT_WEIGHT_FACTOR	100
+ 
++/*
++ * Maximum number of actuators supported. This constant is used simply
++ * to define the size of the static array that will contain
++ * per-actuator data. The current value is hopefully a good upper
++ * bound to the possible number of actuators of any actual drive.
++ */
++#define BFQ_MAX_ACTUATORS 8
++
+ struct bfq_entity;
+ 
+ /**
+@@ -227,12 +235,14 @@ struct bfq_ttime {
+  * struct bfq_queue - leaf schedulable entity.
+  *
+  * A bfq_queue is a leaf request queue; it can be associated with an
+- * io_context or more, if it  is  async or shared  between  cooperating
+- * processes. @cgroup holds a reference to the cgroup, to be sure that it
+- * does not disappear while a bfqq still references it (mostly to avoid
+- * races between request issuing and task migration followed by cgroup
+- * destruction).
+- * All the fields are protected by the queue lock of the containing bfqd.
++ * io_context or more, if it is async or shared between cooperating
++ * processes. Besides, it contains I/O requests for only one actuator
++ * (an io_context is associated with a different bfq_queue for each
++ * actuator it generates I/O for). @cgroup holds a reference to the
++ * cgroup, to be sure that it does not disappear while a bfqq still
++ * references it (mostly to avoid races between request issuing and
++ * task migration followed by cgroup destruction).  All the fields are
++ * protected by the queue lock of the containing bfqd.
+  */
+ struct bfq_queue {
+ 	/* reference counter */
+@@ -397,24 +407,18 @@ struct bfq_queue {
+ 	 * the woken queues when this queue exits.
+ 	 */
+ 	struct hlist_head woken_list;
++
++	/* index of the actuator this queue is associated with */
++	unsigned int actuator_idx;
+ };
+ 
+ /**
+- * struct bfq_io_cq - per (request_queue, io_context) structure.
+- */
+-struct bfq_io_cq {
+-	/* associated io_cq structure */
+-	struct io_cq icq; /* must be the first member */
+-	/* array of two process queues, the sync and the async */
+-	struct bfq_queue *bfqq[2];
+-	/* per (request_queue, blkcg) ioprio */
+-	int ioprio;
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+-	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+-#endif
++* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq
++*/
++struct bfq_iocq_bfqq_data {
+ 	/*
+ 	 * Snapshot of the has_short_time flag before merging; taken
+-	 * to remember its value while the queue is merged, so as to
++	 * to remember its values while the queue is merged, so as to
+ 	 * be able to restore it in case of split.
+ 	 */
+ 	bool saved_has_short_ttime;
+@@ -428,7 +432,7 @@ struct bfq_io_cq {
+ 	u64 saved_tot_idle_time;
+ 
+ 	/*
+-	 * Same purpose as the previous fields for the value of the
++	 * Same purpose as the previous fields for the values of the
+ 	 * field keeping the queue's belonging to a large burst
+ 	 */
+ 	bool saved_in_large_burst;
+@@ -466,6 +470,38 @@ struct bfq_io_cq {
+ 	struct bfq_queue *stable_merge_bfqq;
+ 
+ 	bool stably_merged;	/* non splittable if true */
++};
++
++/**
++ * struct bfq_io_cq - per (request_queue, io_context) structure.
++ */
++struct bfq_io_cq {
++	/* associated io_cq structure */
++	struct io_cq icq; /* must be the first member */
++	/*
++	 * Matrix of associated process queues: first row for async
++	 * queues, second row sync queues. Each row contains one
++	 * column for each actuator. An I/O request generated by the
++	 * process is inserted into the queue pointed by bfqq[i][j] if
++	 * the request is to be served by the j-th actuator of the
++	 * drive, where i==0 or i==1, depending on whether the request
++	 * is async or sync. So there is a distinct queue for each
++	 * actuator.
++	 */
++	struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];
++	/* per (request_queue, blkcg) ioprio */
++	int ioprio;
++#ifdef CONFIG_BFQ_GROUP_IOSCHED
++	uint64_t blkcg_serial_nr; /* the current blkcg serial */
++#endif
++
++	/*
++	 * Persistent data for associated synchronous process queues
++	 * (one queue per actuator, see field bfqq above). In
++	 * particular, each of these queues may undergo a merge.
++	 */
++	struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];
++
+ 	unsigned int requests;	/* Number of requests this process has in flight */
+ };
+ 
+@@ -554,7 +590,12 @@ struct bfq_data {
+ 	/* number of queued requests */
+ 	int queued;
+ 	/* number of requests dispatched and waiting for completion */
+-	int rq_in_driver;
++	int tot_rq_in_driver;
++	/*
++	 * number of requests dispatched and waiting for completion
++	 * for each actuator
++	 */
++	int rq_in_driver[BFQ_MAX_ACTUATORS];
+ 
+ 	/* true if the device is non rotational and performs queueing */
+ 	bool nonrot_with_queueing;
+@@ -648,8 +689,13 @@ struct bfq_data {
+ 	/* maximum budget allotted to a bfq_queue before rescheduling */
+ 	int bfq_max_budget;
+ 
+-	/* list of all the bfq_queues active on the device */
+-	struct list_head active_list;
++	/*
++	 * List of all the bfq_queues active for a specific actuator
++	 * on the device. Keeping active queues separate on a
++	 * per-actuator basis helps implementing per-actuator
++	 * injection more efficiently.
++	 */
++	struct list_head active_list[BFQ_MAX_ACTUATORS];
+ 	/* list of all the bfq_queues idle on the device */
+ 	struct list_head idle_list;
+ 
+@@ -723,8 +769,6 @@ struct bfq_data {
+ 	 * is multiplied.
+ 	 */
+ 	unsigned int bfq_wr_coeff;
+-	/* maximum duration of a weight-raising period (jiffies) */
+-	unsigned int bfq_wr_max_time;
+ 
+ 	/* Maximum weight-raising duration for soft real-time processes */
+ 	unsigned int bfq_wr_rt_max_time;
+@@ -772,6 +816,42 @@ struct bfq_data {
+ 	 */
+ 	unsigned int word_depths[2][2];
+ 	unsigned int full_depth_shift;
++
++	/*
++	 * Number of independent actuators. This is equal to 1 in
++	 * case of single-actuator drives.
++	 */
++	unsigned int num_actuators;
++	/*
++	 * Disk independent access ranges for each actuator
++	 * in this device.
++	 */
++	sector_t sector[BFQ_MAX_ACTUATORS];
++	sector_t nr_sectors[BFQ_MAX_ACTUATORS];
++	struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
++
++	/*
++	 * If the number of I/O requests queued in the device for a
++	 * given actuator is below next threshold, then the actuator
++	 * is deemed as underutilized. If this condition is found to
++	 * hold for some actuator upon a dispatch, but (i) the
++	 * in-service queue does not contain I/O for that actuator,
++	 * while (ii) some other queue does contain I/O for that
++	 * actuator, then the head I/O request of the latter queue is
++	 * returned (injected), instead of the head request of the
++	 * currently in-service queue.
++	 *
++	 * We set the threshold, empirically, to the minimum possible
++	 * value for which an actuator is fully utilized, or close to
++	 * be fully utilized. By doing so, injected I/O 'steals' as
++	 * few drive-queue slots as possibile to the in-service
++	 * queue. This reduces as much as possible the probability
++	 * that the service of I/O from the in-service bfq_queue gets
++	 * delayed because of slot exhaustion, i.e., because all the
++	 * slots of the drive queue are filled with I/O injected from
++	 * other queues (NCQ provides for 32 slots).
++	 */
++	unsigned int actuator_load_threshold;
+ };
+ 
+ enum bfqq_state_flags {
+@@ -937,8 +1017,8 @@ struct bfq_group {
+ 
+ 	struct bfq_data *bfqd;
+ 
+-	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
+-	struct bfq_queue *async_idle_bfqq;
++	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
++	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
+ 
+ 	struct bfq_entity *my_entity;
+ 
+@@ -955,8 +1035,8 @@ struct bfq_group {
+ 	struct bfq_entity entity;
+ 	struct bfq_sched_data sched_data;
+ 
+-	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
+-	struct bfq_queue *async_idle_bfqq;
++	struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
++	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
+ 
+ 	struct rb_root rq_pos_tree;
+ };
+@@ -969,8 +1049,10 @@ struct bfq_group {
+ 
+ extern const int bfq_timeout;
+ 
+-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
++				unsigned int actuator_idx);
++void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,
++				unsigned int actuator_idx);
+ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+ void bfq_weights_tree_add(struct bfq_queue *bfqq);
+diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
+index ea4c3d757fdd..7941b6f07391 100644
+--- a/block/bfq-wf2q.c
++++ b/block/bfq-wf2q.c
+@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
+ 	bfq_update_active_tree(node);
+ 
+ 	if (bfqq)
+-		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]);
+ 
+ 	bfq_inc_active_entities(entity);
+ }
+diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
+index 9ac1efb053e0..4272599a3f08 100644
+--- a/block/blk-cgroup.c
++++ b/block/blk-cgroup.c
+@@ -33,7 +33,6 @@
+ #include "blk-cgroup.h"
+ #include "blk-ioprio.h"
+ #include "blk-throttle.h"
+-#include "blk-rq-qos.h"
+ 
+ /*
+  * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+@@ -626,69 +625,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
+ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
+ 
+ /**
+- * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
+- * @inputp: input string pointer
++ * blkg_conf_init - initialize a blkg_conf_ctx
++ * @ctx: blkg_conf_ctx to initialize
++ * @input: input string
++ *
++ * Initialize @ctx which can be used to parse blkg config input string @input.
++ * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
++ * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
++ */
++void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
++{
++	*ctx = (struct blkg_conf_ctx){ .input = input };
++}
++EXPORT_SYMBOL_GPL(blkg_conf_init);
++
++/**
++ * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
+  *
+- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
+- * from @input and get and return the matching bdev.  *@inputp is
+- * updated to point past the device node prefix.  Returns an ERR_PTR()
+- * value on error.
++ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
++ * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
++ * set to point past the device node prefix.
+  *
+- * Use this function iff blkg_conf_prep() can't be used for some reason.
++ * This function may be called multiple times on @ctx and the extra calls become
++ * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
++ * explicitly if bdev access is needed without resolving the blkcg / policy part
++ * of @ctx->input. Returns -errno on error.
+  */
+-struct block_device *blkcg_conf_open_bdev(char **inputp)
++int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
+ {
+-	char *input = *inputp;
++	char *input = ctx->input;
+ 	unsigned int major, minor;
+ 	struct block_device *bdev;
+ 	int key_len;
+ 
++	if (ctx->bdev)
++		return 0;
++
+ 	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
+-		return ERR_PTR(-EINVAL);
++		return -EINVAL;
+ 
+ 	input += key_len;
+ 	if (!isspace(*input))
+-		return ERR_PTR(-EINVAL);
++		return -EINVAL;
+ 	input = skip_spaces(input);
+ 
+ 	bdev = blkdev_get_no_open(MKDEV(major, minor));
+ 	if (!bdev)
+-		return ERR_PTR(-ENODEV);
++		return -ENODEV;
+ 	if (bdev_is_partition(bdev)) {
+ 		blkdev_put_no_open(bdev);
+-		return ERR_PTR(-ENODEV);
++		return -ENODEV;
+ 	}
+ 
+-	*inputp = input;
+-	return bdev;
++	ctx->body = input;
++	ctx->bdev = bdev;
++	return 0;
+ }
+ 
+ /**
+  * blkg_conf_prep - parse and prepare for per-blkg config update
+  * @blkcg: target block cgroup
+  * @pol: target policy
+- * @input: input string
+- * @ctx: blkg_conf_ctx to be filled
++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
++ *
++ * Parse per-blkg config update from @ctx->input and initialize @ctx
++ * accordingly. On success, @ctx->body points to the part of @ctx->input
++ * following MAJ:MIN, @ctx->bdev points to the target block device and
++ * @ctx->blkg to the blkg being configured.
+  *
+- * Parse per-blkg config update from @input and initialize @ctx with the
+- * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+- * part of @input following MAJ:MIN.  This function returns with RCU read
+- * lock and queue lock held and must be paired with blkg_conf_finish().
++ * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
++ * function returns with queue lock held and must be followed by
++ * blkg_conf_exit().
+  */
+ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+-		   char *input, struct blkg_conf_ctx *ctx)
+-	__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
++		   struct blkg_conf_ctx *ctx)
++	__acquires(&bdev->bd_queue->queue_lock)
+ {
+-	struct block_device *bdev;
+ 	struct gendisk *disk;
+ 	struct request_queue *q;
+ 	struct blkcg_gq *blkg;
+ 	int ret;
+ 
+-	bdev = blkcg_conf_open_bdev(&input);
+-	if (IS_ERR(bdev))
+-		return PTR_ERR(bdev);
+-	disk = bdev->bd_disk;
++	ret = blkg_conf_open_bdev(ctx);
++	if (ret)
++		return ret;
++
++	disk = ctx->bdev->bd_disk;
+ 	q = disk->queue;
+ 
+ 	/*
+@@ -699,7 +722,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ 	if (ret)
+ 		goto fail;
+ 
+-	rcu_read_lock();
+ 	spin_lock_irq(&q->queue_lock);
+ 
+ 	if (!blkcg_policy_enabled(q, pol)) {
+@@ -728,7 +750,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ 
+ 		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
+ 		spin_unlock_irq(&q->queue_lock);
+-		rcu_read_unlock();
+ 
+ 		new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
+ 		if (unlikely(!new_blkg)) {
+@@ -742,7 +763,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ 			goto fail_exit_queue;
+ 		}
+ 
+-		rcu_read_lock();
+ 		spin_lock_irq(&q->queue_lock);
+ 
+ 		if (!blkcg_policy_enabled(q, pol)) {
+@@ -769,20 +789,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ 	}
+ success:
+ 	blk_queue_exit(q);
+-	ctx->bdev = bdev;
+ 	ctx->blkg = blkg;
+-	ctx->body = input;
+ 	return 0;
+ 
+ fail_preloaded:
+ 	radix_tree_preload_end();
+ fail_unlock:
+ 	spin_unlock_irq(&q->queue_lock);
+-	rcu_read_unlock();
+ fail_exit_queue:
+ 	blk_queue_exit(q);
+ fail:
+-	blkdev_put_no_open(bdev);
+ 	/*
+ 	 * If queue was bypassing, we should retry.  Do so after a
+ 	 * short msleep().  It isn't strictly necessary but queue
+@@ -798,20 +814,27 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ EXPORT_SYMBOL_GPL(blkg_conf_prep);
+ 
+ /**
+- * blkg_conf_finish - finish up per-blkg config update
+- * @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
++ * blkg_conf_exit - clean up per-blkg config update
++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
+  *
+- * Finish up after per-blkg config update.  This function must be paired
+- * with blkg_conf_prep().
++ * Clean up after per-blkg config update. This function must be called on all
++ * blkg_conf_ctx's initialized with blkg_conf_init().
+  */
+-void blkg_conf_finish(struct blkg_conf_ctx *ctx)
+-	__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
++void blkg_conf_exit(struct blkg_conf_ctx *ctx)
++	__releases(&ctx->bdev->bd_queue->queue_lock)
+ {
+-	spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
+-	rcu_read_unlock();
+-	blkdev_put_no_open(ctx->bdev);
++	if (ctx->blkg) {
++		spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
++		ctx->blkg = NULL;
++	}
++
++	if (ctx->bdev) {
++		blkdev_put_no_open(ctx->bdev);
++		ctx->body = NULL;
++		ctx->bdev = NULL;
++	}
+ }
+-EXPORT_SYMBOL_GPL(blkg_conf_finish);
++EXPORT_SYMBOL_GPL(blkg_conf_exit);
+ 
+ static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+ {
+@@ -1300,14 +1323,8 @@ int blkcg_init_disk(struct gendisk *disk)
+ 	if (ret)
+ 		goto err_ioprio_exit;
+ 
+-	ret = blk_iolatency_init(disk);
+-	if (ret)
+-		goto err_throtl_exit;
+-
+ 	return 0;
+ 
+-err_throtl_exit:
+-	blk_throtl_exit(disk);
+ err_ioprio_exit:
+ 	blk_ioprio_exit(disk);
+ err_destroy_all:
+@@ -1323,7 +1340,6 @@ int blkcg_init_disk(struct gendisk *disk)
+ void blkcg_exit_disk(struct gendisk *disk)
+ {
+ 	blkg_destroy_all(disk);
+-	rq_qos_exit(disk->queue);
+ 	blk_throtl_exit(disk);
+ }
+ 
+diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
+index 1e94e404eaa8..fe09e8b4c2a8 100644
+--- a/block/blk-cgroup.h
++++ b/block/blk-cgroup.h
+@@ -208,15 +208,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+ 
+ struct blkg_conf_ctx {
++	char				*input;
++	char				*body;
+ 	struct block_device		*bdev;
+ 	struct blkcg_gq			*blkg;
+-	char				*body;
+ };
+ 
+-struct block_device *blkcg_conf_open_bdev(char **inputp);
++void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
++int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+-		   char *input, struct blkg_conf_ctx *ctx);
+-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
++		   struct blkg_conf_ctx *ctx);
++void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+ 
+ /**
+  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+diff --git a/block/blk-iocost.c b/block/blk-iocost.c
+index 6955605629e4..22a3639a7a05 100644
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -3091,9 +3091,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+ 		return nbytes;
+ 	}
+ 
+-	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
++	blkg_conf_init(&ctx, buf);
++
++	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
+ 	if (ret)
+-		return ret;
++		goto err;
+ 
+ 	iocg = blkg_to_iocg(ctx.blkg);
+ 
+@@ -3112,12 +3114,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+ 	weight_updated(iocg, &now);
+ 	spin_unlock(&iocg->ioc->lock);
+ 
+-	blkg_conf_finish(&ctx);
++	blkg_conf_exit(&ctx);
+ 	return nbytes;
+ 
+ einval:
+-	blkg_conf_finish(&ctx);
+-	return -EINVAL;
++	ret = -EINVAL;
++err:
++	blkg_conf_exit(&ctx);
++	return ret;
+ }
+ 
+ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+@@ -3172,19 +3176,22 @@ static const match_table_t qos_tokens = {
+ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ 			     size_t nbytes, loff_t off)
+ {
+-	struct block_device *bdev;
++	struct blkg_conf_ctx ctx;
+ 	struct gendisk *disk;
+ 	struct ioc *ioc;
+ 	u32 qos[NR_QOS_PARAMS];
+ 	bool enable, user;
+-	char *p;
++	char *body, *p;
+ 	int ret;
+ 
+-	bdev = blkcg_conf_open_bdev(&input);
+-	if (IS_ERR(bdev))
+-		return PTR_ERR(bdev);
++	blkg_conf_init(&ctx, input);
+ 
+-	disk = bdev->bd_disk;
++	ret = blkg_conf_open_bdev(&ctx);
++	if (ret)
++		goto err;
++
++	body = ctx.body;
++	disk = ctx.bdev->bd_disk;
+ 	ioc = q_to_ioc(disk->queue);
+ 	if (!ioc) {
+ 		ret = blk_iocost_init(disk);
+@@ -3201,7 +3208,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ 	enable = ioc->enabled;
+ 	user = ioc->user_qos_params;
+ 
+-	while ((p = strsep(&input, " \t\n"))) {
++	while ((p = strsep(&body, " \t\n"))) {
+ 		substring_t args[MAX_OPT_ARGS];
+ 		char buf[32];
+ 		int tok;
+@@ -3290,7 +3297,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ 	blk_mq_unquiesce_queue(disk->queue);
+ 	blk_mq_unfreeze_queue(disk->queue);
+ 
+-	blkdev_put_no_open(bdev);
++	blkg_conf_exit(&ctx);
+ 	return nbytes;
+ einval:
+ 	spin_unlock_irq(&ioc->lock);
+@@ -3300,7 +3307,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ 
+ 	ret = -EINVAL;
+ err:
+-	blkdev_put_no_open(bdev);
++	blkg_conf_exit(&ctx);
+ 	return ret;
+ }
+ 
+@@ -3351,22 +3358,25 @@ static const match_table_t i_lcoef_tokens = {
+ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ 				    size_t nbytes, loff_t off)
+ {
+-	struct block_device *bdev;
++	struct blkg_conf_ctx ctx;
+ 	struct request_queue *q;
+ 	struct ioc *ioc;
+ 	u64 u[NR_I_LCOEFS];
+ 	bool user;
+-	char *p;
++	char *body, *p;
+ 	int ret;
+ 
+-	bdev = blkcg_conf_open_bdev(&input);
+-	if (IS_ERR(bdev))
+-		return PTR_ERR(bdev);
++	blkg_conf_init(&ctx, input);
++
++	ret = blkg_conf_open_bdev(&ctx);
++	if (ret)
++		goto err;
+ 
+-	q = bdev_get_queue(bdev);
++	body = ctx.body;
++	q = bdev_get_queue(ctx.bdev);
+ 	ioc = q_to_ioc(q);
+ 	if (!ioc) {
+-		ret = blk_iocost_init(bdev->bd_disk);
++		ret = blk_iocost_init(ctx.bdev->bd_disk);
+ 		if (ret)
+ 			goto err;
+ 		ioc = q_to_ioc(q);
+@@ -3379,7 +3389,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
+ 	user = ioc->user_cost_model;
+ 
+-	while ((p = strsep(&input, " \t\n"))) {
++	while ((p = strsep(&body, " \t\n"))) {
+ 		substring_t args[MAX_OPT_ARGS];
+ 		char buf[32];
+ 		int tok;
+@@ -3426,7 +3436,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ 	blk_mq_unquiesce_queue(q);
+ 	blk_mq_unfreeze_queue(q);
+ 
+-	blkdev_put_no_open(bdev);
++	blkg_conf_exit(&ctx);
+ 	return nbytes;
+ 
+ einval:
+@@ -3437,7 +3447,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ 
+ 	ret = -EINVAL;
+ err:
+-	blkdev_put_no_open(bdev);
++	blkg_conf_exit(&ctx);
+ 	return ret;
+ }
+ 
+diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
+index ecdc10741836..3484393dbc4a 100644
+--- a/block/blk-iolatency.c
++++ b/block/blk-iolatency.c
+@@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
+ 	}
+ }
+ 
+-int blk_iolatency_init(struct gendisk *disk)
++static int blk_iolatency_init(struct gendisk *disk)
+ {
+ 	struct request_queue *q = disk->queue;
+ 	struct blk_iolatency *blkiolat;
+@@ -830,6 +830,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
+ 	}
+ }
+ 
++static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx)
++{
++	static DEFINE_MUTEX(init_mutex);
++	int ret;
++
++	ret = blkg_conf_open_bdev(ctx);
++	if (ret)
++		return ret;
++
++	/*
++	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
++	 * confuse iolat_rq_qos() test. Make the test and init atomic.
++	 */
++	mutex_lock(&init_mutex);
++
++	if (!iolat_rq_qos(ctx->bdev->bd_queue))
++		ret = blk_iolatency_init(ctx->bdev->bd_disk);
++
++	mutex_unlock(&init_mutex);
++
++	return ret;
++}
++
+ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+ 			     size_t nbytes, loff_t off)
+ {
+@@ -842,9 +865,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+ 	u64 oldval;
+ 	int ret;
+ 
+-	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
++	blkg_conf_init(&ctx, buf);
++
++	ret = blk_iolatency_try_init(&ctx);
+ 	if (ret)
+-		return ret;
++		goto out;
++
++	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
++	if (ret)
++		goto out;
+ 
+ 	iolat = blkg_to_lat(ctx.blkg);
+ 	p = ctx.body;
+@@ -880,7 +909,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+ 		iolatency_clear_scaling(blkg);
+ 	ret = 0;
+ out:
+-	blkg_conf_finish(&ctx);
++	blkg_conf_exit(&ctx);
+ 	return ret ?: nbytes;
+ }
+ 
+@@ -974,7 +1003,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
+ {
+ 	struct iolatency_grp *iolat = pd_to_lat(pd);
+ 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+-	struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
++	struct rq_qos *rqos = iolat_rq_qos(blkg->q);
+ 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+ 	u64 now = ktime_to_ns(ktime_get());
+ 	int cpu;
+diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
+index 1ef1f7d4bc3c..27f004fae66b 100644
+--- a/block/blk-rq-qos.h
++++ b/block/blk-rq-qos.h
+@@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
+ 	return rq_qos_id(q, RQ_QOS_WBT);
+ }
+ 
+-static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
++static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
+ {
+ 	return rq_qos_id(q, RQ_QOS_LATENCY);
+ }
+diff --git a/block/blk-throttle.c b/block/blk-throttle.c
+index 6fb5a2f9e1ee..75841d1d9bf4 100644
+--- a/block/blk-throttle.c
++++ b/block/blk-throttle.c
+@@ -1369,9 +1369,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
+ 	int ret;
+ 	u64 v;
+ 
+-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
++	blkg_conf_init(&ctx, buf);
++
++	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
+ 	if (ret)
+-		return ret;
++		goto out_finish;
+ 
+ 	ret = -EINVAL;
+ 	if (sscanf(ctx.body, "%llu", &v) != 1)
+@@ -1390,7 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
+ 	tg_conf_updated(tg, false);
+ 	ret = 0;
+ out_finish:
+-	blkg_conf_finish(&ctx);
++	blkg_conf_exit(&ctx);
+ 	return ret ?: nbytes;
+ }
+ 
+@@ -1562,9 +1564,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
+ 	int ret;
+ 	int index = of_cft(of)->private;
+ 
+-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
++	blkg_conf_init(&ctx, buf);
++
++	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
+ 	if (ret)
+-		return ret;
++		goto out_finish;
+ 
+ 	tg = blkg_to_tg(ctx.blkg);
+ 	tg_update_carryover(tg);
+@@ -1663,7 +1667,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
+ 		tg->td->limit_valid[LIMIT_LOW]);
+ 	ret = 0;
+ out_finish:
+-	blkg_conf_finish(&ctx);
++	blkg_conf_exit(&ctx);
+ 	return ret ?: nbytes;
+ }
+ 
+diff --git a/block/blk.h b/block/blk.h
+index 4c3b3325219a..78f1706cddca 100644
+--- a/block/blk.h
++++ b/block/blk.h
+@@ -392,12 +392,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
+ 	return bio;
+ }
+ 
+-#ifdef CONFIG_BLK_CGROUP_IOLATENCY
+-int blk_iolatency_init(struct gendisk *disk);
+-#else
+-static inline int blk_iolatency_init(struct gendisk *disk) { return 0; };
+-#endif
+-
+ #ifdef CONFIG_BLK_DEV_ZONED
+ void disk_free_zone_bitmaps(struct gendisk *disk);
+ void disk_clear_zone_settings(struct gendisk *disk);
+-- 
+2.39.2
+
+From f5846f885c52570685c30c97eae68dbebe7639b3 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 13 Feb 2023 11:26:20 +0100
+Subject: [PATCH 03/15] bitmap
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/bitmap.h   |  46 ++++++-------
+ include/linux/cpumask.h  | 144 +++++++++++++++++++--------------------
+ include/linux/find.h     |  40 +++++------
+ include/linux/nodemask.h |  86 +++++++++++------------
+ 4 files changed, 158 insertions(+), 158 deletions(-)
+
+diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
+index 7d6d73b78147..40e53a2ecc0d 100644
+--- a/include/linux/bitmap.h
++++ b/include/linux/bitmap.h
+@@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
+  * the bit offset of all zero areas this function finds is multiples of that
+  * power of 2. A @align_mask of 0 means no alignment is required.
+  */
+-static inline unsigned long
++static __always_inline unsigned long
+ bitmap_find_next_zero_area(unsigned long *map,
+ 			   unsigned long size,
+ 			   unsigned long start,
+@@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
+ #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+ #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+ 
+-static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
++static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
+ {
+ 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ 
+@@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
+ 		memset(dst, 0, len);
+ }
+ 
+-static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
++static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
+ {
+ 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+ 
+@@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
+ 		memset(dst, 0xff, len);
+ }
+ 
+-static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
++static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
+ 			unsigned int nbits)
+ {
+ 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+@@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
+ /*
+  * Copy bitmap and clear tail bits in last word.
+  */
+-static inline void bitmap_copy_clear_tail(unsigned long *dst,
++static __always_inline void bitmap_copy_clear_tail(unsigned long *dst,
+ 		const unsigned long *src, unsigned int nbits)
+ {
+ 	bitmap_copy(dst, src, nbits);
+@@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
+ 	bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
+ #endif
+ 
+-static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
++static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
+ 			const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
+ 	return __bitmap_and(dst, src1, src2, nbits);
+ }
+ 
+-static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
++static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
+ 			const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
+ 		__bitmap_or(dst, src1, src2, nbits);
+ }
+ 
+-static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
++static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
+ 			const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
+ 		__bitmap_xor(dst, src1, src2, nbits);
+ }
+ 
+-static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
++static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+ 			const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+ 	return __bitmap_andnot(dst, src1, src2, nbits);
+ }
+ 
+-static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
++static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
+ 			unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr
+ #endif
+ #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)
+ 
+-static inline bool bitmap_equal(const unsigned long *src1,
++static __always_inline bool bitmap_equal(const unsigned long *src1,
+ 				const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1,
+  *
+  * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
+  */
+-static inline bool bitmap_or_equal(const unsigned long *src1,
++static __always_inline bool bitmap_or_equal(const unsigned long *src1,
+ 				   const unsigned long *src2,
+ 				   const unsigned long *src3,
+ 				   unsigned int nbits)
+@@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1,
+ 	return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
+ }
+ 
+-static inline bool bitmap_intersects(const unsigned long *src1,
++static __always_inline bool bitmap_intersects(const unsigned long *src1,
+ 				     const unsigned long *src2,
+ 				     unsigned int nbits)
+ {
+@@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1,
+ 		return __bitmap_intersects(src1, src2, nbits);
+ }
+ 
+-static inline bool bitmap_subset(const unsigned long *src1,
++static __always_inline bool bitmap_subset(const unsigned long *src1,
+ 				 const unsigned long *src2, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1,
+ 		return __bitmap_subset(src1, src2, nbits);
+ }
+ 
+-static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
++static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+ 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
+@@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
+ 	return find_first_bit(src, nbits) == nbits;
+ }
+ 
+-static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
++static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+ 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
+@@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
+ 		__bitmap_clear(map, start, nbits);
+ }
+ 
+-static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
++static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+ 				unsigned int shift, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s
+ 		__bitmap_shift_right(dst, src, shift, nbits);
+ }
+ 
+-static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
++static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+ 				unsigned int shift, unsigned int nbits)
+ {
+ 	if (small_const_nbits(nbits))
+@@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr
+ 		__bitmap_shift_left(dst, src, shift, nbits);
+ }
+ 
+-static inline void bitmap_replace(unsigned long *dst,
++static __always_inline void bitmap_replace(unsigned long *dst,
+ 				  const unsigned long *old,
+ 				  const unsigned long *new,
+ 				  const unsigned long *mask,
+@@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst,
+ 		__bitmap_replace(dst, old, new, mask, nbits);
+ }
+ 
+-static inline void bitmap_next_set_region(unsigned long *bitmap,
++static __always_inline void bitmap_next_set_region(unsigned long *bitmap,
+ 					  unsigned int *rs, unsigned int *re,
+ 					  unsigned int end)
+ {
+@@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
+  * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
+  * but we expect the lower 32-bits of u64.
+  */
+-static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
++static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+ {
+ 	bitmap_from_arr64(dst, &mask, 64);
+ }
+@@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+  * Returns the 8-bit value located at the @start bit offset within the @src
+  * memory region.
+  */
+-static inline unsigned long bitmap_get_value8(const unsigned long *map,
++static __always_inline unsigned long bitmap_get_value8(const unsigned long *map,
+ 					      unsigned long start)
+ {
+ 	const size_t index = BIT_WORD(start);
+@@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map,
+  * @value: the 8-bit value; values wider than 8 bits may clobber bitmap
+  * @start: bit offset of the 8-bit value; must be a multiple of 8
+  */
+-static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
++static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value,
+ 				     unsigned long start)
+ {
+ 	const size_t index = BIT_WORD(start);
+diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
+index c2aa0aa26b45..9543b22d6dc2 100644
+--- a/include/linux/cpumask.h
++++ b/include/linux/cpumask.h
+@@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+ extern unsigned int nr_cpu_ids;
+ #endif
+ 
+-static inline void set_nr_cpu_ids(unsigned int nr)
++static __always_inline void set_nr_cpu_ids(unsigned int nr)
+ {
+ #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+ 	WARN_ON(nr != nr_cpu_ids);
+@@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
+  *
+  * Returns >= nr_cpu_ids if no cpus set.
+  */
+-static inline unsigned int cpumask_first(const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
+ {
+ 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
+  *
+  * Returns >= nr_cpu_ids if all cpus are set.
+  */
+-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
+ {
+ 	return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
+  *
+  * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
+ {
+ 	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+@@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
+  *
+  * Returns	>= nr_cpumask_bits if no CPUs set.
+  */
+-static inline unsigned int cpumask_last(const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
+ {
+ 	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
+  *
+  * Returns >= nr_cpu_ids if no further cpus set.
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_next(int n, const struct cpumask *srcp)
+ {
+ 	/* -1 is a legal arg here. */
+@@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
+  *
+  * Returns >= nr_cpu_ids if no further cpus unset.
+  */
+-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
+ {
+ 	/* -1 is a legal arg here. */
+ 	if (n != -1)
+@@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
+ 
+ #if NR_CPUS == 1
+ /* Uniprocessor: there is only one valid CPU */
+-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
++static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node)
+ {
+ 	return 0;
+ }
+ 
+-static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
++static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
+ 						      const struct cpumask *src2p)
+ {
+ 	return cpumask_first_and(src1p, src2p);
+ }
+ 
+-static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
+ {
+ 	return cpumask_first(srcp);
+ }
+@@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp);
+  *
+  * Returns >= nr_cpu_ids if no further cpus set in both.
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
+ 		     const struct cpumask *src2p)
+ {
+@@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
+ 	for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
+ 
+ #if NR_CPUS == 1
+-static inline
++static __always_inline
+ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+ {
+ 	cpumask_check(start);
+@@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
+  * Often used to find any cpu but smp_processor_id() in a mask.
+  * Returns >= nr_cpu_ids if no cpus set.
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+ {
+ 	unsigned int i;
+@@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+  *
+  * Returns >= nr_cpu_ids if such cpu doesn't exist.
+  */
+-static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
+ {
+ 	return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
+ }
+@@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s
+  *
+  * Returns >= nr_cpu_ids if such cpu doesn't exist.
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
+ 							const struct cpumask *srcp2)
+ {
+@@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
+  *
+  * Returns >= nr_cpu_ids if such cpu doesn't exist.
+  */
+-static inline
++static __always_inline
+ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
+ 							const struct cpumask *srcp2)
+ {
+@@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
+  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
+  * @dstp: the cpumask pointer
+  */
+-static inline void cpumask_setall(struct cpumask *dstp)
++static __always_inline void cpumask_setall(struct cpumask *dstp)
+ {
+ 	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+ }
+@@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
+  * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
+  * @dstp: the cpumask pointer
+  */
+-static inline void cpumask_clear(struct cpumask *dstp)
++static __always_inline void cpumask_clear(struct cpumask *dstp)
+ {
+ 	bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
+ }
+@@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp)
+  *
+  * If *@dstp is empty, returns false, else returns true
+  */
+-static inline bool cpumask_and(struct cpumask *dstp,
++static __always_inline bool cpumask_and(struct cpumask *dstp,
+ 			       const struct cpumask *src1p,
+ 			       const struct cpumask *src2p)
+ {
+@@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp,
+  * @src1p: the first input
+  * @src2p: the second input
+  */
+-static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
++static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
+ 			      const struct cpumask *src2p)
+ {
+ 	bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
+@@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
+  * @src1p: the first input
+  * @src2p: the second input
+  */
+-static inline void cpumask_xor(struct cpumask *dstp,
++static __always_inline void cpumask_xor(struct cpumask *dstp,
+ 			       const struct cpumask *src1p,
+ 			       const struct cpumask *src2p)
+ {
+@@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
+  *
+  * If *@dstp is empty, returns false, else returns true
+  */
+-static inline bool cpumask_andnot(struct cpumask *dstp,
++static __always_inline bool cpumask_andnot(struct cpumask *dstp,
+ 				  const struct cpumask *src1p,
+ 				  const struct cpumask *src2p)
+ {
+@@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
+  * @dstp: the cpumask result
+  * @srcp: the input to invert
+  */
+-static inline void cpumask_complement(struct cpumask *dstp,
++static __always_inline void cpumask_complement(struct cpumask *dstp,
+ 				      const struct cpumask *srcp)
+ {
+ 	bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
+@@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp,
+  * @src1p: the first input
+  * @src2p: the second input
+  */
+-static inline bool cpumask_equal(const struct cpumask *src1p,
++static __always_inline bool cpumask_equal(const struct cpumask *src1p,
+ 				const struct cpumask *src2p)
+ {
+ 	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
+@@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
+  * @src2p: the second input
+  * @src3p: the third input
+  */
+-static inline bool cpumask_or_equal(const struct cpumask *src1p,
++static __always_inline bool cpumask_or_equal(const struct cpumask *src1p,
+ 				    const struct cpumask *src2p,
+ 				    const struct cpumask *src3p)
+ {
+@@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
+  * @src1p: the first input
+  * @src2p: the second input
+  */
+-static inline bool cpumask_intersects(const struct cpumask *src1p,
++static __always_inline bool cpumask_intersects(const struct cpumask *src1p,
+ 				     const struct cpumask *src2p)
+ {
+ 	return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
+@@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
+  *
+  * Returns true if *@src1p is a subset of *@src2p, else returns false
+  */
+-static inline bool cpumask_subset(const struct cpumask *src1p,
++static __always_inline bool cpumask_subset(const struct cpumask *src1p,
+ 				 const struct cpumask *src2p)
+ {
+ 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
+@@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
+  * cpumask_empty - *srcp == 0
+  * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
+  */
+-static inline bool cpumask_empty(const struct cpumask *srcp)
++static __always_inline bool cpumask_empty(const struct cpumask *srcp)
+ {
+ 	return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
+  * cpumask_full - *srcp == 0xFFFFFFFF...
+  * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
+  */
+-static inline bool cpumask_full(const struct cpumask *srcp)
++static __always_inline bool cpumask_full(const struct cpumask *srcp)
+ {
+ 	return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
+  * cpumask_weight - Count of bits in *srcp
+  * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
+  */
+-static inline unsigned int cpumask_weight(const struct cpumask *srcp)
++static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
+ {
+ 	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
+ }
+@@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
+  * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
+  * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+  */
+-static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
++static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
+ 						const struct cpumask *srcp2)
+ {
+ 	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+@@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
+  * @srcp: the input to shift
+  * @n: the number of bits to shift by
+  */
+-static inline void cpumask_shift_right(struct cpumask *dstp,
++static __always_inline void cpumask_shift_right(struct cpumask *dstp,
+ 				       const struct cpumask *srcp, int n)
+ {
+ 	bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
+@@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
+  * @srcp: the input to shift
+  * @n: the number of bits to shift by
+  */
+-static inline void cpumask_shift_left(struct cpumask *dstp,
++static __always_inline void cpumask_shift_left(struct cpumask *dstp,
+ 				      const struct cpumask *srcp, int n)
+ {
+ 	bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
+@@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
+  * @dstp: the result
+  * @srcp: the input cpumask
+  */
+-static inline void cpumask_copy(struct cpumask *dstp,
++static __always_inline void cpumask_copy(struct cpumask *dstp,
+ 				const struct cpumask *srcp)
+ {
+ 	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
+@@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
+  *
+  * Returns -errno, or 0 for success.
+  */
+-static inline int cpumask_parse_user(const char __user *buf, int len,
++static __always_inline int cpumask_parse_user(const char __user *buf, int len,
+ 				     struct cpumask *dstp)
+ {
+ 	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+@@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
+  *
+  * Returns -errno, or 0 for success.
+  */
+-static inline int cpumask_parselist_user(const char __user *buf, int len,
++static __always_inline int cpumask_parselist_user(const char __user *buf, int len,
+ 				     struct cpumask *dstp)
+ {
+ 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
+@@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
+  *
+  * Returns -errno, or 0 for success.
+  */
+-static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
++static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
+ {
+ 	return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
+ }
+@@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
+  *
+  * Returns -errno, or 0 for success.
+  */
+-static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
++static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+ {
+ 	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
+ }
+@@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+ /**
+  * cpumask_size - size to allocate for a 'struct cpumask' in bytes
+  */
+-static inline unsigned int cpumask_size(void)
++static __always_inline unsigned int cpumask_size(void)
+ {
+ 	return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
+ }
+@@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t;
+ 
+ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
+ 
+-static inline
++static __always_inline
+ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+ {
+ 	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
+@@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+  *
+  * See alloc_cpumask_var_node.
+  */
+-static inline
++static __always_inline
+ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+ {
+ 	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
+ }
+ 
+-static inline
++static __always_inline
+ bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+ {
+ 	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+@@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
+ void free_cpumask_var(cpumask_var_t mask);
+ void free_bootmem_cpumask_var(cpumask_var_t mask);
+ 
+-static inline bool cpumask_available(cpumask_var_t mask)
++static __always_inline bool cpumask_available(cpumask_var_t mask)
+ {
+ 	return mask != NULL;
+ }
+@@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1];
+ #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
+ #define __cpumask_var_read_mostly
+ 
+-static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
++static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+ {
+ 	return true;
+ }
+ 
+-static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
++static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+ 					  int node)
+ {
+ 	return true;
+ }
+ 
+-static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
++static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+ {
+ 	cpumask_clear(*mask);
+ 	return true;
+ }
+ 
+-static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
++static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+ 					  int node)
+ {
+ 	cpumask_clear(*mask);
+ 	return true;
+ }
+ 
+-static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
++static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
+ {
+ }
+ 
+-static inline void free_cpumask_var(cpumask_var_t mask)
++static __always_inline void free_cpumask_var(cpumask_var_t mask)
+ {
+ }
+ 
+-static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
++static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
+ {
+ }
+ 
+-static inline bool cpumask_available(cpumask_var_t mask)
++static __always_inline bool cpumask_available(cpumask_var_t mask)
+ {
+ 	return true;
+ }
+@@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src);
+ void init_cpu_possible(const struct cpumask *src);
+ void init_cpu_online(const struct cpumask *src);
+ 
+-static inline void reset_cpu_possible_mask(void)
++static __always_inline void reset_cpu_possible_mask(void)
+ {
+ 	bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
+ }
+ 
+-static inline void
++static __always_inline void
+ set_cpu_possible(unsigned int cpu, bool possible)
+ {
+ 	if (possible)
+@@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible)
+ 		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
+ }
+ 
+-static inline void
++static __always_inline void
+ set_cpu_present(unsigned int cpu, bool present)
+ {
+ 	if (present)
+@@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present)
+ 
+ void set_cpu_online(unsigned int cpu, bool online);
+ 
+-static inline void
++static __always_inline void
+ set_cpu_active(unsigned int cpu, bool active)
+ {
+ 	if (active)
+@@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active)
+ 		cpumask_clear_cpu(cpu, &__cpu_active_mask);
+ }
+ 
+-static inline void
++static __always_inline void
+ set_cpu_dying(unsigned int cpu, bool dying)
+ {
+ 	if (dying)
+@@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying)
+ 	((struct cpumask *)(1 ? (bitmap)				\
+ 			    : (void *)sizeof(__check_is_bitmap(bitmap))))
+ 
+-static inline int __check_is_bitmap(const unsigned long *bitmap)
++static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
+ {
+ 	return 1;
+ }
+@@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap)
+ extern const unsigned long
+ 	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
+ 
+-static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
++static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
+ {
+ 	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
+ 	p -= cpu / BITS_PER_LONG;
+@@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
+  * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+  * region.
+  */
+-static inline unsigned int num_online_cpus(void)
++static __always_inline unsigned int num_online_cpus(void)
+ {
+ 	return atomic_read(&__num_online_cpus);
+ }
+@@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void)
+ #define num_present_cpus()	cpumask_weight(cpu_present_mask)
+ #define num_active_cpus()	cpumask_weight(cpu_active_mask)
+ 
+-static inline bool cpu_online(unsigned int cpu)
++static __always_inline bool cpu_online(unsigned int cpu)
+ {
+ 	return cpumask_test_cpu(cpu, cpu_online_mask);
+ }
+ 
+-static inline bool cpu_possible(unsigned int cpu)
++static __always_inline bool cpu_possible(unsigned int cpu)
+ {
+ 	return cpumask_test_cpu(cpu, cpu_possible_mask);
+ }
+ 
+-static inline bool cpu_present(unsigned int cpu)
++static __always_inline bool cpu_present(unsigned int cpu)
+ {
+ 	return cpumask_test_cpu(cpu, cpu_present_mask);
+ }
+ 
+-static inline bool cpu_active(unsigned int cpu)
++static __always_inline bool cpu_active(unsigned int cpu)
+ {
+ 	return cpumask_test_cpu(cpu, cpu_active_mask);
+ }
+ 
+-static inline bool cpu_dying(unsigned int cpu)
++static __always_inline bool cpu_dying(unsigned int cpu)
+ {
+ 	return cpumask_test_cpu(cpu, cpu_dying_mask);
+ }
+@@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu)
+ #define num_present_cpus()	1U
+ #define num_active_cpus()	1U
+ 
+-static inline bool cpu_online(unsigned int cpu)
++static __always_inline bool cpu_online(unsigned int cpu)
+ {
+ 	return cpu == 0;
+ }
+ 
+-static inline bool cpu_possible(unsigned int cpu)
++static __always_inline bool cpu_possible(unsigned int cpu)
+ {
+ 	return cpu == 0;
+ }
+ 
+-static inline bool cpu_present(unsigned int cpu)
++static __always_inline bool cpu_present(unsigned int cpu)
+ {
+ 	return cpu == 0;
+ }
+ 
+-static inline bool cpu_active(unsigned int cpu)
++static __always_inline bool cpu_active(unsigned int cpu)
+ {
+ 	return cpu == 0;
+ }
+ 
+-static inline bool cpu_dying(unsigned int cpu)
++static __always_inline bool cpu_dying(unsigned int cpu)
+ {
+ 	return false;
+ }
+@@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu)
+  * Returns the length of the (null-terminated) @buf string, zero if
+  * nothing is copied.
+  */
+-static inline ssize_t
++static __always_inline ssize_t
+ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
+ {
+ 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
+@@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
+  * Returns the length of how many bytes have been copied, excluding
+  * terminating '\0'.
+  */
+-static inline ssize_t
++static __always_inline ssize_t
+ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
+ 		loff_t off, size_t count)
+ {
+@@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
+  * Everything is same with the above cpumap_print_bitmask_to_buf()
+  * except the print format.
+  */
+-static inline ssize_t
++static __always_inline ssize_t
+ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
+ 		loff_t off, size_t count)
+ {
+diff --git a/include/linux/find.h b/include/linux/find.h
+index ccaf61a0f5fd..db2f2851601d 100644
+--- a/include/linux/find.h
++++ b/include/linux/find.h
+@@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
+  * Returns the bit number for the next set bit
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ 			    unsigned long offset)
+ {
+@@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+  * Returns the bit number for the next set bit
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_and_bit(const unsigned long *addr1,
+ 		const unsigned long *addr2, unsigned long size,
+ 		unsigned long offset)
+@@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
+  * Returns the bit number for the next set bit
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_andnot_bit(const unsigned long *addr1,
+ 		const unsigned long *addr2, unsigned long size,
+ 		unsigned long offset)
+@@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1,
+  * Returns the bit number of the next zero bit
+  * If no bits are zero, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ 				 unsigned long offset)
+ {
+@@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+  * Returns the bit number of the first set bit.
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+ {
+ 	if (small_const_nbits(size)) {
+@@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+  * Returns the bit number of the N'th set bit.
+  * If no such, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
+ {
+ 	if (n >= size)
+@@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign
+  * Returns the bit number of the N'th set bit.
+  * If no such, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+ 				unsigned long size, unsigned long n)
+ {
+@@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *
+  * Returns the bit number of the N'th set bit.
+  * If no such, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+ 				unsigned long size, unsigned long n)
+ {
+@@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
+  * Returns the bit number for the next set bit
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_first_and_bit(const unsigned long *addr1,
+ 				 const unsigned long *addr2,
+ 				 unsigned long size)
+@@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1,
+  * Returns the bit number of the first cleared bit.
+  * If no bits are zero, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+ {
+ 	if (small_const_nbits(size)) {
+@@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+  *
+  * Returns the bit number of the last set bit, or size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+ {
+ 	if (small_const_nbits(size)) {
+@@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+  * Returns the bit number for the next set bit, or first set bit up to @offset
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
+ 					const unsigned long *addr2,
+ 					unsigned long size, unsigned long offset)
+@@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
+  * Returns the bit number for the next set bit, or first set bit up to @offset
+  * If no bits are set, returns @size.
+  */
+-static inline
++static __always_inline
+ unsigned long find_next_bit_wrap(const unsigned long *addr,
+ 					unsigned long size, unsigned long offset)
+ {
+@@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr,
+  * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
+  * before using it alone.
+  */
+-static inline
++static __always_inline
+ unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
+ 				 unsigned long start, unsigned long n)
+ {
+@@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump,
+ 
+ #if defined(__LITTLE_ENDIAN)
+ 
+-static inline unsigned long find_next_zero_bit_le(const void *addr,
++static __always_inline unsigned long find_next_zero_bit_le(const void *addr,
+ 		unsigned long size, unsigned long offset)
+ {
+ 	return find_next_zero_bit(addr, size, offset);
+ }
+ 
+-static inline unsigned long find_next_bit_le(const void *addr,
++static __always_inline unsigned long find_next_bit_le(const void *addr,
+ 		unsigned long size, unsigned long offset)
+ {
+ 	return find_next_bit(addr, size, offset);
+ }
+ 
+-static inline unsigned long find_first_zero_bit_le(const void *addr,
++static __always_inline unsigned long find_first_zero_bit_le(const void *addr,
+ 		unsigned long size)
+ {
+ 	return find_first_zero_bit(addr, size);
+@@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
+ #elif defined(__BIG_ENDIAN)
+ 
+ #ifndef find_next_zero_bit_le
+-static inline
++static __always_inline
+ unsigned long find_next_zero_bit_le(const void *addr, unsigned
+ 		long size, unsigned long offset)
+ {
+@@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
+ #endif
+ 
+ #ifndef find_first_zero_bit_le
+-static inline
++static __always_inline
+ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
+ {
+ 	if (small_const_nbits(size)) {
+@@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
+ #endif
+ 
+ #ifndef find_next_bit_le
+-static inline
++static __always_inline
+ unsigned long find_next_bit_le(const void *addr, unsigned
+ 		long size, unsigned long offset)
+ {
+diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
+index bb0ee80526b2..8c04254c5284 100644
+--- a/include/linux/nodemask.h
++++ b/include/linux/nodemask.h
+@@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_;
+  */
+ #define nodemask_pr_args(maskp)	__nodemask_pr_numnodes(maskp), \
+ 				__nodemask_pr_bits(maskp)
+-static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
++static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
+ {
+ 	return m ? MAX_NUMNODES : 0;
+ }
+-static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
++static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
+ {
+ 	return m ? m->bits : NULL;
+ }
+@@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
+ }
+ 
+ #define node_clear(node, dst) __node_clear((node), &(dst))
+-static inline void __node_clear(int node, volatile nodemask_t *dstp)
++static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
+ {
+ 	clear_bit(node, dstp->bits);
+ }
+ 
+ #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
+-static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
++static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
+ {
+ 	bitmap_fill(dstp->bits, nbits);
+ }
+ 
+ #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
+-static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
++static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
+ {
+ 	bitmap_zero(dstp->bits, nbits);
+ }
+@@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
+ 
+ #define node_test_and_set(node, nodemask) \
+ 			__node_test_and_set((node), &(nodemask))
+-static inline bool __node_test_and_set(int node, nodemask_t *addr)
++static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
+ {
+ 	return test_and_set_bit(node, addr->bits);
+ }
+ 
+ #define nodes_and(dst, src1, src2) \
+ 			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
+-static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
++static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+@@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+ 
+ #define nodes_or(dst, src1, src2) \
+ 			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
+-static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
++static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+@@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+ 
+ #define nodes_xor(dst, src1, src2) \
+ 			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
+-static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
++static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+@@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+ 
+ #define nodes_andnot(dst, src1, src2) \
+ 			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
+-static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
++static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+@@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+ 
+ #define nodes_complement(dst, src) \
+ 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
+-static inline void __nodes_complement(nodemask_t *dstp,
++static __always_inline void __nodes_complement(nodemask_t *dstp,
+ 					const nodemask_t *srcp, unsigned int nbits)
+ {
+ 	bitmap_complement(dstp->bits, srcp->bits, nbits);
+@@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
+ 
+ #define nodes_equal(src1, src2) \
+ 			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
+-static inline bool __nodes_equal(const nodemask_t *src1p,
++static __always_inline bool __nodes_equal(const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+@@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p,
+ 
+ #define nodes_intersects(src1, src2) \
+ 			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
+-static inline bool __nodes_intersects(const nodemask_t *src1p,
++static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+@@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p,
+ 
+ #define nodes_subset(src1, src2) \
+ 			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
+-static inline bool __nodes_subset(const nodemask_t *src1p,
++static __always_inline bool __nodes_subset(const nodemask_t *src1p,
+ 					const nodemask_t *src2p, unsigned int nbits)
+ {
+ 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+ }
+ 
+ #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
+-static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
++static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
+ {
+ 	return bitmap_empty(srcp->bits, nbits);
+ }
+ 
+ #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
+-static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
++static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
+ {
+ 	return bitmap_full(srcp->bits, nbits);
+ }
+ 
+ #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
+-static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
++static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
+ {
+ 	return bitmap_weight(srcp->bits, nbits);
+ }
+ 
+ #define nodes_shift_right(dst, src, n) \
+ 			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
+-static inline void __nodes_shift_right(nodemask_t *dstp,
++static __always_inline void __nodes_shift_right(nodemask_t *dstp,
+ 					const nodemask_t *srcp, int n, int nbits)
+ {
+ 	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+@@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp,
+ 
+ #define nodes_shift_left(dst, src, n) \
+ 			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
+-static inline void __nodes_shift_left(nodemask_t *dstp,
++static __always_inline void __nodes_shift_left(nodemask_t *dstp,
+ 					const nodemask_t *srcp, int n, int nbits)
+ {
+ 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+@@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp,
+           > MAX_NUMNODES, then the silly min_ts could be dropped. */
+ 
+ #define first_node(src) __first_node(&(src))
+-static inline unsigned int __first_node(const nodemask_t *srcp)
++static __always_inline unsigned int __first_node(const nodemask_t *srcp)
+ {
+ 	return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
+ }
+ 
+ #define next_node(n, src) __next_node((n), &(src))
+-static inline unsigned int __next_node(int n, const nodemask_t *srcp)
++static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
+ {
+ 	return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
+ }
+@@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp)
+  * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+  */
+ #define next_node_in(n, src) __next_node_in((n), &(src))
+-static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
++static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
+ {
+ 	unsigned int ret = __next_node(node, srcp);
+ 
+@@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
+ 	return ret;
+ }
+ 
+-static inline void init_nodemask_of_node(nodemask_t *mask, int node)
++static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
+ {
+ 	nodes_clear(*mask);
+ 	node_set(node, *mask);
+@@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node)
+ })
+ 
+ #define first_unset_node(mask) __first_unset_node(&(mask))
+-static inline unsigned int __first_unset_node(const nodemask_t *maskp)
++static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
+ {
+ 	return min_t(unsigned int, MAX_NUMNODES,
+ 			find_first_zero_bit(maskp->bits, MAX_NUMNODES));
+@@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp)
+ 
+ #define nodemask_parse_user(ubuf, ulen, dst) \
+ 		__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
+-static inline int __nodemask_parse_user(const char __user *buf, int len,
++static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
+ 					nodemask_t *dstp, int nbits)
+ {
+ 	return bitmap_parse_user(buf, len, dstp->bits, nbits);
+ }
+ 
+ #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
+-static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
++static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
+ {
+ 	return bitmap_parselist(buf, dstp->bits, nbits);
+ }
+ 
+ #define node_remap(oldbit, old, new) \
+ 		__node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
+-static inline int __node_remap(int oldbit,
++static __always_inline int __node_remap(int oldbit,
+ 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
+ {
+ 	return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
+@@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit,
+ 
+ #define nodes_remap(dst, src, old, new) \
+ 		__nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
+-static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
++static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
+ 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
+ {
+ 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
+@@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
+ 
+ #define nodes_onto(dst, orig, relmap) \
+ 		__nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
+-static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
++static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+ 		const nodemask_t *relmapp, int nbits)
+ {
+ 	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
+@@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+ 
+ #define nodes_fold(dst, orig, sz) \
+ 		__nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
+-static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
++static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
+ 		int sz, int nbits)
+ {
+ 	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
+@@ -418,22 +418,22 @@ enum node_states {
+ extern nodemask_t node_states[NR_NODE_STATES];
+ 
+ #if MAX_NUMNODES > 1
+-static inline int node_state(int node, enum node_states state)
++static __always_inline int node_state(int node, enum node_states state)
+ {
+ 	return node_isset(node, node_states[state]);
+ }
+ 
+-static inline void node_set_state(int node, enum node_states state)
++static __always_inline void node_set_state(int node, enum node_states state)
+ {
+ 	__node_set(node, &node_states[state]);
+ }
+ 
+-static inline void node_clear_state(int node, enum node_states state)
++static __always_inline void node_clear_state(int node, enum node_states state)
+ {
+ 	__node_clear(node, &node_states[state]);
+ }
+ 
+-static inline int num_node_state(enum node_states state)
++static __always_inline int num_node_state(enum node_states state)
+ {
+ 	return nodes_weight(node_states[state]);
+ }
+@@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state)
+ 
+ #define first_online_node	first_node(node_states[N_ONLINE])
+ #define first_memory_node	first_node(node_states[N_MEMORY])
+-static inline unsigned int next_online_node(int nid)
++static __always_inline unsigned int next_online_node(int nid)
+ {
+ 	return next_node(nid, node_states[N_ONLINE]);
+ }
+-static inline unsigned int next_memory_node(int nid)
++static __always_inline unsigned int next_memory_node(int nid)
+ {
+ 	return next_node(nid, node_states[N_MEMORY]);
+ }
+@@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid)
+ extern unsigned int nr_node_ids;
+ extern unsigned int nr_online_nodes;
+ 
+-static inline void node_set_online(int nid)
++static __always_inline void node_set_online(int nid)
+ {
+ 	node_set_state(nid, N_ONLINE);
+ 	nr_online_nodes = num_node_state(N_ONLINE);
+ }
+ 
+-static inline void node_set_offline(int nid)
++static __always_inline void node_set_offline(int nid)
+ {
+ 	node_clear_state(nid, N_ONLINE);
+ 	nr_online_nodes = num_node_state(N_ONLINE);
+@@ -469,20 +469,20 @@ static inline void node_set_offline(int nid)
+ 
+ #else
+ 
+-static inline int node_state(int node, enum node_states state)
++static __always_inline int node_state(int node, enum node_states state)
+ {
+ 	return node == 0;
+ }
+ 
+-static inline void node_set_state(int node, enum node_states state)
++static __always_inline void node_set_state(int node, enum node_states state)
+ {
+ }
+ 
+-static inline void node_clear_state(int node, enum node_states state)
++static __always_inline void node_clear_state(int node, enum node_states state)
+ {
+ }
+ 
+-static inline int num_node_state(enum node_states state)
++static __always_inline int num_node_state(enum node_states state)
+ {
+ 	return 1;
+ }
+@@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state)
+ 
+ #endif
+ 
+-static inline int node_random(const nodemask_t *maskp)
++static __always_inline int node_random(const nodemask_t *maskp)
+ {
+ #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
+ 	int w, bit;
+-- 
+2.39.2
+
+From 0e3205aac37cde833a7cc71dd35595de9f88a5b8 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Tue, 21 Feb 2023 10:26:39 +0100
+Subject: [PATCH 04/15] cachy
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .gitignore                                    |   1 +
+ .../admin-guide/kernel-parameters.txt         |  11 +-
+ Documentation/dontdiff                        |   1 +
+ Makefile                                      |   8 +-
+ arch/arc/configs/axs101_defconfig             |   1 +
+ arch/arc/configs/axs103_defconfig             |   1 +
+ arch/arc/configs/axs103_smp_defconfig         |   1 +
+ arch/arc/configs/haps_hs_defconfig            |   1 +
+ arch/arc/configs/haps_hs_smp_defconfig        |   1 +
+ arch/arc/configs/hsdk_defconfig               |   1 +
+ arch/arc/configs/nsim_700_defconfig           |   1 +
+ arch/arc/configs/nsimosci_defconfig           |   1 +
+ arch/arc/configs/nsimosci_hs_defconfig        |   1 +
+ arch/arc/configs/nsimosci_hs_smp_defconfig    |   1 +
+ arch/arc/configs/tb10x_defconfig              |   1 +
+ arch/arc/configs/vdk_hs38_defconfig           |   1 +
+ arch/arc/configs/vdk_hs38_smp_defconfig       |   1 +
+ arch/x86/Kconfig.cpu                          | 416 ++++++++++-
+ arch/x86/Makefile                             |  45 +-
+ arch/x86/Makefile.postlink                    |  41 ++
+ arch/x86/boot/compressed/.gitignore           |   1 -
+ arch/x86/boot/compressed/Makefile             |  10 +-
+ arch/x86/include/asm/vermagic.h               |  72 ++
+ drivers/Makefile                              |  15 +-
+ drivers/i2c/busses/Kconfig                    |   9 +
+ drivers/i2c/busses/Makefile                   |   1 +
+ drivers/i2c/busses/i2c-nct6775.c              | 647 ++++++++++++++++++
+ drivers/i2c/busses/i2c-piix4.c                |   4 +-
+ drivers/md/dm-crypt.c                         |   5 +
+ drivers/pci/quirks.c                          | 101 +++
+ include/linux/pagemap.h                       |   2 +-
+ include/linux/user_namespace.h                |   4 +
+ include/net/netns/ipv4.h                      |   1 +
+ include/trace/events/tcp.h                    |   7 +
+ init/Kconfig                                  |  39 ++
+ kernel/Kconfig.hz                             |  24 +
+ kernel/fork.c                                 |  14 +
+ kernel/module/Kconfig                         |  25 +
+ kernel/rcu/Kconfig                            |   4 +-
+ kernel/rcu/rcutorture.c                       |   2 +-
+ kernel/rcu/tree.c                             |   6 +-
+ kernel/rcu/tree_nocb.h                        |   4 +-
+ kernel/rcu/tree_plugin.h                      |   4 +-
+ kernel/sched/fair.c                           |  20 +-
+ kernel/sysctl.c                               |  12 +
+ kernel/user_namespace.c                       |   7 +
+ lib/string.c                                  |  62 +-
+ mm/Kconfig                                    |   2 +-
+ mm/compaction.c                               |   4 +
+ mm/page-writeback.c                           |   8 +
+ mm/swap.c                                     |   5 +
+ mm/vmpressure.c                               |   4 +
+ mm/vmscan.c                                   |   8 +
+ net/ipv4/sysctl_net_ipv4.c                    |   7 +
+ net/ipv4/tcp_input.c                          |  36 +
+ net/ipv4/tcp_ipv4.c                           |   2 +
+ scripts/Makefile.lib                          |  13 +-
+ scripts/Makefile.modinst                      |   7 +-
+ 58 files changed, 1660 insertions(+), 74 deletions(-)
+ create mode 100644 arch/x86/Makefile.postlink
+ create mode 100644 drivers/i2c/busses/i2c-nct6775.c
+
+diff --git a/.gitignore b/.gitignore
+index 20dce5c3b9e0..466c23de56ce 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -63,6 +63,7 @@ modules.order
+ /vmlinux
+ /vmlinux.32
+ /vmlinux.map
++/vmlinux.relocs
+ /vmlinux.symvers
+ /vmlinux-gdb.py
+ /vmlinuz
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 6cfa6e3996cf..9595abf34974 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -4178,6 +4178,15 @@
+ 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
+ 				enabled, this kernel boot option can be used to
+ 				disable the use of MSI interrupts system-wide.
++		pcie_acs_override =
++					[PCIE] Override missing PCIe ACS support for:
++				downstream
++					All downstream ports - full ACS capabilities
++				multfunction
++					All multifunction devices - multifunction ACS subset
++				id:nnnn:nnnn
++					Specfic device - full ACS capabilities
++					Specified as vid:did (vendor/device ID) in hex
+ 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
+ 				Safety option to keep boot IRQs enabled. This
+ 				should never be necessary.
+@@ -4751,7 +4760,7 @@
+ 			overwritten.
+ 
+ 	rcutree.kthread_prio= 	 [KNL,BOOT]
+-			Set the SCHED_FIFO priority of the RCU per-CPU
++			Set the SCHED_RR priority of the RCU per-CPU
+ 			kthreads (rcuc/N). This value is also used for
+ 			the priority of the RCU boost threads (rcub/N)
+ 			and for the RCU grace-period kthreads (rcu_bh,
+diff --git a/Documentation/dontdiff b/Documentation/dontdiff
+index 352ff53a2306..7c210744d84c 100644
+--- a/Documentation/dontdiff
++++ b/Documentation/dontdiff
+@@ -255,6 +255,7 @@ vmlinux.aout
+ vmlinux.bin.all
+ vmlinux.lds
+ vmlinux.map
++vmlinux.relocs
+ vmlinux.symvers
+ vmlinuz
+ voffset.h
+diff --git a/Makefile b/Makefile
+index 3f6628780eb2..335e93ed017f 100644
+--- a/Makefile
++++ b/Makefile
+@@ -834,6 +834,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
+ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+ KBUILD_CFLAGS += -O2
+ KBUILD_RUSTFLAGS += -Copt-level=2
++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
++KBUILD_CFLAGS += -O3
++KBUILD_RUSTFLAGS += -Copt-level=3
+ else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ KBUILD_CFLAGS += -Os
+ KBUILD_RUSTFLAGS += -Copt-level=s
+@@ -1075,11 +1078,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
+ # Make sure -fstack-check isn't enabled (like gentoo apparently did)
+ KBUILD_CFLAGS  += -fno-stack-check
+ 
+-# conserve stack if available
+-ifdef CONFIG_CC_IS_GCC
+-KBUILD_CFLAGS   += -fconserve-stack
+-endif
+-
+ # Prohibit date/time macros, which would make the build non-deterministic
+ KBUILD_CFLAGS   += -Werror=date-time
+ 
+diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
+index 81764160451f..2c15d3bf747a 100644
+--- a/arch/arc/configs/axs101_defconfig
++++ b/arch/arc/configs/axs101_defconfig
+@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
+index d5181275490e..7d868e148d9a 100644
+--- a/arch/arc/configs/axs103_defconfig
++++ b/arch/arc/configs/axs103_defconfig
+@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
+index 2f336d99a8cf..777a9f21eb6b 100644
+--- a/arch/arc/configs/axs103_smp_defconfig
++++ b/arch/arc/configs/axs103_smp_defconfig
+@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
+index 899b2fd5c71d..bda15a876849 100644
+--- a/arch/arc/configs/haps_hs_defconfig
++++ b/arch/arc/configs/haps_hs_defconfig
+@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EXPERT=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_COMPAT_BRK is not set
+diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
+index 0d32aac8069f..dbd74fea69aa 100644
+--- a/arch/arc/configs/haps_hs_smp_defconfig
++++ b/arch/arc/configs/haps_hs_smp_defconfig
+@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
+index d18378d2c2a6..2396ca417182 100644
+--- a/arch/arc/configs/hsdk_defconfig
++++ b/arch/arc/configs/hsdk_defconfig
+@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
+ CONFIG_BLK_DEV_RAM=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
+index 3e9829775992..5044609540cc 100644
+--- a/arch/arc/configs/nsim_700_defconfig
++++ b/arch/arc/configs/nsim_700_defconfig
+@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_KALLSYMS_ALL=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
+index 502c87f351c8..748c809d1c4c 100644
+--- a/arch/arc/configs/nsimosci_defconfig
++++ b/arch/arc/configs/nsimosci_defconfig
+@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_KALLSYMS_ALL=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
+index f721cc3997d0..205c32b0074c 100644
+--- a/arch/arc/configs/nsimosci_hs_defconfig
++++ b/arch/arc/configs/nsimosci_hs_defconfig
+@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_KALLSYMS_ALL=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
+index 1419fc946a08..2477b7c80977 100644
+--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
++++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
+@@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y
+ # CONFIG_UTS_NS is not set
+ # CONFIG_PID_NS is not set
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_COMPAT_BRK is not set
+ CONFIG_KPROBES=y
+diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
+index 6f0d2be9d926..cf02ad0fc210 100644
+--- a/arch/arc/configs/tb10x_defconfig
++++ b/arch/arc/configs/tb10x_defconfig
+@@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio"
+ CONFIG_INITRAMFS_ROOT_UID=2100
+ CONFIG_INITRAMFS_ROOT_GID=501
+ # CONFIG_RD_GZIP is not set
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_KALLSYMS_ALL=y
+ # CONFIG_AIO is not set
+ CONFIG_EMBEDDED=y
+diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
+index d3ef189c75f8..922b1b24f518 100644
+--- a/arch/arc/configs/vdk_hs38_defconfig
++++ b/arch/arc/configs/vdk_hs38_defconfig
+@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
+ CONFIG_IKCONFIG=y
+ CONFIG_IKCONFIG_PROC=y
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
+index 944b347025fd..ed64319f7eb2 100644
+--- a/arch/arc/configs/vdk_hs38_smp_defconfig
++++ b/arch/arc/configs/vdk_hs38_smp_defconfig
+@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
+ CONFIG_IKCONFIG=y
+ CONFIG_IKCONFIG_PROC=y
+ CONFIG_BLK_DEV_INITRD=y
++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
+ CONFIG_EMBEDDED=y
+ CONFIG_PERF_EVENTS=y
+ # CONFIG_VM_EVENT_COUNTERS is not set
+diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
+index 542377cd419d..08d887d1220d 100644
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -157,7 +157,7 @@ config MPENTIUM4
+ 
+ 
+ config MK6
+-	bool "K6/K6-II/K6-III"
++	bool "AMD K6/K6-II/K6-III"
+ 	depends on X86_32
+ 	help
+ 	  Select this for an AMD K6-family processor.  Enables use of
+@@ -165,7 +165,7 @@ config MK6
+ 	  flags to GCC.
+ 
+ config MK7
+-	bool "Athlon/Duron/K7"
++	bool "AMD Athlon/Duron/K7"
+ 	depends on X86_32
+ 	help
+ 	  Select this for an AMD Athlon K7-family processor.  Enables use of
+@@ -173,12 +173,106 @@ config MK7
+ 	  flags to GCC.
+ 
+ config MK8
+-	bool "Opteron/Athlon64/Hammer/K8"
++	bool "AMD Opteron/Athlon64/Hammer/K8"
+ 	help
+ 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
+ 	  Enables use of some extended instructions, and passes appropriate
+ 	  optimization flags to GCC.
+ 
++config MK8SSE3
++	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
++	help
++	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MK10
++	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
++	help
++	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
++	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
++	  Enables use of some extended instructions, and passes appropriate
++	  optimization flags to GCC.
++
++config MBARCELONA
++	bool "AMD Barcelona"
++	help
++	  Select this for AMD Family 10h Barcelona processors.
++
++	  Enables -march=barcelona
++
++config MBOBCAT
++	bool "AMD Bobcat"
++	help
++	  Select this for AMD Family 14h Bobcat processors.
++
++	  Enables -march=btver1
++
++config MJAGUAR
++	bool "AMD Jaguar"
++	help
++	  Select this for AMD Family 16h Jaguar processors.
++
++	  Enables -march=btver2
++
++config MBULLDOZER
++	bool "AMD Bulldozer"
++	help
++	  Select this for AMD Family 15h Bulldozer processors.
++
++	  Enables -march=bdver1
++
++config MPILEDRIVER
++	bool "AMD Piledriver"
++	help
++	  Select this for AMD Family 15h Piledriver processors.
++
++	  Enables -march=bdver2
++
++config MSTEAMROLLER
++	bool "AMD Steamroller"
++	help
++	  Select this for AMD Family 15h Steamroller processors.
++
++	  Enables -march=bdver3
++
++config MEXCAVATOR
++	bool "AMD Excavator"
++	help
++	  Select this for AMD Family 15h Excavator processors.
++
++	  Enables -march=bdver4
++
++config MZEN
++	bool "AMD Zen"
++	help
++	  Select this for AMD Family 17h Zen processors.
++
++	  Enables -march=znver1
++
++config MZEN2
++	bool "AMD Zen 2"
++	help
++	  Select this for AMD Family 17h Zen 2 processors.
++
++	  Enables -march=znver2
++
++config MZEN3
++	bool "AMD Zen 3"
++	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	help
++	  Select this for AMD Family 19h Zen 3 processors.
++
++	  Enables -march=znver3
++
++config MZEN4
++	bool "AMD Zen 4"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	help
++	  Select this for AMD Family 19h Zen 4 processors.
++
++	  Enables -march=znver4
++
+ config MCRUSOE
+ 	bool "Crusoe"
+ 	depends on X86_32
+@@ -270,7 +364,7 @@ config MPSC
+ 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+ 
+ config MCORE2
+-	bool "Core 2/newer Xeon"
++	bool "Intel Core 2"
+ 	help
+ 
+ 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
+@@ -278,6 +372,8 @@ config MCORE2
+ 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
+ 	  (not a typo)
+ 
++	  Enables -march=core2
++
+ config MATOM
+ 	bool "Intel Atom"
+ 	help
+@@ -287,6 +383,202 @@ config MATOM
+ 	  accordingly optimized code. Use a recent GCC with specific Atom
+ 	  support in order to fully benefit from selecting this option.
+ 
++config MNEHALEM
++	bool "Intel Nehalem"
++	select X86_P6_NOP
++	help
++
++	  Select this for 1st Gen Core processors in the Nehalem family.
++
++	  Enables -march=nehalem
++
++config MWESTMERE
++	bool "Intel Westmere"
++	select X86_P6_NOP
++	help
++
++	  Select this for the Intel Westmere formerly Nehalem-C family.
++
++	  Enables -march=westmere
++
++config MSILVERMONT
++	bool "Intel Silvermont"
++	select X86_P6_NOP
++	help
++
++	  Select this for the Intel Silvermont platform.
++
++	  Enables -march=silvermont
++
++config MGOLDMONT
++	bool "Intel Goldmont"
++	select X86_P6_NOP
++	help
++
++	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
++
++	  Enables -march=goldmont
++
++config MGOLDMONTPLUS
++	bool "Intel Goldmont Plus"
++	select X86_P6_NOP
++	help
++
++	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
++
++	  Enables -march=goldmont-plus
++
++config MSANDYBRIDGE
++	bool "Intel Sandy Bridge"
++	select X86_P6_NOP
++	help
++
++	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
++
++	  Enables -march=sandybridge
++
++config MIVYBRIDGE
++	bool "Intel Ivy Bridge"
++	select X86_P6_NOP
++	help
++
++	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
++
++	  Enables -march=ivybridge
++
++config MHASWELL
++	bool "Intel Haswell"
++	select X86_P6_NOP
++	help
++
++	  Select this for 4th Gen Core processors in the Haswell family.
++
++	  Enables -march=haswell
++
++config MBROADWELL
++	bool "Intel Broadwell"
++	select X86_P6_NOP
++	help
++
++	  Select this for 5th Gen Core processors in the Broadwell family.
++
++	  Enables -march=broadwell
++
++config MSKYLAKE
++	bool "Intel Skylake"
++	select X86_P6_NOP
++	help
++
++	  Select this for 6th Gen Core processors in the Skylake family.
++
++	  Enables -march=skylake
++
++config MSKYLAKEX
++	bool "Intel Skylake X"
++	select X86_P6_NOP
++	help
++
++	  Select this for 6th Gen Core processors in the Skylake X family.
++
++	  Enables -march=skylake-avx512
++
++config MCANNONLAKE
++	bool "Intel Cannon Lake"
++	select X86_P6_NOP
++	help
++
++	  Select this for 8th Gen Core processors
++
++	  Enables -march=cannonlake
++
++config MICELAKE
++	bool "Intel Ice Lake"
++	select X86_P6_NOP
++	help
++
++	  Select this for 10th Gen Core processors in the Ice Lake family.
++
++	  Enables -march=icelake-client
++
++config MCASCADELAKE
++	bool "Intel Cascade Lake"
++	select X86_P6_NOP
++	help
++
++	  Select this for Xeon processors in the Cascade Lake family.
++
++	  Enables -march=cascadelake
++
++config MCOOPERLAKE
++	bool "Intel Cooper Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
++	select X86_P6_NOP
++	help
++
++	  Select this for Xeon processors in the Cooper Lake family.
++
++	  Enables -march=cooperlake
++
++config MTIGERLAKE
++	bool "Intel Tiger Lake"
++	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
++	select X86_P6_NOP
++	help
++
++	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
++
++	  Enables -march=tigerlake
++
++config MSAPPHIRERAPIDS
++	bool "Intel Sapphire Rapids"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	select X86_P6_NOP
++	help
++
++	  Select this for third-generation 10 nm process processors in the Sapphire Rapids family.
++
++	  Enables -march=sapphirerapids
++
++config MROCKETLAKE
++	bool "Intel Rocket Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	select X86_P6_NOP
++	help
++
++	  Select this for eleventh-generation processors in the Rocket Lake family.
++
++	  Enables -march=rocketlake
++
++config MALDERLAKE
++	bool "Intel Alder Lake"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	select X86_P6_NOP
++	help
++
++	  Select this for twelfth-generation processors in the Alder Lake family.
++
++	  Enables -march=alderlake
++
++config MRAPTORLAKE
++	bool "Intel Raptor Lake"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	select X86_P6_NOP
++	help
++
++	  Select this for thirteenth-generation processors in the Raptor Lake family.
++
++	  Enables -march=raptorlake
++
++config MMETEORLAKE
++	bool "Intel Meteor Lake"
++	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
++	select X86_P6_NOP
++	help
++
++	  Select this for fourteenth-generation processors in the Meteor Lake family.
++
++	  Enables -march=meteorlake
++
+ config GENERIC_CPU
+ 	bool "Generic-x86-64"
+ 	depends on X86_64
+@@ -294,6 +586,50 @@ config GENERIC_CPU
+ 	  Generic x86-64 CPU.
+ 	  Run equally well on all x86-64 CPUs.
+ 
++config GENERIC_CPU2
++	bool "Generic-x86-64-v2"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64
++	help
++	  Generic x86-64 CPU.
++	  Run equally well on all x86-64 CPUs with min support of x86-64-v2.
++
++config GENERIC_CPU3
++	bool "Generic-x86-64-v3"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64
++	help
++	  Generic x86-64-v3 CPU with v3 instructions.
++	  Run equally well on all x86-64 CPUs with min support of x86-64-v3.
++
++config GENERIC_CPU4
++	bool "Generic-x86-64-v4"
++	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
++	depends on X86_64
++	help
++	  Generic x86-64 CPU with v4 instructions.
++	  Run equally well on all x86-64 CPUs with min support of x86-64-v4.
++
++config MNATIVE_INTEL
++	bool "Intel-Native optimizations autodetected by the compiler"
++	help
++
++	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
++	  the optimum settings to use based on your processor. Do NOT use this
++	  for AMD CPUs.  Intel Only!
++
++	  Enables -march=native
++
++config MNATIVE_AMD
++	bool "AMD-Native optimizations autodetected by the compiler"
++	help
++
++	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
++	  the optimum settings to use based on your processor. Do NOT use this
++	  for Intel CPUs.  AMD Only!
++
++	  Enables -march=native
++
+ endchoice
+ 
+ config X86_GENERIC
+@@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT
+ config X86_L1_CACHE_SHIFT
+ 	int
+ 	default "7" if MPENTIUM4 || MPSC
+-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
++	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
++	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
++	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
++	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
++	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
++	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
++	|| MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \
++	|| GENERIC_CPU4
+ 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
+-	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
++	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
++	|| MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+ 
+ config X86_F00F_BUG
+ 	def_bool y
+@@ -332,15 +676,27 @@ config X86_INVD_BUG
+ 
+ config X86_ALIGNMENT_16
+ 	def_bool y
+-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
++	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
++	|| M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
+ 
+ config X86_INTEL_USERCOPY
+ 	def_bool y
+-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
++	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
++	|| MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
++	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
++	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
++	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL
+ 
+ config X86_USE_PPRO_CHECKSUM
+ 	def_bool y
+-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
++	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
++	|| MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
++	|| MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
++	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
++	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
++	|| MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
++	|| MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
++	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD
+ 
+ #
+ # P6_NOPs are a relatively minor optimization that require a family >=
+@@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM
+ config X86_P6_NOP
+ 	def_bool y
+ 	depends on X86_64
+-	depends on (MCORE2 || MPENTIUM4 || MPSC)
++	depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
++	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
++	|| MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
++	|| MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL)
+ 
+ config X86_TSC
+ 	def_bool y
+-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
++	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
++	|| MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
++	|| MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
++	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
++	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
++	|| MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
++	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \
++	|| MNATIVE_AMD) || X86_64
+ 
+ config X86_CMPXCHG64
+ 	def_bool y
+-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
++	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
++	|| M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
++	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
++	|| MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
++	|| MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
++	|| MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
++	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD
+ 
+ # this should be set for all -march=.. options where the compiler
+ # generates cmov.
+ config X86_CMOV
+ 	def_bool y
+-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
++	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
++	|| MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
++	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
++	|| MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
++	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
++	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
++	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD)
+ 
+ config X86_MINIMUM_CPU_FAMILY
+ 	int
+ 	default "64" if X86_64
+-	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
++	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
++	|| MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 ||  MK8SSE3 \
++	|| MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
++	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
++	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
++	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
++	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
++	|| MNATIVE_INTEL || MNATIVE_AMD)
+ 	default "5" if X86_32 && X86_CMPXCHG64
+ 	default "4"
+ 
+ config X86_DEBUGCTLMSR
+ 	def_bool y
+-	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
++	depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \
++	|| M486SX || M486) && !UML
+ 
+ config IA32_FEAT_CTL
+ 	def_bool y
+diff --git a/arch/x86/Makefile b/arch/x86/Makefile
+index 73ed982d4100..cb4c6620b34a 100644
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -67,7 +67,7 @@ export BITS
+ #
+ #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+ #
+-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3 -fno-tree-vectorize
+ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+ 
+ ifeq ($(CONFIG_X86_KERNEL_IBT),y)
+@@ -151,8 +151,47 @@ else
+         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+         cflags-$(CONFIG_MK8)		+= -march=k8
+         cflags-$(CONFIG_MPSC)		+= -march=nocona
+-        cflags-$(CONFIG_MCORE2)		+= -march=core2
+-        cflags-$(CONFIG_MATOM)		+= -march=atom
++        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
++        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
++        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
++        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
++        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
++        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
++        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
++        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
++        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
++        cflags-$(CONFIG_MZEN) 		+= -march=znver1
++        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
++        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
++        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
++        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
++        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native
++        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
++        cflags-$(CONFIG_MCORE2) 	+= -march=core2
++        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
++        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
++        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
++        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
++        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
++        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
++        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
++        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
++        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
++        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
++        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
++        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
++        cflags-$(CONFIG_MICELAKE) 	+= -march=icelake-client
++        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
++        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
++        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
++        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
++        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
++        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
++        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
++        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
++        cflags-$(CONFIG_GENERIC_CPU2) 	+= -march=x86-64-v2
++        cflags-$(CONFIG_GENERIC_CPU3) 	+= -march=x86-64-v3
++        cflags-$(CONFIG_GENERIC_CPU4) 	+= -march=x86-64-v4
+         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
+         KBUILD_CFLAGS += $(cflags-y)
+ 
+diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink
+new file mode 100644
+index 000000000000..b38ffa4defb3
+--- /dev/null
++++ b/arch/x86/Makefile.postlink
+@@ -0,0 +1,41 @@
++# SPDX-License-Identifier: GPL-2.0
++# ===========================================================================
++# Post-link x86 pass
++# ===========================================================================
++#
++# 1. Separate relocations from vmlinux into vmlinux.relocs.
++# 2. Strip relocations from vmlinux.
++
++PHONY := __archpost
++__archpost:
++
++-include include/config/auto.conf
++include scripts/Kbuild.include
++
++CMD_RELOCS = arch/x86/tools/relocs
++quiet_cmd_relocs = RELOCS  $@.relocs
++      cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@
++
++quiet_cmd_strip_relocs = RSTRIP  $@
++      cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@
++
++# `@true` prevents complaint when there is nothing to be done
++
++vmlinux: FORCE
++	@true
++ifeq ($(CONFIG_X86_NEED_RELOCS),y)
++	$(call cmd,relocs)
++	$(call cmd,strip_relocs)
++endif
++
++%.ko: FORCE
++	@true
++
++clean:
++	@rm -f vmlinux.relocs
++
++PHONY += FORCE clean
++
++FORCE:
++
++.PHONY: $(PHONY)
+diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
+index 25805199a506..b2968175fc27 100644
+--- a/arch/x86/boot/compressed/.gitignore
++++ b/arch/x86/boot/compressed/.gitignore
+@@ -1,7 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0-only
+ relocs
+ vmlinux.bin.all
+-vmlinux.relocs
+ vmlinux.lds
+ mkpiggy
+ piggy.S
+diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
+index 1acff356d97a..d995595394bb 100644
+--- a/arch/x86/boot/compressed/Makefile
++++ b/arch/x86/boot/compressed/Makefile
+@@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE
+ 
+ targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
+ 
+-CMD_RELOCS = arch/x86/tools/relocs
+-quiet_cmd_relocs = RELOCS  $@
+-      cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
+-$(obj)/vmlinux.relocs: vmlinux FORCE
+-	$(call if_changed,relocs)
++# vmlinux.relocs is created by the vmlinux postlink step.
++vmlinux.relocs: vmlinux
++	@true
+ 
+ vmlinux.bin.all-y := $(obj)/vmlinux.bin
+-vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
++vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs
+ 
+ $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
+ 	$(call if_changed,gzip)
+diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
+index 75884d2cdec3..18021e8c0c28 100644
+--- a/arch/x86/include/asm/vermagic.h
++++ b/arch/x86/include/asm/vermagic.h
+@@ -17,6 +17,52 @@
+ #define MODULE_PROC_FAMILY "586MMX "
+ #elif defined CONFIG_MCORE2
+ #define MODULE_PROC_FAMILY "CORE2 "
++#elif defined CONFIG_MNATIVE_INTEL
++#define MODULE_PROC_FAMILY "NATIVE_INTEL "
++#elif defined CONFIG_MNATIVE_AMD
++#define MODULE_PROC_FAMILY "NATIVE_AMD "
++#elif defined CONFIG_MNEHALEM
++#define MODULE_PROC_FAMILY "NEHALEM "
++#elif defined CONFIG_MWESTMERE
++#define MODULE_PROC_FAMILY "WESTMERE "
++#elif defined CONFIG_MSILVERMONT
++#define MODULE_PROC_FAMILY "SILVERMONT "
++#elif defined CONFIG_MGOLDMONT
++#define MODULE_PROC_FAMILY "GOLDMONT "
++#elif defined CONFIG_MGOLDMONTPLUS
++#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
++#elif defined CONFIG_MSANDYBRIDGE
++#define MODULE_PROC_FAMILY "SANDYBRIDGE "
++#elif defined CONFIG_MIVYBRIDGE
++#define MODULE_PROC_FAMILY "IVYBRIDGE "
++#elif defined CONFIG_MHASWELL
++#define MODULE_PROC_FAMILY "HASWELL "
++#elif defined CONFIG_MBROADWELL
++#define MODULE_PROC_FAMILY "BROADWELL "
++#elif defined CONFIG_MSKYLAKE
++#define MODULE_PROC_FAMILY "SKYLAKE "
++#elif defined CONFIG_MSKYLAKEX
++#define MODULE_PROC_FAMILY "SKYLAKEX "
++#elif defined CONFIG_MCANNONLAKE
++#define MODULE_PROC_FAMILY "CANNONLAKE "
++#elif defined CONFIG_MICELAKE
++#define MODULE_PROC_FAMILY "ICELAKE "
++#elif defined CONFIG_MCASCADELAKE
++#define MODULE_PROC_FAMILY "CASCADELAKE "
++#elif defined CONFIG_MCOOPERLAKE
++#define MODULE_PROC_FAMILY "COOPERLAKE "
++#elif defined CONFIG_MTIGERLAKE
++#define MODULE_PROC_FAMILY "TIGERLAKE "
++#elif defined CONFIG_MSAPPHIRERAPIDS
++#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
++#elif defined CONFIG_ROCKETLAKE
++#define MODULE_PROC_FAMILY "ROCKETLAKE "
++#elif defined CONFIG_MALDERLAKE
++#define MODULE_PROC_FAMILY "ALDERLAKE "
++#elif defined CONFIG_MRAPTORLAKE
++#define MODULE_PROC_FAMILY "RAPTORLAKE "
++#elif defined CONFIG_MMETEORLAKE
++#define MODULE_PROC_FAMILY "METEORLAKE "
+ #elif defined CONFIG_MATOM
+ #define MODULE_PROC_FAMILY "ATOM "
+ #elif defined CONFIG_M686
+@@ -35,6 +81,32 @@
+ #define MODULE_PROC_FAMILY "K7 "
+ #elif defined CONFIG_MK8
+ #define MODULE_PROC_FAMILY "K8 "
++#elif defined CONFIG_MK8SSE3
++#define MODULE_PROC_FAMILY "K8SSE3 "
++#elif defined CONFIG_MK10
++#define MODULE_PROC_FAMILY "K10 "
++#elif defined CONFIG_MBARCELONA
++#define MODULE_PROC_FAMILY "BARCELONA "
++#elif defined CONFIG_MBOBCAT
++#define MODULE_PROC_FAMILY "BOBCAT "
++#elif defined CONFIG_MBULLDOZER
++#define MODULE_PROC_FAMILY "BULLDOZER "
++#elif defined CONFIG_MPILEDRIVER
++#define MODULE_PROC_FAMILY "PILEDRIVER "
++#elif defined CONFIG_MSTEAMROLLER
++#define MODULE_PROC_FAMILY "STEAMROLLER "
++#elif defined CONFIG_MJAGUAR
++#define MODULE_PROC_FAMILY "JAGUAR "
++#elif defined CONFIG_MEXCAVATOR
++#define MODULE_PROC_FAMILY "EXCAVATOR "
++#elif defined CONFIG_MZEN
++#define MODULE_PROC_FAMILY "ZEN "
++#elif defined CONFIG_MZEN2
++#define MODULE_PROC_FAMILY "ZEN2 "
++#elif defined CONFIG_MZEN3
++#define MODULE_PROC_FAMILY "ZEN3 "
++#elif defined CONFIG_MZEN4
++#define MODULE_PROC_FAMILY "ZEN4 "
+ #elif defined CONFIG_MELAN
+ #define MODULE_PROC_FAMILY "ELAN "
+ #elif defined CONFIG_MCRUSOE
+diff --git a/drivers/Makefile b/drivers/Makefile
+index bdf1c66141c9..1e1a0832fb48 100644
+--- a/drivers/Makefile
++++ b/drivers/Makefile
+@@ -59,15 +59,8 @@ obj-y				+= char/
+ # iommu/ comes before gpu as gpu are using iommu controllers
+ obj-y				+= iommu/
+ 
+-# gpu/ comes after char for AGP vs DRM startup and after iommu
+-obj-y				+= gpu/
+-
+ obj-$(CONFIG_CONNECTOR)		+= connector/
+ 
+-# i810fb and intelfb depend on char/agp/
+-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+-obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
+-
+ obj-$(CONFIG_PARPORT)		+= parport/
+ obj-y				+= base/ block/ misc/ mfd/ nfc/
+ obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
+@@ -79,6 +72,14 @@ obj-y				+= macintosh/
+ obj-y				+= scsi/
+ obj-y				+= nvme/
+ obj-$(CONFIG_ATA)		+= ata/
++
++# gpu/ comes after char for AGP vs DRM startup and after iommu
++obj-y				+= gpu/
++
++# i810fb and intelfb depend on char/agp/
++obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
++obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
++
+ obj-$(CONFIG_TARGET_CORE)	+= target/
+ obj-$(CONFIG_MTD)		+= mtd/
+ obj-$(CONFIG_SPI)		+= spi/
+diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
+index a7bfddf08fa7..c9a5fa597950 100644
+--- a/drivers/i2c/busses/Kconfig
++++ b/drivers/i2c/busses/Kconfig
+@@ -229,6 +229,15 @@ config I2C_CHT_WC
+ 	  combined with a FUSB302 Type-C port-controller as such it is advised
+ 	  to also select CONFIG_TYPEC_FUSB302=m.
+ 
++config I2C_NCT6775
++	tristate "Nuvoton NCT6775 and compatible SMBus controller"
++	help
++		If you say yes to this option, support will be included for the
++		Nuvoton NCT6775 and compatible SMBus controllers.
++
++		This driver can also be built as a module.  If so, the module
++		will be called i2c-nct6775.
++
+ config I2C_NFORCE2
+ 	tristate "Nvidia nForce2, nForce3 and nForce4"
+ 	depends on PCI
+diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
+index e73cdb1d2b5a..052ccd05c13c 100644
+--- a/drivers/i2c/busses/Makefile
++++ b/drivers/i2c/busses/Makefile
+@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
+ obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
+ obj-$(CONFIG_I2C_ISCH)		+= i2c-isch.o
+ obj-$(CONFIG_I2C_ISMT)		+= i2c-ismt.o
++obj-$(CONFIG_I2C_NCT6775)   += i2c-nct6775.o
+ obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
+ obj-$(CONFIG_I2C_NFORCE2_S4985)	+= i2c-nforce2-s4985.o
+ obj-$(CONFIG_I2C_NVIDIA_GPU)	+= i2c-nvidia-gpu.o
+diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
+new file mode 100644
+index 000000000000..0462f0952043
+--- /dev/null
++++ b/drivers/i2c/busses/i2c-nct6775.c
+@@ -0,0 +1,647 @@
++/*
++ * i2c-nct6775 - Driver for the SMBus master functionality of
++ *	       Nuvoton NCT677x Super-I/O chips
++ *
++ * Copyright (C) 2019  Adam Honse <calcprogrammer1@gmail.com>
++ *
++ * Derived from nct6775 hwmon driver
++ * Copyright (C) 2012  Guenter Roeck <linux@roeck-us.net>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/platform_device.h>
++#include <linux/hwmon.h>
++#include <linux/hwmon-sysfs.h>
++#include <linux/hwmon-vid.h>
++#include <linux/err.h>
++#include <linux/mutex.h>
++#include <linux/delay.h>
++#include <linux/ioport.h>
++#include <linux/i2c.h>
++#include <linux/acpi.h>
++#include <linux/bitops.h>
++#include <linux/dmi.h>
++#include <linux/io.h>
++#include <linux/nospec.h>
++
++#define DRVNAME "i2c-nct6775"
++
++/* Nuvoton SMBus address offsets */
++#define SMBHSTDAT       (0 + nuvoton_nct6793d_smba)
++#define SMBBLKSZ        (1 + nuvoton_nct6793d_smba)
++#define SMBHSTCMD       (2 + nuvoton_nct6793d_smba)
++#define SMBHSTIDX       (3 + nuvoton_nct6793d_smba)  //Index field is the Command field on other controllers
++#define SMBHSTCTL       (4 + nuvoton_nct6793d_smba)
++#define SMBHSTADD       (5 + nuvoton_nct6793d_smba)
++#define SMBHSTERR       (9 + nuvoton_nct6793d_smba)
++#define SMBHSTSTS       (0xE + nuvoton_nct6793d_smba)
++
++/* Command register */
++#define NCT6793D_READ_BYTE      0
++#define NCT6793D_READ_WORD      1
++#define NCT6793D_READ_BLOCK     2
++#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3
++#define NCT6793D_PROC_CALL      4
++#define NCT6793D_WRITE_BYTE     8
++#define NCT6793D_WRITE_WORD     9
++#define NCT6793D_WRITE_BLOCK    10
++
++/* Control register */
++#define NCT6793D_MANUAL_START   128
++#define NCT6793D_SOFT_RESET     64
++
++/* Error register */
++#define NCT6793D_NO_ACK         32
++
++/* Status register */
++#define NCT6793D_FIFO_EMPTY     1
++#define NCT6793D_FIFO_FULL      2
++#define NCT6793D_MANUAL_ACTIVE  4
++
++#define NCT6775_LD_SMBUS		0x0B
++
++/* Other settings */
++#define MAX_RETRIES		400
++
++enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793,
++	     nct6795, nct6796, nct6798 };
++
++struct nct6775_sio_data {
++	int sioreg;
++	enum kinds kind;
++};
++
++/* used to set data->name = nct6775_device_names[data->sio_kind] */
++static const char * const nct6775_device_names[] = {
++	"nct6106",
++	"nct6775",
++	"nct6776",
++	"nct6779",
++	"nct6791",
++	"nct6792",
++	"nct6793",
++	"nct6795",
++	"nct6796",
++	"nct6798",
++};
++
++static const char * const nct6775_sio_names[] __initconst = {
++	"NCT6106D",
++	"NCT6775F",
++	"NCT6776D/F",
++	"NCT6779D",
++	"NCT6791D",
++	"NCT6792D",
++	"NCT6793D",
++	"NCT6795D",
++	"NCT6796D",
++	"NCT6798D",
++};
++
++#define SIO_REG_LDSEL		0x07	/* Logical device select */
++#define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
++#define SIO_REG_SMBA		0x62	/* SMBus base address register */
++
++#define SIO_NCT6106_ID		0xc450
++#define SIO_NCT6775_ID		0xb470
++#define SIO_NCT6776_ID		0xc330
++#define SIO_NCT6779_ID		0xc560
++#define SIO_NCT6791_ID		0xc800
++#define SIO_NCT6792_ID		0xc910
++#define SIO_NCT6793_ID		0xd120
++#define SIO_NCT6795_ID		0xd350
++#define SIO_NCT6796_ID		0xd420
++#define SIO_NCT6798_ID		0xd428
++#define SIO_ID_MASK			0xFFF0
++
++static inline void
++superio_outb(int ioreg, int reg, int val)
++{
++	outb(reg, ioreg);
++	outb(val, ioreg + 1);
++}
++
++static inline int
++superio_inb(int ioreg, int reg)
++{
++	outb(reg, ioreg);
++	return inb(ioreg + 1);
++}
++
++static inline void
++superio_select(int ioreg, int ld)
++{
++	outb(SIO_REG_LDSEL, ioreg);
++	outb(ld, ioreg + 1);
++}
++
++static inline int
++superio_enter(int ioreg)
++{
++	/*
++	 * Try to reserve <ioreg> and <ioreg + 1> for exclusive access.
++	 */
++	if (!request_muxed_region(ioreg, 2, DRVNAME))
++		return -EBUSY;
++
++	outb(0x87, ioreg);
++	outb(0x87, ioreg);
++
++	return 0;
++}
++
++static inline void
++superio_exit(int ioreg)
++{
++	outb(0xaa, ioreg);
++	outb(0x02, ioreg);
++	outb(0x02, ioreg + 1);
++	release_region(ioreg, 2);
++}
++
++/*
++ * ISA constants
++ */
++
++#define IOREGION_ALIGNMENT	(~7)
++#define IOREGION_LENGTH		2
++#define ADDR_REG_OFFSET		0
++#define DATA_REG_OFFSET		1
++
++#define NCT6775_REG_BANK	0x4E
++#define NCT6775_REG_CONFIG	0x40
++
++static struct i2c_adapter *nct6775_adapter;
++
++struct i2c_nct6775_adapdata {
++	unsigned short smba;
++};
++
++/* Return negative errno on error. */
++static s32 nct6775_access(struct i2c_adapter * adap, u16 addr,
++		 unsigned short flags, char read_write,
++		 u8 command, int size, union i2c_smbus_data * data)
++{
++	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
++	unsigned short nuvoton_nct6793d_smba = adapdata->smba;
++	int i, len, cnt;
++	union i2c_smbus_data tmp_data;
++	int timeout = 0;
++
++	tmp_data.word = 0;
++	cnt = 0;
++	len = 0;
++
++	outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL);
++
++	switch (size) {
++		case I2C_SMBUS_QUICK:
++			outb_p((addr << 1) | read_write,
++			       SMBHSTADD);
++			break;
++		case I2C_SMBUS_BYTE_DATA:
++			tmp_data.byte = data->byte;
++		case I2C_SMBUS_BYTE:
++			outb_p((addr << 1) | read_write,
++			       SMBHSTADD);
++			outb_p(command, SMBHSTIDX);
++			if (read_write == I2C_SMBUS_WRITE) {
++				outb_p(tmp_data.byte, SMBHSTDAT);
++				outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD);
++			}
++			else {
++				outb_p(NCT6793D_READ_BYTE, SMBHSTCMD);
++			}
++			break;
++		case I2C_SMBUS_WORD_DATA:
++			outb_p((addr << 1) | read_write,
++			       SMBHSTADD);
++			outb_p(command, SMBHSTIDX);
++			if (read_write == I2C_SMBUS_WRITE) {
++				outb_p(data->word & 0xff, SMBHSTDAT);
++				outb_p((data->word & 0xff00) >> 8, SMBHSTDAT);
++				outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD);
++			}
++			else {
++				outb_p(NCT6793D_READ_WORD, SMBHSTCMD);
++			}
++			break;
++		case I2C_SMBUS_BLOCK_DATA:
++			outb_p((addr << 1) | read_write,
++			       SMBHSTADD);
++			outb_p(command, SMBHSTIDX);
++			if (read_write == I2C_SMBUS_WRITE) {
++				len = data->block[0];
++				if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
++					return -EINVAL;
++				outb_p(len, SMBBLKSZ);
++
++				cnt = 1;
++				if (len >= 4) {
++					for (i = cnt; i <= 4; i++) {
++						outb_p(data->block[i], SMBHSTDAT);
++					}
++
++					len -= 4;
++					cnt += 4;
++				}
++				else {
++					for (i = cnt; i <= len; i++ ) {
++						outb_p(data->block[i], SMBHSTDAT);
++					}
++
++					len = 0;
++				}
++
++				outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD);
++			}
++			else {
++				return -ENOTSUPP;
++			}
++			break;
++		default:
++			dev_warn(&adap->dev, "Unsupported transaction %d\n", size);
++			return -EOPNOTSUPP;
++	}
++
++	outb_p(NCT6793D_MANUAL_START, SMBHSTCTL);
++
++	while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) {
++		if (read_write == I2C_SMBUS_WRITE) {
++			timeout = 0;
++			while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0)
++			{
++				if(timeout > MAX_RETRIES)
++				{
++					return -ETIMEDOUT;
++				}
++				usleep_range(250, 500);
++				timeout++;
++			}
++
++			//Load more bytes into FIFO
++			if (len >= 4) {
++				for (i = cnt; i <= (cnt + 4); i++) {
++					outb_p(data->block[i], SMBHSTDAT);
++				}
++
++				len -= 4;
++				cnt += 4;
++			}
++			else {
++				for (i = cnt; i <= (cnt + len); i++) {
++					outb_p(data->block[i], SMBHSTDAT);
++				}
++
++				len = 0;
++			}
++		}
++		else {
++			return -ENOTSUPP;
++		}
++		
++	}
++
++	//wait for manual mode to complete
++	timeout = 0;
++	while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0)
++	{
++		if(timeout > MAX_RETRIES)
++		{
++			return -ETIMEDOUT;
++		}
++		usleep_range(250, 500);
++		timeout++;
++	}
++
++	if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) {    	
++		return -ENXIO;
++	}
++	else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) {
++		return 0;
++	}
++
++	switch (size) {
++		case I2C_SMBUS_QUICK:
++		case I2C_SMBUS_BYTE_DATA:
++			data->byte = inb_p(SMBHSTDAT);
++			break;
++		case I2C_SMBUS_WORD_DATA:
++			data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8);
++			break;
++	}
++	return 0;
++}
++
++static u32 nct6775_func(struct i2c_adapter *adapter)
++{
++	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
++	    I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
++	    I2C_FUNC_SMBUS_BLOCK_DATA;
++}
++
++static const struct i2c_algorithm smbus_algorithm = {
++	.smbus_xfer	= nct6775_access,
++	.functionality	= nct6775_func,
++};
++
++static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap)
++{
++	struct i2c_adapter *adap;
++	struct i2c_nct6775_adapdata *adapdata;
++	int retval;
++
++	adap = kzalloc(sizeof(*adap), GFP_KERNEL);
++	if (adap == NULL) {
++		return -ENOMEM;
++	}
++
++	adap->owner = THIS_MODULE;
++	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
++	adap->algo = &smbus_algorithm;
++
++	adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
++	if (adapdata == NULL) {
++		kfree(adap);
++		return -ENOMEM;
++	}
++
++	adapdata->smba = smba;
++
++	snprintf(adap->name, sizeof(adap->name),
++		"SMBus NCT67xx adapter%s at %04x", name, smba);
++
++	i2c_set_adapdata(adap, adapdata);
++
++	retval = i2c_add_adapter(adap);
++	if (retval) {
++		kfree(adapdata);
++		kfree(adap);
++		return retval;
++	}
++
++	*padap = adap;
++	return 0;
++}
++
++static void nct6775_remove_adapter(struct i2c_adapter *adap)
++{
++	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
++
++	if (adapdata->smba) {
++		i2c_del_adapter(adap);
++		kfree(adapdata);
++		kfree(adap);
++	}
++}
++
++//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume);
++
++/*
++ * when Super-I/O functions move to a separate file, the Super-I/O
++ * bus will manage the lifetime of the device and this module will only keep
++ * track of the nct6775 driver. But since we use platform_device_alloc(), we
++ * must keep track of the device
++ */
++static struct platform_device *pdev[2];
++
++static int nct6775_probe(struct platform_device *pdev)
++{
++	struct device *dev = &pdev->dev;
++	struct nct6775_sio_data *sio_data = dev_get_platdata(dev);
++	struct resource *res;
++
++	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
++	if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH,
++				 DRVNAME))
++		return -EBUSY;
++
++	switch (sio_data->kind) {
++	case nct6791:
++	case nct6792:
++	case nct6793:
++	case nct6795:
++	case nct6796:
++	case nct6798:
++		nct6775_add_adapter(res->start, "", &nct6775_adapter);
++		break;
++	default:
++		return -ENODEV;
++	}
++
++	return 0;
++}
++/*
++static void nct6791_enable_io_mapping(int sioaddr)
++{
++	int val;
++
++	val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE);
++	if (val & 0x10) {
++		pr_info("Enabling hardware monitor logical device mappings.\n");
++		superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE,
++			     val & ~0x10);
++	}
++}*/
++
++static struct platform_driver i2c_nct6775_driver = {
++	.driver = {
++		.name	= DRVNAME,
++//		.pm	= &nct6775_dev_pm_ops,
++	},
++	.probe		= nct6775_probe,
++};
++
++static void __exit i2c_nct6775_exit(void)
++{
++	int i;
++
++	if(nct6775_adapter)
++		nct6775_remove_adapter(nct6775_adapter);
++
++	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
++		if (pdev[i])
++			platform_device_unregister(pdev[i]);
++	}
++	platform_driver_unregister(&i2c_nct6775_driver);
++}
++
++/* nct6775_find() looks for a '627 in the Super-I/O config space */
++static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
++{
++	u16 val;
++	int err;
++	int addr;
++
++	err = superio_enter(sioaddr);
++	if (err)
++		return err;
++
++	val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) |
++		superio_inb(sioaddr, SIO_REG_DEVID + 1);
++
++	switch (val & SIO_ID_MASK) {
++	case SIO_NCT6106_ID:
++		sio_data->kind = nct6106;
++		break;
++	case SIO_NCT6775_ID:
++		sio_data->kind = nct6775;
++		break;
++	case SIO_NCT6776_ID:
++		sio_data->kind = nct6776;
++		break;
++	case SIO_NCT6779_ID:
++		sio_data->kind = nct6779;
++		break;
++	case SIO_NCT6791_ID:
++		sio_data->kind = nct6791;
++		break;
++	case SIO_NCT6792_ID:
++		sio_data->kind = nct6792;
++		break;
++	case SIO_NCT6793_ID:
++		sio_data->kind = nct6793;
++		break;
++	case SIO_NCT6795_ID:
++		sio_data->kind = nct6795;
++		break;
++	case SIO_NCT6796_ID:
++		sio_data->kind = nct6796;
++		break;
++	case SIO_NCT6798_ID:
++		sio_data->kind = nct6798;
++		break;
++	default:
++		if (val != 0xffff)
++			pr_debug("unsupported chip ID: 0x%04x\n", val);
++		superio_exit(sioaddr);
++		return -ENODEV;
++	}
++
++	/* We have a known chip, find the SMBus I/O address */
++	superio_select(sioaddr, NCT6775_LD_SMBUS);
++	val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8)
++	    | superio_inb(sioaddr, SIO_REG_SMBA + 1);
++	addr = val & IOREGION_ALIGNMENT;
++	if (addr == 0) {
++		pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n");
++		superio_exit(sioaddr);
++		return -ENODEV;
++	}
++
++	//if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
++	//    sio_data->kind == nct6793 || sio_data->kind == nct6795 ||
++	//    sio_data->kind == nct6796)
++	//	nct6791_enable_io_mapping(sioaddr);
++
++	superio_exit(sioaddr);
++	pr_info("Found %s or compatible chip at %#x:%#x\n",
++		nct6775_sio_names[sio_data->kind], sioaddr, addr);
++	sio_data->sioreg = sioaddr;
++
++	return addr;
++}
++
++static int __init i2c_nct6775_init(void)
++{
++	int i, err;
++	bool found = false;
++	int address;
++	struct resource res;
++	struct nct6775_sio_data sio_data;
++	int sioaddr[2] = { 0x2e, 0x4e };
++
++	err = platform_driver_register(&i2c_nct6775_driver);
++	if (err)
++		return err;
++
++	/*
++	 * initialize sio_data->kind and sio_data->sioreg.
++	 *
++	 * when Super-I/O functions move to a separate file, the Super-I/O
++	 * driver will probe 0x2e and 0x4e and auto-detect the presence of a
++	 * nct6775 hardware monitor, and call probe()
++	 */
++	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
++		address = nct6775_find(sioaddr[i], &sio_data);
++		if (address <= 0)
++			continue;
++
++		found = true;
++
++		pdev[i] = platform_device_alloc(DRVNAME, address);
++		if (!pdev[i]) {
++			err = -ENOMEM;
++			goto exit_device_unregister;
++		}
++
++		err = platform_device_add_data(pdev[i], &sio_data,
++					       sizeof(struct nct6775_sio_data));
++		if (err)
++			goto exit_device_put;
++
++		memset(&res, 0, sizeof(res));
++		res.name = DRVNAME;
++		res.start = address;
++		res.end = address + IOREGION_LENGTH - 1;
++		res.flags = IORESOURCE_IO;
++
++		err = acpi_check_resource_conflict(&res);
++		if (err) {
++			platform_device_put(pdev[i]);
++			pdev[i] = NULL;
++			continue;
++		}
++
++		err = platform_device_add_resources(pdev[i], &res, 1);
++		if (err)
++			goto exit_device_put;
++
++		/* platform_device_add calls probe() */
++		err = platform_device_add(pdev[i]);
++		if (err)
++			goto exit_device_put;
++	}
++	if (!found) {
++		err = -ENODEV;
++		goto exit_unregister;
++	}
++
++	return 0;
++
++exit_device_put:
++	platform_device_put(pdev[i]);
++exit_device_unregister:
++	while (--i >= 0) {
++		if (pdev[i])
++			platform_device_unregister(pdev[i]);
++	}
++exit_unregister:
++	platform_driver_unregister(&i2c_nct6775_driver);
++	return err;
++}
++
++MODULE_AUTHOR("Adam Honse <calcprogrammer1@gmail.com>");
++MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips");
++MODULE_LICENSE("GPL");
++
++module_init(i2c_nct6775_init);
++module_exit(i2c_nct6775_exit);
+diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
+index 809fbd014cd6..d54b35b147ee 100644
+--- a/drivers/i2c/busses/i2c-piix4.c
++++ b/drivers/i2c/busses/i2c-piix4.c
+@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
+ 	if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */
+ 		usleep_range(2000, 2100);
+ 	else
+-		usleep_range(250, 500);
++		usleep_range(25, 50);
+ 
+ 	while ((++timeout < MAX_TIMEOUT) &&
+ 	       ((temp = inb_p(SMBHSTSTS)) & 0x01))
+-		usleep_range(250, 500);
++		usleep_range(25, 50);
+ 
+ 	/* If the SMBus is still busy, we give up */
+ 	if (timeout == MAX_TIMEOUT) {
+diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
+index 2653516bcdef..973fe8f80051 100644
+--- a/drivers/md/dm-crypt.c
++++ b/drivers/md/dm-crypt.c
+@@ -3207,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+ 			goto bad;
+ 	}
+ 
++#ifdef CONFIG_CACHY
++	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
++	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
++#endif
++
+ 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
+ 	if (ret < 0)
+ 		goto bad;
+diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
+index 285acc4aaccc..492e88a99c07 100644
+--- a/drivers/pci/quirks.c
++++ b/drivers/pci/quirks.c
+@@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
+ 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
+ }
+ 
++static bool acs_on_downstream;
++static bool acs_on_multifunction;
++
++#define NUM_ACS_IDS 16
++struct acs_on_id {
++	unsigned short vendor;
++	unsigned short device;
++};
++static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
++static u8 max_acs_id;
++
++static __init int pcie_acs_override_setup(char *p)
++{
++	if (!p)
++		return -EINVAL;
++
++	while (*p) {
++		if (!strncmp(p, "downstream", 10))
++			acs_on_downstream = true;
++		if (!strncmp(p, "multifunction", 13))
++			acs_on_multifunction = true;
++		if (!strncmp(p, "id:", 3)) {
++			char opt[5];
++			int ret;
++			long val;
++
++			if (max_acs_id >= NUM_ACS_IDS - 1) {
++				pr_warn("Out of PCIe ACS override slots (%d)\n",
++						NUM_ACS_IDS);
++				goto next;
++			}
++
++			p += 3;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].vendor = val;
++
++			p += strcspn(p, ":");
++			if (*p != ':') {
++				pr_warn("PCIe ACS invalid ID\n");
++				goto next;
++			}
++
++			p++;
++			snprintf(opt, 5, "%s", p);
++			ret = kstrtol(opt, 16, &val);
++			if (ret) {
++				pr_warn("PCIe ACS ID parse error %d\n", ret);
++				goto next;
++			}
++			acs_on_ids[max_acs_id].device = val;
++			max_acs_id++;
++		}
++next:
++		p += strcspn(p, ",");
++		if (*p == ',')
++			p++;
++	}
++
++	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
++		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
++
++	return 0;
++}
++early_param("pcie_acs_override", pcie_acs_override_setup);
++
++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
++{
++	int i;
++
++	/* Never override ACS for legacy devices or devices with ACS caps */
++	if (!pci_is_pcie(dev) ||
++		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
++			return -ENOTTY;
++
++	for (i = 0; i < max_acs_id; i++)
++		if (acs_on_ids[i].vendor == dev->vendor &&
++			acs_on_ids[i].device == dev->device)
++				return 1;
++
++	switch (pci_pcie_type(dev)) {
++	case PCI_EXP_TYPE_DOWNSTREAM:
++	case PCI_EXP_TYPE_ROOT_PORT:
++		if (acs_on_downstream)
++			return 1;
++		break;
++	case PCI_EXP_TYPE_ENDPOINT:
++	case PCI_EXP_TYPE_UPSTREAM:
++	case PCI_EXP_TYPE_LEG_END:
++	case PCI_EXP_TYPE_RC_END:
++		if (acs_on_multifunction && dev->multifunction)
++			return 1;
++	}
++
++	return -ENOTTY;
++}
+ /*
+  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
+  * prevented for those affected devices.
+@@ -4980,6 +5080,7 @@ static const struct pci_dev_acs_enabled {
+ 	{ PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs },
+ 	/* Zhaoxin Root/Downstream Ports */
+ 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
++	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
+ 	{ 0 }
+ };
+ 
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 29e1f9e76eb6..a7852e22101f 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -1178,7 +1178,7 @@ struct readahead_control {
+ 		._index = i,						\
+ 	}
+ 
+-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
++#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
+ 
+ void page_cache_ra_unbounded(struct readahead_control *,
+ 		unsigned long nr_to_read, unsigned long lookahead_count);
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index 45f09bec02c4..87b20e2ee274 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
+ 
+ #ifdef CONFIG_USER_NS
+ 
++extern int unprivileged_userns_clone;
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	if (ns)
+@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
+ struct ns_common *ns_get_owner(struct ns_common *ns);
+ #else
+ 
++#define unprivileged_userns_clone 0
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	return &init_user_ns;
+diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
+index db762e35aca9..0336791656eb 100644
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -194,6 +194,7 @@ struct netns_ipv4 {
+ 	int sysctl_udp_rmem_min;
+ 
+ 	u8 sysctl_fib_notify_on_flag_change;
++	unsigned int sysctl_tcp_collapse_max_bytes;
+ 
+ #ifdef CONFIG_NET_L3_MASTER_DEV
+ 	u8 sysctl_udp_l3mdev_accept;
+diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
+index 901b440238d5..7026df84a0f6 100644
+--- a/include/trace/events/tcp.h
++++ b/include/trace/events/tcp.h
+@@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
+ 	TP_ARGS(sk)
+ );
+ 
++DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
++
++	TP_PROTO(struct sock *sk),
++
++	TP_ARGS(sk)
++);
++
+ TRACE_EVENT(tcp_retransmit_synack,
+ 
+ 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
+diff --git a/init/Kconfig b/init/Kconfig
+index 44e90b28a30f..748a9491ca12 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
+ 
+ menu "General setup"
+ 
++config CACHY
++    bool "Some kernel tweaks by CachyOS"
++    default y
++
+ config BROKEN
+ 	bool
+ 
+@@ -348,6 +352,19 @@ config KERNEL_UNCOMPRESSED
+ 
+ endchoice
+ 
++menu "ZSTD compression options"
++	depends on KERNEL_ZSTD
++
++config ZSTD_COMPRESSION_LEVEL
++	int "Compression level (1-22)"
++	range 1 22
++	default "22"
++	help
++	  Choose a compression level for zstd kernel compression.
++	  Default is 22, which is the maximum.
++
++endmenu
++
+ config DEFAULT_INIT
+ 	string "Default init path"
+ 	default ""
+@@ -1253,6 +1270,22 @@ config USER_NS
+ 
+ 	  If unsure, say N.
+ 
++config USER_NS_UNPRIVILEGED
++	bool "Allow unprivileged users to create namespaces"
++	default y
++	depends on USER_NS
++	help
++	  When disabled, unprivileged users will not be able to create
++	  new namespaces. Allowing users to create their own namespaces
++	  has been part of several recent local privilege escalation
++	  exploits, so if you need user namespaces but are
++	  paranoid^Wsecurity-conscious you want to disable this.
++
++	  This setting can be overridden at runtime via the
++	  kernel.unprivileged_userns_clone sysctl.
++
++	  If unsure, say Y.
++
+ config PID_NS
+ 	bool "PID Namespaces"
+ 	default y
+@@ -1420,6 +1453,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
+ 	  with the "-O2" compiler flag for best performance and most
+ 	  helpful compile-time warnings.
+ 
++config CC_OPTIMIZE_FOR_PERFORMANCE_O3
++	bool "Optimize more for performance (-O3)"
++	help
++	  Choosing this option will pass "-O3" to your compiler to optimize
++	  the kernel yet more for performance.
++
+ config CC_OPTIMIZE_FOR_SIZE
+ 	bool "Optimize for size (-Os)"
+ 	help
+diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
+index 38ef6d06888e..0f78364efd4f 100644
+--- a/kernel/Kconfig.hz
++++ b/kernel/Kconfig.hz
+@@ -40,6 +40,27 @@ choice
+ 	 on SMP and NUMA systems and exactly dividing by both PAL and
+ 	 NTSC frame rates for video and multimedia work.
+ 
++	config HZ_500
++		bool "500 HZ"
++	help
++	 500 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
++	config HZ_600
++		bool "600 HZ"
++	help
++	 600 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
++	config HZ_750
++		bool "750 HZ"
++	help
++	 750 Hz is a balanced timer frequency. Provides fast interactivity
++	 on desktops with good smoothness without increasing CPU power
++	 consumption and sacrificing the battery life on laptops.
++
+ 	config HZ_1000
+ 		bool "1000 HZ"
+ 	help
+@@ -53,6 +74,9 @@ config HZ
+ 	default 100 if HZ_100
+ 	default 250 if HZ_250
+ 	default 300 if HZ_300
++	default 500 if HZ_500
++	default 600 if HZ_600
++	default 750 if HZ_750
+ 	default 1000 if HZ_1000
+ 
+ config SCHED_HRTICK
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 9f7fe3541897..068062cdf5a3 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -98,6 +98,10 @@
+ #include <linux/bpf.h>
+ #include <linux/stackprotector.h>
+ 
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
++
+ #include <asm/pgalloc.h>
+ #include <linux/uaccess.h>
+ #include <asm/mmu_context.h>
+@@ -2030,6 +2034,10 @@ static __latent_entropy struct task_struct *copy_process(
+ 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ 		return ERR_PTR(-EINVAL);
+ 
++	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
++		if (!capable(CAP_SYS_ADMIN))
++			return ERR_PTR(-EPERM);
++
+ 	/*
+ 	 * Thread groups must share signals as well, and detached threads
+ 	 * can only be started up within the thread group.
+@@ -3180,6 +3188,12 @@ int ksys_unshare(unsigned long unshare_flags)
+ 	if (unshare_flags & CLONE_NEWNS)
+ 		unshare_flags |= CLONE_FS;
+ 
++	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
++		err = -EPERM;
++		if (!capable(CAP_SYS_ADMIN))
++			goto bad_unshare_out;
++	}
++
+ 	err = check_unshare_flags(unshare_flags);
+ 	if (err)
+ 		goto bad_unshare_out;
+diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
+index 424b3bc58f3f..ecf2798c5ccf 100644
+--- a/kernel/module/Kconfig
++++ b/kernel/module/Kconfig
+@@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD
+ 
+ endchoice
+ 
++menu "ZSTD module compression options"
++    depends on MODULE_COMPRESS_ZSTD
++
++config MODULE_COMPRESS_ZSTD_LEVEL
++	int "Compression level (1-19)"
++	range 1 19
++	default 9
++	help
++	  Compression level used by zstd for compressing modules.
++
++config MODULE_COMPRESS_ZSTD_ULTRA
++	bool "Enable ZSTD ultra compression"
++	help
++	  Compress modules with ZSTD using the highest possible compression.
++
++config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA
++	int "Compression level (20-22)"
++	depends on MODULE_COMPRESS_ZSTD_ULTRA
++	range 20 22
++	default 20
++	help
++	  Ultra compression level used by zstd for compressing modules.
++
++endmenu
++
+ config MODULE_DECOMPRESS
+ 	bool "Support in-kernel module decompression"
+ 	depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
+diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
+index ab62074174c3..f1f909bdc30d 100644
+--- a/kernel/rcu/Kconfig
++++ b/kernel/rcu/Kconfig
+@@ -280,9 +280,9 @@ config RCU_NOCB_CPU_CB_BOOST
+ 	depends on RCU_NOCB_CPU && RCU_BOOST
+ 	default y if PREEMPT_RT
+ 	help
+-	  Use this option to invoke offloaded callbacks as SCHED_FIFO
++	  Use this option to invoke offloaded callbacks as SCHED_RR
+ 	  to avoid starvation by heavy SCHED_OTHER background load.
+-	  Of course, running as SCHED_FIFO during callback floods will
++	  Of course, running as SCHED_RR during callback floods will
+ 	  cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ 	  of milliseconds or more.  Therefore, when enabling this option,
+ 	  it is your responsibility to ensure that latency-sensitive
+diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
+index 634df26a2c27..8c54871cc0a0 100644
+--- a/kernel/rcu/rcutorture.c
++++ b/kernel/rcu/rcutorture.c
+@@ -2406,7 +2406,7 @@ static int rcutorture_booster_init(unsigned int cpu)
+ 		t = per_cpu(ksoftirqd, cpu);
+ 		WARN_ON_ONCE(!t);
+ 		sp.sched_priority = 2;
+-		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
++		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
+ 	}
+ 
+ 	/* Don't allow time recalculation while creating a new task. */
+diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
+index cf34a961821a..80cf9824d461 100644
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -4443,8 +4443,8 @@ static void __init rcu_start_exp_gp_kworkers(void)
+ 		return;
+ 	}
+ 
+-	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+-	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
++	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, &param);
++	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR,
+ 				   &param);
+ }
+ 
+@@ -4482,7 +4482,7 @@ static int __init rcu_spawn_gp_kthread(void)
+ 		return 0;
+ 	if (kthread_prio) {
+ 		sp.sched_priority = kthread_prio;
+-		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
++		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
+ 	}
+ 	rnp = rcu_get_root();
+ 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
+index 9e1c8caec5ce..dd39c50ae099 100644
+--- a/kernel/rcu/tree_nocb.h
++++ b/kernel/rcu/tree_nocb.h
+@@ -1465,7 +1465,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
+ 		}
+ 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ 		if (kthread_prio)
+-			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
++			sched_setscheduler_nocheck(t, SCHED_RR, &sp);
+ 	}
+ 	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+ 
+@@ -1476,7 +1476,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
+ 		goto end;
+ 
+ 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
+-		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
++		sched_setscheduler_nocheck(t, SCHED_RR, &sp);
+ 
+ 	WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
+index 7b0fe741a088..77ad9e033358 100644
+--- a/kernel/rcu/tree_plugin.h
++++ b/kernel/rcu/tree_plugin.h
+@@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
+ 	struct sched_param sp;
+ 
+ 	sp.sched_priority = kthread_prio;
+-	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
++	sched_setscheduler_nocheck(current, SCHED_RR, &sp);
+ #endif /* #ifdef CONFIG_RCU_BOOST */
+ 
+ 	WRITE_ONCE(rdp->rcuc_activity, jiffies);
+@@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
+ 	rnp->boost_kthread_task = t;
+ 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ 	sp.sched_priority = kthread_prio;
+-	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
++	sched_setscheduler_nocheck(t, SCHED_RR, &sp);
+ 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+ 
+  out:
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0f8736991427..86a988c830ef 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -69,9 +69,13 @@
+  *
+  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_CACHY
++unsigned int sysctl_sched_latency			= 3000000ULL;
++static unsigned int normalized_sysctl_sched_latency	= 3000000ULL;
++#else
+ unsigned int sysctl_sched_latency			= 6000000ULL;
+ static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
+-
++#endif
+ /*
+  * The initial- and re-scaling of tunables is configurable
+  *
+@@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+  *
+  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_CACHY
++unsigned int sysctl_sched_min_granularity			= 400000ULL;
++static unsigned int normalized_sysctl_sched_min_granularity	= 400000ULL;
++#else
+ unsigned int sysctl_sched_min_granularity			= 750000ULL;
+ static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
++#endif
+ 
+ /*
+  * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+@@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
+  *
+  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+  */
++#ifdef CONFIG_CACHY
++unsigned int sysctl_sched_wakeup_granularity			= 500000UL;
++static unsigned int normalized_sysctl_sched_wakeup_granularity	= 500000UL;
++#else
+ unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
+ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
++#endif
+ 
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
+@@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu)
+  *
+  * (default: 5 msec, units: microseconds)
+  */
++#ifdef CONFIG_CACHY
++static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
++#else
+ static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+ #endif
++#endif
+ 
+ #ifdef CONFIG_NUMA_BALANCING
+ /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index 137d4abe3eda..98e2d9cc8491 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
+ #ifdef CONFIG_PERF_EVENTS
+ static const int six_hundred_forty_kb = 640 * 1024;
+ #endif
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
+ 
+ 
+ static const int ngroups_max = NGROUPS_MAX;
+@@ -1640,6 +1643,15 @@ static struct ctl_table kern_table[] = {
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
++#ifdef CONFIG_USER_NS
++	{
++		.procname	= "unprivileged_userns_clone",
++		.data		= &unprivileged_userns_clone,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++#endif
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "tainted",
+diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
+index 54211dbd516c..16ca0c151629 100644
+--- a/kernel/user_namespace.c
++++ b/kernel/user_namespace.c
+@@ -22,6 +22,13 @@
+ #include <linux/bsearch.h>
+ #include <linux/sort.h>
+ 
++/* sysctl */
++#ifdef CONFIG_USER_NS_UNPRIVILEGED
++int unprivileged_userns_clone = 1;
++#else
++int unprivileged_userns_clone;
++#endif
++
+ static struct kmem_cache *user_ns_cachep __read_mostly;
+ static DEFINE_MUTEX(userns_state_mutex);
+ 
+diff --git a/lib/string.c b/lib/string.c
+index 4fb566ea610f..4746a98b153e 100644
+--- a/lib/string.c
++++ b/lib/string.c
+@@ -792,24 +792,61 @@ char *strnstr(const char *s1, const char *s2, size_t len)
+ EXPORT_SYMBOL(strnstr);
+ #endif
+ 
++#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
++
++#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL)
++
++#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
++
++#define MEMCHR_MASK_GEN(mask)                                                  \
++	do {                                                                   \
++		mask *= 0x01010101;                                            \
++		mask |= mask << 32;                                            \
++	} while (0)
++
++#else
++
++#define MEMCHR_MASK_GEN(mask)                                                  \
++	do {                                                                   \
++		mask |= mask << 8;                                             \
++		mask |= mask << 16;                                            \
++		mask |= mask << 32;                                            \
++	} while (0)
++
++#endif
++
+ #ifndef __HAVE_ARCH_MEMCHR
+ /**
+  * memchr - Find a character in an area of memory.
+- * @s: The memory area
++ * @p: The memory area
+  * @c: The byte to search for
+- * @n: The size of the area.
++ * @length: The size of the area.
+  *
+  * returns the address of the first occurrence of @c, or %NULL
+  * if @c is not found
+  */
+-void *memchr(const void *s, int c, size_t n)
++void *memchr(const void *p, int c, unsigned long length)
+ {
+-	const unsigned char *p = s;
+-	while (n-- != 0) {
+-        	if ((unsigned char)c == *p++) {
+-			return (void *)(p - 1);
++	u64 mask, val;
++	const void *end = p + length;
++
++	c &= 0xff;
++	if (p <= end - 8) {
++		mask = c;
++		MEMCHR_MASK_GEN(mask);
++
++		for (; p <= end - 8; p += 8) {
++			val = *(u64 *)p ^ mask;
++			if ((val + 0xfefefefefefefeffu) &
++			    (~val & 0x8080808080808080u))
++				break;
+ 		}
+ 	}
++
++	for (; p < end; p++)
++		if (*(unsigned char *)p == c)
++			return (void *)p;
++
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(memchr);
+@@ -845,16 +882,7 @@ void *memchr_inv(const void *start, int c, size_t bytes)
+ 		return check_bytes8(start, value, bytes);
+ 
+ 	value64 = value;
+-#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
+-	value64 *= 0x0101010101010101ULL;
+-#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
+-	value64 *= 0x01010101;
+-	value64 |= value64 << 32;
+-#else
+-	value64 |= value64 << 8;
+-	value64 |= value64 << 16;
+-	value64 |= value64 << 32;
+-#endif
++	MEMCHR_MASK_GEN(value64);
+ 
+ 	prefix = (unsigned long)start % 8;
+ 	if (prefix) {
+diff --git a/mm/Kconfig b/mm/Kconfig
+index ff7b209dec05..bf317c39ed2d 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -602,7 +602,7 @@ config COMPACTION
+ config COMPACT_UNEVICTABLE_DEFAULT
+ 	int
+ 	depends on COMPACTION
+-	default 0 if PREEMPT_RT
++	default 0 if PREEMPT_RT || CACHY
+ 	default 1
+ 
+ #
+diff --git a/mm/compaction.c b/mm/compaction.c
+index 8238e83385a7..d0b16a5b30f7 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -2717,7 +2717,11 @@ static void compact_nodes(void)
+  * aggressively the kernel should compact memory in the
+  * background. It takes values in the range [0, 100].
+  */
++#ifdef CONFIG_CACHY
++unsigned int __read_mostly sysctl_compaction_proactiveness;
++#else
+ unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
++#endif
+ 
+ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+ 		void *buffer, size_t *length, loff_t *ppos)
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index ad608ef2a243..178cfd5490b1 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
+ /*
+  * Start background writeback (via writeback threads) at this percentage
+  */
++#ifdef CONFIG_CACHY
++static int dirty_background_ratio = 5;
++#else
+ static int dirty_background_ratio = 10;
++#endif
+ 
+ /*
+  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
+ /*
+  * The interval between `kupdate'-style writebacks
+  */
++#ifdef CONFIG_CACHY
++unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
++#else
+ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
++#endif
+ 
+ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+ 
+diff --git a/mm/swap.c b/mm/swap.c
+index 70e2063ef43a..79ab9b1c3910 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -1134,6 +1134,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag);
+  */
+ void __init swap_setup(void)
+ {
++#ifdef CONFIG_CACHY
++	/* Only swap-in pages requested, avoid readahead */
++	page_cluster = 0;
++#else
+ 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ 
+ 	/* Use a smaller cluster for small-memory machines */
+@@ -1145,4 +1149,5 @@ void __init swap_setup(void)
+ 	 * Right now other parts of the system means that we
+ 	 * _really_ don't want to cluster much more
+ 	 */
++#endif
+ }
+diff --git a/mm/vmpressure.c b/mm/vmpressure.c
+index b52644771cc4..11a4b0e3b583 100644
+--- a/mm/vmpressure.c
++++ b/mm/vmpressure.c
+@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+  * essence, they are percents: the higher the value, the more number
+  * unsuccessful reclaims there were.
+  */
++#ifdef CONFIG_CACHY
++static const unsigned int vmpressure_level_med = 65;
++#else
+ static const unsigned int vmpressure_level_med = 60;
++#endif
+ static const unsigned int vmpressure_level_critical = 95;
+ 
+ /*
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 5b7b8d4f5297..160acbbdf111 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -190,7 +190,11 @@ struct scan_control {
+ /*
+  * From 0 .. 200.  Higher means more swappy.
+  */
++#ifdef CONFIG_CACHY
++int vm_swappiness = 20;
++#else
+ int vm_swappiness = 60;
++#endif
+ 
+ static void set_task_reclaim_state(struct task_struct *task,
+ 				   struct reclaim_state *rs)
+@@ -4559,7 +4563,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
+ }
+ 
+ /* to protect the working set of the last N jiffies */
++#ifdef CONFIG_CACHY
++static unsigned long lru_gen_min_ttl __read_mostly = HZ;
++#else
+ static unsigned long lru_gen_min_ttl __read_mostly;
++#endif
+ 
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
+index 0d0cc4ef2b85..544104f9f4b3 100644
+--- a/net/ipv4/sysctl_net_ipv4.c
++++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -1467,6 +1467,13 @@ static struct ctl_table ipv4_net_table[] = {
+ 		.extra1         = SYSCTL_ZERO,
+ 		.extra2         = &tcp_plb_max_cong_thresh,
+ 	},
++	{
++		.procname	= "tcp_collapse_max_bytes",
++		.data		= &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= proc_douintvec_minmax,
++	},
+ 	{ }
+ };
+ 
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 754e0212c951..b6d7faeb737a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5414,6 +5414,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
+ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
+ 
+ 	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ 
+@@ -5425,6 +5426,39 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ 		return 0;
+ 
++	/* For context and additional information about this patch, see the
++	 * blog post at
++	 *
++	 * sysctl:  net.ipv4.tcp_collapse_max_bytes
++	 *
++	 * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
++	 * queue to free up memory if the current amount of memory allocated
++	 * is less than tcp_collapse_max_bytes.  Otherwise, the packet is
++	 * dropped without attempting to collapse the queue.
++	 *
++	 * If tcp_collapse_max_bytes is zero, this feature is disabled
++	 * and the default Linux behavior is used.  The default Linux
++	 * behavior is to always perform the attempt to collapse the
++	 * queue to free up memory.
++	 *
++	 * When the receive queue is small, we want to collapse the
++	 * queue.  There are two reasons for this: (a) the latency of
++	 * performing the collapse will be small on a small queue, and
++	 * (b) we want to avoid sending a congestion signal (via a
++	 * packet drop) to the sender when the receive queue is small.
++	 *
++	 * The result is that we avoid latency spikes caused by the
++	 * time it takes to perform the collapse logic when the receive
++	 * queue is large and full, while preserving existing behavior
++	 * and performance for all other cases.
++	 */
++	if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
++		(atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
++		/* We are dropping the packet */
++		trace_tcp_collapse_max_bytes_exceeded(sk);
++		goto do_not_collapse;
++	}
++
+ 	tcp_collapse_ofo_queue(sk);
+ 	if (!skb_queue_empty(&sk->sk_receive_queue))
+ 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+@@ -5443,6 +5477,8 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ 		return 0;
+ 
++do_not_collapse:
++
+ 	/* If we are really being abused, tell the caller to silently
+ 	 * drop receive data on the floor.  It will get retransmitted
+ 	 * and hopefully then we'll have sufficient space.
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 8320d0ecb13a..37a09cd767a1 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -3274,6 +3274,8 @@ static int __net_init tcp_sk_init(struct net *net)
+ 	else
+ 		net->ipv4.tcp_congestion_control = &tcp_reno;
+ 
++	net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
++
+ 	return 0;
+ }
+ 
+diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
+index 4a4a5f67c1a6..993e4578c0f2 100644
+--- a/scripts/Makefile.lib
++++ b/scripts/Makefile.lib
+@@ -557,14 +557,21 @@ quiet_cmd_xzmisc = XZMISC  $@
+ # decompression is used, like initramfs decompression, zstd22 should likely not
+ # be used because it would require zstd to allocate a 128 MB buffer.
+ 
++ifdef CONFIG_ZSTD_COMPRESSION_LEVEL
++zstd_comp_val	:= $(CONFIG_ZSTD_COMPRESSION_LEVEL)
++ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0)
++zstd_comp_val	+= --ultra
++endif
++endif
++
+ quiet_cmd_zstd = ZSTD    $@
+-      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@
++      cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@
+ 
+ quiet_cmd_zstd22 = ZSTD22  $@
+-      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@
++      cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@
+ 
+ quiet_cmd_zstd22_with_size = ZSTD22  $@
+-      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@
++      cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@
+ 
+ # ASM offsets
+ # ---------------------------------------------------------------------------
+diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
+index 4815a8e32227..6a3c36713045 100644
+--- a/scripts/Makefile.modinst
++++ b/scripts/Makefile.modinst
+@@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP    $@
+       cmd_gzip = $(KGZIP) -n -f $<
+ quiet_cmd_xz = XZ      $@
+       cmd_xz = $(XZ) --lzma2=dict=2MiB -f $<
++ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA
+ quiet_cmd_zstd = ZSTD    $@
+-      cmd_zstd = $(ZSTD) -T0 --rm -f -q $<
++      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $<
++else
++quiet_cmd_zstd = ZSTD    $@
++      cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $<
++endif
+ 
+ $(dst)/%.ko.gz: $(dst)/%.ko FORCE
+ 	$(call cmd,gzip)
+-- 
+2.39.2
+
+From e80cb8174e11427fa2c9a98d05cf11552767b940 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 15 Jan 2023 16:51:11 +0100
+Subject: [PATCH 05/15] clr
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/kernel/tsc.c                    |  3 ++
+ arch/x86/mm/fault.c                      |  4 +-
+ drivers/cpufreq/intel_pstate.c           |  7 ++++
+ drivers/idle/intel_idle.c                | 50 ++++++++++++------------
+ drivers/input/serio/i8042.c              | 10 ++---
+ drivers/net/dummy.c                      |  2 +-
+ drivers/pci/pci.c                        |  2 +-
+ drivers/powercap/intel_rapl_common.c     |  2 +-
+ drivers/thermal/intel/intel_powerclamp.c | 10 +++++
+ fs/xattr.c                               | 15 +++----
+ include/linux/jbd2.h                     |  2 +-
+ include/linux/wait.h                     |  2 +
+ include/uapi/linux/if_bonding.h          |  2 +-
+ init/do_mounts.c                         | 16 +++++++-
+ kernel/locking/rwsem.c                   |  4 +-
+ kernel/sched/wait.c                      | 24 ++++++++++++
+ kernel/watchdog.c                        |  2 +-
+ lib/raid6/algos.c                        |  4 +-
+ mm/ksm.c                                 | 11 ++++--
+ net/ipv4/inet_connection_sock.c          |  2 +-
+ net/ipv4/tcp.c                           |  4 +-
+ 21 files changed, 123 insertions(+), 55 deletions(-)
+
+diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
+index a78e73da4a74..bab8a98080cf 100644
+--- a/arch/x86/kernel/tsc.c
++++ b/arch/x86/kernel/tsc.c
+@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void)
+ 	if (!constant_tsc || !mask)
+ 		return 0;
+ 
++	if (cpu != 0)
++		return cpu_data(0).loops_per_jiffy;
++
+ 	sibling = cpumask_any_but(mask, cpu);
+ 	if (sibling < nr_cpu_ids)
+ 		return cpu_data(sibling).loops_per_jiffy;
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 7b0d4ab894c8..1a14f52added 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ 	if (!printk_ratelimit())
+ 		return;
+ 
+-	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
++	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i",
+ 		loglvl, tsk->comm, task_pid_nr(tsk), address,
+-		(void *)regs->ip, (void *)regs->sp, error_code);
++		(void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id());
+ 
+ 	print_vma_addr(KERN_CONT " in ", regs->ip);
+ 
+diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
+index fd73d6d2b808..0c0071ab3966 100644
+--- a/drivers/cpufreq/intel_pstate.c
++++ b/drivers/cpufreq/intel_pstate.c
+@@ -366,6 +366,13 @@ static void intel_pstate_set_itmt_prio(int cpu)
+ 	 * update them at any time after it has been called.
+ 	 */
+ 	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
++	/*
++	 * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff.
++	 * In this case we can't use CPPC.highest_perf to enable ITMT.
++	 * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide.
++	 */
++	if (cppc_perf.highest_perf == 0xff)
++		cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached));
+ 
+ 	if (max_highest_perf <= min_highest_perf) {
+ 		if (cppc_perf.highest_perf > max_highest_perf)
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index cfeb24d40d37..8d1945afa973 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 10,
+-		.target_residency = 20,
++		.target_residency = 120,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x10",
+ 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 33,
+-		.target_residency = 100,
++		.target_residency = 900,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x20",
+ 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 133,
+-		.target_residency = 400,
++		.target_residency = 1000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x32",
+ 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 166,
+-		.target_residency = 500,
++		.target_residency = 1500,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x40",
+ 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 300,
+-		.target_residency = 900,
++		.target_residency = 2000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x50",
+ 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 600,
+-		.target_residency = 1800,
++		.target_residency = 5000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x60",
+ 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 2600,
+-		.target_residency = 7700,
++		.target_residency = 9000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 10,
+-		.target_residency = 20,
++		.target_residency = 120,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x10",
+ 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 40,
+-		.target_residency = 100,
++		.target_residency = 1000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x20",
+ 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 133,
+-		.target_residency = 400,
++		.target_residency = 1000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x32",
+ 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 166,
+-		.target_residency = 500,
++		.target_residency = 2000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x40",
+ 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 300,
+-		.target_residency = 900,
++		.target_residency = 4000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x50",
+ 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 600,
+-		.target_residency = 1800,
++		.target_residency = 7000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x60",
+ 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 2600,
+-		.target_residency = 7700,
++		.target_residency = 9000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 10,
+-		.target_residency = 20,
++		.target_residency = 120,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x10",
+ 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 70,
+-		.target_residency = 100,
++		.target_residency = 1000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x20",
+ 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
+ 		.exit_latency = 85,
+-		.target_residency = 200,
++		.target_residency = 600,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x33",
+ 		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
+ 		.exit_latency = 124,
+-		.target_residency = 800,
++		.target_residency = 3000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x40",
+ 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
+ 		.exit_latency = 200,
+-		.target_residency = 800,
++		.target_residency = 3200,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x50",
+ 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
+ 		.exit_latency = 480,
+-		.target_residency = 5000,
++		.target_residency = 9000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x60",
+ 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
+ 		.exit_latency = 890,
+-		.target_residency = 5000,
++		.target_residency = 9000,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 10,
+-		.target_residency = 20,
++		.target_residency = 300,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 4,
+-		.target_residency = 4,
++		.target_residency = 40,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x20",
+ 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+ 		.exit_latency = 170,
+-		.target_residency = 600,
++		.target_residency = 900,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+@@ -987,7 +987,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
+ 		.desc = "MWAIT 0x01",
+ 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ 		.exit_latency = 2,
+-		.target_residency = 4,
++		.target_residency = 40,
+ 		.enter = &intel_idle,
+ 		.enter_s2idle = intel_idle_s2idle, },
+ 	{
+diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
+index 6dac7c1853a5..fab04cd8a7a0 100644
+--- a/drivers/input/serio/i8042.c
++++ b/drivers/input/serio/i8042.c
+@@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void)
+ 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
+ 		i8042_ctr &= ~I8042_CTR_KBDINT;
+ 		i8042_ctr |= I8042_CTR_KBDDIS;
+-		pr_err("Failed to enable KBD port\n");
++		pr_info("Failed to enable KBD port\n");
+ 		return -EIO;
+ 	}
+ 
+@@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void)
+ 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
+ 		i8042_ctr &= ~I8042_CTR_AUXINT;
+ 		i8042_ctr |= I8042_CTR_AUXDIS;
+-		pr_err("Failed to enable AUX port\n");
++		pr_info("Failed to enable AUX port\n");
+ 		return -EIO;
+ 	}
+ 
+@@ -732,7 +732,7 @@ static int i8042_check_mux(void)
+ 	i8042_ctr &= ~I8042_CTR_AUXINT;
+ 
+ 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
+-		pr_err("Failed to disable AUX port, can't use MUX\n");
++		pr_info("Failed to disable AUX port, can't use MUX\n");
+ 		return -EIO;
+ 	}
+ 
+@@ -955,7 +955,7 @@ static int i8042_controller_selftest(void)
+ 	do {
+ 
+ 		if (i8042_command(&param, I8042_CMD_CTL_TEST)) {
+-			pr_err("i8042 controller selftest timeout\n");
++			pr_info("i8042 controller selftest timeout\n");
+ 			return -ENODEV;
+ 		}
+ 
+@@ -977,7 +977,7 @@ static int i8042_controller_selftest(void)
+ 	pr_info("giving up on controller selftest, continuing anyway...\n");
+ 	return 0;
+ #else
+-	pr_err("i8042 controller selftest failed\n");
++	pr_info("i8042 controller selftest failed\n");
+ 	return -EIO;
+ #endif
+ }
+diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
+index c4b1b0aa438a..06b00f7a8eab 100644
+--- a/drivers/net/dummy.c
++++ b/drivers/net/dummy.c
+@@ -43,7 +43,7 @@
+ 
+ #define DRV_NAME	"dummy"
+ 
+-static int numdummies = 1;
++static int numdummies = 0;
+ 
+ /* fake multicast ability */
+ static void set_multicast_list(struct net_device *dev)
+diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
+index 5641786bd020..0ef504e909db 100644
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -62,7 +62,7 @@ struct pci_pme_device {
+ 	struct pci_dev *dev;
+ };
+ 
+-#define PME_TIMEOUT 1000 /* How long between PME checks */
++#define PME_TIMEOUT 4000 /* How long between PME checks */
+ 
+ static void pci_dev_d3_sleep(struct pci_dev *dev)
+ {
+diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
+index 26d00b1853b4..3e239d6548b5 100644
+--- a/drivers/powercap/intel_rapl_common.c
++++ b/drivers/powercap/intel_rapl_common.c
+@@ -1518,7 +1518,7 @@ static int __init rapl_init(void)
+ 
+ 	id = x86_match_cpu(rapl_ids);
+ 	if (!id) {
+-		pr_err("driver does not support CPU family %d model %d\n",
++		pr_info("driver does not support CPU family %d model %d\n",
+ 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
+ 
+ 		return -ENODEV;
+diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
+index b80e25ec1261..187b4ee6e9f5 100644
+--- a/drivers/thermal/intel/intel_powerclamp.c
++++ b/drivers/thermal/intel/intel_powerclamp.c
+@@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
+ 	.set_cur_state = powerclamp_set_cur_state,
+ };
+ 
++static const struct x86_cpu_id amd_cpu[] = {
++	{ X86_VENDOR_AMD },
++	{},
++};
++
+ static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
+ 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
+ 	{}
+@@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
+ static int __init powerclamp_probe(void)
+ {
+ 
++	if (x86_match_cpu(amd_cpu)){
++		pr_info("Intel PowerClamp does not support AMD CPUs\n");
++		return -ENODEV;
++	}
++
+ 	if (!x86_match_cpu(intel_powerclamp_ids)) {
+ 		pr_err("CPU does not support MWAIT\n");
+ 		return -ENODEV;
+diff --git a/fs/xattr.c b/fs/xattr.c
+index adab9a70b536..4ada829a3b1b 100644
+--- a/fs/xattr.c
++++ b/fs/xattr.c
+@@ -139,16 +139,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ 	}
+ 
+ 	/*
+-	 * In the user.* namespace, only regular files and directories can have
+-	 * extended attributes. For sticky directories, only the owner and
+-	 * privileged users can write attributes.
++	 * In the user.* namespace, only regular files, symbolic links, and
++	 * directories can have extended attributes. For symbolic links and
++	 * sticky directories, only the owner and privileged users can write
++	 * attributes.
+ 	 */
+ 	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
+-		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
++		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode))
+ 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+-		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+-		    (mask & MAY_WRITE) &&
+-		    !inode_owner_or_capable(mnt_userns, inode))
++		if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX))
++		        || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE)
++		    && !inode_owner_or_capable(mnt_userns, inode))
+ 			return -EPERM;
+ 	}
+ 
+diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
+index 2170e0cc279d..e8fa79f5bb34 100644
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -45,7 +45,7 @@
+ /*
+  * The default maximum commit age, in seconds.
+  */
+-#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
++#define JBD2_DEFAULT_MAX_COMMIT_AGE 30
+ 
+ #ifdef CONFIG_JBD2_DEBUG
+ /*
+diff --git a/include/linux/wait.h b/include/linux/wait.h
+index a0307b516b09..edc21128f387 100644
+--- a/include/linux/wait.h
++++ b/include/linux/wait.h
+@@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
+ 
+ extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
++extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ 
+@@ -1192,6 +1193,7 @@ do {										\
+  */
+ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
+diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
+index d174914a837d..bf8e2af101a3 100644
+--- a/include/uapi/linux/if_bonding.h
++++ b/include/uapi/linux/if_bonding.h
+@@ -82,7 +82,7 @@
+ #define BOND_STATE_ACTIVE       0   /* link is active */
+ #define BOND_STATE_BACKUP       1   /* link is backup */
+ 
+-#define BOND_DEFAULT_MAX_BONDS  1   /* Default maximum number of devices to support */
++#define BOND_DEFAULT_MAX_BONDS  0   /* Default maximum number of devices to support */
+ 
+ #define BOND_DEFAULT_TX_QUEUES 16   /* Default number of tx queues per device */
+ 
+diff --git a/init/do_mounts.c b/init/do_mounts.c
+index 811e94daf0a8..06fef7f97c02 100644
+--- a/init/do_mounts.c
++++ b/init/do_mounts.c
+@@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name)
+ 	if (strcmp(name, "/dev/ram") == 0)
+ 		return Root_RAM0;
+ #ifdef CONFIG_BLOCK
+-	if (strncmp(name, "PARTUUID=", 9) == 0)
+-		return devt_from_partuuid(name + 9);
++	if (strncmp(name, "PARTUUID=", 9) == 0) {
++		dev_t res;
++		int  needtowait = 40<<1;
++		res = devt_from_partuuid(name + 9);
++		while (!res && needtowait) {
++			/* waiting 0.5 sec */
++			msleep(500);
++			res = devt_from_partuuid(name + 9);
++			needtowait--;
++		}
++		return res;
++	}
+ 	if (strncmp(name, "PARTLABEL=", 10) == 0)
+ 		return devt_from_partlabel(name + 10);
+ 	if (strncmp(name, "/dev/", 5) == 0)
+@@ -612,7 +622,9 @@ void __init prepare_namespace(void)
+ 	 * For example, it is not atypical to wait 5 seconds here
+ 	 * for the touchpad of a laptop to initialize.
+ 	 */
++	async_synchronize_full();
+ 	wait_for_device_probe();
++	async_synchronize_full();
+ 
+ 	md_run_setup();
+ 
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index 44873594de03..fe62d59f2bdc 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+ 	struct task_struct *new, *owner;
+ 	unsigned long flags, new_flags;
+ 	enum owner_state state;
++	int i = 0;
+ 
+ 	lockdep_assert_preemption_disabled();
+ 
+@@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+ 			break;
+ 		}
+ 
+-		cpu_relax();
++		if (i++ > 1000)
++			cpu_relax();
+ 	}
+ 
+ 	return state;
+diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
+index 133b74730738..1647fb8662eb 100644
+--- a/kernel/sched/wait.c
++++ b/kernel/sched/wait.c
+@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
+ }
+ EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+ 
++void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
++{
++	unsigned long flags;
++
++	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
++	spin_lock_irqsave(&wq_head->lock, flags);
++	__add_wait_queue(wq_head, wq_entry);
++	spin_unlock_irqrestore(&wq_head->lock, flags);
++}
++EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
++
+ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ {
+ 	unsigned long flags;
+@@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent
+ }
+ EXPORT_SYMBOL(prepare_to_wait_exclusive);
+ 
++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
++{
++	unsigned long flags;
++
++	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
++	spin_lock_irqsave(&wq_head->lock, flags);
++	if (list_empty(&wq_entry->entry))
++		__add_wait_queue(wq_head, wq_entry);
++	set_current_state(state);
++	spin_unlock_irqrestore(&wq_head->lock, flags);
++}
++EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
++
+ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+ {
+ 	wq_entry->flags = flags;
+diff --git a/kernel/watchdog.c b/kernel/watchdog.c
+index 8e61f21e7e33..be1439d38f26 100644
+--- a/kernel/watchdog.c
++++ b/kernel/watchdog.c
+@@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled;
+ int __read_mostly watchdog_user_enabled = 1;
+ int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+ int __read_mostly soft_watchdog_user_enabled = 1;
+-int __read_mostly watchdog_thresh = 10;
++int __read_mostly watchdog_thresh = 40;
+ static int __read_mostly nmi_watchdog_available;
+ 
+ struct cpumask watchdog_cpumask __read_mostly;
+diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
+index a22a05c9af8a..a70bcbbd1673 100644
+--- a/lib/raid6/algos.c
++++ b/lib/raid6/algos.c
+@@ -126,8 +126,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
+ 
+ 	for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
+ 		if (!best || (*algo)->priority > best->priority)
+-			if (!(*algo)->valid || (*algo)->valid())
++			if (!(*algo)->valid || (*algo)->valid()) {
+ 				best = *algo;
++				break;
++			}
+ 
+ 	if (best) {
+ 		raid6_2data_recov = best->data2;
+diff --git a/mm/ksm.c b/mm/ksm.c
+index addf490da146..a92c9594a2d3 100644
+--- a/mm/ksm.c
++++ b/mm/ksm.c
+@@ -2454,9 +2454,14 @@ static int ksm_scan_thread(void *nothing)
+ 
+ 		if (ksmd_should_run()) {
+ 			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+-			wait_event_interruptible_timeout(ksm_iter_wait,
+-				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+-				msecs_to_jiffies(sleep_ms));
++			if (sleep_ms >= 1000)
++				wait_event_interruptible_timeout(ksm_iter_wait,
++					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
++					msecs_to_jiffies(round_jiffies_relative(sleep_ms)));
++			else
++				wait_event_interruptible_timeout(ksm_iter_wait,
++					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
++					msecs_to_jiffies(sleep_ms));
+ 		} else {
+ 			wait_event_freezable(ksm_thread_wait,
+ 				ksmd_should_run() || kthread_should_stop());
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index f2c43f67187d..9885bfb429a2 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+ 	 * having to remove and re-insert us on the wait queue.
+ 	 */
+ 	for (;;) {
+-		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
++		prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
+ 					  TASK_INTERRUPTIBLE);
+ 		release_sock(sk);
+ 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index e9e8040d6491..f9b56123b3b8 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -4815,8 +4815,8 @@ void __init tcp_init(void)
+ 	tcp_init_mem();
+ 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+ 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+-	max_wshare = min(4UL*1024*1024, limit);
+-	max_rshare = min(6UL*1024*1024, limit);
++	max_wshare = min(16UL*1024*1024, limit);
++	max_rshare = min(16UL*1024*1024, limit);
+ 
+ 	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
+ 	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+-- 
+2.39.2
+
+From 952f0ec42e0dddee76cb525f4cca1fe60e910b95 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Tue, 21 Feb 2023 10:27:37 +0100
+Subject: [PATCH 06/15] fixes
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/ABI/stable/sysfs-block          |   10 +
+ .../testing/sysfs-class-led-trigger-blkdev    |   78 ++
+ Documentation/admin-guide/mm/ksm.rst          |    7 +
+ Documentation/leds/index.rst                  |    1 +
+ Documentation/leds/ledtrig-blkdev.rst         |  158 +++
+ arch/x86/boot/compressed/Makefile             |    2 +-
+ arch/x86/kernel/acpi/boot.c                   |   19 +-
+ arch/x86/mm/tlb.c                             |    2 +-
+ drivers/acpi/acpica/Makefile                  |    2 +-
+ drivers/bluetooth/btusb.c                     |    9 +
+ drivers/char/tpm/tpm-chip.c                   |   62 +-
+ drivers/char/tpm/tpm.h                        |   73 +
+ drivers/hwmon/nct6775-core.c                  |    2 +-
+ drivers/leds/trigger/Kconfig                  |    9 +
+ drivers/leds/trigger/Makefile                 |    1 +
+ drivers/leds/trigger/ledtrig-blkdev.c         | 1220 +++++++++++++++++
+ drivers/md/dm.c                               |    2 +
+ fs/eventpoll.c                                |    2 +-
+ fs/proc/base.c                                |    1 +
+ include/linux/mm_types.h                      |    7 +-
+ include/linux/pageblock-flags.h               |    2 +-
+ kernel/kthread.c                              |    5 +
+ kernel/padata.c                               |    4 +-
+ lib/string.c                                  |   10 +-
+ lib/zstd/decompress/huf_decompress.c          |    2 +-
+ mm/compaction.c                               |   75 +-
+ mm/internal.h                                 |    6 +-
+ mm/ksm.c                                      |  185 ++-
+ mm/page_alloc.c                               |   22 +-
+ mm/z3fold.c                                   |    2 -
+ mm/zsmalloc.c                                 |    3 -
+ scripts/Kconfig.include                       |    2 +-
+ scripts/Makefile.compiler                     |    8 +-
+ scripts/Makefile.vmlinux_o                    |    2 +-
+ scripts/as-version.sh                         |    2 +-
+ security/Kconfig.hardening                    |    3 +
+ sound/pci/hda/cs35l41_hda.c                   |    2 +-
+ .../selftests/vm/ksm_functional_tests.c       |   96 +-
+ 38 files changed, 1992 insertions(+), 106 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+ create mode 100644 Documentation/leds/ledtrig-blkdev.rst
+ create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
+
+diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
+index cd14ecb3c9a5..853cb2601242 100644
+--- a/Documentation/ABI/stable/sysfs-block
++++ b/Documentation/ABI/stable/sysfs-block
+@@ -101,6 +101,16 @@ Description:
+ 		devices that support receiving integrity metadata.
+ 
+ 
++What:		/sys/block/<disk>/linked_leds
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Directory that contains symbolic links to all LEDs that
++		are associated with (linked to) this block device by the
++		blkdev LED trigger.  Only present when at least one LED
++		is linked.  (See Documentation/leds/ledtrig-blkdev.rst.)
++
++
+ What:		/sys/block/<disk>/<partition>/alignment_offset
+ Date:		April 2009
+ Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+new file mode 100644
+index 000000000000..45275eb0bad3
+--- /dev/null
++++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+@@ -0,0 +1,78 @@
++What:		/sys/class/leds/<led>/blink_time
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Time (in milliseconds) that the LED will be on during a single
++		"blink".
++
++What:		/sys/class/leds/<led>/check_interval
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Interval (in milliseconds) between checks of the block devices
++		linked to this LED.  The LED will be blinked if the correct type
++		of activity (see blink_on_{read,write,discard,flush} attributes)
++		has occurred on any of the linked devices since the previous
++		check.
++
++What:		/sys/class/leds/<led>/blink_on_read
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to read activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_write
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to write activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_discard
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to discard activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_flush
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gamil.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to cache flush activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/link_dev_by_path
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Associate a block device with this LED by writing the path to
++		the device special file (e.g. /dev/sda) to this attribute.
++		Symbolic links are followed.
++
++What:		/sys/class/leds/<led>/unlink_dev_by_path
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Remove the association between this LED and a block device by
++		writing the path to the device special file (e.g. /dev/sda) to
++		this attribute.  Symbolic links are followed.
++
++What:		/sys/class/leds/<led>/unlink_dev_by_name
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Remove the association between this LED and a block device by
++		writing the kernel name of the device (e.g. sda) to this
++		attribute.
++
++What:		/sys/class/leds/<led>/linked_devices
++Date:		October 2022
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Directory containing links to all block devices that are
++		associated with this LED.  (Note that the names of the
++		symbolic links in this directory are *kernel* names, which
++		may not match the device special file paths written to
++		link_device and unlink_device.)
+diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
+index fb6ba2002a4b..f160f9487a90 100644
+--- a/Documentation/admin-guide/mm/ksm.rst
++++ b/Documentation/admin-guide/mm/ksm.rst
+@@ -173,6 +173,13 @@ stable_node_chains
+         the number of KSM pages that hit the ``max_page_sharing`` limit
+ stable_node_dups
+         number of duplicated KSM pages
++zero_pages_sharing
++        how many empty pages are sharing kernel zero page(s) instead of
++        with each other as it would happen normally. Only effective when
++        enabling ``use_zero_pages`` knob.
++
++When enabling ``use_zero_pages``, the sum of ``pages_sharing`` +
++``zero_pages_sharing`` represents how much really saved by KSM.
+ 
+ A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
+ sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
+diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
+index e5d63b940045..e3c24e468cbc 100644
+--- a/Documentation/leds/index.rst
++++ b/Documentation/leds/index.rst
+@@ -10,6 +10,7 @@ LEDs
+    leds-class
+    leds-class-flash
+    leds-class-multicolor
++   ledtrig-blkdev
+    ledtrig-oneshot
+    ledtrig-transient
+    ledtrig-usbport
+diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst
+new file mode 100644
+index 000000000000..9ff5b99de451
+--- /dev/null
++++ b/Documentation/leds/ledtrig-blkdev.rst
+@@ -0,0 +1,158 @@
++.. SPDX-License-Identifier: GPL-2.0
++
++=================================
++Block Device (blkdev) LED Trigger
++=================================
++
++Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or
++``CONFIG_LEDS_TRIGGER_BLKDEV=m``.
++
++See also:
++
++* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev``
++* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block/<disk>/linked_leds``)
++
++Overview
++========
++
++.. note::
++	The examples below use ``<LED>`` to refer to the name of a
++	system-specific LED.  If no suitable LED is available on a test
++	system (in a virtual machine, for example), it is possible to
++	use a userspace LED.  (See ``Documentation/leds/uleds.rst``.)
++
++Verify that the ``blkdev`` LED trigger is available::
++
++	# grep blkdev /sys/class/leds/<LED>/trigger
++	... rfkill-none blkdev
++
++(If the previous command produces no output, you may need to load the trigger
++module - ``modprobe ledtrig_blkdev``.  If the module is not available, check
++the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.)
++
++Associate the LED with the ``blkdev`` LED trigger::
++
++	# echo blkdev > /sys/class/leds/<LED>/trigger
++
++	# cat /sys/class/leds/<LED>/trigger
++	... rfkill-none [blkdev]
++
++Note that several new device attributes are available in the
++``/sys/class/leds/<LED>`` directory.
++
++* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are
++  used to manage the set of block devices associated with this LED.  The LED
++  will blink when activity occurs on any of its linked devices.
++
++* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and
++  ``blink_on_flush`` are boolean values that determine whether the LED will
++  blink when a particular type of activity is detected on one of its linked
++  block devices.
++
++* ``blink_time`` is the duration (in milliseconds) of each blink of this LED.
++  (The minimum value is 10 milliseconds.)
++
++* ``check_interval`` is the frequency (in milliseconds) with which block devices
++  linked to this LED will be checked for activity and the LED blinked (if the
++  correct type of activity has occurred).
++
++* The ``linked_devices`` directory will contain a symbolic link to every device
++  that is associated with this LED.
++
++Link a block device to the LED::
++
++	# echo /dev/sda > /sys/class/leds/<LED>/link_dev_by_path
++
++	# ls /sys/class/leds/<LED>/linked_devices
++	sda
++
++(The value written to ``link_dev_by_path`` must be the path of the device
++special file, such as ``/dev/sda``, that represents the block device - or the
++path of a symbolic link to such a device special file.)
++
++Activity on the device will now cause the LED to blink.  The duration of each
++blink (in milliseconds) can be adjusted by setting
++``/sys/class/leds/<LED>/blink_time``.  (But see **check_interval and
++blink_time** below.)
++
++Associate a second device with the LED::
++
++	# echo /dev/sdb > /sys/class/leds/<LED>/link_dev_by_path
++
++	# ls /sys/class/leds/<LED>/linked_devices
++	sda  sdb
++
++When a block device is linked to one or more LEDs, the LEDs are linked from
++the device's ``linked_leds`` directory::
++
++	# ls /sys/class/block/sd{a,b}/linked_leds
++	/sys/class/block/sda/linked_leds:
++	<LED>
++
++	/sys/class/block/sdb/linked_leds:
++	<LED>
++
++(The ``linked_leds`` directory only exists when the block device is linked to
++at least one LED.)
++
++``check_interval`` and ``blink_time``
++=====================================
++
++* By default, linked block devices are checked for activity every 100
++  milliseconds.  This frequency can be changed for an LED via the
++  ``/sys/class/leds/<led>/check_interval`` attribute.  (The minimum value is 25
++  milliseconds.)
++
++* All block devices associated with an LED are checked for activity every
++  ``check_interval`` milliseconds, and a blink is triggered if the correct type
++  of activity (as determined by the LED's ``blink_on_*`` attributes) is
++  detected.  The duration of an LED's blink is determined by its ``blink_time``
++  attribute.  Thus (when the correct type of activity is detected), the LED will
++  be on for ``blink_time`` milliseconds and off for
++  ``check_interval - blink_time`` milliseconds.
++
++* The LED subsystem ignores new blink requests for an LED that is already in
++  in the process of blinking, so setting a ``blink_time`` greater than or equal
++  to ``check_interval`` will cause some blinks to be missed.
++
++* Because of processing times, scheduling latencies, etc., avoiding missed
++  blinks actually requires a difference of at least a few milliseconds between
++  the ``blink_time`` and ``check_interval``.  The required difference is likely
++  to vary from system to system.  As a  reference, a Thecus N5550 NAS requires a
++  difference of 7 milliseconds (e.g. ``check_interval == 100``,
++  ``blink_time == 93``).
++
++* The default values (``check_interval == 100``, ``blink_time == 75``) cause the
++  LED associated with a continuously active device to blink rapidly.  For a more
++  "always on" effect, increase the ``blink_time`` (but not too much; see the
++  previous bullet).
++
++Other Notes
++===========
++
++* Many (possibly all) types of block devices work with this trigger, including:
++
++  * SCSI (including SATA and USB) hard disk drives and SSDs
++  * SCSI (including SATA and USB) optical drives
++  * NVMe SSDs
++  * SD cards
++  * loopback block devices (``/dev/loop*``)
++  * device mapper devices, such as LVM logical volumes
++  * MD RAID devices
++  * zRAM compressed RAM-disks
++  * partitions on block devices that support them
++
++* The names of the symbolic links in ``/sys/class/leds/<LED>/linked_devices``
++  are **kernel** names, which may not match the paths used for
++  ``link_dev_by_path`` and ``unlink_dev_by_path``.  This is most likely when a
++  symbolic link is used to refer to the device (as is common with logical
++  volumes), but it can be true for any device, because nothing prevents the
++  creation of device special files with arbitrary names (e.g.
++  ``sudo mknod /foo b 8 0``).
++
++  Kernel names can be used to unlink block devices from LEDs by writing them to
++  the LED's ``unlink_dev_by_name`` attribute.
++
++* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
++  A device can be associated with multiple LEDs, and an LED can be associated
++  with multiple devices.
+diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
+index d995595394bb..19d1fb601796 100644
+--- a/arch/x86/boot/compressed/Makefile
++++ b/arch/x86/boot/compressed/Makefile
+@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
+ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
+ # Disable relocation relaxation in case the link is not PIE.
+-KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
++KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
+ KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
+ 
+ # sev.c indirectly inludes inat-table.h which is generated during
+diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
+index 907cc98b1938..518bda50068c 100644
+--- a/arch/x86/kernel/acpi/boot.c
++++ b/arch/x86/kernel/acpi/boot.c
+@@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
+ 	return cpu;
+ }
+ 
++static bool __init acpi_is_processor_usable(u32 lapic_flags)
++{
++	if (lapic_flags & ACPI_MADT_ENABLED)
++		return true;
++
++	if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
++		return true;
++
++	return false;
++}
++
+ static int __init
+ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
+ {
+@@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
+ 	if (apic_id == 0xffffffff)
+ 		return 0;
+ 
++	/* don't register processors that cannot be onlined */
++	if (!acpi_is_processor_usable(processor->lapic_flags))
++		return 0;
++
+ 	/*
+ 	 * We need to register disabled CPU as well to permit
+ 	 * counting disabled CPUs. This allows us to size
+@@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
+ 		return 0;
+ 
+ 	/* don't register processors that can not be onlined */
+-	if (acpi_support_online_capable &&
+-	    !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
+-	    !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
++	if (!acpi_is_processor_usable(processor->lapic_flags))
+ 		return 0;
+ 
+ 	/*
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index c1e31e9a85d7..92d73ccede70 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -1205,7 +1205,7 @@ void __flush_tlb_all(void)
+ 	 */
+ 	VM_WARN_ON_ONCE(preemptible());
+ 
+-	if (boot_cpu_has(X86_FEATURE_PGE)) {
++	if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ 		__flush_tlb_global();
+ 	} else {
+ 		/*
+diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile
+index 9e0d95d76fff..30f3fc13c29d 100644
+--- a/drivers/acpi/acpica/Makefile
++++ b/drivers/acpi/acpica/Makefile
+@@ -3,7 +3,7 @@
+ # Makefile for ACPICA Core interpreter
+ #
+ 
+-ccflags-y			:= -Os -D_LINUX -DBUILDING_ACPICA
++ccflags-y			:= -D_LINUX -DBUILDING_ACPICA
+ ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
+ 
+ # use acpi.o to put all files here into acpi.o modparam namespace
+diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
+index 2ad4efdd9e40..afd2f08ffe30 100644
+--- a/drivers/bluetooth/btusb.c
++++ b/drivers/bluetooth/btusb.c
+@@ -64,6 +64,7 @@ static struct usb_driver btusb_driver;
+ #define BTUSB_INTEL_BROKEN_SHUTDOWN_LED	BIT(24)
+ #define BTUSB_INTEL_BROKEN_INITIAL_NCMD BIT(25)
+ #define BTUSB_INTEL_NO_WBS_SUPPORT	BIT(26)
++#define BTUSB_ACTIONS_SEMI		BIT(27)
+ 
+ static const struct usb_device_id btusb_table[] = {
+ 	/* Generic Bluetooth USB device */
+@@ -677,6 +678,9 @@ static const struct usb_device_id blacklist_table[] = {
+ 	{ USB_DEVICE(0x0cb5, 0xc547), .driver_info = BTUSB_REALTEK |
+ 						     BTUSB_WIDEBAND_SPEECH },
+ 
++	/* Actions Semiconductor ATS2851 based devices */
++	{ USB_DEVICE(0x10d7, 0xb012), .driver_info = BTUSB_ACTIONS_SEMI },
++
+ 	/* Silicon Wave based devices */
+ 	{ USB_DEVICE(0x0c10, 0x0000), .driver_info = BTUSB_SWAVE },
+ 
+@@ -4098,6 +4102,11 @@ static int btusb_probe(struct usb_interface *intf,
+ 		set_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags);
+ 	}
+ 
++	if (id->driver_info & BTUSB_ACTIONS_SEMI) {
++		/* Support is advertised, but not implemented */
++		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
++	}
++
+ 	if (!reset)
+ 		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+ 
+diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
+index 741d8f3e8fb3..348dd5705fbb 100644
+--- a/drivers/char/tpm/tpm-chip.c
++++ b/drivers/char/tpm/tpm-chip.c
+@@ -512,6 +512,65 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip)
+ 	return 0;
+ }
+ 
++static bool tpm_is_rng_defective(struct tpm_chip *chip)
++{
++	int ret;
++	u64 version;
++	u32 val1, val2;
++
++	/* No known-broken TPM1 chips. */
++	if (!(chip->flags & TPM_CHIP_FLAG_TPM2))
++		return false;
++
++	ret = tpm_request_locality(chip);
++	if (ret)
++		return false;
++
++	/* Some AMD fTPM versions may cause stutter */
++	ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL);
++	if (ret)
++		goto release;
++	if (val1 != 0x414D4400U /* AMD */) {
++		ret = -ENODEV;
++		goto release;
++	}
++	ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL);
++	if (ret)
++		goto release;
++	ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL);
++	if (ret)
++		goto release;
++
++release:
++	tpm_relinquish_locality(chip);
++
++	if (ret)
++		return false;
++
++	version = ((u64)val1 << 32) | val2;
++	/*
++	 * Fixes for stutter as described in
++	 * https://www.amd.com/en/support/kb/faq/pa-410
++	 * are available in two series of fTPM firmware:
++	 *   6.x.y.z series: 6.0.18.6 +
++	 *   3.x.y.z series: 3.57.x.5 +
++	 */
++	if ((version >> 48) == 6) {
++		if (version >= 0x0006000000180006ULL)
++			return false;
++	} else if ((version >> 48) == 3) {
++		if (version >= 0x0003005700000005ULL)
++			return false;
++	} else {
++		return false;
++	}
++	dev_warn(&chip->dev,
++		 "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n",
++		 version);
++
++	return true;
++}
++
+ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
+ {
+ 	struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng);
+@@ -521,7 +580,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
+ 
+ static int tpm_add_hwrng(struct tpm_chip *chip)
+ {
+-	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip))
++	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) ||
++	    tpm_is_rng_defective(chip))
+ 		return 0;
+ 
+ 	snprintf(chip->hwrng_name, sizeof(chip->hwrng_name),
+diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
+index 24ee4e1cc452..830014a26609 100644
+--- a/drivers/char/tpm/tpm.h
++++ b/drivers/char/tpm/tpm.h
+@@ -150,6 +150,79 @@ enum tpm_sub_capabilities {
+ 	TPM_CAP_PROP_TIS_DURATION = 0x120,
+ };
+ 
++enum tpm2_pt_props {
++	TPM2_PT_NONE = 0x00000000,
++	TPM2_PT_GROUP = 0x00000100,
++	TPM2_PT_FIXED = TPM2_PT_GROUP * 1,
++	TPM2_PT_FAMILY_INDICATOR = TPM2_PT_FIXED + 0,
++	TPM2_PT_LEVEL = TPM2_PT_FIXED + 1,
++	TPM2_PT_REVISION = TPM2_PT_FIXED + 2,
++	TPM2_PT_DAY_OF_YEAR = TPM2_PT_FIXED + 3,
++	TPM2_PT_YEAR = TPM2_PT_FIXED + 4,
++	TPM2_PT_MANUFACTURER = TPM2_PT_FIXED + 5,
++	TPM2_PT_VENDOR_STRING_1 = TPM2_PT_FIXED + 6,
++	TPM2_PT_VENDOR_STRING_2 = TPM2_PT_FIXED + 7,
++	TPM2_PT_VENDOR_STRING_3 = TPM2_PT_FIXED + 8,
++	TPM2_PT_VENDOR_STRING_4 = TPM2_PT_FIXED + 9,
++	TPM2_PT_VENDOR_TPM_TYPE = TPM2_PT_FIXED + 10,
++	TPM2_PT_FIRMWARE_VERSION_1 = TPM2_PT_FIXED + 11,
++	TPM2_PT_FIRMWARE_VERSION_2 = TPM2_PT_FIXED + 12,
++	TPM2_PT_INPUT_BUFFER = TPM2_PT_FIXED + 13,
++	TPM2_PT_HR_TRANSIENT_MIN = TPM2_PT_FIXED + 14,
++	TPM2_PT_HR_PERSISTENT_MIN = TPM2_PT_FIXED + 15,
++	TPM2_PT_HR_LOADED_MIN = TPM2_PT_FIXED + 16,
++	TPM2_PT_ACTIVE_SESSIONS_MAX = TPM2_PT_FIXED + 17,
++	TPM2_PT_PCR_COUNT = TPM2_PT_FIXED + 18,
++	TPM2_PT_PCR_SELECT_MIN = TPM2_PT_FIXED + 19,
++	TPM2_PT_CONTEXT_GAP_MAX = TPM2_PT_FIXED + 20,
++	TPM2_PT_NV_COUNTERS_MAX = TPM2_PT_FIXED + 22,
++	TPM2_PT_NV_INDEX_MAX = TPM2_PT_FIXED + 23,
++	TPM2_PT_MEMORY = TPM2_PT_FIXED + 24,
++	TPM2_PT_CLOCK_UPDATE = TPM2_PT_FIXED + 25,
++	TPM2_PT_CONTEXT_HASH = TPM2_PT_FIXED + 26,
++	TPM2_PT_CONTEXT_SYM = TPM2_PT_FIXED + 27,
++	TPM2_PT_CONTEXT_SYM_SIZE = TPM2_PT_FIXED + 28,
++	TPM2_PT_ORDERLY_COUNT = TPM2_PT_FIXED + 29,
++	TPM2_PT_MAX_COMMAND_SIZE = TPM2_PT_FIXED + 30,
++	TPM2_PT_MAX_RESPONSE_SIZE = TPM2_PT_FIXED + 31,
++	TPM2_PT_MAX_DIGEST = TPM2_PT_FIXED + 32,
++	TPM2_PT_MAX_OBJECT_CONTEXT = TPM2_PT_FIXED + 33,
++	TPM2_PT_MAX_SESSION_CONTEXT = TPM2_PT_FIXED + 34,
++	TPM2_PT_PS_FAMILY_INDICATOR = TPM2_PT_FIXED + 35,
++	TPM2_PT_PS_LEVEL = TPM2_PT_FIXED + 36,
++	TPM2_PT_PS_REVISION = TPM2_PT_FIXED + 37,
++	TPM2_PT_PS_DAY_OF_YEAR = TPM2_PT_FIXED + 38,
++	TPM2_PT_PS_YEAR = TPM2_PT_FIXED + 39,
++	TPM2_PT_SPLIT_MAX = TPM2_PT_FIXED + 40,
++	TPM2_PT_TOTAL_COMMANDS = TPM2_PT_FIXED + 41,
++	TPM2_PT_LIBRARY_COMMANDS = TPM2_PT_FIXED + 42,
++	TPM2_PT_VENDOR_COMMANDS = TPM2_PT_FIXED + 43,
++	TPM2_PT_NV_BUFFER_MAX = TPM2_PT_FIXED + 44,
++	TPM2_PT_MODES = TPM2_PT_FIXED + 45,
++	TPM2_PT_MAX_CAP_BUFFER = TPM2_PT_FIXED + 46,
++	TPM2_PT_VAR = TPM2_PT_GROUP * 2,
++	TPM2_PT_PERMANENT = TPM2_PT_VAR + 0,
++	TPM2_PT_STARTUP_CLEAR = TPM2_PT_VAR + 1,
++	TPM2_PT_HR_NV_INDEX = TPM2_PT_VAR + 2,
++	TPM2_PT_HR_LOADED = TPM2_PT_VAR + 3,
++	TPM2_PT_HR_LOADED_AVAIL = TPM2_PT_VAR + 4,
++	TPM2_PT_HR_ACTIVE = TPM2_PT_VAR + 5,
++	TPM2_PT_HR_ACTIVE_AVAIL = TPM2_PT_VAR + 6,
++	TPM2_PT_HR_TRANSIENT_AVAIL = TPM2_PT_VAR + 7,
++	TPM2_PT_HR_PERSISTENT = TPM2_PT_VAR + 8,
++	TPM2_PT_HR_PERSISTENT_AVAIL = TPM2_PT_VAR + 9,
++	TPM2_PT_NV_COUNTERS = TPM2_PT_VAR + 10,
++	TPM2_PT_NV_COUNTERS_AVAIL = TPM2_PT_VAR + 11,
++	TPM2_PT_ALGORITHM_SET = TPM2_PT_VAR + 12,
++	TPM2_PT_LOADED_CURVES = TPM2_PT_VAR + 13,
++	TPM2_PT_LOCKOUT_COUNTER = TPM2_PT_VAR + 14,
++	TPM2_PT_MAX_AUTH_FAIL = TPM2_PT_VAR + 15,
++	TPM2_PT_LOCKOUT_INTERVAL = TPM2_PT_VAR + 16,
++	TPM2_PT_LOCKOUT_RECOVERY = TPM2_PT_VAR + 17,
++	TPM2_PT_NV_WRITE_RECOVERY = TPM2_PT_VAR + 18,
++	TPM2_PT_AUDIT_COUNTER_0 = TPM2_PT_VAR + 19,
++	TPM2_PT_AUDIT_COUNTER_1 = TPM2_PT_VAR + 20,
++};
+ 
+ /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18
+  * bytes, but 128 is still a relatively large number of random bytes and
+diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c
+index da9ec6983e13..c54233f0369b 100644
+--- a/drivers/hwmon/nct6775-core.c
++++ b/drivers/hwmon/nct6775-core.c
+@@ -1150,7 +1150,7 @@ static int nct6775_write_fan_div(struct nct6775_data *data, int nr)
+ 	if (err)
+ 		return err;
+ 	reg &= 0x70 >> oddshift;
+-	reg |= data->fan_div[nr] & (0x7 << oddshift);
++	reg |= (data->fan_div[nr] & 0x7) << oddshift;
+ 	return nct6775_write_value(data, fandiv_reg, reg);
+ }
+ 
+diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
+index dc6816d36d06..bda249068182 100644
+--- a/drivers/leds/trigger/Kconfig
++++ b/drivers/leds/trigger/Kconfig
+@@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY
+ 
+ 	  When build as a module this driver will be called ledtrig-tty.
+ 
++config LEDS_TRIGGER_BLKDEV
++	tristate "LED Trigger for block devices"
++	depends on BLOCK
++	help
++	  The blkdev LED trigger allows LEDs to be controlled by block device
++	  activity (reads and writes).
++
++	  See Documentation/leds/ledtrig-blkdev.rst.
++
+ endif # LEDS_TRIGGERS
+diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
+index 25c4db97cdd4..d53bab5d93f1 100644
+--- a/drivers/leds/trigger/Makefile
++++ b/drivers/leds/trigger/Makefile
+@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV)	+= ledtrig-netdev.o
+ obj-$(CONFIG_LEDS_TRIGGER_PATTERN)	+= ledtrig-pattern.o
+ obj-$(CONFIG_LEDS_TRIGGER_AUDIO)	+= ledtrig-audio.o
+ obj-$(CONFIG_LEDS_TRIGGER_TTY)		+= ledtrig-tty.o
++obj-$(CONFIG_LEDS_TRIGGER_BLKDEV)	+= ledtrig-blkdev.o
+diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
+new file mode 100644
+index 000000000000..8614e308fadc
+--- /dev/null
++++ b/drivers/leds/trigger/ledtrig-blkdev.c
+@@ -0,0 +1,1220 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ *	Block device LED trigger
++ *
++ *	Copyright 2021-2022 Ian Pilcher <arequipeno@gmail.com>
++ */
++
++#include <linux/blkdev.h>
++#include <linux/leds.h>
++#include <linux/module.h>
++#include <linux/part_stat.h>
++#include <linux/xarray.h>
++
++/**
++ * DOC: Overview
++ *
++ * The ``blkdev`` LED trigger works by periodically checking the activity
++ * counters of block devices that have been linked to one or more LEDs and
++ * blinking those LED(s) if the correct type of activity has occurred.  The
++ * periodic check is scheduled with the Linux kernel's deferred work facility.
++ *
++ * Trigger-specific data about block devices and LEDs is stored in two data
++ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led
++ * (a "BTL").  Each structure contains a &struct xarray that holds links to any
++ * linked devices of the other type.  I.e. &blkdev_trig_bdev.linked_btls
++ * contains links to all BTLs whose LEDs have been linked to the BTB's block
++ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose
++ * block devices have been linked to the BTL's LED.  Thus, a block device can
++ * be linked to more than one LED, and an LED can be linked to more than one
++ * block device.
++ */
++
++/* Default, minimum & maximum blink duration (milliseconds) */
++#define BLKDEV_TRIG_BLINK_DEF	75
++#define BLKDEV_TRIG_BLINK_MIN	10
++#define BLKDEV_TRIG_BLINK_MAX	86400000  /* 24 hours */
++
++/* Default, minimum & maximum activity check interval (milliseconds) */
++#define BLKDEV_TRIG_CHECK_DEF	100
++#define BLKDEV_TRIG_CHECK_MIN	25
++#define BLKDEV_TRIG_CHECK_MAX	86400000  /* 24 hours */
++
++/*
++ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying
++ * again (milliseconds)
++ */
++#define BLKDEV_TRIG_CHECK_RETRY	5
++
++/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */
++#define BLKDEV_TRIG_FMODE	0
++
++/**
++ * struct blkdev_trig_bdev - Trigger-specific data about a block device.
++ * @last_checked:	Time (in jiffies) at which the trigger last checked this
++ *			block device for activity.
++ * @last_activity:	Time (in jiffies) at which the trigger last detected
++ *			activity of each type.
++ * @ios:		Activity counter values for each type, corresponding to
++ *			the timestamps in &last_activity.
++ * @index:		&xarray index, so the BTB can be included in one or more
++ *			&blkdev_trig_led.linked_btbs.
++ * @bdev:		The block device.
++ * @linked_btls:	The BTLs that represent the LEDs linked to the BTB's
++ *			block device.
++ *
++ * Every block device linked to at least one LED gets a "BTB."  A BTB is created
++ * when a block device that is not currently linked to any LEDs is linked to an
++ * LED.
++ *
++ * A BTB is freed when one of the following occurs:
++ *
++ * * The number of LEDs linked to the block device becomes zero, because it has
++ *   been unlinked from its last LED using the trigger's &sysfs interface.
++ *
++ * * The number of LEDs linked to the block device becomes zero, because the
++ *   last LED to which it was linked has been disassociated from the trigger
++ *   (which happens automatically if the LED device is removed from the system).
++ *
++ * * The BTB's block device is removed from the system.  To accomodate this
++ *   scenario, BTB's are created as device resources, so that the release
++ *   function will be called by the driver core when the device is removed.
++ */
++struct blkdev_trig_bdev {
++	unsigned long		last_checked;
++	unsigned long		last_activity[NR_STAT_GROUPS];
++	unsigned long		ios[NR_STAT_GROUPS];
++	unsigned long		index;
++	struct block_device	*bdev;
++	struct xarray		linked_btls;
++};
++
++/**
++ * struct blkdev_trig_led - Trigger-specific data about an LED.
++ * @last_checked:	Time (in jiffies) at which the trigger last checked the
++ *			the block devices linked to this LED for activity.
++ * @index:		&xarray index, so the BTL can be included in one or more
++ *			&blkdev_trig_bdev.linked_btls.
++ * @mode:		Bitmask for types of block device activity that will
++ *			cause this LED to blink --- reads, writes, discards,
++ *			etc.
++ * @led:		The LED device.
++ * @blink_msec:		Duration of a blink (milliseconds).
++ * @check_jiffies:	Frequency with which block devices linked to this LED
++ *			should be checked for activity (jiffies).
++ * @linked_btbs:	The BTBs that represent the block devices linked to the
++ *			BTL's LED.
++ * @all_btls_node:	The BTL's node in the module's list of all BTLs.
++ *
++ * Every LED associated with the block device trigger gets a "BTL."  A BTL is
++ * created when the trigger is "activated" on an LED (usually by writing
++ * ``blkdev`` to the LED's &sysfs &trigger attribute).  A BTL is freed wnen its
++ * LED is disassociated from the trigger, either through the trigger's &sysfs
++ * interface or because the LED device is removed from the system.
++ */
++struct blkdev_trig_led {
++	unsigned long		last_checked;
++	unsigned long		index;
++	unsigned long		mode;  /* must be ulong for atomic bit ops */
++	struct led_classdev	*led;
++	unsigned int		blink_msec;
++	unsigned int		check_jiffies;
++	struct xarray		linked_btbs;
++	struct hlist_node	all_btls_node;
++};
++
++/* Protects everything except atomic LED attributes */
++static DEFINE_MUTEX(blkdev_trig_mutex);
++
++/* BTB device resource release function */
++static void blkdev_trig_btb_release(struct device *dev, void *res);
++
++/* Index for next BTB or BTL */
++static unsigned long blkdev_trig_next_index;
++
++/* All LEDs associated with the trigger */
++static HLIST_HEAD(blkdev_trig_all_btls);
++
++/* Delayed work to periodically check for activity & blink LEDs */
++static void blkdev_trig_check(struct work_struct *work);
++static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check);
++
++/* When is the delayed work scheduled to run next (jiffies) */
++static unsigned long blkdev_trig_next_check;
++
++/* Total number of BTB-to-BTL links */
++static unsigned int blkdev_trig_link_count;
++
++/* Empty sysfs attribute list for next 2 declarations */
++static struct attribute *blkdev_trig_attrs_empty[] = { NULL };
++
++/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */
++static const struct attribute_group blkdev_trig_linked_leds = {
++	.name	= "linked_leds",
++	.attrs	= blkdev_trig_attrs_empty,
++};
++
++/* linked_devices sysfs directory for each LED associated with the trigger */
++static const struct attribute_group blkdev_trig_linked_devs = {
++	.name	= "linked_devices",
++	.attrs	= blkdev_trig_attrs_empty,
++};
++
++
++/*
++ *
++ *	Delayed work to check for activity & blink LEDs
++ *
++ */
++
++/**
++ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has
++ *	occurred on the block device.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	&true if the LED is blinked, &false if not.
++ */
++static bool blkdev_trig_blink(const struct blkdev_trig_led *btl,
++			      const struct blkdev_trig_bdev *btb)
++{
++	unsigned long mode, mask, delay_on, delay_off;
++	enum stat_group i;
++
++	mode = READ_ONCE(btl->mode);
++
++	for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) {
++
++		if (!(mode & mask))
++			continue;
++
++		if (time_before_eq(btb->last_activity[i], btl->last_checked))
++			continue;
++
++		delay_on = READ_ONCE(btl->blink_msec);
++		delay_off = 1;	/* 0 leaves LED turned on */
++
++		led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0);
++		return true;
++	}
++
++	return false;
++}
++
++/**
++ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps.
++ * @btb:	The BTB
++ * @now:	Timestamp (in jiffies)
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb,
++				   unsigned long now)
++{
++	unsigned long new_ios;
++	enum stat_group i;
++
++	for (i = STAT_READ; i <= STAT_FLUSH; ++i) {
++
++		new_ios = part_stat_read(btb->bdev, ios[i]);
++
++		if (new_ios != btb->ios[i]) {
++			btb->ios[i] = new_ios;
++			btb->last_activity[i] = now;
++		}
++	}
++
++	btb->last_checked = now;
++}
++
++/**
++ * blkdev_trig_check() - Check linked devices for activity and blink LEDs.
++ * @work:	Delayed work (&blkdev_trig_work)
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_check(struct work_struct *work)
++{
++	struct blkdev_trig_led *btl;
++	struct blkdev_trig_bdev *btb;
++	unsigned long index, delay, now, led_check, led_delay;
++	bool blinked;
++
++	if (!mutex_trylock(&blkdev_trig_mutex)) {
++		delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY);
++		goto exit_reschedule;
++	}
++
++	now = jiffies;
++	delay = ULONG_MAX;
++
++	hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) {
++
++		led_check = btl->last_checked + btl->check_jiffies;
++
++		if (time_before_eq(led_check, now)) {
++
++			blinked = false;
++
++			xa_for_each (&btl->linked_btbs, index, btb) {
++
++				if (btb->last_checked != now)
++					blkdev_trig_update_btb(btb, now);
++				if (!blinked)
++					blinked = blkdev_trig_blink(btl, btb);
++			}
++
++			btl->last_checked = now;
++			led_delay = btl->check_jiffies;
++
++		} else {
++			led_delay = led_check - now;
++		}
++
++		if (led_delay < delay)
++			delay = led_delay;
++	}
++
++	mutex_unlock(&blkdev_trig_mutex);
++
++exit_reschedule:
++	WARN_ON_ONCE(delay == ULONG_MAX);
++	WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay));
++}
++
++/**
++ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new
++ *	LED is added to the schedule.
++ * @btl:	The BTL that represents the LED
++ *
++ * Called when the number of block devices to which an LED is linked becomes
++ * non-zero.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl)
++{
++	unsigned long delay = READ_ONCE(btl->check_jiffies);
++	unsigned long check_by = jiffies + delay;
++
++	/*
++	 * If no other LED-to-block device links exist, simply schedule the
++	 * delayed work according to this LED's check_interval attribute
++	 * (check_jiffies).
++	 */
++	if (blkdev_trig_link_count == 0) {
++		WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay));
++		blkdev_trig_next_check = check_by;
++		return;
++	}
++
++	/*
++	 * If the next check is already scheduled to occur soon enough to
++	 * accomodate this LED's check_interval, the schedule doesn't need
++	 * to be changed.
++	 */
++	if (time_after_eq(check_by, blkdev_trig_next_check))
++		return;
++
++	/*
++	 * Modify the schedule, so that the delayed work runs soon enough for
++	 * this LED.
++	 */
++	WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay));
++	blkdev_trig_next_check = check_by;
++}
++
++
++/*
++ *
++ *	Linking and unlinking LEDs and block devices
++ *
++ */
++
++/**
++ * blkdev_trig_link() - Link a block device to an LED.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	&0 on success, negative &errno on error.
++ */
++static int blkdev_trig_link(struct blkdev_trig_led *btl,
++			    struct blkdev_trig_bdev *btb)
++{
++	bool led_first_link;
++	int err;
++
++	led_first_link = xa_empty(&btl->linked_btbs);
++
++	err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL);
++	if (err)
++		return err;
++
++	err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL);
++	if (err)
++		goto error_erase_btl;
++
++	/* Create /sys/class/block/<bdev>/linked_leds/<led> symlink */
++	err = sysfs_add_link_to_group(bdev_kobj(btb->bdev),
++				      blkdev_trig_linked_leds.name,
++				      &btl->led->dev->kobj, btl->led->name);
++	if (err)
++		goto error_erase_btb;
++
++	/* Create /sys/class/leds/<led>/linked_devices/<bdev> symlink */
++	err = sysfs_add_link_to_group(&btl->led->dev->kobj,
++				      blkdev_trig_linked_devs.name,
++				      bdev_kobj(btb->bdev),
++				      dev_name(&btb->bdev->bd_device));
++	if (err)
++		goto error_remove_symlink;
++
++	/*
++	 * If this is the first block device linked to this LED, the delayed
++	 * work schedule may need to be changed.
++	 */
++	if (led_first_link)
++		blkdev_trig_sched_led(btl);
++
++	++blkdev_trig_link_count;
++
++	return 0;
++
++error_remove_symlink:
++	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
++				     blkdev_trig_linked_leds.name,
++				     btl->led->name);
++error_erase_btb:
++	xa_erase(&btl->linked_btbs, btb->index);
++error_erase_btl:
++	xa_erase(&btb->linked_btls, btl->index);
++	return err;
++}
++
++/**
++ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed.
++ * @btb:	The BTB
++ *
++ * Does nothing if the BTB (block device) is still linked to at least one LED.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb)
++{
++	struct block_device *bdev = btb->bdev;
++	int err;
++
++	if (xa_empty(&btb->linked_btls)) {
++
++		sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
++		err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release,
++				     NULL, NULL);
++		WARN_ON(err);
++	}
++}
++
++/**
++ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of
++ *	unlinking a block device from an LED.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * When a block device is unlinked from an LED, certain steps must be performed
++ * only if the block device is **not** being released.  This function performs
++ * those steps that are **always** required, whether or not the block device is
++ * being released.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl,
++				       struct blkdev_trig_bdev *btb)
++{
++	--blkdev_trig_link_count;
++
++	if (blkdev_trig_link_count == 0)
++		WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work));
++
++	xa_erase(&btb->linked_btls, btl->index);
++	xa_erase(&btl->linked_btbs, btb->index);
++
++	/* Remove /sys/class/leds/<led>/linked_devices/<bdev> symlink */
++	sysfs_remove_link_from_group(&btl->led->dev->kobj,
++				     blkdev_trig_linked_devs.name,
++				     dev_name(&btb->bdev->bd_device));
++}
++
++/**
++ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is
++ *	**not** being released.
++ * @btl:	The BTL that represents the LED.
++ * @btb:	The BTB that represents the block device.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl,
++					 struct blkdev_trig_bdev *btb)
++{
++	_blkdev_trig_unlink_always(btl, btb);
++
++	/* Remove /sys/class/block/<bdev>/linked_leds/<led> symlink */
++	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
++				     blkdev_trig_linked_leds.name,
++				     btl->led->name);
++
++	blkdev_trig_put_btb(btb);
++}
++
++/**
++ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is
++ *	being released.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl,
++				       struct blkdev_trig_bdev *btb)
++{
++	_blkdev_trig_unlink_always(btl, btb);
++
++	/*
++	 * If the BTB is being released, the driver core has already removed the
++	 * device's attribute groups, and the BTB will be freed automatically,
++	 * so there's nothing else to do.
++	 */
++}
++
++
++/*
++ *
++ *	BTB creation
++ *
++ */
++
++/**
++ * blkdev_trig_btb_release() - BTB device resource release function.
++ * @dev:	The block device
++ * @res:	The BTB
++ *
++ * Called by the driver core when a block device with a BTB is removed.
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_btb_release(struct device *dev, void *res)
++{
++	struct blkdev_trig_bdev *btb = res;
++	struct blkdev_trig_led *btl;
++	unsigned long index;
++
++	mutex_lock(&blkdev_trig_mutex);
++
++	xa_for_each (&btb->linked_btls, index, btl)
++		blkdev_trig_unlink_release(btl, btb);
++
++	mutex_unlock(&blkdev_trig_mutex);
++}
++
++/**
++ * blkdev_trig_get_bdev() - Get a block device by path.
++ * @path:	The value written to an LED's &link_dev_by_path or
++ *		&unlink_dev_by_path attribute, which should be the path to a
++ *		special file that represents a block device
++ * @len:	The number of characters in &path (not including its
++ *		terminating null)
++ *
++ * The caller must call blkdev_put() when finished with the device.
++ *
++ * Context:	Process context.
++ * Return:	The block device, or an error pointer.
++ */
++static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len)
++{
++	struct block_device *bdev;
++	char *buf;
++
++	buf = kmemdup(path, len + 1, GFP_KERNEL);  /* +1 to include null */
++	if (buf == NULL)
++		return ERR_PTR(-ENOMEM);
++
++	bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE);
++	kfree(buf);
++	return bdev;
++}
++
++/**
++ * blkdev_trig_get_btb() - Find or create the BTB for a block device.
++ * @path:	The value written to an LED's &link_dev_by_path attribute,
++ *		which should be the path to a special file that represents a
++ *		block device
++ * @len:	The number of characters in &path
++ *
++ * If a new BTB is created, because the block device was not previously linked
++ * to any LEDs, the block device's &linked_leds &sysfs directory is created.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	Pointer to the BTB, error pointer on error.
++ */
++static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path,
++						    size_t len)
++{
++	struct block_device *bdev;
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	bdev = blkdev_trig_get_bdev(path, len);
++	if (IS_ERR(bdev))
++		return ERR_CAST(bdev);
++
++	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
++			  NULL, NULL);
++	if (btb != NULL) {
++		err = 0;
++		goto exit_put_bdev;
++	}
++
++	if (blkdev_trig_next_index == ULONG_MAX) {
++		err = -EOVERFLOW;
++		goto exit_put_bdev;
++	}
++
++	btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL);
++	if (btb == NULL) {
++		err = -ENOMEM;
++		goto exit_put_bdev;
++	}
++
++	err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
++	if (err)
++		goto exit_free_btb;
++
++	btb->index = blkdev_trig_next_index++;
++	btb->bdev = bdev;
++	xa_init(&btb->linked_btls);
++
++	/* Populate BTB activity counters */
++	blkdev_trig_update_btb(btb, jiffies);
++
++	devres_add(&bdev->bd_device, btb);
++
++exit_free_btb:
++	if (err)
++		devres_free(btb);
++exit_put_bdev:
++	blkdev_put(bdev, BLKDEV_TRIG_FMODE);
++	return err ? ERR_PTR(err) : btb;
++}
++
++
++/*
++ *
++ *	Activating and deactivating the trigger on an LED
++ *
++ */
++
++/**
++ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is
++ *	associated with the trigger.
++ * @led:	The LED
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&0 on success, negative &errno on error.
++ */
++static int blkdev_trig_activate(struct led_classdev *led)
++{
++	struct blkdev_trig_led *btl;
++	int err;
++
++	btl = kzalloc(sizeof(*btl), GFP_KERNEL);
++	if (btl == NULL)
++		return -ENOMEM;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		goto exit_free;
++
++	if (blkdev_trig_next_index == ULONG_MAX) {
++		err = -EOVERFLOW;
++		goto exit_unlock;
++	}
++
++	btl->index = blkdev_trig_next_index++;
++	btl->last_checked = jiffies;
++	btl->mode = -1;  /* set all bits */
++	btl->led = led;
++	btl->blink_msec = BLKDEV_TRIG_BLINK_DEF;
++	btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF);
++	xa_init(&btl->linked_btbs);
++
++	hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls);
++	led_set_trigger_data(led, btl);
++
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++exit_free:
++	if (err)
++		kfree(btl);
++	return err;
++}
++
++/**
++ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is
++ *	disassociated from the trigger.
++ * @led:	The LED
++ *
++ * The LEDs subsystem also calls this function when an LED associated with the
++ * trigger is removed or when the trigger is unregistered (if the module is
++ * unloaded).
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_deactivate(struct led_classdev *led)
++{
++	struct blkdev_trig_led *btl = led_get_trigger_data(led);
++	struct blkdev_trig_bdev *btb;
++	unsigned long index;
++
++	mutex_lock(&blkdev_trig_mutex);
++
++	xa_for_each (&btl->linked_btbs, index, btb)
++		blkdev_trig_unlink_norelease(btl, btb);
++
++	hlist_del(&btl->all_btls_node);
++	kfree(btl);
++
++	mutex_unlock(&blkdev_trig_mutex);
++}
++
++
++/*
++ *
++ *	Link-related attribute store functions
++ *
++ */
++
++/**
++ * link_dev_by_path_store() - &link_dev_by_path device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &link_dev_by_path attribute (&dev_attr_link_dev_by_path)
++ * @buf:	The value written to the attribute, which should be the path to
++ *		a special file that represents a block device to be linked to
++ *		the LED (e.g. ``/dev/sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t link_dev_by_path_store(struct device *dev,
++				      struct device_attribute *attr,
++				      const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		return err;
++
++	btb = blkdev_trig_get_btb(buf, count);
++	if (IS_ERR(btb)) {
++		err = PTR_ERR(btb);
++		goto exit_unlock;
++	}
++
++	if (xa_load(&btb->linked_btls, btl->index) != NULL) {
++		err = -EEXIST;
++		goto exit_put_btb;
++	}
++
++	err = blkdev_trig_link(btl, btb);
++
++exit_put_btb:
++	if (err)
++		blkdev_trig_put_btb(btb);
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++	return err ? : count;
++}
++
++/**
++ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store
++ *	function.
++ * @dev:	The LED device
++ * @attr:	The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path)
++ * @buf:	The value written to the attribute, which should be the path to
++ *		a special file that represents a block device to be unlinked
++ *		from the LED (e.g. ``/dev/sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t unlink_dev_by_path_store(struct device *dev,
++					struct device_attribute *attr,
++					const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct block_device *bdev;
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	bdev = blkdev_trig_get_bdev(buf, count);
++	if (IS_ERR(bdev))
++		return PTR_ERR(bdev);
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		goto exit_put_bdev;
++
++	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
++			  NULL, NULL);
++	if (btb == NULL) {
++		err = -EUNATCH;  /* bdev isn't linked to any LED */
++		goto exit_unlock;
++	}
++
++	if (xa_load(&btb->linked_btls, btl->index) == NULL) {
++		err = -EUNATCH;  /* bdev isn't linked to this LED */
++		goto exit_unlock;
++	}
++
++	blkdev_trig_unlink_norelease(btl, btb);
++
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++exit_put_bdev:
++	blkdev_put(bdev, BLKDEV_TRIG_FMODE);
++	return err ? : count;
++}
++
++/**
++ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store
++ *	function.
++ * @dev:	The LED device
++ * @attr:	The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name)
++ * @buf:	The value written to the attribute, which should be the kernel
++ *		name of a block device to be unlinked from the LED (e.g.
++ *		``sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t unlink_dev_by_name_store(struct device *dev,
++					struct device_attribute *attr,
++					const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct blkdev_trig_bdev *btb;
++	unsigned long index;
++	int err;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		return err;
++
++	err = -EUNATCH;
++
++	xa_for_each (&btl->linked_btbs, index, btb) {
++
++		if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) {
++			blkdev_trig_unlink_norelease(btl, btb);
++			err = 0;
++			break;
++		}
++	}
++
++	mutex_unlock(&blkdev_trig_mutex);
++	return err ? : count;
++}
++
++
++/*
++ *
++ *	Atomic attribute show & store functions
++ *
++ */
++
++/**
++ * blink_time_show() - &blink_time device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
++ * @buf:	Output buffer
++ *
++ * Writes the value of &blkdev_trig_led.blink_msec to &buf.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_time_show(struct device *dev,
++			       struct device_attribute *attr, char *buf)
++{
++	const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++
++	return sprintf(buf, "%u\n", READ_ONCE(btl->blink_msec));
++}
++
++/**
++ * blink_time_store() - &blink_time device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets &blkdev_trig_led.blink_msec to the value in &buf.
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_time_store(struct device *dev,
++				struct device_attribute *attr,
++				const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	unsigned int value;
++	int err;
++
++	err = kstrtouint(buf, 0, &value);
++	if (err)
++		return err;
++
++	if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX)
++		return -ERANGE;
++
++	WRITE_ONCE(btl->blink_msec, value);
++	return count;
++}
++
++/**
++ * check_interval_show() - &check_interval device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
++ * @buf:	Output buffer
++ *
++ * Writes the value of &blkdev_trig_led.check_jiffies (converted to
++ * milliseconds) to &buf.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t check_interval_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++
++	return sprintf(buf, "%u\n",
++		       jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
++}
++
++/**
++ * check_interval_store() - &check_interval device attribute store function
++ * @dev:	The LED device
++ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting
++ * from milliseconds).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t check_interval_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	struct blkdev_trig_led *led = led_trigger_get_drvdata(dev);
++	unsigned int value;
++	int err;
++
++	err = kstrtouint(buf, 0, &value);
++	if (err)
++		return err;
++
++	if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX)
++		return -ERANGE;
++
++	WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value));
++
++	return count;
++}
++
++/**
++ * blkdev_trig_mode_show() - Helper for boolean attribute show functions.
++ * @led:	The LED
++ * @buf:	Output buffer
++ * @bit:	Which bit to show
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
++				 enum stat_group bit)
++{
++	return sprintf(buf, READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
++}
++
++/**
++ * blkdev_trig_mode_store() - Helper for boolean attribute store functions.
++ * @led:	The LED
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ * @bit:	Which bit to set
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static int blkdev_trig_mode_store(struct blkdev_trig_led *led,
++				  const char *buf, size_t count,
++				  enum stat_group bit)
++{
++	bool set;
++	int err;
++
++	err = kstrtobool(buf, &set);
++	if (err)
++		return err;
++
++	if (set)
++		set_bit(bit, &led->mode);
++	else
++		clear_bit(bit, &led->mode);
++
++	return count;
++}
++
++/**
++ * blink_on_read_show() - &blink_on_read device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_read_show(struct device *dev,
++				  struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_READ);
++}
++
++/**
++ * blink_on_read_store() - &blink_on_read device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_read_store(struct device *dev,
++				   struct device_attribute *attr,
++				   const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_READ);
++}
++
++/**
++ * blink_on_write_show() - &blink_on_write device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in
++ * in &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_write_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_WRITE);
++}
++
++/**
++ * blink_on_write_store() - &blink_on_write device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_write_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_WRITE);
++}
++
++/**
++ * blink_on_flush_show() - &blink_on_flush device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_flush_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_FLUSH);
++}
++
++/**
++ * blink_on_flush_store() - &blink_on_flush device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_flush_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_FLUSH);
++}
++
++/**
++ * blink_on_discard_show() - &blink_on_discard device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_discard_show(struct device *dev,
++				     struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_DISCARD);
++}
++
++/**
++ * blink_on_discard_store() - &blink_on_discard device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_discard_store(struct device *dev,
++				      struct device_attribute *attr,
++				      const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_DISCARD);
++}
++
++/* Device attributes */
++static DEVICE_ATTR_WO(link_dev_by_path);
++static DEVICE_ATTR_WO(unlink_dev_by_path);
++static DEVICE_ATTR_WO(unlink_dev_by_name);
++static DEVICE_ATTR_RW(blink_time);
++static DEVICE_ATTR_RW(check_interval);
++static DEVICE_ATTR_RW(blink_on_read);
++static DEVICE_ATTR_RW(blink_on_write);
++static DEVICE_ATTR_RW(blink_on_flush);
++static DEVICE_ATTR_RW(blink_on_discard);
++
++/* Device attributes in LED directory (/sys/class/leds/<led>/...) */
++static struct attribute *blkdev_trig_attrs[] = {
++	&dev_attr_link_dev_by_path.attr,
++	&dev_attr_unlink_dev_by_path.attr,
++	&dev_attr_unlink_dev_by_name.attr,
++	&dev_attr_blink_time.attr,
++	&dev_attr_check_interval.attr,
++	&dev_attr_blink_on_read.attr,
++	&dev_attr_blink_on_write.attr,
++	&dev_attr_blink_on_flush.attr,
++	&dev_attr_blink_on_discard.attr,
++	NULL
++};
++
++/* Unnamed attribute group == no subdirectory */
++static const struct attribute_group blkdev_trig_attr_group = {
++	.attrs	= blkdev_trig_attrs,
++};
++
++/* Attribute groups for the trigger */
++static const struct attribute_group *blkdev_trig_attr_groups[] = {
++	&blkdev_trig_attr_group,   /* /sys/class/leds/<led>/... */
++	&blkdev_trig_linked_devs,  /* /sys/class/leds/<led>/linked_devices/ */
++	NULL
++};
++
++/* Trigger registration data */
++static struct led_trigger blkdev_trig_trigger = {
++	.name		= "blkdev",
++	.activate	= blkdev_trig_activate,
++	.deactivate	= blkdev_trig_deactivate,
++	.groups		= blkdev_trig_attr_groups,
++};
++
++/**
++ * blkdev_trig_init() - Block device LED trigger initialization.
++ *
++ * Registers the ``blkdev`` LED trigger.
++ *
++ * Return:	&0 on success, negative &errno on failure.
++ */
++static int __init blkdev_trig_init(void)
++{
++	return led_trigger_register(&blkdev_trig_trigger);
++}
++module_init(blkdev_trig_init);
++
++/**
++ * blkdev_trig_exit() - Block device LED trigger module exit.
++ *
++ * Unregisters the ``blkdev`` LED trigger.
++ */
++static void __exit blkdev_trig_exit(void)
++{
++	led_trigger_unregister(&blkdev_trig_trigger);
++}
++module_exit(blkdev_trig_exit);
++
++MODULE_DESCRIPTION("Block device LED trigger");
++MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
++MODULE_LICENSE("GPL v2");
+diff --git a/drivers/md/dm.c b/drivers/md/dm.c
+index b424a6ee27ba..df3fe80824bc 100644
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -1008,6 +1008,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
+ 		io->next = NULL;
+ 		__dm_io_complete(io, false);
+ 		io = next;
++		cond_resched();
+ 	}
+ }
+ 
+@@ -2569,6 +2570,7 @@ static void dm_wq_work(struct work_struct *work)
+ 			break;
+ 
+ 		submit_bio_noacct(bio);
++		cond_resched();
+ 	}
+ }
+ 
+diff --git a/fs/eventpoll.c b/fs/eventpoll.c
+index 64659b110973..8b5ca9f8f4bb 100644
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1760,7 +1760,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+ {
+ 	int ret = default_wake_function(wq_entry, mode, sync, key);
+ 
+-	list_del_init(&wq_entry->entry);
++	list_del_init_careful(&wq_entry->entry);
+ 	return ret;
+ }
+ 
+diff --git a/fs/proc/base.c b/fs/proc/base.c
+index 9e479d7d202b..ac9ebe972be0 100644
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ 	mm = get_task_mm(task);
+ 	if (mm) {
+ 		seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
++		seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing);
+ 		mmput(mm);
+ 	}
+ 
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 9757067c3053..d853e1c8a581 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -776,7 +776,7 @@ struct mm_struct {
+ #ifdef CONFIG_KSM
+ 		/*
+ 		 * Represent how many pages of this process are involved in KSM
+-		 * merging.
++		 * merging (not including ksm_zero_pages_sharing).
+ 		 */
+ 		unsigned long ksm_merging_pages;
+ 		/*
+@@ -784,6 +784,11 @@ struct mm_struct {
+ 		 * including merged and not merged.
+ 		 */
+ 		unsigned long ksm_rmap_items;
++		/*
++		 * Represent how many empty pages are merged with kernel zero
++		 * pages when enabling KSM use_zero_pages.
++		 */
++		unsigned long ksm_zero_pages_sharing;
+ #endif
+ #ifdef CONFIG_LRU_GEN
+ 		struct {
+diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
+index 5f1ae07d724b..97cda629c9e9 100644
+--- a/include/linux/pageblock-flags.h
++++ b/include/linux/pageblock-flags.h
+@@ -48,7 +48,7 @@ extern unsigned int pageblock_order;
+ #else /* CONFIG_HUGETLB_PAGE */
+ 
+ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
+-#define pageblock_order		(MAX_ORDER-1)
++#define pageblock_order		PAGE_ALLOC_COSTLY_ORDER
+ 
+ #endif /* CONFIG_HUGETLB_PAGE */
+ 
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index f97fd01a2932..7e6751b29101 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);
+  * Flush and destroy @worker.  The simple flush is enough because the kthread
+  * worker API is used only in trivial scenarios.  There are no multi-step state
+  * machines needed.
++ *
++ * Note that this function is not responsible for handling delayed work, so
++ * caller should be responsible for queuing or canceling all delayed work items
++ * before invoke this function.
+  */
+ void kthread_destroy_worker(struct kthread_worker *worker)
+ {
+@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
+ 
+ 	kthread_flush_worker(worker);
+ 	kthread_stop(task);
++	WARN_ON(!list_empty(&worker->delayed_work_list));
+ 	WARN_ON(!list_empty(&worker->work_list));
+ 	kfree(worker);
+ }
+diff --git a/kernel/padata.c b/kernel/padata.c
+index e007b8a4b738..7c80301ab084 100644
+--- a/kernel/padata.c
++++ b/kernel/padata.c
+@@ -45,7 +45,7 @@ struct padata_mt_job_state {
+ };
+ 
+ static void padata_free_pd(struct parallel_data *pd);
+-static void __init padata_mt_helper(struct work_struct *work);
++static void padata_mt_helper(struct work_struct *work);
+ 
+ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+ {
+@@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
+ 	return err;
+ }
+ 
+-static void __init padata_mt_helper(struct work_struct *w)
++static void padata_mt_helper(struct work_struct *w)
+ {
+ 	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
+ 	struct padata_mt_job_state *ps = pw->pw_data;
+diff --git a/lib/string.c b/lib/string.c
+index 4746a98b153e..6b7cf32b4e54 100644
+--- a/lib/string.c
++++ b/lib/string.c
+@@ -480,13 +480,11 @@ EXPORT_SYMBOL(strcspn);
+  */
+ char *strpbrk(const char *cs, const char *ct)
+ {
+-	const char *sc1, *sc2;
++	const char *sc;
+ 
+-	for (sc1 = cs; *sc1 != '\0'; ++sc1) {
+-		for (sc2 = ct; *sc2 != '\0'; ++sc2) {
+-			if (*sc1 == *sc2)
+-				return (char *)sc1;
+-		}
++	for (sc = cs; *sc != '\0'; ++sc) {
++		if (strchr(ct, *sc))
++			return (char *)sc;
+ 	}
+ 	return NULL;
+ }
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 89b269a641c7..60958afebc41 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -985,7 +985,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+diff --git a/mm/compaction.c b/mm/compaction.c
+index d0b16a5b30f7..3613d7f174dc 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -122,7 +122,6 @@ bool PageMovable(struct page *page)
+ 
+ 	return false;
+ }
+-EXPORT_SYMBOL(PageMovable);
+ 
+ void __SetPageMovable(struct page *page, const struct movable_operations *mops)
+ {
+@@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
+ 
+ 		/*
+ 		 * Avoid isolating too much unless this block is being
+-		 * rescanned (e.g. dirty/writeback pages, parallel allocation)
++		 * fully scanned (e.g. dirty/writeback pages, parallel allocation)
+ 		 * or a lock is contended. For contention, isolate quickly to
+ 		 * potentially remove one source of contention.
+ 		 */
+ 		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
+-		    !cc->rescan && !cc->contended) {
++		    !cc->finish_pageblock && !cc->contended) {
+ 			++low_pfn;
+ 			break;
+ 		}
+@@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
+ 	}
+ 
+ 	/*
+-	 * Updated the cached scanner pfn once the pageblock has been scanned
++	 * Update the cached scanner pfn once the pageblock has been scanned.
+ 	 * Pages will either be migrated in which case there is no point
+ 	 * scanning in the near future or migration failed in which case the
+ 	 * failure reason may persist. The block is marked for skipping if
+ 	 * there were no pages isolated in the block or if the block is
+ 	 * rescanned twice in a row.
+ 	 */
+-	if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
++	if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
+ 		if (valid_page && !skip_updated)
+ 			set_pageblock_skip(valid_page);
+ 		update_cached_migrate(cc, low_pfn);
+@@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
+ 	if (cc->ignore_skip_hint)
+ 		return pfn;
+ 
++	/*
++	 * If the pageblock should be finished then do not select a different
++	 * pageblock.
++	 */
++	if (cc->finish_pageblock)
++		return pfn;
++
+ 	/*
+ 	 * If the migrate_pfn is not at the start of a zone or the start
+ 	 * of a pageblock then assume this is a continuation of a previous
+@@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
+ 					pfn = cc->zone->zone_start_pfn;
+ 				cc->fast_search_fail = 0;
+ 				found_block = true;
+-				set_pageblock_skip(freepage);
+ 				break;
+ 			}
+ 		}
+@@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
+ 		unsigned long iteration_start_pfn = cc->migrate_pfn;
+ 
+ 		/*
+-		 * Avoid multiple rescans which can happen if a page cannot be
+-		 * isolated (dirty/writeback in async mode) or if the migrated
+-		 * pages are being allocated before the pageblock is cleared.
+-		 * The first rescan will capture the entire pageblock for
+-		 * migration. If it fails, it'll be marked skip and scanning
+-		 * will proceed as normal.
++		 * Avoid multiple rescans of the same pageblock which can
++		 * happen if a page cannot be isolated (dirty/writeback in
++		 * async mode) or if the migrated pages are being allocated
++		 * before the pageblock is cleared.  The first rescan will
++		 * capture the entire pageblock for migration. If it fails,
++		 * it'll be marked skip and scanning will proceed as normal.
+ 		 */
+-		cc->rescan = false;
++		cc->finish_pageblock = false;
+ 		if (pageblock_start_pfn(last_migrated_pfn) ==
+ 		    pageblock_start_pfn(iteration_start_pfn)) {
+-			cc->rescan = true;
++			cc->finish_pageblock = true;
+ 		}
+ 
++rescan:
+ 		switch (isolate_migratepages(cc)) {
+ 		case ISOLATE_ABORT:
+ 			ret = COMPACT_CONTENDED;
+@@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
+ 				goto out;
+ 			}
+ 			/*
+-			 * We failed to migrate at least one page in the current
+-			 * order-aligned block, so skip the rest of it.
++			 * If an ASYNC or SYNC_LIGHT fails to migrate a page
++			 * within the current order-aligned block, scan the
++			 * remainder of the pageblock. This will mark the
++			 * pageblock "skip" to avoid rescanning in the near
++			 * future. This will isolate more pages than necessary
++			 * for the request but avoid loops due to
++			 * fast_find_migrateblock revisiting blocks that were
++			 * recently partially scanned.
+ 			 */
+-			if (cc->direct_compaction &&
+-						(cc->mode == MIGRATE_ASYNC)) {
+-				cc->migrate_pfn = block_end_pfn(
+-						cc->migrate_pfn - 1, cc->order);
+-				/* Draining pcplists is useless in this case */
+-				last_migrated_pfn = 0;
++			if (cc->direct_compaction && !cc->finish_pageblock &&
++						(cc->mode < MIGRATE_SYNC)) {
++				cc->finish_pageblock = true;
++
++				/*
++				 * Draining pcplists does not help THP if
++				 * any page failed to migrate. Even after
++				 * drain, the pageblock will not be free.
++				 */
++				if (cc->order == COMPACTION_HPAGE_ORDER)
++					last_migrated_pfn = 0;
++
++				goto rescan;
+ 			}
+ 		}
+ 
++		/* Stop if a page has been captured */
++		if (capc && capc->page) {
++			ret = COMPACT_SUCCESS;
++			break;
++		}
++
+ check_drain:
+ 		/*
+ 		 * Has the migration scanner moved away from the previous
+@@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
+ 				last_migrated_pfn = 0;
+ 			}
+ 		}
+-
+-		/* Stop if a page has been captured */
+-		if (capc && capc->page) {
+-			ret = COMPACT_SUCCESS;
+-			break;
+-		}
+ 	}
+ 
+ out:
+diff --git a/mm/internal.h b/mm/internal.h
+index bcf75a8b032d..21466d0ab22f 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -422,7 +422,11 @@ struct compact_control {
+ 	bool proactive_compaction;	/* kcompactd proactive compaction */
+ 	bool whole_zone;		/* Whole zone should/has been scanned */
+ 	bool contended;			/* Signal lock contention */
+-	bool rescan;			/* Rescanning the same pageblock */
++	bool finish_pageblock;		/* Scan the remainder of a pageblock. Used
++					 * when there are potentially transient
++					 * isolation or migration failures to
++					 * ensure forward progress.
++					 */
+ 	bool alloc_contig;		/* alloc_contig_range allocation */
+ };
+ 
+diff --git a/mm/ksm.c b/mm/ksm.c
+index a92c9594a2d3..c267b92b837b 100644
+--- a/mm/ksm.c
++++ b/mm/ksm.c
+@@ -214,6 +214,7 @@ struct ksm_rmap_item {
+ #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
+ #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
+ #define STABLE_FLAG	0x200	/* is listed from the stable tree */
++#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */
+ 
+ /* The stable and unstable tree heads */
+ static struct rb_root one_stable_tree[1] = { RB_ROOT };
+@@ -275,6 +276,9 @@ static unsigned int zero_checksum __read_mostly;
+ /* Whether to merge empty (zeroed) pages with actual zero pages */
+ static bool ksm_use_zero_pages __read_mostly;
+ 
++/* The number of zero pages placed by KSM use_zero_pages */
++static unsigned long ksm_zero_pages_sharing;
++
+ #ifdef CONFIG_NUMA
+ /* Zeroed when merging across nodes is not allowed */
+ static unsigned int ksm_merge_across_nodes = 1;
+@@ -420,6 +424,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
+ 	return atomic_read(&mm->mm_users) == 0;
+ }
+ 
++enum break_ksm_pmd_entry_return_flag {
++	HAVE_KSM_PAGE = 1,
++	HAVE_ZERO_PAGE
++};
++
+ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ 			struct mm_walk *walk)
+ {
+@@ -427,6 +436,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
+ 	spinlock_t *ptl;
+ 	pte_t *pte;
+ 	int ret;
++	bool is_zero_page = false;
+ 
+ 	if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ 		return 0;
+@@ -434,6 +444,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
+ 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ 	if (pte_present(*pte)) {
+ 		page = vm_normal_page(walk->vma, addr, *pte);
++		if (!page)
++			is_zero_page = is_zero_pfn(pte_pfn(*pte));
+ 	} else if (!pte_none(*pte)) {
+ 		swp_entry_t entry = pte_to_swp_entry(*pte);
+ 
+@@ -444,7 +456,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
+ 		if (is_migration_entry(entry))
+ 			page = pfn_swap_entry_to_page(entry);
+ 	}
+-	ret = page && PageKsm(page);
++
++	if (page && PageKsm(page))
++		ret = HAVE_KSM_PAGE;
++	else if (is_zero_page)
++		ret = HAVE_ZERO_PAGE;
++	else
++		ret = 0;
++
+ 	pte_unmap_unlock(pte, ptl);
+ 	return ret;
+ }
+@@ -466,19 +485,22 @@ static const struct mm_walk_ops break_ksm_ops = {
+  * of the process that owns 'vma'.  We also do not want to enforce
+  * protection keys here anyway.
+  */
+-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
++static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
++				     bool unshare_zero_page)
+ {
+ 	vm_fault_t ret = 0;
+ 
+ 	do {
+-		int ksm_page;
++		int walk_result;
+ 
+ 		cond_resched();
+-		ksm_page = walk_page_range_vma(vma, addr, addr + 1,
++		walk_result = walk_page_range_vma(vma, addr, addr + 1,
+ 					       &break_ksm_ops, NULL);
+-		if (WARN_ON_ONCE(ksm_page < 0))
+-			return ksm_page;
+-		if (!ksm_page)
++		if (WARN_ON_ONCE(walk_result < 0))
++			return walk_result;
++		if (!walk_result)
++			return 0;
++		if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page)
+ 			return 0;
+ 		ret = handle_mm_fault(vma, addr,
+ 				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+@@ -539,7 +561,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
+ 	mmap_read_lock(mm);
+ 	vma = find_mergeable_vma(mm, addr);
+ 	if (vma)
+-		break_ksm(vma, addr);
++		break_ksm(vma, addr, false);
+ 	mmap_read_unlock(mm);
+ }
+ 
+@@ -764,6 +786,33 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
+ 	return NULL;
+ }
+ 
++/*
++ * Cleaning the rmap_item's ZERO_PAGE_FLAG
++ * This function will be called when unshare or writing on zero pages.
++ */
++static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item)
++{
++	if (rmap_item->address & ZERO_PAGE_FLAG) {
++		ksm_zero_pages_sharing--;
++		rmap_item->mm->ksm_zero_pages_sharing--;
++		rmap_item->address &= PAGE_MASK;
++	}
++}
++
++/* Only called when rmap_item is going to be freed */
++static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item)
++{
++	struct vm_area_struct *vma;
++
++	if (rmap_item->address & ZERO_PAGE_FLAG) {
++		vma = vma_lookup(rmap_item->mm, rmap_item->address);
++		if (vma && !ksm_test_exit(rmap_item->mm))
++			break_ksm(vma, rmap_item->address, true);
++	}
++	/* Put at last. */
++	clean_rmap_item_zero_flag(rmap_item);
++}
++
+ /*
+  * Removing rmap_item from stable or unstable tree.
+  * This function will clean the information from the stable/unstable tree.
+@@ -824,6 +873,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
+ 		struct ksm_rmap_item *rmap_item = *rmap_list;
+ 		*rmap_list = rmap_item->rmap_list;
+ 		remove_rmap_item_from_tree(rmap_item);
++		unshare_zero_pages(rmap_item);
+ 		free_rmap_item(rmap_item);
+ 	}
+ }
+@@ -853,7 +903,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ 		if (signal_pending(current))
+ 			err = -ERESTARTSYS;
+ 		else
+-			err = break_ksm(vma, addr);
++			err = break_ksm(vma, addr, false);
+ 	}
+ 	return err;
+ }
+@@ -2044,6 +2094,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+ 	rmap_item->mm->ksm_merging_pages++;
+ }
+ 
++static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item,
++									struct page *page)
++{
++	struct mm_struct *mm = rmap_item->mm;
++	int err = 0;
++
++	/*
++	 * It should not take ZERO_PAGE_FLAG because on one hand,
++	 * get_next_rmap_item don't return zero pages' rmap_item.
++	 * On the other hand, even if zero page was writen as
++	 * anonymous page, rmap_item has been cleaned after
++	 * stable_tree_search
++	 */
++	if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) {
++		struct vm_area_struct *vma;
++
++		mmap_read_lock(mm);
++		vma = find_mergeable_vma(mm, rmap_item->address);
++		if (vma) {
++			err = try_to_merge_one_page(vma, page,
++						ZERO_PAGE(rmap_item->address));
++			if (!err) {
++				rmap_item->address |= ZERO_PAGE_FLAG;
++				ksm_zero_pages_sharing++;
++				rmap_item->mm->ksm_zero_pages_sharing++;
++			}
++		} else {
++			/* If the vma is out of date, we do not need to continue. */
++			err = 0;
++		}
++		mmap_read_unlock(mm);
++	}
++
++	return err;
++}
++
+ /*
+  * cmp_and_merge_page - first see if page can be merged into the stable tree;
+  * if not, compare checksum to previous and if it's the same, see if page can
+@@ -2055,7 +2141,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+  */
+ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
+ {
+-	struct mm_struct *mm = rmap_item->mm;
+ 	struct ksm_rmap_item *tree_rmap_item;
+ 	struct page *tree_page = NULL;
+ 	struct ksm_stable_node *stable_node;
+@@ -2092,6 +2177,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+ 	}
+ 
+ 	remove_rmap_item_from_tree(rmap_item);
++	clean_rmap_item_zero_flag(rmap_item);
+ 
+ 	if (kpage) {
+ 		if (PTR_ERR(kpage) == -EBUSY)
+@@ -2128,29 +2214,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+ 	 * Same checksum as an empty page. We attempt to merge it with the
+ 	 * appropriate zero page if the user enabled this via sysfs.
+ 	 */
+-	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
+-		struct vm_area_struct *vma;
+-
+-		mmap_read_lock(mm);
+-		vma = find_mergeable_vma(mm, rmap_item->address);
+-		if (vma) {
+-			err = try_to_merge_one_page(vma, page,
+-					ZERO_PAGE(rmap_item->address));
+-		} else {
++	if (ksm_use_zero_pages) {
++		if (checksum == zero_checksum)
+ 			/*
+-			 * If the vma is out of date, we do not need to
+-			 * continue.
++			 * In case of failure, the page was not really empty, so we
++			 * need to continue. Otherwise we're done.
+ 			 */
+-			err = 0;
+-		}
+-		mmap_read_unlock(mm);
+-		/*
+-		 * In case of failure, the page was not really empty, so we
+-		 * need to continue. Otherwise we're done.
+-		 */
+-		if (!err)
+-			return;
++			if (!try_to_merge_with_kernel_zero_page(rmap_item, page))
++				return;
+ 	}
++
+ 	tree_rmap_item =
+ 		unstable_tree_search_insert(rmap_item, page, &tree_page);
+ 	if (tree_rmap_item) {
+@@ -2214,23 +2287,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+ 	}
+ }
+ 
+-static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
+-					    struct ksm_rmap_item **rmap_list,
+-					    unsigned long addr)
++static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr,
++					 struct ksm_rmap_item **rmap_list)
+ {
+-	struct ksm_rmap_item *rmap_item;
+-
+ 	while (*rmap_list) {
+-		rmap_item = *rmap_list;
++		struct ksm_rmap_item *rmap_item = *rmap_list;
++
+ 		if ((rmap_item->address & PAGE_MASK) == addr)
+ 			return rmap_item;
+ 		if (rmap_item->address > addr)
+ 			break;
+ 		*rmap_list = rmap_item->rmap_list;
++		/*
++		 * If we end up here, the VMA is MADV_UNMERGEABLE or its page
++		 * is ineligible or discarded, e.g. MADV_DONTNEED.
++		 */
+ 		remove_rmap_item_from_tree(rmap_item);
++		unshare_zero_pages(rmap_item);
+ 		free_rmap_item(rmap_item);
+ 	}
+ 
++	return NULL;
++}
++
++static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
++					    struct ksm_rmap_item **rmap_list,
++					    unsigned long addr)
++{
++	struct ksm_rmap_item *rmap_item;
++
++	rmap_item = try_to_get_old_rmap_item(addr, rmap_list);
++	if (rmap_item)
++		return rmap_item;
++
+ 	rmap_item = alloc_rmap_item();
+ 	if (rmap_item) {
+ 		/* It has already been zeroed */
+@@ -2337,6 +2426,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
+ 			}
+ 			if (is_zone_device_page(*page))
+ 				goto next_page;
++			if (is_zero_pfn(page_to_pfn(*page))) {
++				/*
++				 * To monitor ksm zero pages which becomes non-anonymous,
++				 * we have to save each rmap_item of zero pages by
++				 * try_to_get_old_rmap_item() walking on
++				 * ksm_scan.rmap_list, otherwise their rmap_items will be
++				 * freed by the next turn of get_next_rmap_item(). The
++				 * function get_next_rmap_item() will free all "skipped"
++				 * rmap_items because it thinks its areas as UNMERGEABLE.
++				 */
++				rmap_item = try_to_get_old_rmap_item(ksm_scan.address,
++									ksm_scan.rmap_list);
++				if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG))
++					ksm_scan.rmap_list = &rmap_item->rmap_list;
++				goto next_page;
++			}
+ 			if (PageAnon(*page)) {
+ 				flush_anon_page(vma, *page, ksm_scan.address);
+ 				flush_dcache_page(*page);
+@@ -3138,6 +3243,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
+ }
+ KSM_ATTR_RO(pages_volatile);
+ 
++static ssize_t zero_pages_sharing_show(struct kobject *kobj,
++				struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%ld\n", ksm_zero_pages_sharing);
++}
++KSM_ATTR_RO(zero_pages_sharing);
++
+ static ssize_t stable_node_dups_show(struct kobject *kobj,
+ 				     struct kobj_attribute *attr, char *buf)
+ {
+@@ -3193,6 +3305,7 @@ static struct attribute *ksm_attrs[] = {
+ 	&pages_sharing_attr.attr,
+ 	&pages_unshared_attr.attr,
+ 	&pages_volatile_attr.attr,
++	&zero_pages_sharing_attr.attr,
+ 	&full_scans_attr.attr,
+ #ifdef CONFIG_NUMA
+ 	&merge_across_nodes_attr.attr,
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3bb3484563ed..3aec9a6a9cb7 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ {
+ 	unsigned long flags;
+ 	int i, allocated = 0;
++	struct list_head *prev_tail = list->prev;
++	struct page *pos, *n;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	for (i = 0; i < count; ++i) {
+@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		if (unlikely(page == NULL))
+ 			break;
+ 
+-		if (unlikely(check_pcp_refill(page, order)))
+-			continue;
+-
+ 		/*
+ 		 * Split buddy pages returned by expand() are received here in
+ 		 * physical page order. The page is added to the tail of
+@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * pages are ordered properly.
+ 		 */
+ 		list_add_tail(&page->pcp_list, list);
+-		allocated++;
+ 		if (is_migrate_cma(get_pcppage_migratetype(page)))
+ 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ 					      -(1 << order));
+@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 	 */
+ 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+ 	spin_unlock_irqrestore(&zone->lock, flags);
++
++	/*
++	 * Pages are appended to the pcp list without checking to reduce the
++	 * time holding the zone lock. Checking the appended pages happens right
++	 * after the critical section while still holding the pcp lock.
++	 */
++	pos = list_first_entry(prev_tail, struct page, pcp_list);
++	list_for_each_entry_safe_from(pos, n, list, pcp_list) {
++		if (unlikely(check_pcp_refill(pos, order))) {
++			list_del(&pos->pcp_list);
++			continue;
++		}
++
++		allocated++;
++	}
++
+ 	return allocated;
+ }
+ 
+diff --git a/mm/z3fold.c b/mm/z3fold.c
+index a4de0c317ac7..0cef845d397b 100644
+--- a/mm/z3fold.c
++++ b/mm/z3fold.c
+@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+ 	struct z3fold_header *zhdr;
+ 	struct z3fold_pool *pool;
+ 
+-	VM_BUG_ON_PAGE(!PageMovable(page), page);
+ 	VM_BUG_ON_PAGE(PageIsolated(page), page);
+ 
+ 	if (test_bit(PAGE_HEADLESS, &page->private))
+@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
+ 	struct z3fold_header *zhdr, *new_zhdr;
+ 	struct z3fold_pool *pool;
+ 
+-	VM_BUG_ON_PAGE(!PageMovable(page), page);
+ 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ 	VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
+ 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
+index 702bc3fd687a..9d27d9b00bce 100644
+--- a/mm/zsmalloc.c
++++ b/mm/zsmalloc.c
+@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+ 	 * Page is locked so zspage couldn't be destroyed. For detail, look at
+ 	 * lock_zspage in free_zspage.
+ 	 */
+-	VM_BUG_ON_PAGE(!PageMovable(page), page);
+ 	VM_BUG_ON_PAGE(PageIsolated(page), page);
+ 
+ 	zspage = get_zspage(page);
+@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
+ 	if (mode == MIGRATE_SYNC_NO_COPY)
+ 		return -EINVAL;
+ 
+-	VM_BUG_ON_PAGE(!PageMovable(page), page);
+ 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ 
+ 	/* The page is locked, so this pointer must remain valid */
+@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page)
+ {
+ 	struct zspage *zspage;
+ 
+-	VM_BUG_ON_PAGE(!PageMovable(page), page);
+ 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ 
+ 	zspage = get_zspage(page);
+diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
+index 274125307ebd..5a84b6443875 100644
+--- a/scripts/Kconfig.include
++++ b/scripts/Kconfig.include
+@@ -33,7 +33,7 @@ ld-option = $(success,$(LD) -v $(1))
+ 
+ # $(as-instr,<instr>)
+ # Return y if the assembler supports <instr>, n otherwise
+-as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler -o /dev/null -)
++as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler-with-cpp -o /dev/null -)
+ 
+ # check if $(CC) and $(LD) exist
+ $(error-if,$(failure,command -v $(CC)),C compiler '$(CC)' not found)
+diff --git a/scripts/Makefile.compiler b/scripts/Makefile.compiler
+index 3d8adfd34af1..7aa1fbc4aafe 100644
+--- a/scripts/Makefile.compiler
++++ b/scripts/Makefile.compiler
+@@ -29,16 +29,16 @@ try-run = $(shell set -e;		\
+ 	fi)
+ 
+ # as-option
+-# Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
++# Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
+ 
+ as-option = $(call try-run,\
+-	$(CC) $(KBUILD_CFLAGS) $(1) -c -x assembler /dev/null -o "$$TMP",$(1),$(2))
++	$(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+ 
+ # as-instr
+-# Usage: cflags-y += $(call as-instr,instr,option1,option2)
++# Usage: aflags-y += $(call as-instr,instr,option1,option2)
+ 
+ as-instr = $(call try-run,\
+-	printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
++	printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+ 
+ # __cc-option
+ # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
+diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
+index 0edfdb40364b..ae52d3b3f063 100644
+--- a/scripts/Makefile.vmlinux_o
++++ b/scripts/Makefile.vmlinux_o
+@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN     $@
+ 
+ .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \
+ 		vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
+-	$(call if_changed,gen_initcalls_lds)
++	+$(call if_changed,gen_initcalls_lds)
+ 
+ targets := .tmp_initcalls.lds
+ 
+diff --git a/scripts/as-version.sh b/scripts/as-version.sh
+index 1a21495e9ff0..af717476152d 100755
+--- a/scripts/as-version.sh
++++ b/scripts/as-version.sh
+@@ -45,7 +45,7 @@ orig_args="$@"
+ # Get the first line of the --version output.
+ IFS='
+ '
+-set -- $(LC_ALL=C "$@" -Wa,--version -c -x assembler /dev/null -o /dev/null 2>/dev/null)
++set -- $(LC_ALL=C "$@" -Wa,--version -c -x assembler-with-cpp /dev/null -o /dev/null 2>/dev/null)
+ 
+ # Split the line on spaces.
+ IFS=' '
+diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
+index 53baa95cb644..0f295961e773 100644
+--- a/security/Kconfig.hardening
++++ b/security/Kconfig.hardening
+@@ -281,6 +281,9 @@ endmenu
+ 
+ config CC_HAS_RANDSTRUCT
+ 	def_bool $(cc-option,-frandomize-layout-seed-file=/dev/null)
++	# Randstruct was first added in Clang 15, but it isn't safe to use until
++	# Clang 16 due to https://github.com/llvm/llvm-project/issues/60349
++	depends on !CC_IS_CLANG || CLANG_VERSION >= 160000
+ 
+ choice
+ 	prompt "Randomize layout of sensitive kernel structures"
+diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
+index f7815ee24f83..e94b0a6b96df 100644
+--- a/sound/pci/hda/cs35l41_hda.c
++++ b/sound/pci/hda/cs35l41_hda.c
+@@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
+ 
+ 	if (strncmp(hid, "CLSA0100", 8) == 0) {
+ 		hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH;
+-	} else if (strncmp(hid, "CLSA0101", 8) == 0) {
++	} else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) {
+ 		hw_cfg->bst_type = CS35L41_EXT_BOOST;
+ 		hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
+ 		hw_cfg->gpio1.valid = true;
+diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
+index b11b7e5115dc..3033cd6ed3b4 100644
+--- a/tools/testing/selftests/vm/ksm_functional_tests.c
++++ b/tools/testing/selftests/vm/ksm_functional_tests.c
+@@ -24,9 +24,12 @@
+ 
+ #define KiB 1024u
+ #define MiB (1024 * KiB)
++#define PageSize (4 * KiB)
+ 
+ static int ksm_fd;
+ static int ksm_full_scans_fd;
++static int ksm_zero_pages_fd;
++static int ksm_use_zero_pages_fd;
+ static int pagemap_fd;
+ static size_t pagesize;
+ 
+@@ -57,6 +60,21 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
+ 	return false;
+ }
+ 
++static long ksm_get_zero_pages(void)
++{
++	char buf[20];
++	ssize_t read_size;
++	unsigned long ksm_zero_pages;
++
++	read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0);
++	if (read_size < 0)
++		return -errno;
++	buf[read_size] = 0;
++	ksm_zero_pages = strtol(buf, NULL, 10);
++
++	return ksm_zero_pages;
++}
++
+ static long ksm_get_full_scans(void)
+ {
+ 	char buf[10];
+@@ -70,15 +88,12 @@ static long ksm_get_full_scans(void)
+ 	return strtol(buf, NULL, 10);
+ }
+ 
+-static int ksm_merge(void)
++static int wait_two_full_scans(void)
+ {
+ 	long start_scans, end_scans;
+ 
+-	/* Wait for two full scans such that any possible merging happened. */
+ 	start_scans = ksm_get_full_scans();
+ 	if (start_scans < 0)
+-		return start_scans;
+-	if (write(ksm_fd, "1", 1) != 1)
+ 		return -errno;
+ 	do {
+ 		end_scans = ksm_get_full_scans();
+@@ -89,6 +104,34 @@ static int ksm_merge(void)
+ 	return 0;
+ }
+ 
++static inline int ksm_merge(void)
++{
++	/* Wait for two full scans such that any possible merging happened. */
++	if (write(ksm_fd, "1", 1) != 1)
++		return -errno;
++
++	return wait_two_full_scans();
++}
++
++static int unmerge_zero_page(char *start, unsigned long size)
++{
++	int ret;
++
++	ret = madvise(start, size, MADV_UNMERGEABLE);
++	if (ret) {
++		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
++		return ret;
++	}
++
++	/*
++	 * Wait for two full scans such that any possible unmerging of zero
++	 * pages happened. Why? Because the unmerge action of zero pages is not
++	 * done in the context of madvise(), but in the context of
++	 * unshare_zero_pages() of the ksmd thread.
++	 */
++	return wait_two_full_scans();
++}
++
+ static char *mmap_and_merge_range(char val, unsigned long size)
+ {
+ 	char *map;
+@@ -146,6 +189,48 @@ static void test_unmerge(void)
+ 	munmap(map, size);
+ }
+ 
++static void test_unmerge_zero_pages(void)
++{
++	const unsigned int size = 2 * MiB;
++	char *map;
++	unsigned long pages_expected;
++
++	ksft_print_msg("[RUN] %s\n", __func__);
++
++	/* Confirm the interfaces*/
++	if (ksm_zero_pages_fd < 0) {
++		ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n");
++		return;
++	}
++	if (ksm_use_zero_pages_fd < 0) {
++		ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
++		return;
++	}
++	if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
++		ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
++		return;
++	}
++
++	/* Mmap zero pages*/
++	map = mmap_and_merge_range(0x00, size);
++	if (map == MAP_FAILED)
++		return;
++
++	if (unmerge_zero_page(map + size / 2, size / 2))
++		goto unmap;
++
++	/* Check if zero_pages_sharing can be update correctly when unmerge */
++	pages_expected = (size / 2) / PageSize;
++	ksft_test_result(pages_expected == ksm_get_zero_pages(),
++						"zero page count react to unmerge\n");
++
++	/* Check if ksm zero pages are really unmerged */
++	ksft_test_result(!range_maps_duplicates(map + size / 2, size / 2),
++						"KSM zero pages were unmerged\n");
++unmap:
++	munmap(map, size);
++}
++
+ static void test_unmerge_discarded(void)
+ {
+ 	const unsigned int size = 2 * MiB;
+@@ -264,8 +349,11 @@ int main(int argc, char **argv)
+ 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ 	if (pagemap_fd < 0)
+ 		ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
++	ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY);
++	ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
+ 
+ 	test_unmerge();
++	test_unmerge_zero_pages();
+ 	test_unmerge_discarded();
+ #ifdef __NR_userfaultfd
+ 	test_unmerge_uffd_wp();
+-- 
+2.39.2
+
+From d1c5ae2d043a5ae09cbe88ad8f21e4753ced9418 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 13 Feb 2023 11:27:09 +0100
+Subject: [PATCH 07/15] fs-patches
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/xfs.rst |   2 +-
+ block/blk-merge.c                 |   3 +-
+ fs/btrfs/Makefile                 |   6 +-
+ fs/btrfs/backref.c                |  33 +-
+ fs/btrfs/bio.c                    | 557 +++++++++++++++++++++---
+ fs/btrfs/bio.h                    |  67 +--
+ fs/btrfs/block-group.c            | 273 ++++++++++--
+ fs/btrfs/block-group.h            |  24 +-
+ fs/btrfs/btrfs_inode.h            |  22 +-
+ fs/btrfs/compression.c            | 276 ++----------
+ fs/btrfs/compression.h            |   3 -
+ fs/btrfs/ctree.c                  |  62 ++-
+ fs/btrfs/ctree.h                  |  15 +
+ fs/btrfs/defrag.c                 |   4 +-
+ fs/btrfs/delayed-ref.c            |  24 +-
+ fs/btrfs/delayed-ref.h            |   2 +-
+ fs/btrfs/discard.c                |  41 +-
+ fs/btrfs/disk-io.c                | 225 +---------
+ fs/btrfs/disk-io.h                |  14 +-
+ fs/btrfs/extent-io-tree.c         |  10 +-
+ fs/btrfs/extent-io-tree.h         |   1 -
+ fs/btrfs/extent-tree.c            | 181 +++-----
+ fs/btrfs/extent-tree.h            |  81 ++++
+ fs/btrfs/extent_io.c              | 582 +++----------------------
+ fs/btrfs/extent_io.h              |  36 +-
+ fs/btrfs/file-item.c              |  72 ++--
+ fs/btrfs/file-item.h              |   8 +-
+ fs/btrfs/file.c                   |   2 +-
+ fs/btrfs/free-space-tree.c        |   2 +-
+ fs/btrfs/fs.c                     |   4 +
+ fs/btrfs/fs.h                     |  11 +-
+ fs/btrfs/inode.c                  | 641 ++++------------------------
+ fs/btrfs/ioctl.c                  |   2 +-
+ fs/btrfs/lru_cache.c              | 166 ++++++++
+ fs/btrfs/lru_cache.h              |  80 ++++
+ fs/btrfs/lzo.c                    |   2 +-
+ fs/btrfs/messages.c               |  30 --
+ fs/btrfs/messages.h               |  34 --
+ fs/btrfs/ordered-data.c           |  25 +-
+ fs/btrfs/ordered-data.h           |   3 +-
+ fs/btrfs/qgroup.c                 |   2 +-
+ fs/btrfs/raid56.c                 | 334 ++++++---------
+ fs/btrfs/raid56.h                 |   4 +-
+ fs/btrfs/relocation.c             |   2 +-
+ fs/btrfs/scrub.c                  |  51 ++-
+ fs/btrfs/send.c                   | 684 ++++++++++++++++--------------
+ fs/btrfs/super.c                  |   3 +-
+ fs/btrfs/sysfs.c                  |  41 +-
+ fs/btrfs/sysfs.h                  |   3 +-
+ fs/btrfs/tests/extent-map-tests.c |   2 +-
+ fs/btrfs/transaction.c            |  34 ++
+ fs/btrfs/transaction.h            |  31 ++
+ fs/btrfs/tree-log.c               |  87 ++--
+ fs/btrfs/tree-log.h               |   9 +-
+ fs/btrfs/volumes.c                | 116 ++---
+ fs/btrfs/volumes.h                |  18 -
+ fs/btrfs/zoned.c                  | 146 +++----
+ fs/btrfs/zoned.h                  |  20 +-
+ fs/gfs2/bmap.c                    |  38 +-
+ fs/iomap/buffered-io.c            |  91 ++--
+ fs/iomap/direct-io.c              |  10 +-
+ fs/xfs/libxfs/xfs_alloc.c         |  32 +-
+ fs/xfs/libxfs/xfs_bmap.c          |  32 +-
+ fs/xfs/libxfs/xfs_bmap.h          |   5 +-
+ fs/xfs/libxfs/xfs_btree.c         |  18 +-
+ fs/xfs/libxfs/xfs_refcount.c      |  96 ++---
+ fs/xfs/libxfs/xfs_refcount.h      |   4 +-
+ fs/xfs/libxfs/xfs_rmap.c          |  50 ++-
+ fs/xfs/libxfs/xfs_rmap.h          |   6 +-
+ fs/xfs/xfs_bmap_item.c            | 137 +++---
+ fs/xfs/xfs_error.c                |   2 +-
+ fs/xfs/xfs_error.h                |  12 +-
+ fs/xfs/xfs_extfree_item.c         |  99 +++--
+ fs/xfs/xfs_globals.c              |   3 +-
+ fs/xfs/xfs_iomap.c                |   4 +-
+ fs/xfs/xfs_refcount_item.c        | 110 +++--
+ fs/xfs/xfs_rmap_item.c            | 142 +++----
+ fs/xfs/xfs_sysfs.c                |  12 +-
+ fs/xfs/xfs_sysfs.h                |  10 +-
+ fs/xfs/xfs_trace.h                |  15 +-
+ include/linux/bio.h               |   4 +
+ include/linux/iomap.h             |  30 +-
+ include/trace/events/btrfs.h      | 127 +++++-
+ 83 files changed, 2936 insertions(+), 3366 deletions(-)
+ create mode 100644 fs/btrfs/lru_cache.c
+ create mode 100644 fs/btrfs/lru_cache.h
+
+diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
+index 8de008c0c5ad..e2561416391c 100644
+--- a/Documentation/admin-guide/xfs.rst
++++ b/Documentation/admin-guide/xfs.rst
+@@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem:
+ 		XFS_ERRLEVEL_LOW:       1
+ 		XFS_ERRLEVEL_HIGH:      5
+ 
+-  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 256)
++  fs.xfs.panic_mask		(Min: 0  Default: 0  Max: 511)
+ 	Causes certain error conditions to call BUG(). Value is a bitmask;
+ 	OR together the tags which represent errors which should cause panics:
+ 
+diff --git a/block/blk-merge.c b/block/blk-merge.c
+index b7c193d67185..64bf7d9dd8e8 100644
+--- a/block/blk-merge.c
++++ b/block/blk-merge.c
+@@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim,
+  * responsible for ensuring that @bs is only destroyed after processing of the
+  * split bio has finished.
+  */
+-static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
++struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ 		unsigned *segs, struct bio_set *bs, unsigned max_bytes)
+ {
+ 	struct bio_vec bv, bvprv, *bvprvp = NULL;
+@@ -336,6 +336,7 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ 	bio_clear_polled(bio);
+ 	return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
+ }
++EXPORT_SYMBOL_GPL(bio_split_rw);
+ 
+ /**
+  * __bio_split_to_limits - split a bio to fit the queue limits
+diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
+index 555c962fdad6..90d53209755b 100644
+--- a/fs/btrfs/Makefile
++++ b/fs/btrfs/Makefile
+@@ -11,7 +11,8 @@ condflags := \
+ 	$(call cc-option, -Wunused-but-set-variable)		\
+ 	$(call cc-option, -Wunused-const-variable)		\
+ 	$(call cc-option, -Wpacked-not-aligned)			\
+-	$(call cc-option, -Wstringop-truncation)
++	$(call cc-option, -Wstringop-truncation)		\
++	$(call cc-option, -Wmaybe-uninitialized)
+ subdir-ccflags-y += $(condflags)
+ # The following turn off the warnings enabled by -Wextra
+ subdir-ccflags-y += -Wno-missing-field-initializers
+@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+ 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
+ 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
+-	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
++	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
++	   lru_cache.o
+ 
+ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
+index 46851511b661..90e40d5ceccd 100644
+--- a/fs/btrfs/backref.c
++++ b/fs/btrfs/backref.c
+@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
+ 					struct btrfs_root *root,
+ 					u64 bytenr, int level, bool *is_shared)
+ {
++	const struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct btrfs_backref_shared_cache_entry *entry;
+ 
++	if (!current->journal_info)
++		lockdep_assert_held(&fs_info->commit_root_sem);
++
+ 	if (!ctx->use_path_cache)
+ 		return false;
+ 
+@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct
+ 	 * could be a snapshot sharing this extent buffer.
+ 	 */
+ 	if (entry->is_shared &&
+-	    entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
++	    entry->gen != btrfs_get_last_root_drop_gen(fs_info))
+ 		return false;
+ 
+ 	*is_shared = entry->is_shared;
+@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
+ 				       struct btrfs_root *root,
+ 				       u64 bytenr, int level, bool is_shared)
+ {
++	const struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct btrfs_backref_shared_cache_entry *entry;
+ 	u64 gen;
+ 
++	if (!current->journal_info)
++		lockdep_assert_held(&fs_info->commit_root_sem);
++
+ 	if (!ctx->use_path_cache)
+ 		return;
+ 
+@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx
+ 	ASSERT(level >= 0);
+ 
+ 	if (is_shared)
+-		gen = btrfs_get_last_root_drop_gen(root->fs_info);
++		gen = btrfs_get_last_root_drop_gen(fs_info);
+ 	else
+ 		gen = btrfs_root_last_snapshot(&root->root_item);
+ 
+@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ 		.have_delayed_delete_refs = false,
+ 	};
+ 	int level;
++	bool leaf_cached;
++	bool leaf_is_shared;
+ 
+ 	for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) {
+ 		if (ctx->prev_extents_cache[i].bytenr == bytenr)
+@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ 		walk_ctx.time_seq = elem.seq;
+ 	}
+ 
++	ctx->use_path_cache = true;
++
++	/*
++	 * We may have previously determined that the current leaf is shared.
++	 * If it is, then we have a data extent that is shared due to a shared
++	 * subtree (caused by snapshotting) and we don't need to check for data
++	 * backrefs. If the leaf is not shared, then we must do backref walking
++	 * to determine if the data extent is shared through reflinks.
++	 */
++	leaf_cached = lookup_backref_shared_cache(ctx, root,
++						  ctx->curr_leaf_bytenr, 0,
++						  &leaf_is_shared);
++	if (leaf_cached && leaf_is_shared) {
++		ret = 1;
++		goto out_trans;
++	}
++
+ 	walk_ctx.ignore_extent_item_pos = true;
+ 	walk_ctx.trans = trans;
+ 	walk_ctx.fs_info = fs_info;
+@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ 	/* -1 means we are in the bytenr of the data extent. */
+ 	level = -1;
+ 	ULIST_ITER_INIT(&uiter);
+-	ctx->use_path_cache = true;
+ 	while (1) {
+ 		bool is_shared;
+ 		bool cached;
+@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
+ 		ctx->prev_extents_cache_slot = slot;
+ 	}
+ 
++out_trans:
+ 	if (trans) {
+ 		btrfs_put_tree_mod_seq(fs_info, &elem);
+ 		btrfs_end_transaction(trans);
+diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
+index 8affc88b0e0a..d8b90f95b157 100644
+--- a/fs/btrfs/bio.c
++++ b/fs/btrfs/bio.c
+@@ -14,19 +14,31 @@
+ #include "dev-replace.h"
+ #include "rcu-string.h"
+ #include "zoned.h"
++#include "file-item.h"
+ 
+ static struct bio_set btrfs_bioset;
++static struct bio_set btrfs_clone_bioset;
++static struct bio_set btrfs_repair_bioset;
++static mempool_t btrfs_failed_bio_pool;
++
++struct btrfs_failed_bio {
++	struct btrfs_bio *bbio;
++	int num_copies;
++	atomic_t repair_count;
++};
+ 
+ /*
+  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
+  * is already initialized by the block layer.
+  */
+-static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+-				  btrfs_bio_end_io_t end_io, void *private)
++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
++		    btrfs_bio_end_io_t end_io, void *private)
+ {
+ 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
++	bbio->inode = inode;
+ 	bbio->end_io = end_io;
+ 	bbio->private = private;
++	atomic_set(&bbio->pending_ios, 1);
+ }
+ 
+ /*
+@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+  * a mempool.
+  */
+ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
++			    struct btrfs_inode *inode,
+ 			    btrfs_bio_end_io_t end_io, void *private)
+ {
+ 	struct bio *bio;
+ 
+ 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+-	btrfs_bio_init(btrfs_bio(bio), end_io, private);
++	btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
+ 	return bio;
+ }
+ 
+-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+-				    btrfs_bio_end_io_t end_io, void *private)
++static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
++				   struct bio *orig, u64 map_length,
++				   bool use_append)
+ {
++	struct btrfs_bio *orig_bbio = btrfs_bio(orig);
+ 	struct bio *bio;
+-	struct btrfs_bio *bbio;
+ 
+-	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
++	if (use_append) {
++		unsigned int nr_segs;
++
++		bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
++				   &btrfs_clone_bioset, map_length);
++	} else {
++		bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
++				&btrfs_clone_bioset);
++	}
++	btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
+ 
+-	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+-	bbio = btrfs_bio(bio);
+-	btrfs_bio_init(bbio, end_io, private);
++	btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
++	if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
++		orig_bbio->file_offset += map_length;
+ 
+-	bio_trim(bio, offset >> 9, size >> 9);
+-	bbio->iter = bio->bi_iter;
++	atomic_inc(&orig_bbio->pending_ios);
+ 	return bio;
+ }
+ 
++static void btrfs_orig_write_end_io(struct bio *bio);
++
++static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
++				       struct btrfs_bio *orig_bbio)
++{
++	/*
++	 * For writes we tolerate nr_mirrors - 1 write failures, so we can't
++	 * just blindly propagate a write failure here.  Instead increment the
++	 * error count in the original I/O context so that it is guaranteed to
++	 * be larger than the error tolerance.
++	 */
++	if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
++		struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
++		struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
++
++		atomic_add(orig_bioc->max_errors, &orig_bioc->error);
++	} else {
++		orig_bbio->bio.bi_status = bbio->bio.bi_status;
++	}
++}
++
++static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
++{
++	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
++		struct btrfs_bio *orig_bbio = bbio->private;
++
++		if (bbio->bio.bi_status)
++			btrfs_bbio_propagate_error(bbio, orig_bbio);
++		bio_put(&bbio->bio);
++		bbio = orig_bbio;
++	}
++
++	if (atomic_dec_and_test(&bbio->pending_ios))
++		bbio->end_io(bbio);
++}
++
++static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
++{
++	if (cur_mirror == fbio->num_copies)
++		return cur_mirror + 1 - fbio->num_copies;
++	return cur_mirror + 1;
++}
++
++static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
++{
++	if (cur_mirror == 1)
++		return fbio->num_copies;
++	return cur_mirror - 1;
++}
++
++static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
++{
++	if (atomic_dec_and_test(&fbio->repair_count)) {
++		btrfs_orig_bbio_end_io(fbio->bbio);
++		mempool_free(fbio, &btrfs_failed_bio_pool);
++	}
++}
++
++static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
++				 struct btrfs_device *dev)
++{
++	struct btrfs_failed_bio *fbio = repair_bbio->private;
++	struct btrfs_inode *inode = repair_bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
++	int mirror = repair_bbio->mirror_num;
++
++	if (repair_bbio->bio.bi_status ||
++	    !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
++		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
++		repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
++
++		mirror = next_repair_mirror(fbio, mirror);
++		if (mirror == fbio->bbio->mirror_num) {
++			btrfs_debug(fs_info, "no mirror left");
++			fbio->bbio->bio.bi_status = BLK_STS_IOERR;
++			goto done;
++		}
++
++		btrfs_submit_bio(&repair_bbio->bio, mirror);
++		return;
++	}
++
++	do {
++		mirror = prev_repair_mirror(fbio, mirror);
++		btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
++				  repair_bbio->file_offset, fs_info->sectorsize,
++				  repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
++				  bv->bv_page, bv->bv_offset, mirror);
++	} while (mirror != fbio->bbio->mirror_num);
++
++done:
++	btrfs_repair_done(fbio);
++	bio_put(&repair_bbio->bio);
++}
++
++/*
++ * Try to kick off a repair read to the next available mirror for a bad sector.
++ *
++ * This primarily tries to recover good data to serve the actual read request,
++ * but also tries to write the good data back to the bad mirror(s) when a
++ * read succeeded to restore the redundancy.
++ */
++static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
++						  u32 bio_offset,
++						  struct bio_vec *bv,
++						  struct btrfs_failed_bio *fbio)
++{
++	struct btrfs_inode *inode = failed_bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	const u32 sectorsize = fs_info->sectorsize;
++	const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
++	struct btrfs_bio *repair_bbio;
++	struct bio *repair_bio;
++	int num_copies;
++	int mirror;
++
++	btrfs_debug(fs_info, "repair read error: read error at %llu",
++		    failed_bbio->file_offset + bio_offset);
++
++	num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
++	if (num_copies == 1) {
++		btrfs_debug(fs_info, "no copy to repair from");
++		failed_bbio->bio.bi_status = BLK_STS_IOERR;
++		return fbio;
++	}
++
++	if (!fbio) {
++		fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
++		fbio->bbio = failed_bbio;
++		fbio->num_copies = num_copies;
++		atomic_set(&fbio->repair_count, 1);
++	}
++
++	atomic_inc(&fbio->repair_count);
++
++	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
++				      &btrfs_repair_bioset);
++	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
++	bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
++
++	repair_bbio = btrfs_bio(repair_bio);
++	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
++	repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
++
++	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
++	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
++	btrfs_submit_bio(repair_bio, mirror);
++	return fbio;
++}
++
++static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
++{
++	struct btrfs_inode *inode = bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	u32 sectorsize = fs_info->sectorsize;
++	struct bvec_iter *iter = &bbio->saved_iter;
++	blk_status_t status = bbio->bio.bi_status;
++	struct btrfs_failed_bio *fbio = NULL;
++	u32 offset = 0;
++
++	/*
++	 * Hand off repair bios to the repair code as there is no upper level
++	 * submitter for them.
++	 */
++	if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
++		btrfs_end_repair_bio(bbio, dev);
++		return;
++	}
++
++	/* Clear the I/O error. A failed repair will reset it. */
++	bbio->bio.bi_status = BLK_STS_OK;
++
++	while (iter->bi_size) {
++		struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
++
++		bv.bv_len = min(bv.bv_len, sectorsize);
++		if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
++			fbio = repair_one_sector(bbio, offset, &bv, fbio);
++
++		bio_advance_iter_single(&bbio->bio, iter, sectorsize);
++		offset += sectorsize;
++	}
++
++	if (bbio->csum != bbio->csum_inline)
++		kfree(bbio->csum);
++
++	if (fbio)
++		btrfs_repair_done(fbio);
++	else
++		btrfs_orig_bbio_end_io(bbio);
++}
++
+ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+ {
+ 	if (!dev || !dev->bdev)
+@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work)
+ {
+ 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ 
+-	bbio->end_io(bbio);
++	/* Metadata reads are checked and repaired by the submitter. */
++	if (bbio->bio.bi_opf & REQ_META)
++		bbio->end_io(bbio);
++	else
++		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+ }
+ 
+ static void btrfs_simple_end_io(struct bio *bio)
+ {
+-	struct btrfs_fs_info *fs_info = bio->bi_private;
+ 	struct btrfs_bio *bbio = btrfs_bio(bio);
++	struct btrfs_device *dev = bio->bi_private;
++	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ 
+ 	btrfs_bio_counter_dec(fs_info);
+ 
+ 	if (bio->bi_status)
+-		btrfs_log_dev_io_error(bio, bbio->device);
++		btrfs_log_dev_io_error(bio, dev);
+ 
+ 	if (bio_op(bio) == REQ_OP_READ) {
+ 		INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ 		queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ 	} else {
+-		bbio->end_io(bbio);
++		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
++			btrfs_record_physical_zoned(bbio);
++		btrfs_orig_bbio_end_io(bbio);
+ 	}
+ }
+ 
+@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio)
+ 
+ 	btrfs_bio_counter_dec(bioc->fs_info);
+ 	bbio->mirror_num = bioc->mirror_num;
+-	bbio->end_io(bbio);
++	if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
++		btrfs_check_read_bio(bbio, NULL);
++	else
++		btrfs_orig_bbio_end_io(bbio);
+ 
+ 	btrfs_put_bioc(bioc);
+ }
+@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
+ 	else
+ 		bio->bi_status = BLK_STS_OK;
+ 
+-	bbio->end_io(bbio);
++	btrfs_orig_bbio_end_io(bbio);
+ 	btrfs_put_bioc(bioc);
+ }
+ 
+@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+ 	 */
+ 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ 		u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
++		u64 zone_start = round_down(physical, dev->fs_info->zone_size);
+ 
+-		if (btrfs_dev_is_sequential(dev, physical)) {
+-			u64 zone_start = round_down(physical,
+-						    dev->fs_info->zone_size);
+-
+-			bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+-		} else {
+-			bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+-			bio->bi_opf |= REQ_OP_WRITE;
+-		}
++		ASSERT(btrfs_dev_is_sequential(dev, physical));
++		bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ 	}
+ 	btrfs_debug_in_rcu(dev->fs_info,
+ 	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+ 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+ }
+ 
+-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
++static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
++			       struct btrfs_io_stripe *smap, int mirror_num)
+ {
+-	u64 logical = bio->bi_iter.bi_sector << 9;
+-	u64 length = bio->bi_iter.bi_size;
+-	u64 map_length = length;
+-	struct btrfs_io_context *bioc = NULL;
+-	struct btrfs_io_stripe smap;
+-	int ret;
+-
+-	btrfs_bio_counter_inc_blocked(fs_info);
+-	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+-				&bioc, &smap, &mirror_num, 1);
+-	if (ret) {
+-		btrfs_bio_counter_dec(fs_info);
+-		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+-		return;
+-	}
+-
+-	if (map_length < length) {
+-		btrfs_crit(fs_info,
+-			   "mapping failed logical %llu bio len %llu len %llu",
+-			   logical, length, map_length);
+-		BUG();
+-	}
++	/* Do not leak our private flag into the block layer. */
++	bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED;
+ 
+ 	if (!bioc) {
+-		/* Single mirror read/write fast path */
++		/* Single mirror read/write fast path. */
+ 		btrfs_bio(bio)->mirror_num = mirror_num;
+-		btrfs_bio(bio)->device = smap.dev;
+-		bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+-		bio->bi_private = fs_info;
++		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
++		bio->bi_private = smap->dev;
+ 		bio->bi_end_io = btrfs_simple_end_io;
+-		btrfs_submit_dev_bio(smap.dev, bio);
++		btrfs_submit_dev_bio(smap->dev, bio);
+ 	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+-		/* Parity RAID write or read recovery */
++		/* Parity RAID write or read recovery. */
+ 		bio->bi_private = bioc;
+ 		bio->bi_end_io = btrfs_raid56_end_io;
+ 		if (bio_op(bio) == REQ_OP_READ)
+@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
+ 		else
+ 			raid56_parity_write(bio, bioc);
+ 	} else {
+-		/* Write to multiple mirrors */
++		/* Write to multiple mirrors. */
+ 		int total_devs = bioc->num_stripes;
+-		int dev_nr;
+ 
+ 		bioc->orig_bio = bio;
+-		for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
++		for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ 			btrfs_submit_mirrored_bio(bioc, dev_nr);
+ 	}
+ }
+ 
++static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
++{
++	if (bbio->bio.bi_opf & REQ_META)
++		return btree_csum_one_bio(bbio);
++	return btrfs_csum_one_bio(bbio);
++}
++
++/*
++ * Async submit bios are used to offload expensive checksumming onto the worker
++ * threads.
++ */
++struct async_submit_bio {
++	struct btrfs_bio *bbio;
++	struct btrfs_io_context *bioc;
++	struct btrfs_io_stripe smap;
++	int mirror_num;
++	struct btrfs_work work;
++};
++
++/*
++ * In order to insert checksums into the metadata in large chunks, we wait
++ * until bio submission time.   All the pages in the bio are checksummed and
++ * sums are attached onto the ordered extent record.
++ *
++ * At IO completion time the csums attached on the ordered extent record are
++ * inserted into the btree.
++ */
++static void run_one_async_start(struct btrfs_work *work)
++{
++	struct async_submit_bio *async =
++		container_of(work, struct async_submit_bio, work);
++	blk_status_t ret;
++
++	ret = btrfs_bio_csum(async->bbio);
++	if (ret)
++		async->bbio->bio.bi_status = ret;
++}
++
++/*
++ * In order to insert checksums into the metadata in large chunks, we wait
++ * until bio submission time.   All the pages in the bio are checksummed and
++ * sums are attached onto the ordered extent record.
++ *
++ * At IO completion time the csums attached on the ordered extent record are
++ * inserted into the tree.
++ */
++static void run_one_async_done(struct btrfs_work *work)
++{
++	struct async_submit_bio *async =
++		container_of(work, struct async_submit_bio, work);
++	struct bio *bio = &async->bbio->bio;
++
++	/* If an error occurred we just want to clean up the bio and move on. */
++	if (bio->bi_status) {
++		btrfs_orig_bbio_end_io(async->bbio);
++		return;
++	}
++
++	/*
++	 * All of the bios that pass through here are from async helpers.
++	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
++	 * This changes nothing when cgroups aren't in use.
++	 */
++	bio->bi_opf |= REQ_CGROUP_PUNT;
++	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
++}
++
++static void run_one_async_free(struct btrfs_work *work)
++{
++	kfree(container_of(work, struct async_submit_bio, work));
++}
++
++static bool should_async_write(struct btrfs_bio *bbio)
++{
++	/*
++	 * If the I/O is not issued by fsync and friends, (->sync_writers != 0),
++	 * then try to defer the submission to a workqueue to parallelize the
++	 * checksum calculation.
++	 */
++	if (atomic_read(&bbio->inode->sync_writers))
++		return false;
++
++	/*
++	 * Submit metadata writes synchronously if the checksum implementation
++	 * is fast, or we are on a zoned device that wants I/O to be submitted
++	 * in order.
++	 */
++	if (bbio->bio.bi_opf & REQ_META) {
++		struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++
++		if (btrfs_is_zoned(fs_info))
++			return false;
++		if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
++			return false;
++	}
++
++	return true;
++}
++
++/*
++ * Submit bio to an async queue.
++ *
++ * Return true if the work has been succesfuly submitted, else false.
++ */
++static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
++				struct btrfs_io_context *bioc,
++				struct btrfs_io_stripe *smap, int mirror_num)
++{
++	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++	struct async_submit_bio *async;
++
++	async = kmalloc(sizeof(*async), GFP_NOFS);
++	if (!async)
++		return false;
++
++	async->bbio = bbio;
++	async->bioc = bioc;
++	async->smap = *smap;
++	async->mirror_num = mirror_num;
++
++	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
++			run_one_async_free);
++	if (op_is_sync(bbio->bio.bi_opf))
++		btrfs_queue_work(fs_info->hipri_workers, &async->work);
++	else
++		btrfs_queue_work(fs_info->workers, &async->work);
++	return true;
++}
++
++static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
++{
++	struct btrfs_bio *bbio = btrfs_bio(bio);
++	struct btrfs_inode *inode = bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct btrfs_bio *orig_bbio = bbio;
++	u64 logical = bio->bi_iter.bi_sector << 9;
++	u64 length = bio->bi_iter.bi_size;
++	u64 map_length = length;
++	bool use_append = btrfs_use_zone_append(bbio);
++	struct btrfs_io_context *bioc = NULL;
++	struct btrfs_io_stripe smap;
++	blk_status_t ret;
++	int error;
++
++	btrfs_bio_counter_inc_blocked(fs_info);
++	error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
++				  &bioc, &smap, &mirror_num, 1);
++	if (error) {
++		ret = errno_to_blk_status(error);
++		goto fail;
++	}
++
++	map_length = min(map_length, length);
++	if (use_append)
++		map_length = min(map_length, fs_info->max_zone_append_size);
++
++	if (map_length < length) {
++		bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
++		bbio = btrfs_bio(bio);
++	}
++
++	/*
++	 * Save the iter for the end_io handler and preload the checksums for
++	 * data reads.
++	 */
++	if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
++		bbio->saved_iter = bio->bi_iter;
++		ret = btrfs_lookup_bio_sums(bbio);
++		if (ret)
++			goto fail_put_bio;
++	}
++
++	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
++		if (use_append) {
++			bio->bi_opf &= ~REQ_OP_WRITE;
++			bio->bi_opf |= REQ_OP_ZONE_APPEND;
++			ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
++			if (ret)
++				goto fail_put_bio;
++		}
++
++		/*
++		 * Csum items for reloc roots have already been cloned at this
++		 * point, so they are handled as part of the no-checksum case.
++		 */
++		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
++		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
++		    !btrfs_is_data_reloc_root(inode->root)) {
++			if (should_async_write(bbio) &&
++			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
++				goto done;
++
++			ret = btrfs_bio_csum(bbio);
++			if (ret)
++				goto fail_put_bio;
++		}
++	}
++
++	__btrfs_submit_bio(bio, bioc, &smap, mirror_num);
++done:
++	return map_length == length;
++
++fail_put_bio:
++	if (map_length < length)
++		bio_put(bio);
++fail:
++	btrfs_bio_counter_dec(fs_info);
++	btrfs_bio_end_io(orig_bbio, ret);
++	/* Do not submit another chunk */
++	return true;
++}
++
++void btrfs_submit_bio(struct bio *bio, int mirror_num)
++{
++	while (!btrfs_submit_chunk(bio, mirror_num))
++		;
++}
++
+ /*
+  * Submit a repair write.
+  *
+@@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
+  * RAID setup.  Here we only want to write the one bad copy, so we do the
+  * mapping ourselves and submit the bio directly.
+  *
+- * The I/O is issued sychronously to block the repair read completion from
++ * The I/O is issued synchronously to block the repair read completion from
+  * freeing the bio.
+  */
+ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+@@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void)
+ 			offsetof(struct btrfs_bio, bio),
+ 			BIOSET_NEED_BVECS))
+ 		return -ENOMEM;
++	if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
++			offsetof(struct btrfs_bio, bio), 0))
++		goto out_free_bioset;
++	if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
++			offsetof(struct btrfs_bio, bio),
++			BIOSET_NEED_BVECS))
++		goto out_free_clone_bioset;
++	if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
++				      sizeof(struct btrfs_failed_bio)))
++		goto out_free_repair_bioset;
+ 	return 0;
++
++out_free_repair_bioset:
++	bioset_exit(&btrfs_repair_bioset);
++out_free_clone_bioset:
++	bioset_exit(&btrfs_clone_bioset);
++out_free_bioset:
++	bioset_exit(&btrfs_bioset);
++	return -ENOMEM;
+ }
+ 
+ void __cold btrfs_bioset_exit(void)
+ {
++	mempool_exit(&btrfs_failed_bio_pool);
++	bioset_exit(&btrfs_repair_bioset);
++	bioset_exit(&btrfs_clone_bioset);
+ 	bioset_exit(&btrfs_bioset);
+ }
+diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
+index b12f84b3b341..873ff85817f0 100644
+--- a/fs/btrfs/bio.h
++++ b/fs/btrfs/bio.h
+@@ -26,32 +26,23 @@ struct btrfs_fs_info;
+ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+ 
+ /*
+- * Additional info to pass along bio.
+- *
+- * Mostly for btrfs specific features like csum and mirror_num.
++ * Highlevel btrfs I/O structure.  It is allocated by btrfs_bio_alloc and
++ * passed to btrfs_submit_bio for mapping to the physical devices.
+  */
+ struct btrfs_bio {
+-	unsigned int mirror_num:7;
+-
+-	/*
+-	 * Extra indicator for metadata bios.
+-	 * For some btrfs bios they use pages without a mapping, thus
+-	 * we can not rely on page->mapping->host to determine if
+-	 * it's a metadata bio.
+-	 */
+-	unsigned int is_metadata:1;
+-	struct bvec_iter iter;
+-
+-	/* for direct I/O */
++	/* Inode and offset into it that this I/O operates on. */
++	struct btrfs_inode *inode;
+ 	u64 file_offset;
+ 
+-	/* @device is for stripe IO submission. */
+-	struct btrfs_device *device;
+ 	union {
+-		/* For data checksum verification. */
++		/*
++		 * Data checksumming and original I/O information for internal
++		 * use in the btrfs_submit_bio machinery.
++		 */
+ 		struct {
+ 			u8 *csum;
+ 			u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
++			struct bvec_iter saved_iter;
+ 		};
+ 
+ 		/* For metadata parentness verification. */
+@@ -62,7 +53,9 @@ struct btrfs_bio {
+ 	btrfs_bio_end_io_t end_io;
+ 	void *private;
+ 
+-	/* For read end I/O handling */
++	/* For internal use in read end I/O handling */
++	unsigned int mirror_num;
++	atomic_t pending_ios;
+ 	struct work_struct end_io_work;
+ 
+ 	/*
+@@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+ int __init btrfs_bioset_init(void);
+ void __cold btrfs_bioset_exit(void);
+ 
++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
++		    btrfs_bio_end_io_t end_io, void *private);
+ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
++			    struct btrfs_inode *inode,
+ 			    btrfs_bio_end_io_t end_io, void *private);
+-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+-				    btrfs_bio_end_io_t end_io, void *private);
+-
+ 
+ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+ {
+@@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+ 	bbio->end_io(bbio);
+ }
+ 
+-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
+-{
+-	if (bbio->is_metadata)
+-		return;
+-	if (bbio->csum != bbio->csum_inline) {
+-		kfree(bbio->csum);
+-		bbio->csum = NULL;
+-	}
+-}
++/* Bio only refers to one ordered extent. */
++#define REQ_BTRFS_ONE_ORDERED			REQ_DRV
+ 
+-/*
+- * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+- *
+- * bvl        - struct bio_vec
+- * bbio       - struct btrfs_bio
+- * iters      - struct bvec_iter
+- * bio_offset - unsigned int
+- */
+-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset)	\
+-	for ((iter) = (bbio)->iter, (bio_offset) = 0;			\
+-	     (iter).bi_size &&					\
+-	     (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1);	\
+-	     (bio_offset) += fs_info->sectorsize,			\
+-	     bio_advance_iter_single(&(bbio)->bio, &(iter),		\
+-	     (fs_info)->sectorsize))
+-
+-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+-		      int mirror_num);
++void btrfs_submit_bio(struct bio *bio, int mirror_num);
+ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ 			    u64 length, u64 logical, struct page *page,
+ 			    unsigned int pg_offset, int mirror_num);
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index 708d843daa72..5b10401d803b 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ 
++#include <linux/sizes.h>
+ #include <linux/list_sort.h>
+ #include "misc.h"
+ #include "ctree.h"
+@@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
+ 	return total_added;
+ }
+ 
++/*
++ * Get an arbitrary extent item index / max_index through the block group
++ *
++ * @block_group   the block group to sample from
++ * @index:        the integral step through the block group to grab from
++ * @max_index:    the granularity of the sampling
++ * @key:          return value parameter for the item we find
++ *
++ * Pre-conditions on indices:
++ * 0 <= index <= max_index
++ * 0 < max_index
++ *
++ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
++ * error code on error.
++ */
++static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
++					  struct btrfs_block_group *block_group,
++					  int index, int max_index,
++					  struct btrfs_key *key)
++{
++	struct btrfs_fs_info *fs_info = block_group->fs_info;
++	struct btrfs_root *extent_root;
++	int ret = 0;
++	u64 search_offset;
++	u64 search_end = block_group->start + block_group->length;
++	struct btrfs_path *path;
++
++	ASSERT(index >= 0);
++	ASSERT(index <= max_index);
++	ASSERT(max_index > 0);
++	lockdep_assert_held(&caching_ctl->mutex);
++	lockdep_assert_held_read(&fs_info->commit_root_sem);
++
++	path = btrfs_alloc_path();
++	if (!path)
++		return -ENOMEM;
++
++	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
++						       BTRFS_SUPER_INFO_OFFSET));
++
++	path->skip_locking = 1;
++	path->search_commit_root = 1;
++	path->reada = READA_FORWARD;
++
++	search_offset = index * div_u64(block_group->length, max_index);
++	key->objectid = block_group->start + search_offset;
++	key->type = BTRFS_EXTENT_ITEM_KEY;
++	key->offset = 0;
++
++	while (1) {
++		ret = btrfs_search_forward(extent_root, key, path, 0);
++		if (ret != 0)
++			goto out;
++		/* Success; sampled an extent item in the block group */
++		if (key->type == BTRFS_EXTENT_ITEM_KEY &&
++		    key->objectid >= block_group->start &&
++		    key->objectid + key->offset <= search_end)
++			goto out;
++
++		/* We can't possibly find a valid extent item anymore */
++		if (key->objectid >= search_end) {
++			ret = 1;
++			break;
++		}
++		if (key->type < BTRFS_EXTENT_ITEM_KEY)
++			key->type = BTRFS_EXTENT_ITEM_KEY;
++		else
++			key->objectid++;
++		btrfs_release_path(path);
++		up_read(&fs_info->commit_root_sem);
++		mutex_unlock(&caching_ctl->mutex);
++		cond_resched();
++		mutex_lock(&caching_ctl->mutex);
++		down_read(&fs_info->commit_root_sem);
++	}
++out:
++	lockdep_assert_held(&caching_ctl->mutex);
++	lockdep_assert_held_read(&fs_info->commit_root_sem);
++	btrfs_free_path(path);
++	return ret;
++}
++
++/*
++ * Best effort attempt to compute a block group's size class while caching it.
++ *
++ * @block_group: the block group we are caching
++ *
++ * We cannot infer the size class while adding free space extents, because that
++ * logic doesn't care about contiguous file extents (it doesn't differentiate
++ * between a 100M extent and 100 contiguous 1M extents). So we need to read the
++ * file extent items. Reading all of them is quite wasteful, because usually
++ * only a handful are enough to give a good answer. Therefore, we just grab 5 of
++ * them at even steps through the block group and pick the smallest size class
++ * we see. Since size class is best effort, and not guaranteed in general,
++ * inaccuracy is acceptable.
++ *
++ * To be more explicit about why this algorithm makes sense:
++ *
++ * If we are caching in a block group from disk, then there are three major cases
++ * to consider:
++ * 1. the block group is well behaved and all extents in it are the same size
++ *    class.
++ * 2. the block group is mostly one size class with rare exceptions for last
++ *    ditch allocations
++ * 3. the block group was populated before size classes and can have a totally
++ *    arbitrary mix of size classes.
++ *
++ * In case 1, looking at any extent in the block group will yield the correct
++ * result. For the mixed cases, taking the minimum size class seems like a good
++ * approximation, since gaps from frees will be usable to the size class. For
++ * 2., a small handful of file extents is likely to yield the right answer. For
++ * 3, we can either read every file extent, or admit that this is best effort
++ * anyway and try to stay fast.
++ *
++ * Returns: 0 on success, negative error code on error.
++ */
++static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
++				       struct btrfs_block_group *block_group)
++{
++	struct btrfs_key key;
++	int i;
++	u64 min_size = block_group->length;
++	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
++	int ret;
++
++	if (!btrfs_block_group_should_use_size_class(block_group))
++		return 0;
++
++	for (i = 0; i < 5; ++i) {
++		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
++		if (ret < 0)
++			goto out;
++		if (ret > 0)
++			continue;
++		min_size = min_t(u64, min_size, key.offset);
++		size_class = btrfs_calc_block_group_size_class(min_size);
++	}
++	if (size_class != BTRFS_BG_SZ_NONE) {
++		spin_lock(&block_group->lock);
++		block_group->size_class = size_class;
++		spin_unlock(&block_group->lock);
++	}
++
++out:
++	return ret;
++}
++
+ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
+ {
+ 	struct btrfs_block_group *block_group = caching_ctl->block_group;
+@@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
+ 	mutex_lock(&caching_ctl->mutex);
+ 	down_read(&fs_info->commit_root_sem);
+ 
++	load_block_group_size_class(caching_ctl, block_group);
+ 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ 		ret = load_free_space_cache(block_group);
+ 		if (ret == 1) {
+@@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+  *
+  * @fs_info:       the filesystem
+  * @chunk_start:   logical address of block group
+- * @bdev:	   physical device to resolve, can be NULL to indicate any device
+  * @physical:	   physical address to map to logical addresses
+  * @logical:	   return array of logical addresses which map to @physical
+  * @naddrs:	   length of @logical
+@@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+  * block copies.
+  */
+ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+-		     struct block_device *bdev, u64 physical, u64 **logical,
+-		     int *naddrs, int *stripe_len)
++		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+ {
+ 	struct extent_map *em;
+ 	struct map_lookup *map;
+@@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ 			      data_stripe_length))
+ 			continue;
+ 
+-		if (bdev && map->stripes[i].dev->bdev != bdev)
+-			continue;
+-
+ 		stripe_nr = physical - map->stripes[i].physical;
+ 		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
+ 
+@@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
+ 
+ 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ 		bytenr = btrfs_sb_offset(i);
+-		ret = btrfs_rmap_block(fs_info, cache->start, NULL,
++		ret = btrfs_rmap_block(fs_info, cache->start,
+ 				       bytenr, &logical, &nr, &stripe_len);
+ 		if (ret)
+ 			return ret;
+@@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ 	spin_unlock(&info->delalloc_root_lock);
+ 
+ 	while (total) {
+-		bool reclaim;
++		bool reclaim = false;
+ 
+ 		cache = btrfs_lookup_block_group(info, bytenr);
+ 		if (!cache) {
+@@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ 			cache->space_info->disk_used -= num_bytes * factor;
+ 
+ 			reclaim = should_reclaim_block_group(cache, num_bytes);
++
+ 			spin_unlock(&cache->lock);
+ 			spin_unlock(&cache->space_info->lock);
+ 
+@@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+  * reservation and return -EAGAIN, otherwise this function always succeeds.
+  */
+ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+-			     u64 ram_bytes, u64 num_bytes, int delalloc)
++			     u64 ram_bytes, u64 num_bytes, int delalloc,
++			     bool force_wrong_size_class)
+ {
+ 	struct btrfs_space_info *space_info = cache->space_info;
++	enum btrfs_block_group_size_class size_class;
+ 	int ret = 0;
+ 
+ 	spin_lock(&space_info->lock);
+ 	spin_lock(&cache->lock);
+ 	if (cache->ro) {
+ 		ret = -EAGAIN;
+-	} else {
+-		cache->reserved += num_bytes;
+-		space_info->bytes_reserved += num_bytes;
+-		trace_btrfs_space_reservation(cache->fs_info, "space_info",
+-					      space_info->flags, num_bytes, 1);
+-		btrfs_space_info_update_bytes_may_use(cache->fs_info,
+-						      space_info, -ram_bytes);
+-		if (delalloc)
+-			cache->delalloc_bytes += num_bytes;
++		goto out;
++	}
+ 
+-		/*
+-		 * Compression can use less space than we reserved, so wake
+-		 * tickets if that happens
+-		 */
+-		if (num_bytes < ram_bytes)
+-			btrfs_try_granting_tickets(cache->fs_info, space_info);
++	if (btrfs_block_group_should_use_size_class(cache)) {
++		size_class = btrfs_calc_block_group_size_class(num_bytes);
++		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
++		if (ret)
++			goto out;
+ 	}
++	cache->reserved += num_bytes;
++	space_info->bytes_reserved += num_bytes;
++	trace_btrfs_space_reservation(cache->fs_info, "space_info",
++				      space_info->flags, num_bytes, 1);
++	btrfs_space_info_update_bytes_may_use(cache->fs_info,
++					      space_info, -ram_bytes);
++	if (delalloc)
++		cache->delalloc_bytes += num_bytes;
++
++	/*
++	 * Compression can use less space than we reserved, so wake tickets if
++	 * that happens.
++	 */
++	if (num_bytes < ram_bytes)
++		btrfs_try_granting_tickets(cache->fs_info, space_info);
++out:
+ 	spin_unlock(&cache->lock);
+ 	spin_unlock(&space_info->lock);
+ 	return ret;
+@@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
+ 	bg->swap_extents -= amount;
+ 	spin_unlock(&bg->lock);
+ }
++
++enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
++{
++	if (size <= SZ_128K)
++		return BTRFS_BG_SZ_SMALL;
++	if (size <= SZ_8M)
++		return BTRFS_BG_SZ_MEDIUM;
++	return BTRFS_BG_SZ_LARGE;
++}
++
++/*
++ * Handle a block group allocating an extent in a size class
++ *
++ * @bg:				The block group we allocated in.
++ * @size_class:			The size class of the allocation.
++ * @force_wrong_size_class:	Whether we are desperate enough to allow
++ *				mismatched size classes.
++ *
++ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
++ * case of a race that leads to the wrong size class without
++ * force_wrong_size_class set.
++ *
++ * find_free_extent will skip block groups with a mismatched size class until
++ * it really needs to avoid ENOSPC. In that case it will set
++ * force_wrong_size_class. However, if a block group is newly allocated and
++ * doesn't yet have a size class, then it is possible for two allocations of
++ * different sizes to race and both try to use it. The loser is caught here and
++ * has to retry.
++ */
++int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
++				     enum btrfs_block_group_size_class size_class,
++				     bool force_wrong_size_class)
++{
++	ASSERT(size_class != BTRFS_BG_SZ_NONE);
++
++	/* The new allocation is in the right size class, do nothing */
++	if (bg->size_class == size_class)
++		return 0;
++	/*
++	 * The new allocation is in a mismatched size class.
++	 * This means one of two things:
++	 *
++	 * 1. Two tasks in find_free_extent for different size_classes raced
++	 *    and hit the same empty block_group. Make the loser try again.
++	 * 2. A call to find_free_extent got desperate enough to set
++	 *    'force_wrong_slab'. Don't change the size_class, but allow the
++	 *    allocation.
++	 */
++	if (bg->size_class != BTRFS_BG_SZ_NONE) {
++		if (force_wrong_size_class)
++			return 0;
++		return -EAGAIN;
++	}
++	/*
++	 * The happy new block group case: the new allocation is the first
++	 * one in the block_group so we set size_class.
++	 */
++	bg->size_class = size_class;
++
++	return 0;
++}
++
++bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
++{
++	if (btrfs_is_zoned(bg->fs_info))
++		return false;
++	if (!btrfs_is_block_group_data_only(bg))
++		return false;
++	return true;
++}
+diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
+index a02ea76fd6cf..6e4a0b429ac3 100644
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state {
+ 	BTRFS_DC_SETUP,
+ };
+ 
++enum btrfs_block_group_size_class {
++	/* Unset */
++	BTRFS_BG_SZ_NONE,
++	/* 0 < size <= 128K */
++	BTRFS_BG_SZ_SMALL,
++	/* 128K < size <= 8M */
++	BTRFS_BG_SZ_MEDIUM,
++	/* 8M < size < BG_LENGTH */
++	BTRFS_BG_SZ_LARGE,
++};
++
+ /*
+  * This describes the state of the block_group for async discard.  This is due
+  * to the two pass nature of it where extent discarding is prioritized over
+@@ -233,6 +244,7 @@ struct btrfs_block_group {
+ 	struct list_head active_bg_list;
+ 	struct work_struct zone_finish_work;
+ 	struct extent_buffer *last_eb;
++	enum btrfs_block_group_size_class size_class;
+ };
+ 
+ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
+ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ 			     u64 bytenr, u64 num_bytes, bool alloc);
+ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+-			     u64 ram_bytes, u64 num_bytes, int delalloc);
++			     u64 ram_bytes, u64 num_bytes, int delalloc,
++			     bool force_wrong_size_class);
+ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
+ 			       u64 num_bytes, int delalloc);
+ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+@@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
+ void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+ int btrfs_free_block_groups(struct btrfs_fs_info *info);
+ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+-		       struct block_device *bdev, u64 physical, u64 **logical,
+-		       int *naddrs, int *stripe_len);
++		     u64 physical, u64 **logical, int *naddrs, int *stripe_len);
+ 
+ static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+ {
+@@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
+ bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
+ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
+ 
++enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
++int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
++				     enum btrfs_block_group_size_class size_class,
++				     bool force_wrong_size_class);
++bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
++
+ #endif /* BTRFS_BLOCK_GROUP_H */
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index 195c09e20609..49a92aa65de1 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -93,12 +93,6 @@ struct btrfs_inode {
+ 	/* the io_tree does range state (DIRTY, LOCKED etc) */
+ 	struct extent_io_tree io_tree;
+ 
+-	/* special utility tree used to record which mirrors have already been
+-	 * tried when checksums fail for a given block
+-	 */
+-	struct rb_root io_failure_tree;
+-	spinlock_t io_failure_lock;
+-
+ 	/*
+ 	 * Keep track of where the inode has extent items mapped in order to
+ 	 * make sure the i_size adjustments are accurate
+@@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ #define CSUM_FMT				"0x%*phN"
+ #define CSUM_FMT_VALUE(size, bytes)		size, bytes
+ 
+-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
+-			int mirror_num, enum btrfs_compression_type compress_type);
+-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
+-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+-					      struct bio *bio,
+-					      u64 dio_file_offset);
+ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
+-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
+-			  u32 bio_offset, struct page *page, u32 pgoff);
+-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+-				    u32 bio_offset, struct page *page,
+-				    u64 start, u64 end);
++blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
++bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
++			u32 bio_offset, struct bio_vec *bv);
+ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ 			      u64 *orig_start, u64 *orig_block_len,
+ 			      u64 *ram_bytes, bool nowait, bool strict);
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index 5122ca79f7ea..f42f31f22d13 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws,
+ 
+ static int btrfs_decompress_bio(struct compressed_bio *cb);
+ 
+-static void finish_compressed_bio_read(struct compressed_bio *cb)
++static void end_compressed_bio_read(struct btrfs_bio *bbio)
+ {
++	struct compressed_bio *cb = bbio->private;
+ 	unsigned int index;
+ 	struct page *page;
+ 
+-	if (cb->status == BLK_STS_OK)
++	if (bbio->bio.bi_status)
++		cb->status = bbio->bio.bi_status;
++	else
+ 		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+ 
+ 	/* Release the compressed pages */
+@@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
+ 	/* Finally free the cb struct */
+ 	kfree(cb->compressed_pages);
+ 	kfree(cb);
+-}
+-
+-/*
+- * Verify the checksums and kick off repair if needed on the uncompressed data
+- * before decompressing it into the original bio and freeing the uncompressed
+- * pages.
+- */
+-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+-{
+-	struct compressed_bio *cb = bbio->private;
+-	struct inode *inode = cb->inode;
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	struct btrfs_inode *bi = BTRFS_I(inode);
+-	bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+-		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+-	blk_status_t status = bbio->bio.bi_status;
+-	struct bvec_iter iter;
+-	struct bio_vec bv;
+-	u32 offset;
+-
+-	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+-		u64 start = bbio->file_offset + offset;
+-
+-		if (!status &&
+-		    (!csum || !btrfs_check_data_csum(bi, bbio, offset,
+-						     bv.bv_page, bv.bv_offset))) {
+-			btrfs_clean_io_failure(bi, start, bv.bv_page,
+-					       bv.bv_offset);
+-		} else {
+-			int ret;
+-
+-			refcount_inc(&cb->pending_ios);
+-			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
+-						      bv.bv_page, bv.bv_offset,
+-						      true);
+-			if (ret) {
+-				refcount_dec(&cb->pending_ios);
+-				status = errno_to_blk_status(ret);
+-			}
+-		}
+-	}
+-
+-	if (status)
+-		cb->status = status;
+-
+-	if (refcount_dec_and_test(&cb->pending_ios))
+-		finish_compressed_bio_read(cb);
+-	btrfs_bio_free_csum(bbio);
+ 	bio_put(&bbio->bio);
+ }
+ 
+@@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
+ static void end_compressed_bio_write(struct btrfs_bio *bbio)
+ {
+ 	struct compressed_bio *cb = bbio->private;
+-
+-	if (bbio->bio.bi_status)
+-		cb->status = bbio->bio.bi_status;
+-
+-	if (refcount_dec_and_test(&cb->pending_ios)) {
+-		struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+-
+-		btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
+-		queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+-	}
+-	bio_put(&bbio->bio);
+-}
+-
+-/*
+- * Allocate a compressed_bio, which will be used to read/write on-disk
+- * (aka, compressed) * data.
+- *
+- * @cb:                 The compressed_bio structure, which records all the needed
+- *                      information to bind the compressed data to the uncompressed
+- *                      page cache.
+- * @disk_byten:         The logical bytenr where the compressed data will be read
+- *                      from or written to.
+- * @endio_func:         The endio function to call after the IO for compressed data
+- *                      is finished.
+- * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
+- *                      Let the caller know to only fill the bio up to the stripe
+- *                      boundary.
+- */
+-
+-
+-static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+-					blk_opf_t opf,
+-					btrfs_bio_end_io_t endio_func,
+-					u64 *next_stripe_start)
+-{
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+-	struct btrfs_io_geometry geom;
+-	struct extent_map *em;
+-	struct bio *bio;
+-	int ret;
+ 
+-	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
+-	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
++	cb->status = bbio->bio.bi_status;
++	queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ 
+-	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+-	if (IS_ERR(em)) {
+-		bio_put(bio);
+-		return ERR_CAST(em);
+-	}
+-
+-	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+-		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+-
+-	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+-	free_extent_map(em);
+-	if (ret < 0) {
+-		bio_put(bio);
+-		return ERR_PTR(ret);
+-	}
+-	*next_stripe_start = disk_bytenr + geom.len;
+-	refcount_inc(&cb->pending_ios);
+-	return bio;
++	bio_put(&bbio->bio);
+ }
+ 
+ /*
+@@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 	struct bio *bio = NULL;
+ 	struct compressed_bio *cb;
+ 	u64 cur_disk_bytenr = disk_start;
+-	u64 next_stripe_start;
+ 	blk_status_t ret = BLK_STS_OK;
+-	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+-	const bool use_append = btrfs_use_zone_append(inode, disk_start);
+-	const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+ 
+ 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ 	       IS_ALIGNED(len, fs_info->sectorsize));
+ 	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+ 	if (!cb)
+ 		return BLK_STS_RESOURCE;
+-	refcount_set(&cb->pending_ios, 1);
+ 	cb->status = BLK_STS_OK;
+ 	cb->inode = &inode->vfs_inode;
+ 	cb->start = start;
+@@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
+ 	cb->nr_pages = nr_pages;
+ 
+-	if (blkcg_css)
++	if (blkcg_css) {
+ 		kthread_associate_blkcg(blkcg_css);
++		write_flags |= REQ_CGROUP_PUNT;
++	}
++
++	write_flags |= REQ_BTRFS_ONE_ORDERED;
++	bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
++			      BTRFS_I(cb->inode), end_compressed_bio_write, cb);
++	bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
++	btrfs_bio(bio)->file_offset = start;
+ 
+ 	while (cur_disk_bytenr < disk_start + compressed_len) {
+ 		u64 offset = cur_disk_bytenr - disk_start;
+@@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 		unsigned int real_size;
+ 		unsigned int added;
+ 		struct page *page = compressed_pages[index];
+-		bool submit = false;
+-
+-		/* Allocate new bio if submitted or not yet allocated */
+-		if (!bio) {
+-			bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+-				bio_op | write_flags, end_compressed_bio_write,
+-				&next_stripe_start);
+-			if (IS_ERR(bio)) {
+-				ret = errno_to_blk_status(PTR_ERR(bio));
+-				break;
+-			}
+-			if (blkcg_css)
+-				bio->bi_opf |= REQ_CGROUP_PUNT;
+-		}
+-		/*
+-		 * We should never reach next_stripe_start start as we will
+-		 * submit comp_bio when reach the boundary immediately.
+-		 */
+-		ASSERT(cur_disk_bytenr != next_stripe_start);
+ 
+ 		/*
+ 		 * We have various limits on the real read size:
+-		 * - stripe boundary
+ 		 * - page boundary
+ 		 * - compressed length boundary
+ 		 */
+-		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+-		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
++		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
+ 		real_size = min_t(u64, real_size, compressed_len - offset);
+ 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ 
+-		if (use_append)
+-			added = bio_add_zone_append_page(bio, page, real_size,
+-					offset_in_page(offset));
+-		else
+-			added = bio_add_page(bio, page, real_size,
+-					offset_in_page(offset));
+-		/* Reached zoned boundary */
+-		if (added == 0)
+-			submit = true;
+-
++		added = bio_add_page(bio, page, real_size, offset_in_page(offset));
++		/*
++		 * Maximum compressed extent is smaller than bio size limit,
++		 * thus bio_add_page() should always success.
++		 */
++		ASSERT(added == real_size);
+ 		cur_disk_bytenr += added;
+-		/* Reached stripe boundary */
+-		if (cur_disk_bytenr == next_stripe_start)
+-			submit = true;
+-
+-		/* Finished the range */
+-		if (cur_disk_bytenr == disk_start + compressed_len)
+-			submit = true;
+-
+-		if (submit) {
+-			if (!skip_sum) {
+-				ret = btrfs_csum_one_bio(inode, bio, start, true);
+-				if (ret) {
+-					btrfs_bio_end_io(btrfs_bio(bio), ret);
+-					break;
+-				}
+-			}
+-
+-			ASSERT(bio->bi_iter.bi_size);
+-			btrfs_submit_bio(fs_info, bio, 0);
+-			bio = NULL;
+-		}
+-		cond_resched();
+ 	}
+ 
++	/* Finished the range. */
++	ASSERT(bio->bi_iter.bi_size);
++	btrfs_submit_bio(bio, 0);
+ 	if (blkcg_css)
+ 		kthread_associate_blkcg(NULL);
+-
+-	if (refcount_dec_and_test(&cb->pending_ios))
+-		finish_compressed_bio_write(cb);
+ 	return ret;
+ }
+ 
+@@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 	struct extent_map_tree *em_tree;
+ 	struct compressed_bio *cb;
+ 	unsigned int compressed_len;
+-	struct bio *comp_bio = NULL;
++	struct bio *comp_bio;
+ 	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 	u64 cur_disk_byte = disk_bytenr;
+-	u64 next_stripe_start;
+ 	u64 file_offset;
+ 	u64 em_len;
+ 	u64 em_start;
+@@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 		goto out;
+ 	}
+ 
+-	refcount_set(&cb->pending_ios, 1);
+ 	cb->status = BLK_STS_OK;
+ 	cb->inode = inode;
+ 
+@@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 	/* include any pages we added in add_ra-bio_pages */
+ 	cb->len = bio->bi_iter.bi_size;
+ 
++	comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
++				   end_compressed_bio_read, cb);
++	comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
++
+ 	while (cur_disk_byte < disk_bytenr + compressed_len) {
+ 		u64 offset = cur_disk_byte - disk_bytenr;
+ 		unsigned int index = offset >> PAGE_SHIFT;
+ 		unsigned int real_size;
+ 		unsigned int added;
+ 		struct page *page = cb->compressed_pages[index];
+-		bool submit = false;
+-
+-		/* Allocate new bio if submitted or not yet allocated */
+-		if (!comp_bio) {
+-			comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+-					REQ_OP_READ, end_compressed_bio_read,
+-					&next_stripe_start);
+-			if (IS_ERR(comp_bio)) {
+-				cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+-				break;
+-			}
+-		}
+-		/*
+-		 * We should never reach next_stripe_start start as we will
+-		 * submit comp_bio when reach the boundary immediately.
+-		 */
+-		ASSERT(cur_disk_byte != next_stripe_start);
++
+ 		/*
+ 		 * We have various limit on the real read size:
+-		 * - stripe boundary
+ 		 * - page boundary
+ 		 * - compressed length boundary
+ 		 */
+-		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+-		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
++		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
+ 		real_size = min_t(u64, real_size, compressed_len - offset);
+ 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ 
+@@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 		 */
+ 		ASSERT(added == real_size);
+ 		cur_disk_byte += added;
+-
+-		/* Reached stripe boundary, need to submit */
+-		if (cur_disk_byte == next_stripe_start)
+-			submit = true;
+-
+-		/* Has finished the range, need to submit */
+-		if (cur_disk_byte == disk_bytenr + compressed_len)
+-			submit = true;
+-
+-		if (submit) {
+-			/* Save the original iter for read repair */
+-			if (bio_op(comp_bio) == REQ_OP_READ)
+-				btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
+-
+-			/*
+-			 * Save the initial offset of this chunk, as there
+-			 * is no direct correlation between compressed pages and
+-			 * the original file offset.  The field is only used for
+-			 * priting error messages.
+-			 */
+-			btrfs_bio(comp_bio)->file_offset = file_offset;
+-
+-			ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
+-			if (ret) {
+-				btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
+-				break;
+-			}
+-
+-			ASSERT(comp_bio->bi_iter.bi_size);
+-			btrfs_submit_bio(fs_info, comp_bio, mirror_num);
+-			comp_bio = NULL;
+-		}
+ 	}
+ 
+ 	if (memstall)
+ 		psi_memstall_leave(&pflags);
+ 
+-	if (refcount_dec_and_test(&cb->pending_ios))
+-		finish_compressed_bio_read(cb);
++	/*
++	 * Stash the initial offset of this chunk, as there is no direct
++	 * correlation between compressed pages and the original file offset.
++	 * The field is only used for printing error messages anyway.
++	 */
++	btrfs_bio(comp_bio)->file_offset = file_offset;
++
++	ASSERT(comp_bio->bi_iter.bi_size);
++	btrfs_submit_bio(comp_bio, mirror_num);
+ 	return;
+ 
+ fail:
+@@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
+ 	index_end = end >> PAGE_SHIFT;
+ 
+ 	/* Don't miss unaligned end */
+-	if (!IS_ALIGNED(end, PAGE_SIZE))
++	if (!PAGE_ALIGNED(end))
+ 		index_end++;
+ 
+ 	curr_sample_pos = 0;
+@@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
+  *
+  * For now is's a naive and optimistic 'return true', we'll extend the logic to
+  * quickly (compared to direct compression) detect data characteristics
+- * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
++ * (compressible/incompressible) to avoid wasting CPU time on incompressible
+  * data.
+  *
+  * The following types of analysis can be performed:
+diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
+index 6209d40a1e08..a5e3377db9ad 100644
+--- a/fs/btrfs/compression.h
++++ b/fs/btrfs/compression.h
+@@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+ #define	BTRFS_ZLIB_DEFAULT_LEVEL		3
+ 
+ struct compressed_bio {
+-	/* Number of outstanding bios */
+-	refcount_t pending_ios;
+-
+ 	/* Number of compressed pages in the array */
+ 	unsigned int nr_pages;
+ 
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index 4754c9101a4c..a5b6bb54545f 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+ 			if (ret)
+ 				return ret;
+ 		}
+-		btrfs_clean_tree_block(buf);
++		btrfs_clear_buffer_dirty(trans, buf);
+ 		*last_ref = 1;
+ 	}
+ 	return 0;
+@@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ /*
+  * Search for a key in the given extent_buffer.
+  *
+- * The lower boundary for the search is specified by the slot number @low. Use a
+- * value of 0 to search over the whole extent buffer.
++ * The lower boundary for the search is specified by the slot number @first_slot.
++ * Use a value of 0 to search over the whole extent buffer.
+  *
+  * The slot in the extent buffer is returned via @slot. If the key exists in the
+  * extent buffer, then @slot will point to the slot where the key is, otherwise
+@@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+  * Slot may point to the total number of items (i.e. one position beyond the last
+  * key) if the key is bigger than the last key in the extent buffer.
+  */
+-static noinline int generic_bin_search(struct extent_buffer *eb, int low,
+-				       const struct btrfs_key *key, int *slot)
++int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
++			     const struct btrfs_key *key, int *slot)
+ {
+ 	unsigned long p;
+ 	int item_size;
+-	int high = btrfs_header_nritems(eb);
++	/*
++	 * Use unsigned types for the low and high slots, so that we get a more
++	 * efficient division in the search loop below.
++	 */
++	u32 low = first_slot;
++	u32 high = btrfs_header_nritems(eb);
+ 	int ret;
+ 	const int key_size = sizeof(struct btrfs_disk_key);
+ 
+-	if (low > high) {
++	if (unlikely(low > high)) {
+ 		btrfs_err(eb->fs_info,
+-		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
++		 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
+ 			  __func__, low, high, eb->start,
+ 			  btrfs_header_owner(eb), btrfs_header_level(eb));
+ 		return -EINVAL;
+@@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low,
+ 	return 1;
+ }
+ 
+-/*
+- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
+- */
+-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+-		     int *slot)
+-{
+-	return generic_bin_search(eb, 0, key, slot);
+-}
+-
+ static void root_add_used(struct btrfs_root *root, u32 size)
+ {
+ 	spin_lock(&root->accounting_lock);
+@@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
+ 
+ 		path->locks[level] = 0;
+ 		path->nodes[level] = NULL;
+-		btrfs_clean_tree_block(mid);
++		btrfs_clear_buffer_dirty(trans, mid);
+ 		btrfs_tree_unlock(mid);
+ 		/* once for the path */
+ 		free_extent_buffer(mid);
+@@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
+ 		if (wret < 0 && wret != -ENOSPC)
+ 			ret = wret;
+ 		if (btrfs_header_nritems(right) == 0) {
+-			btrfs_clean_tree_block(right);
++			btrfs_clear_buffer_dirty(trans, right);
+ 			btrfs_tree_unlock(right);
+ 			del_ptr(root, path, level + 1, pslot + 1);
+ 			root_sub_used(root, right->len);
+@@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
+ 		BUG_ON(wret == 1);
+ 	}
+ 	if (btrfs_header_nritems(mid) == 0) {
+-		btrfs_clean_tree_block(mid);
++		btrfs_clear_buffer_dirty(trans, mid);
+ 		btrfs_tree_unlock(mid);
+ 		del_ptr(root, path, level + 1, pslot);
+ 		root_sub_used(root, mid->len);
+@@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
+ 		return 0;
+ 	}
+ 
+-	return generic_bin_search(eb, search_low_slot, key, slot);
++	return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
+ }
+ 
+ static int search_leaf(struct btrfs_trans_handle *trans,
+@@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
+  * min slot controls the lowest index we're willing to push to the
+  * right.  We'll push up to and including min_slot, but no lower
+  */
+-static noinline int __push_leaf_right(struct btrfs_path *path,
++static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
++				      struct btrfs_path *path,
+ 				      int data_size, int empty,
+ 				      struct extent_buffer *right,
+ 				      int free_space, u32 left_nritems,
+@@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	if (left_nritems)
+ 		btrfs_mark_buffer_dirty(left);
+ 	else
+-		btrfs_clean_tree_block(left);
++		btrfs_clear_buffer_dirty(trans, left);
+ 
+ 	btrfs_mark_buffer_dirty(right);
+ 
+@@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
+ 	if (path->slots[0] >= left_nritems) {
+ 		path->slots[0] -= left_nritems;
+ 		if (btrfs_header_nritems(path->nodes[0]) == 0)
+-			btrfs_clean_tree_block(path->nodes[0]);
++			btrfs_clear_buffer_dirty(trans, path->nodes[0]);
+ 		btrfs_tree_unlock(path->nodes[0]);
+ 		free_extent_buffer(path->nodes[0]);
+ 		path->nodes[0] = right;
+@@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ 		return 0;
+ 	}
+ 
+-	return __push_leaf_right(path, min_data_size, empty,
+-				right, free_space, left_nritems, min_slot);
++	return __push_leaf_right(trans, path, min_data_size, empty, right,
++				 free_space, left_nritems, min_slot);
+ out_unlock:
+ 	btrfs_tree_unlock(right);
+ 	free_extent_buffer(right);
+@@ -3259,7 +3255,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+  * items
+  */
+-static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
++static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
++				     struct btrfs_path *path, int data_size,
+ 				     int empty, struct extent_buffer *left,
+ 				     int free_space, u32 right_nritems,
+ 				     u32 max_slot)
+@@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ 	if (right_nritems)
+ 		btrfs_mark_buffer_dirty(right);
+ 	else
+-		btrfs_clean_tree_block(right);
++		btrfs_clear_buffer_dirty(trans, right);
+ 
+ 	btrfs_item_key(right, &disk_key, 0);
+ 	fixup_low_keys(path, &disk_key, 1);
+@@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ 		ret = -EUCLEAN;
+ 		goto out;
+ 	}
+-	return __push_leaf_left(path, min_data_size,
+-			       empty, left, free_space, right_nritems,
+-			       max_slot);
++	return __push_leaf_left(trans, path, min_data_size, empty, left,
++				free_space, right_nritems, max_slot);
+ out:
+ 	btrfs_tree_unlock(left);
+ 	free_extent_buffer(left);
+@@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 		if (leaf == root->node) {
+ 			btrfs_set_header_level(leaf, 0);
+ 		} else {
+-			btrfs_clean_tree_block(leaf);
++			btrfs_clear_buffer_dirty(trans, leaf);
+ 			btrfs_del_leaf(trans, root, path, leaf);
+ 		}
+ 	} else {
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 6965703a81b6..97897107fab5 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
+ /* ctree.c */
+ int __init btrfs_ctree_init(void);
+ void __cold btrfs_ctree_exit(void);
++
++int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
++			     const struct btrfs_key *key, int *slot);
++
++/*
++ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
++ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
++ */
++static inline int btrfs_bin_search(struct extent_buffer *eb,
++				   const struct btrfs_key *key,
++				   int *slot)
++{
++	return btrfs_generic_bin_search(eb, 0, key, slot);
++}
++
+ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+ 		     int *slot);
+ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
+index d81b764a7644..8065341d831a 100644
+--- a/fs/btrfs/defrag.c
++++ b/fs/btrfs/defrag.c
+@@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i
+ 			break;
+ 
+ 		unlock_page(page);
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 		lock_page(page);
+ 		/*
+@@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
+ }
+ 
+ #define CLUSTER_SIZE	(SZ_256K)
+-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
++static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
+ 
+ /*
+  * Defrag one contiguous target range.
+diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
+index 573ebab886e2..886ffb232eac 100644
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ 	return 0;
+ }
+ 
+-static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+-				    struct btrfs_delayed_ref_root *delayed_refs,
++static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+ 				    struct btrfs_delayed_ref_head *head,
+ 				    struct btrfs_delayed_ref_node *ref)
+ {
+@@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+ 	atomic_dec(&delayed_refs->num_entries);
+ }
+ 
+-static bool merge_ref(struct btrfs_trans_handle *trans,
+-		      struct btrfs_delayed_ref_root *delayed_refs,
++static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+ 		      struct btrfs_delayed_ref_head *head,
+ 		      struct btrfs_delayed_ref_node *ref,
+ 		      u64 seq)
+@@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
+ 			mod = -next->ref_mod;
+ 		}
+ 
+-		drop_delayed_ref(trans, delayed_refs, head, next);
++		drop_delayed_ref(delayed_refs, head, next);
+ 		ref->ref_mod += mod;
+ 		if (ref->ref_mod == 0) {
+-			drop_delayed_ref(trans, delayed_refs, head, ref);
++			drop_delayed_ref(delayed_refs, head, ref);
+ 			done = true;
+ 		} else {
+ 			/*
+@@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
+ 	return done;
+ }
+ 
+-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
++void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
+ 			      struct btrfs_delayed_ref_root *delayed_refs,
+ 			      struct btrfs_delayed_ref_head *head)
+ {
+-	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_delayed_ref_node *ref;
+ 	struct rb_node *node;
+ 	u64 seq = 0;
+@@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ 		if (seq && ref->seq >= seq)
+ 			continue;
+-		if (merge_ref(trans, delayed_refs, head, ref, seq))
++		if (merge_ref(delayed_refs, head, ref, seq))
+ 			goto again;
+ 	}
+ }
+@@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+  * Return 0 for insert.
+  * Return >0 for merge.
+  */
+-static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+-			      struct btrfs_delayed_ref_root *root,
++static int insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+ 			      struct btrfs_delayed_ref_head *href,
+ 			      struct btrfs_delayed_ref_node *ref)
+ {
+@@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+ 
+ 	/* remove existing tail if its ref_mod is zero */
+ 	if (exist->ref_mod == 0)
+-		drop_delayed_ref(trans, root, href, exist);
++		drop_delayed_ref(root, href, exist);
+ 	spin_unlock(&href->lock);
+ 	return ret;
+ inserted:
+@@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ 	head_ref = add_delayed_ref_head(trans, head_ref, record,
+ 					action, &qrecord_inserted);
+ 
+-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
++	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ 	spin_unlock(&delayed_refs->lock);
+ 
+ 	/*
+@@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ 	head_ref = add_delayed_ref_head(trans, head_ref, record,
+ 					action, &qrecord_inserted);
+ 
+-	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
++	ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ 	spin_unlock(&delayed_refs->lock);
+ 
+ 	/*
+diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
+index d6304b690ec4..2eb34abf700f 100644
+--- a/fs/btrfs/delayed-ref.h
++++ b/fs/btrfs/delayed-ref.h
+@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ 				u64 bytenr, u64 num_bytes,
+ 				struct btrfs_delayed_extent_op *extent_op);
+-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
++void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
+ 			      struct btrfs_delayed_ref_root *delayed_refs,
+ 			      struct btrfs_delayed_ref_head *head);
+ 
+diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
+index ff2e524d9937..317aeff6c1da 100644
+--- a/fs/btrfs/discard.c
++++ b/fs/btrfs/discard.c
+@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ 				  struct btrfs_block_group *block_group)
+ {
++	lockdep_assert_held(&discard_ctl->lock);
+ 	if (!btrfs_run_discard_work(discard_ctl))
+ 		return;
+ 
+@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ 						      BTRFS_DISCARD_DELAY);
+ 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ 	}
++	if (list_empty(&block_group->discard_list))
++		btrfs_get_block_group(block_group);
+ 
+ 	list_move_tail(&block_group->discard_list,
+ 		       get_discard_list(discard_ctl, block_group));
+@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
+ 				       struct btrfs_block_group *block_group)
+ {
++	bool queued;
++
+ 	spin_lock(&discard_ctl->lock);
+ 
++	queued = !list_empty(&block_group->discard_list);
++
+ 	if (!btrfs_run_discard_work(discard_ctl)) {
+ 		spin_unlock(&discard_ctl->lock);
+ 		return;
+@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
+ 	block_group->discard_eligible_time = (ktime_get_ns() +
+ 					      BTRFS_DISCARD_UNUSED_DELAY);
+ 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
++	if (!queued)
++		btrfs_get_block_group(block_group);
+ 	list_add_tail(&block_group->discard_list,
+ 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
+ 
+@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ 				     struct btrfs_block_group *block_group)
+ {
+ 	bool running = false;
++	bool queued = false;
+ 
+ 	spin_lock(&discard_ctl->lock);
+ 
+@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ 	}
+ 
+ 	block_group->discard_eligible_time = 0;
++	queued = !list_empty(&block_group->discard_list);
+ 	list_del_init(&block_group->discard_list);
++	/*
++	 * If the block group is currently running in the discard workfn, we
++	 * don't want to deref it, since it's still being used by the workfn.
++	 * The workfn will notice this case and deref the block group when it is
++	 * finished.
++	 */
++	if (queued && !running)
++		btrfs_put_block_group(block_group);
+ 
+ 	spin_unlock(&discard_ctl->lock);
+ 
+@@ -214,10 +233,12 @@ static struct btrfs_block_group *peek_discard_list(
+ 	if (block_group && now >= block_group->discard_eligible_time) {
+ 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
+ 		    block_group->used != 0) {
+-			if (btrfs_is_block_group_data_only(block_group))
++			if (btrfs_is_block_group_data_only(block_group)) {
+ 				__add_to_discard_list(discard_ctl, block_group);
+-			else
++			} else {
+ 				list_del_init(&block_group->discard_list);
++				btrfs_put_block_group(block_group);
++			}
+ 			goto again;
+ 		}
+ 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
+@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work)
+ 	spin_lock(&discard_ctl->lock);
+ 	discard_ctl->prev_discard = trimmed;
+ 	discard_ctl->prev_discard_time = now;
++	/*
++	 * If the block group was removed from the discard list while it was
++	 * running in this workfn, then we didn't deref it, since this function
++	 * still owned that reference. But we set the discard_ctl->block_group
++	 * back to NULL, so we can use that condition to know that now we need
++	 * to deref the block_group.
++	 */
++	if (discard_ctl->block_group == NULL)
++		btrfs_put_block_group(block_group);
+ 	discard_ctl->block_group = NULL;
+ 	__btrfs_discard_schedule_work(discard_ctl, now, false);
+ 	spin_unlock(&discard_ctl->lock);
+@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
+ 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
+ 				 bg_list) {
+ 		list_del_init(&block_group->bg_list);
+-		btrfs_put_block_group(block_group);
+ 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
++		/*
++		 * This put is for the get done by btrfs_mark_bg_unused.
++		 * Queueing discard incremented it for discard's reference.
++		 */
++		btrfs_put_block_group(block_group);
+ 	}
+ 	spin_unlock(&fs_info->unused_bgs_lock);
+ }
+@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
+ 			if (block_group->used == 0)
+ 				btrfs_mark_bg_unused(block_group);
+ 			spin_lock(&discard_ctl->lock);
++			btrfs_put_block_group(block_group);
+ 		}
+ 	}
+ 	spin_unlock(&discard_ctl->lock);
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 3aa04224315e..b53f0e30ce2b 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+ 		crypto_free_shash(fs_info->csum_shash);
+ }
+ 
+-/*
+- * async submit bios are used to offload expensive checksumming
+- * onto the worker threads.  They checksum file and metadata bios
+- * just before they are sent down the IO stack.
+- */
+-struct async_submit_bio {
+-	struct btrfs_inode *inode;
+-	struct bio *bio;
+-	enum btrfs_wq_submit_cmd submit_cmd;
+-	int mirror_num;
+-
+-	/* Optional parameter for used by direct io */
+-	u64 dio_file_offset;
+-	struct btrfs_work work;
+-	blk_status_t status;
+-};
+-
+ /*
+  * Compute the csum of a btree block and store the result to provided buffer.
+  */
+@@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
+ 	return csum_one_extent_buffer(eb);
+ }
+ 
++blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
++{
++	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++	struct bvec_iter iter;
++	struct bio_vec bv;
++	int ret = 0;
++
++	bio_for_each_segment(bv, &bbio->bio, iter) {
++		ret = csum_dirty_buffer(fs_info, &bv);
++		if (ret)
++			break;
++	}
++
++	return errno_to_blk_status(ret);
++}
++
+ static int check_tree_block_fsid(struct extent_buffer *eb)
+ {
+ 	struct btrfs_fs_info *fs_info = eb->fs_info;
+@@ -700,172 +699,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ 	return ret;
+ }
+ 
+-static void run_one_async_start(struct btrfs_work *work)
+-{
+-	struct async_submit_bio *async;
+-	blk_status_t ret;
+-
+-	async = container_of(work, struct  async_submit_bio, work);
+-	switch (async->submit_cmd) {
+-	case WQ_SUBMIT_METADATA:
+-		ret = btree_submit_bio_start(async->bio);
+-		break;
+-	case WQ_SUBMIT_DATA:
+-		ret = btrfs_submit_bio_start(async->inode, async->bio);
+-		break;
+-	case WQ_SUBMIT_DATA_DIO:
+-		ret = btrfs_submit_bio_start_direct_io(async->inode,
+-				async->bio, async->dio_file_offset);
+-		break;
+-	}
+-	if (ret)
+-		async->status = ret;
+-}
+-
+-/*
+- * In order to insert checksums into the metadata in large chunks, we wait
+- * until bio submission time.   All the pages in the bio are checksummed and
+- * sums are attached onto the ordered extent record.
+- *
+- * At IO completion time the csums attached on the ordered extent record are
+- * inserted into the tree.
+- */
+-static void run_one_async_done(struct btrfs_work *work)
+-{
+-	struct async_submit_bio *async =
+-		container_of(work, struct  async_submit_bio, work);
+-	struct btrfs_inode *inode = async->inode;
+-	struct btrfs_bio *bbio = btrfs_bio(async->bio);
+-
+-	/* If an error occurred we just want to clean up the bio and move on */
+-	if (async->status) {
+-		btrfs_bio_end_io(bbio, async->status);
+-		return;
+-	}
+-
+-	/*
+-	 * All of the bios that pass through here are from async helpers.
+-	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+-	 * This changes nothing when cgroups aren't in use.
+-	 */
+-	async->bio->bi_opf |= REQ_CGROUP_PUNT;
+-	btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
+-}
+-
+-static void run_one_async_free(struct btrfs_work *work)
+-{
+-	struct async_submit_bio *async;
+-
+-	async = container_of(work, struct  async_submit_bio, work);
+-	kfree(async);
+-}
+-
+-/*
+- * Submit bio to an async queue.
+- *
+- * Retrun:
+- * - true if the work has been succesfuly submitted
+- * - false in case of error
+- */
+-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+-			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct async_submit_bio *async;
+-
+-	async = kmalloc(sizeof(*async), GFP_NOFS);
+-	if (!async)
+-		return false;
+-
+-	async->inode = inode;
+-	async->bio = bio;
+-	async->mirror_num = mirror_num;
+-	async->submit_cmd = cmd;
+-
+-	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+-			run_one_async_free);
+-
+-	async->dio_file_offset = dio_file_offset;
+-
+-	async->status = 0;
+-
+-	if (op_is_sync(bio->bi_opf))
+-		btrfs_queue_work(fs_info->hipri_workers, &async->work);
+-	else
+-		btrfs_queue_work(fs_info->workers, &async->work);
+-	return true;
+-}
+-
+-static blk_status_t btree_csum_one_bio(struct bio *bio)
+-{
+-	struct bio_vec *bvec;
+-	struct btrfs_root *root;
+-	int ret = 0;
+-	struct bvec_iter_all iter_all;
+-
+-	ASSERT(!bio_flagged(bio, BIO_CLONED));
+-	bio_for_each_segment_all(bvec, bio, iter_all) {
+-		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+-		ret = csum_dirty_buffer(root->fs_info, bvec);
+-		if (ret)
+-			break;
+-	}
+-
+-	return errno_to_blk_status(ret);
+-}
+-
+-blk_status_t btree_submit_bio_start(struct bio *bio)
+-{
+-	/*
+-	 * when we're called for a write, we're already in the async
+-	 * submission context.  Just jump into btrfs_submit_bio.
+-	 */
+-	return btree_csum_one_bio(bio);
+-}
+-
+-static bool should_async_write(struct btrfs_fs_info *fs_info,
+-			     struct btrfs_inode *bi)
+-{
+-	if (btrfs_is_zoned(fs_info))
+-		return false;
+-	if (atomic_read(&bi->sync_writers))
+-		return false;
+-	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
+-		return false;
+-	return true;
+-}
+-
+-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct btrfs_bio *bbio = btrfs_bio(bio);
+-	blk_status_t ret;
+-
+-	bio->bi_opf |= REQ_META;
+-	bbio->is_metadata = 1;
+-
+-	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+-		btrfs_submit_bio(fs_info, bio, mirror_num);
+-		return;
+-	}
+-
+-	/*
+-	 * Kthread helpers are used to submit writes so that checksumming can
+-	 * happen in parallel across all CPUs.
+-	 */
+-	if (should_async_write(fs_info, inode) &&
+-	    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
+-		return;
+-
+-	ret = btree_csum_one_bio(bio);
+-	if (ret) {
+-		btrfs_bio_end_io(bbio, ret);
+-		return;
+-	}
+-
+-	btrfs_submit_bio(fs_info, bio, mirror_num);
+-}
+-
+ #ifdef CONFIG_MIGRATION
+ static int btree_migrate_folio(struct address_space *mapping,
+ 		struct folio *dst, struct folio *src, enum migrate_mode mode)
+@@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+ 
+ }
+ 
+-void btrfs_clean_tree_block(struct extent_buffer *buf)
+-{
+-	struct btrfs_fs_info *fs_info = buf->fs_info;
+-	if (btrfs_header_generation(buf) ==
+-	    fs_info->running_transaction->transid) {
+-		btrfs_assert_tree_write_locked(buf);
+-
+-		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+-			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+-						 -buf->len,
+-						 fs_info->dirty_metadata_batch);
+-			clear_extent_buffer_dirty(buf);
+-		}
+-	}
+-}
+-
+ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+ 			 u64 objectid)
+ {
+@@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg)
+ 			goto sleep;
+ 		}
+ 
++		if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
++			btrfs_sysfs_feature_update(fs_info);
++
+ 		btrfs_run_delayed_iputs(fs_info);
+ 
+ 		again = btrfs_clean_one_deleted_snapshot(fs_info);
+@@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
+ 			start += fs_info->nodesize;
+ 			if (!eb)
+ 				continue;
++
++			btrfs_tree_lock(eb);
+ 			wait_on_extent_buffer_writeback(eb);
++			btrfs_clear_buffer_dirty(NULL, eb);
++			btrfs_tree_unlock(eb);
+ 
+-			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+-					       &eb->bflags))
+-				clear_extent_buffer_dirty(eb);
+ 			free_extent_buffer_stale(eb);
+ 		}
+ 	}
+diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
+index f2f295eb6103..4d5772330110 100644
+--- a/fs/btrfs/disk-io.h
++++ b/fs/btrfs/disk-io.h
+@@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block(
+ 						struct btrfs_fs_info *fs_info,
+ 						u64 bytenr, u64 owner_root,
+ 						int level);
+-void btrfs_clean_tree_block(struct extent_buffer *buf);
++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
++			      struct extent_buffer *buf);
+ void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
+ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+@@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ 				   struct page *page, u64 start, u64 end,
+ 				   int mirror);
+-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
+ #endif
+@@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int btrfs_read_extent_buffer(struct extent_buffer *buf,
+ 			     struct btrfs_tree_parent_check *check);
+ 
+-enum btrfs_wq_submit_cmd {
+-	WQ_SUBMIT_METADATA,
+-	WQ_SUBMIT_DATA,
+-	WQ_SUBMIT_DATA_DIO,
+-};
+-
+-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+-			 u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
+-blk_status_t btree_submit_bio_start(struct bio *bio);
++blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
+ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root);
+ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
+index 3c7766dfaa69..29a225836e28 100644
+--- a/fs/btrfs/extent-io-tree.c
++++ b/fs/btrfs/extent-io-tree.c
+@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ {
+ 	struct extent_state *state;
+ 	struct extent_state *prealloc = NULL;
+-	struct rb_node **p;
+-	struct rb_node *parent;
++	struct rb_node **p = NULL;
++	struct rb_node *parent = NULL;
+ 	int err = 0;
+ 	u64 last_start;
+ 	u64 last_end;
+@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ {
+ 	struct extent_state *state;
+ 	struct extent_state *prealloc = NULL;
+-	struct rb_node **p;
+-	struct rb_node *parent;
++	struct rb_node **p = NULL;
++	struct rb_node *parent = NULL;
+ 	int err = 0;
+ 	u64 last_start;
+ 	u64 last_end;
+@@ -1625,7 +1625,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
+ }
+ 
+ /*
+- * Searche a range in the state tree for a given mask.  If 'filled' == 1, this
++ * Search a range in the state tree for a given mask.  If 'filled' == 1, this
+  * returns 1 only if every extent in the tree has the bits set.  Otherwise, 1
+  * is returned if any bit in the range is found set.
+  */
+diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
+index e3eeec380844..21766e49ec02 100644
+--- a/fs/btrfs/extent-io-tree.h
++++ b/fs/btrfs/extent-io-tree.h
+@@ -6,7 +6,6 @@
+ #include "misc.h"
+ 
+ struct extent_changeset;
+-struct io_failure_record;
+ 
+ /* Bits for the extent state */
+ enum {
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 72ba13b027a9..824c657f59e8 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -16,7 +16,8 @@
+ #include <linux/percpu_counter.h>
+ #include <linux/lockdep.h>
+ #include <linux/crc32c.h>
+-#include "misc.h"
++#include "ctree.h"
++#include "extent-tree.h"
+ #include "tree-log.h"
+ #include "disk-io.h"
+ #include "print-tree.h"
+@@ -31,14 +32,12 @@
+ #include "space-info.h"
+ #include "block-rsv.h"
+ #include "delalloc-space.h"
+-#include "block-group.h"
+ #include "discard.h"
+ #include "rcu-string.h"
+ #include "zoned.h"
+ #include "dev-replace.h"
+ #include "fs.h"
+ #include "accessors.h"
+-#include "extent-tree.h"
+ #include "root-tree.h"
+ #include "file-item.h"
+ #include "orphan.h"
+@@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ 		cond_resched();
+ 
+ 		spin_lock(&locked_ref->lock);
+-		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
++		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
+ 	}
+ 
+ 	return 0;
+@@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 		 * insert_inline_extent_backref()).
+ 		 */
+ 		spin_lock(&locked_ref->lock);
+-		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
++		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
+ 
+ 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ 						      &actual_count);
+@@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
+ enum btrfs_loop_type {
+ 	LOOP_CACHING_NOWAIT,
+ 	LOOP_CACHING_WAIT,
++	LOOP_UNSET_SIZE_CLASS,
+ 	LOOP_ALLOC_CHUNK,
++	LOOP_WRONG_SIZE_CLASS,
+ 	LOOP_NO_EMPTY_SIZE,
+ };
+ 
+@@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
+ 	btrfs_put_block_group(cache);
+ }
+ 
+-enum btrfs_extent_allocation_policy {
+-	BTRFS_EXTENT_ALLOC_CLUSTERED,
+-	BTRFS_EXTENT_ALLOC_ZONED,
+-};
+-
+-/*
+- * Structure used internally for find_free_extent() function.  Wraps needed
+- * parameters.
+- */
+-struct find_free_extent_ctl {
+-	/* Basic allocation info */
+-	u64 ram_bytes;
+-	u64 num_bytes;
+-	u64 min_alloc_size;
+-	u64 empty_size;
+-	u64 flags;
+-	int delalloc;
+-
+-	/* Where to start the search inside the bg */
+-	u64 search_start;
+-
+-	/* For clustered allocation */
+-	u64 empty_cluster;
+-	struct btrfs_free_cluster *last_ptr;
+-	bool use_cluster;
+-
+-	bool have_caching_bg;
+-	bool orig_have_caching_bg;
+-
+-	/* Allocation is called for tree-log */
+-	bool for_treelog;
+-
+-	/* Allocation is called for data relocation */
+-	bool for_data_reloc;
+-
+-	/* RAID index, converted from flags */
+-	int index;
+-
+-	/*
+-	 * Current loop number, check find_free_extent_update_loop() for details
+-	 */
+-	int loop;
+-
+-	/*
+-	 * Whether we're refilling a cluster, if true we need to re-search
+-	 * current block group but don't try to refill the cluster again.
+-	 */
+-	bool retry_clustered;
+-
+-	/*
+-	 * Whether we're updating free space cache, if true we need to re-search
+-	 * current block group but don't try updating free space cache again.
+-	 */
+-	bool retry_unclustered;
+-
+-	/* If current block group is cached */
+-	int cached;
+-
+-	/* Max contiguous hole found */
+-	u64 max_extent_size;
+-
+-	/* Total free space from free space cache, not always contiguous */
+-	u64 total_free_space;
+-
+-	/* Found result */
+-	u64 found_offset;
+-
+-	/* Hint where to start looking for an empty space */
+-	u64 hint_byte;
+-
+-	/* Allocation policy */
+-	enum btrfs_extent_allocation_policy policy;
+-};
+-
+-
+ /*
+  * Helper function for find_free_extent().
+  *
+@@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ 	if (offset) {
+ 		/* We have a block, we're done */
+ 		spin_unlock(&last_ptr->refill_lock);
+-		trace_btrfs_reserve_extent_cluster(cluster_bg,
+-				ffe_ctl->search_start, ffe_ctl->num_bytes);
++		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
+ 		*cluster_bg_ret = cluster_bg;
+ 		ffe_ctl->found_offset = offset;
+ 		return 0;
+@@ -3610,10 +3535,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ 		if (offset) {
+ 			/* We found one, proceed */
+ 			spin_unlock(&last_ptr->refill_lock);
+-			trace_btrfs_reserve_extent_cluster(bg,
+-					ffe_ctl->search_start,
+-					ffe_ctl->num_bytes);
+ 			ffe_ctl->found_offset = offset;
++			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
+ 			return 0;
+ 		}
+ 	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+@@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ 	}
+ }
+ 
+-static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+-{
+-	switch (ffe_ctl->policy) {
+-	case BTRFS_EXTENT_ALLOC_CLUSTERED:
+-		/*
+-		 * If we can't allocate a new chunk we've already looped through
+-		 * at least once, move on to the NO_EMPTY_SIZE case.
+-		 */
+-		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+-		return 0;
+-	case BTRFS_EXTENT_ALLOC_ZONED:
+-		/* Give up here */
+-		return -ENOSPC;
+-	default:
+-		BUG();
+-	}
+-}
+-
+ /*
+  * Return >0 means caller needs to re-search for free extent
+  * Return 0 means we have the needed free extent.
+@@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ 	 *			caching kthreads as we move along
+ 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
++	 * LOOP_UNSET_SIZE_CLASS, allow unset size class
+ 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ 	 *		       again
+ 	 */
+ 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+ 		ffe_ctl->index = 0;
+-		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+-			/*
+-			 * We want to skip the LOOP_CACHING_WAIT step if we
+-			 * don't have any uncached bgs and we've already done a
+-			 * full search through.
+-			 */
+-			if (ffe_ctl->orig_have_caching_bg || !full_search)
+-				ffe_ctl->loop = LOOP_CACHING_WAIT;
+-			else
+-				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+-		} else {
++		/*
++		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
++		 * any uncached bgs and we've already done a full search
++		 * through.
++		 */
++		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
++		    (!ffe_ctl->orig_have_caching_bg && full_search))
+ 			ffe_ctl->loop++;
+-		}
++		ffe_ctl->loop++;
+ 
+ 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+ 			struct btrfs_trans_handle *trans;
+ 			int exist = 0;
+ 
+-			/*Check if allocation policy allows to create a new chunk */
++			/* Check if allocation policy allows to create a new chunk */
+ 			ret = can_allocate_chunk(fs_info, ffe_ctl);
+ 			if (ret)
+ 				return ret;
+@@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
+ 
+ 			/* Do not bail out on ENOSPC since we can do more. */
+-			if (ret == -ENOSPC)
+-				ret = chunk_allocation_failed(ffe_ctl);
++			if (ret == -ENOSPC) {
++				ret = 0;
++				ffe_ctl->loop++;
++			}
+ 			else if (ret < 0)
+ 				btrfs_abort_transaction(trans, ret);
+ 			else
+@@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ 	return -ENOSPC;
+ }
+ 
++static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
++					      struct btrfs_block_group *bg)
++{
++	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
++		return true;
++	if (!btrfs_block_group_should_use_size_class(bg))
++		return true;
++	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
++		return true;
++	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
++	    bg->size_class == BTRFS_BG_SZ_NONE)
++		return true;
++	return ffe_ctl->size_class == bg->size_class;
++}
++
+ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ 					struct find_free_extent_ctl *ffe_ctl,
+ 					struct btrfs_space_info *space_info,
+@@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 	ffe_ctl->total_free_space = 0;
+ 	ffe_ctl->found_offset = 0;
+ 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
++	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
+ 
+ 	if (btrfs_is_zoned(fs_info))
+ 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
+@@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 	ins->objectid = 0;
+ 	ins->offset = 0;
+ 
+-	trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+-			       ffe_ctl->flags);
++	trace_find_free_extent(root, ffe_ctl);
+ 
+ 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
+ 	if (!space_info) {
+@@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 							block_group->flags);
+ 				btrfs_lock_block_group(block_group,
+ 						       ffe_ctl->delalloc);
++				ffe_ctl->hinted = true;
+ 				goto have_block_group;
+ 			}
+ 		} else if (block_group) {
+@@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 		}
+ 	}
+ search:
++	trace_find_free_extent_search_loop(root, ffe_ctl);
+ 	ffe_ctl->have_caching_bg = false;
+ 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ 	    ffe_ctl->index == 0)
+@@ -4356,6 +4277,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 			    &space_info->block_groups[ffe_ctl->index], list) {
+ 		struct btrfs_block_group *bg_ret;
+ 
++		ffe_ctl->hinted = false;
+ 		/* If the block group is read-only, we can skip it entirely. */
+ 		if (unlikely(block_group->ro)) {
+ 			if (ffe_ctl->for_treelog)
+@@ -4397,6 +4319,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 		}
+ 
+ have_block_group:
++		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
+ 		ffe_ctl->cached = btrfs_block_group_done(block_group);
+ 		if (unlikely(!ffe_ctl->cached)) {
+ 			ffe_ctl->have_caching_bg = true;
+@@ -4421,6 +4344,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ 			goto loop;
+ 
++		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
++			goto loop;
++
+ 		bg_ret = NULL;
+ 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
+ 		if (ret == 0) {
+@@ -4455,7 +4381,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 
+ 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ 					       ffe_ctl->num_bytes,
+-					       ffe_ctl->delalloc);
++					       ffe_ctl->delalloc,
++					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
+ 		if (ret == -EAGAIN) {
+ 			btrfs_add_free_space_unused(block_group,
+ 					ffe_ctl->found_offset,
+@@ -4468,8 +4395,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
+ 		ins->objectid = ffe_ctl->search_start;
+ 		ins->offset = ffe_ctl->num_bytes;
+ 
+-		trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+-					   ffe_ctl->num_bytes);
++		trace_btrfs_reserve_extent(block_group, ffe_ctl);
+ 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ 		break;
+ loop:
+@@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ 	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+ 
+ 	__btrfs_tree_lock(buf, nest);
+-	btrfs_clean_tree_block(buf);
++	btrfs_clear_buffer_dirty(trans, buf);
+ 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ 	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ 
+@@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ 				}
+ 			}
+ 		}
+-		/* make block locked assertion in btrfs_clean_tree_block happy */
+-		if (!path->locks[level] &&
+-		    btrfs_header_generation(eb) == trans->transid) {
++		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
++		if (!path->locks[level]) {
+ 			btrfs_tree_lock(eb);
+ 			path->locks[level] = BTRFS_WRITE_LOCK;
+ 		}
+-		btrfs_clean_tree_block(eb);
++		btrfs_clear_buffer_dirty(trans, eb);
+ 	}
+ 
+ 	if (eb == root->node) {
+diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
+index ae5425253603..0c958fc1b3b8 100644
+--- a/fs/btrfs/extent-tree.h
++++ b/fs/btrfs/extent-tree.h
+@@ -3,6 +3,87 @@
+ #ifndef BTRFS_EXTENT_TREE_H
+ #define BTRFS_EXTENT_TREE_H
+ 
++#include "misc.h"
++#include "block-group.h"
++
++struct btrfs_free_cluster;
++
++enum btrfs_extent_allocation_policy {
++	BTRFS_EXTENT_ALLOC_CLUSTERED,
++	BTRFS_EXTENT_ALLOC_ZONED,
++};
++
++struct find_free_extent_ctl {
++	/* Basic allocation info */
++	u64 ram_bytes;
++	u64 num_bytes;
++	u64 min_alloc_size;
++	u64 empty_size;
++	u64 flags;
++	int delalloc;
++
++	/* Where to start the search inside the bg */
++	u64 search_start;
++
++	/* For clustered allocation */
++	u64 empty_cluster;
++	struct btrfs_free_cluster *last_ptr;
++	bool use_cluster;
++
++	bool have_caching_bg;
++	bool orig_have_caching_bg;
++
++	/* Allocation is called for tree-log */
++	bool for_treelog;
++
++	/* Allocation is called for data relocation */
++	bool for_data_reloc;
++
++	/* RAID index, converted from flags */
++	int index;
++
++	/*
++	 * Current loop number, check find_free_extent_update_loop() for details
++	 */
++	int loop;
++
++	/*
++	 * Whether we're refilling a cluster, if true we need to re-search
++	 * current block group but don't try to refill the cluster again.
++	 */
++	bool retry_clustered;
++
++	/*
++	 * Whether we're updating free space cache, if true we need to re-search
++	 * current block group but don't try updating free space cache again.
++	 */
++	bool retry_unclustered;
++
++	/* If current block group is cached */
++	int cached;
++
++	/* Max contiguous hole found */
++	u64 max_extent_size;
++
++	/* Total free space from free space cache, not always contiguous */
++	u64 total_free_space;
++
++	/* Found result */
++	u64 found_offset;
++
++	/* Hint where to start looking for an empty space */
++	u64 hint_byte;
++
++	/* Allocation policy */
++	enum btrfs_extent_allocation_policy policy;
++
++	/* Whether or not the allocator is currently following a hint */
++	bool hinted;
++
++	/* Size class of block groups to prefer in early loops */
++	enum btrfs_block_group_size_class size_class;
++};
++
+ enum btrfs_inline_ref_type {
+ 	BTRFS_REF_TYPE_INVALID,
+ 	BTRFS_REF_TYPE_BLOCK,
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 3bbf8703db2a..c25fa74d7615 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -36,6 +36,7 @@
+ #include "file.h"
+ #include "dev-replace.h"
+ #include "super.h"
++#include "transaction.h"
+ 
+ static struct kmem_cache *extent_buffer_cache;
+ 
+@@ -99,7 +100,6 @@ struct btrfs_bio_ctrl {
+ 	struct bio *bio;
+ 	int mirror_num;
+ 	enum btrfs_compression_type compress_type;
+-	u32 len_to_stripe_boundary;
+ 	u32 len_to_oe_boundary;
+ 	btrfs_bio_end_io_t end_io_func;
+ 
+@@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+ {
+ 	struct bio *bio;
+ 	struct bio_vec *bv;
+-	struct btrfs_inode *inode;
++	struct inode *inode;
+ 	int mirror_num;
+ 
+ 	if (!bio_ctrl->bio)
+@@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+ 
+ 	bio = bio_ctrl->bio;
+ 	bv = bio_first_bvec_all(bio);
+-	inode = BTRFS_I(bv->bv_page->mapping->host);
++	inode = bv->bv_page->mapping->host;
+ 	mirror_num = bio_ctrl->mirror_num;
+ 
+ 	/* Caller should ensure the bio has at least some range added */
+ 	ASSERT(bio->bi_iter.bi_size);
+ 
+-	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
+-
+-	if (!is_data_inode(&inode->vfs_inode)) {
++	if (!is_data_inode(inode)) {
+ 		if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ 			/*
+ 			 * For metadata read, we should have the parent_check,
+@@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+ 			       bio_ctrl->parent_check,
+ 			       sizeof(struct btrfs_tree_parent_check));
+ 		}
+-		btrfs_submit_metadata_bio(inode, bio, mirror_num);
+-	} else if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+-		btrfs_submit_data_write_bio(inode, bio, mirror_num);
+-	} else {
+-		btrfs_submit_data_read_bio(inode, bio, mirror_num,
+-					   bio_ctrl->compress_type);
++		bio->bi_opf |= REQ_META;
+ 	}
+ 
++	if (btrfs_op(bio) == BTRFS_MAP_READ &&
++	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
++		btrfs_submit_compressed_read(inode, bio, mirror_num);
++	else
++		btrfs_submit_bio(bio, mirror_num);
++
+ 	/* The bio is owned by the end_io handler now */
+ 	bio_ctrl->bio = NULL;
+ }
+@@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ 			       start, end, page_ops, NULL);
+ }
+ 
+-static int insert_failrec(struct btrfs_inode *inode,
+-			  struct io_failure_record *failrec)
+-{
+-	struct rb_node *exist;
+-
+-	spin_lock(&inode->io_failure_lock);
+-	exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+-				 &failrec->rb_node);
+-	spin_unlock(&inode->io_failure_lock);
+-
+-	return (exist == NULL) ? 0 : -EEXIST;
+-}
+-
+-static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
+-{
+-	struct rb_node *node;
+-	struct io_failure_record *failrec = ERR_PTR(-ENOENT);
+-
+-	spin_lock(&inode->io_failure_lock);
+-	node = rb_simple_search(&inode->io_failure_tree, start);
+-	if (node)
+-		failrec = rb_entry(node, struct io_failure_record, rb_node);
+-	spin_unlock(&inode->io_failure_lock);
+-	return failrec;
+-}
+-
+-static void free_io_failure(struct btrfs_inode *inode,
+-			    struct io_failure_record *rec)
+-{
+-	spin_lock(&inode->io_failure_lock);
+-	rb_erase(&rec->rb_node, &inode->io_failure_tree);
+-	spin_unlock(&inode->io_failure_lock);
+-
+-	kfree(rec);
+-}
+-
+-static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
+-{
+-	if (cur_mirror == failrec->num_copies)
+-		return cur_mirror + 1 - failrec->num_copies;
+-	return cur_mirror + 1;
+-}
+-
+-static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
+-{
+-	if (cur_mirror == 1)
+-		return failrec->num_copies;
+-	return cur_mirror - 1;
+-}
+-
+-/*
+- * each time an IO finishes, we do a fast check in the IO failure tree
+- * to see if we need to process or clean up an io_failure_record
+- */
+-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+-			   struct page *page, unsigned int pg_offset)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct extent_io_tree *io_tree = &inode->io_tree;
+-	u64 ino = btrfs_ino(inode);
+-	u64 locked_start, locked_end;
+-	struct io_failure_record *failrec;
+-	int mirror;
+-	int ret;
+-
+-	failrec = get_failrec(inode, start);
+-	if (IS_ERR(failrec))
+-		return 0;
+-
+-	BUG_ON(!failrec->this_mirror);
+-
+-	if (sb_rdonly(fs_info->sb))
+-		goto out;
+-
+-	ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+-				    &locked_end, EXTENT_LOCKED, NULL);
+-	if (ret || locked_start > failrec->bytenr ||
+-	    locked_end < failrec->bytenr + failrec->len - 1)
+-		goto out;
+-
+-	mirror = failrec->this_mirror;
+-	do {
+-		mirror = prev_mirror(failrec, mirror);
+-		btrfs_repair_io_failure(fs_info, ino, start, failrec->len,
+-				  failrec->logical, page, pg_offset, mirror);
+-	} while (mirror != failrec->failed_mirror);
+-
+-out:
+-	free_io_failure(inode, failrec);
+-	return 0;
+-}
+-
+-/*
+- * Can be called when
+- * - hold extent lock
+- * - under ordered extent
+- * - the inode is freeing
+- */
+-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
+-{
+-	struct io_failure_record *failrec;
+-	struct rb_node *node, *next;
+-
+-	if (RB_EMPTY_ROOT(&inode->io_failure_tree))
+-		return;
+-
+-	spin_lock(&inode->io_failure_lock);
+-	node = rb_simple_search_first(&inode->io_failure_tree, start);
+-	while (node) {
+-		failrec = rb_entry(node, struct io_failure_record, rb_node);
+-		if (failrec->bytenr > end)
+-			break;
+-
+-		next = rb_next(node);
+-		rb_erase(&failrec->rb_node, &inode->io_failure_tree);
+-		kfree(failrec);
+-
+-		node = next;
+-	}
+-	spin_unlock(&inode->io_failure_lock);
+-}
+-
+-static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+-							     struct btrfs_bio *bbio,
+-							     unsigned int bio_offset)
+-{
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	u64 start = bbio->file_offset + bio_offset;
+-	struct io_failure_record *failrec;
+-	const u32 sectorsize = fs_info->sectorsize;
+-	int ret;
+-
+-	failrec = get_failrec(BTRFS_I(inode), start);
+-	if (!IS_ERR(failrec)) {
+-		btrfs_debug(fs_info,
+-	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
+-			failrec->logical, failrec->bytenr, failrec->len);
+-		/*
+-		 * when data can be on disk more than twice, add to failrec here
+-		 * (e.g. with a list for failed_mirror) to make
+-		 * clean_io_failure() clean all those errors at once.
+-		 */
+-		ASSERT(failrec->this_mirror == bbio->mirror_num);
+-		ASSERT(failrec->len == fs_info->sectorsize);
+-		return failrec;
+-	}
+-
+-	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+-	if (!failrec)
+-		return ERR_PTR(-ENOMEM);
+-
+-	RB_CLEAR_NODE(&failrec->rb_node);
+-	failrec->bytenr = start;
+-	failrec->len = sectorsize;
+-	failrec->failed_mirror = bbio->mirror_num;
+-	failrec->this_mirror = bbio->mirror_num;
+-	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
+-
+-	btrfs_debug(fs_info,
+-		    "new io failure record logical %llu start %llu",
+-		    failrec->logical, start);
+-
+-	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
+-	if (failrec->num_copies == 1) {
+-		/*
+-		 * We only have a single copy of the data, so don't bother with
+-		 * all the retry and error correction code that follows. No
+-		 * matter what the error is, it is very likely to persist.
+-		 */
+-		btrfs_debug(fs_info,
+-			"cannot repair logical %llu num_copies %d",
+-			failrec->logical, failrec->num_copies);
+-		kfree(failrec);
+-		return ERR_PTR(-EIO);
+-	}
+-
+-	/* Set the bits in the private failure tree */
+-	ret = insert_failrec(BTRFS_I(inode), failrec);
+-	if (ret) {
+-		kfree(failrec);
+-		return ERR_PTR(ret);
+-	}
+-
+-	return failrec;
+-}
+-
+-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
+-			    u32 bio_offset, struct page *page, unsigned int pgoff,
+-			    bool submit_buffered)
+-{
+-	u64 start = failed_bbio->file_offset + bio_offset;
+-	struct io_failure_record *failrec;
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct bio *failed_bio = &failed_bbio->bio;
+-	const int icsum = bio_offset >> fs_info->sectorsize_bits;
+-	struct bio *repair_bio;
+-	struct btrfs_bio *repair_bbio;
+-
+-	btrfs_debug(fs_info,
+-		   "repair read error: read error at %llu", start);
+-
+-	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+-
+-	failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset);
+-	if (IS_ERR(failrec))
+-		return PTR_ERR(failrec);
+-
+-	/*
+-	 * There are two premises:
+-	 * a) deliver good data to the caller
+-	 * b) correct the bad sectors on disk
+-	 *
+-	 * Since we're only doing repair for one sector, we only need to get
+-	 * a good copy of the failed sector and if we succeed, we have setup
+-	 * everything for btrfs_repair_io_failure to do the rest for us.
+-	 */
+-	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
+-	if (failrec->this_mirror == failrec->failed_mirror) {
+-		btrfs_debug(fs_info,
+-			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
+-			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
+-		free_io_failure(inode, failrec);
+-		return -EIO;
+-	}
+-
+-	repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+-				     failed_bbio->private);
+-	repair_bbio = btrfs_bio(repair_bio);
+-	repair_bbio->file_offset = start;
+-	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+-
+-	if (failed_bbio->csum) {
+-		const u32 csum_size = fs_info->csum_size;
+-
+-		repair_bbio->csum = repair_bbio->csum_inline;
+-		memcpy(repair_bbio->csum,
+-		       failed_bbio->csum + csum_size * icsum, csum_size);
+-	}
+-
+-	bio_add_page(repair_bio, page, failrec->len, pgoff);
+-	repair_bbio->iter = repair_bio->bi_iter;
+-
+-	btrfs_debug(fs_info,
+-		    "repair read error: submitting new read to mirror %d",
+-		    failrec->this_mirror);
+-
+-	/*
+-	 * At this point we have a bio, so any errors from bio submission will
+-	 * be handled by the endio on the repair_bio, so we can't return an
+-	 * error here.
+-	 */
+-	if (submit_buffered)
+-		btrfs_submit_data_read_bio(inode, repair_bio,
+-					   failrec->this_mirror, 0);
+-	else
+-		btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror);
+-
+-	return BLK_STS_OK;
+-}
+-
+ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+@@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+ 		btrfs_subpage_end_reader(fs_info, page, start, len);
+ }
+ 
+-static void end_sector_io(struct page *page, u64 offset, bool uptodate)
+-{
+-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+-	const u32 sectorsize = inode->root->fs_info->sectorsize;
+-
+-	end_page_read(page, uptodate, offset, sectorsize);
+-	unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL);
+-}
+-
+-static void submit_data_read_repair(struct inode *inode,
+-				    struct btrfs_bio *failed_bbio,
+-				    u32 bio_offset, const struct bio_vec *bvec,
+-				    unsigned int error_bitmap)
+-{
+-	const unsigned int pgoff = bvec->bv_offset;
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	struct page *page = bvec->bv_page;
+-	const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+-	const u64 end = start + bvec->bv_len - 1;
+-	const u32 sectorsize = fs_info->sectorsize;
+-	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
+-	int i;
+-
+-	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
+-
+-	/* This repair is only for data */
+-	ASSERT(is_data_inode(inode));
+-
+-	/* We're here because we had some read errors or csum mismatch */
+-	ASSERT(error_bitmap);
+-
+-	/*
+-	 * We only get called on buffered IO, thus page must be mapped and bio
+-	 * must not be cloned.
+-	 */
+-	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
+-
+-	/* Iterate through all the sectors in the range */
+-	for (i = 0; i < nr_bits; i++) {
+-		const unsigned int offset = i * sectorsize;
+-		bool uptodate = false;
+-		int ret;
+-
+-		if (!(error_bitmap & (1U << i))) {
+-			/*
+-			 * This sector has no error, just end the page read
+-			 * and unlock the range.
+-			 */
+-			uptodate = true;
+-			goto next;
+-		}
+-
+-		ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio,
+-				bio_offset + offset, page, pgoff + offset,
+-				true);
+-		if (!ret) {
+-			/*
+-			 * We have submitted the read repair, the page release
+-			 * will be handled by the endio function of the
+-			 * submitted repair bio.
+-			 * Thus we don't need to do any thing here.
+-			 */
+-			continue;
+-		}
+-		/*
+-		 * Continue on failed repair, otherwise the remaining sectors
+-		 * will not be properly unlocked.
+-		 */
+-next:
+-		end_sector_io(page, start + offset, uptodate);
+-	}
+-}
+-
+ /* lots and lots of room for performance fixes in the end_bio funcs */
+ 
+ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+@@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+ 	u64 start;
+ 	u64 end;
+ 	struct bvec_iter_all iter_all;
+-	bool first_bvec = true;
+ 
+ 	ASSERT(!bio_flagged(bio, BIO_CLONED));
+ 	bio_for_each_segment_all(bvec, bio, iter_all) {
+@@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+ 		start = page_offset(page) + bvec->bv_offset;
+ 		end = start + bvec->bv_len - 1;
+ 
+-		if (first_bvec) {
+-			btrfs_record_physical_zoned(inode, start, bio);
+-			first_bvec = false;
+-		}
+-
+ 		end_extent_writepage(page, error, start, end);
+ 
+ 		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
+@@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+ 		struct inode *inode = page->mapping->host;
+ 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 		const u32 sectorsize = fs_info->sectorsize;
+-		unsigned int error_bitmap = (unsigned int)-1;
+-		bool repair = false;
+ 		u64 start;
+ 		u64 end;
+ 		u32 len;
+@@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+ 		len = bvec->bv_len;
+ 
+ 		mirror = bbio->mirror_num;
+-		if (likely(uptodate)) {
+-			if (is_data_inode(inode)) {
+-				error_bitmap = btrfs_verify_data_csum(bbio,
+-						bio_offset, page, start, end);
+-				if (error_bitmap)
+-					uptodate = false;
+-			} else {
+-				if (btrfs_validate_metadata_buffer(bbio,
+-						page, start, end, mirror))
+-					uptodate = false;
+-			}
+-		}
++		if (uptodate && !is_data_inode(inode) &&
++		    btrfs_validate_metadata_buffer(bbio, page, start, end, mirror))
++			uptodate = false;
+ 
+ 		if (likely(uptodate)) {
+ 			loff_t i_size = i_size_read(inode);
+ 			pgoff_t end_index = i_size >> PAGE_SHIFT;
+ 
+-			btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
+-
+ 			/*
+ 			 * Zero out the remaining part if this range straddles
+ 			 * i_size.
+@@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+ 				zero_user_segment(page, zero_start,
+ 						  offset_in_page(end) + 1);
+ 			}
+-		} else if (is_data_inode(inode)) {
+-			/*
+-			 * Only try to repair bios that actually made it to a
+-			 * device.  If the bio failed to be submitted mirror
+-			 * is 0 and we need to fail it without retrying.
+-			 *
+-			 * This also includes the high level bios for compressed
+-			 * extents - these never make it to a device and repair
+-			 * is already handled on the lower compressed bio.
+-			 */
+-			if (mirror > 0)
+-				repair = true;
+-		} else {
++		} else if (!is_data_inode(inode)) {
+ 			struct extent_buffer *eb;
+ 
+ 			eb = find_extent_buffer_readpage(fs_info, page, start);
+@@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+ 			atomic_dec(&eb->io_pages);
+ 		}
+ 
+-		if (repair) {
+-			/*
+-			 * submit_data_read_repair() will handle all the good
+-			 * and bad sectors, we just continue to the next bvec.
+-			 */
+-			submit_data_read_repair(inode, bbio, bio_offset, bvec,
+-						error_bitmap);
+-		} else {
+-			/* Update page status and unlock */
+-			end_page_read(page, uptodate, start, len);
+-			endio_readpage_release_extent(&processed, BTRFS_I(inode),
+-					start, end, PageUptodate(page));
+-		}
++		/* Update page status and unlock. */
++		end_page_read(page, uptodate, start, len);
++		endio_readpage_release_extent(&processed, BTRFS_I(inode),
++					      start, end, PageUptodate(page));
+ 
+ 		ASSERT(bio_offset + len > bio_offset);
+ 		bio_offset += len;
+@@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+ 	}
+ 	/* Release the last extent */
+ 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
+-	btrfs_bio_free_csum(bbio);
+ 	bio_put(bio);
+ }
+ 
+@@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ 	u32 real_size;
+ 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ 	bool contig = false;
+-	int ret;
+ 
+ 	ASSERT(bio);
+ 	/* The limit should be calculated when bio_ctrl->bio is allocated */
+-	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
++	ASSERT(bio_ctrl->len_to_oe_boundary);
+ 	if (bio_ctrl->compress_type != compress_type)
+ 		return 0;
+ 
+@@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ 	if (!contig)
+ 		return 0;
+ 
+-	real_size = min(bio_ctrl->len_to_oe_boundary,
+-			bio_ctrl->len_to_stripe_boundary) - bio_size;
+-	real_size = min(real_size, size);
++	real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
+ 
+ 	/*
+ 	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+@@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ 	if (real_size == 0)
+ 		return 0;
+ 
+-	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+-		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
+-	else
+-		ret = bio_add_page(bio, page, real_size, pg_offset);
+-
+-	return ret;
++	return bio_add_page(bio, page, real_size, pg_offset);
+ }
+ 
+-static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+-			       struct btrfs_inode *inode, u64 file_offset)
++static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
++				struct btrfs_inode *inode, u64 file_offset)
+ {
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct btrfs_io_geometry geom;
+ 	struct btrfs_ordered_extent *ordered;
+-	struct extent_map *em;
+-	u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
+-	int ret;
+ 
+ 	/*
+-	 * Pages for compressed extent are never submitted to disk directly,
+-	 * thus it has no real boundary, just set them to U32_MAX.
+-	 *
+-	 * The split happens for real compressed bio, which happens in
+-	 * btrfs_submit_compressed_read/write().
++	 * Limit the extent to the ordered boundary for Zone Append.
++	 * Compressed bios aren't submitted directly, so it doesn't apply to
++	 * them.
+ 	 */
+-	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
+-		bio_ctrl->len_to_oe_boundary = U32_MAX;
+-		bio_ctrl->len_to_stripe_boundary = U32_MAX;
+-		return 0;
+-	}
+-	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
+-	if (IS_ERR(em))
+-		return PTR_ERR(em);
+-	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
+-				    logical, &geom);
+-	free_extent_map(em);
+-	if (ret < 0) {
+-		return ret;
+-	}
+-	if (geom.len > U32_MAX)
+-		bio_ctrl->len_to_stripe_boundary = U32_MAX;
+-	else
+-		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
+-
+-	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+-		bio_ctrl->len_to_oe_boundary = U32_MAX;
+-		return 0;
+-	}
+-
+-	/* Ordered extent not yet created, so we're good */
+-	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+-	if (!ordered) {
+-		bio_ctrl->len_to_oe_boundary = U32_MAX;
+-		return 0;
++	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
++	    btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
++		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
++		if (ordered) {
++			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
++					ordered->file_offset +
++					ordered->disk_num_bytes - file_offset);
++			btrfs_put_ordered_extent(ordered);
++			return;
++		}
+ 	}
+ 
+-	bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+-		ordered->disk_bytenr + ordered->disk_num_bytes - logical);
+-	btrfs_put_ordered_extent(ordered);
+-	return 0;
++	bio_ctrl->len_to_oe_boundary = U32_MAX;
+ }
+ 
+-static int alloc_new_bio(struct btrfs_inode *inode,
+-			 struct btrfs_bio_ctrl *bio_ctrl,
+-			 struct writeback_control *wbc,
+-			 blk_opf_t opf,
+-			 u64 disk_bytenr, u32 offset, u64 file_offset,
+-			 enum btrfs_compression_type compress_type)
++static void alloc_new_bio(struct btrfs_inode *inode,
++			  struct btrfs_bio_ctrl *bio_ctrl,
++			  struct writeback_control *wbc, blk_opf_t opf,
++			  u64 disk_bytenr, u32 offset, u64 file_offset,
++			  enum btrfs_compression_type compress_type)
+ {
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct bio *bio;
+-	int ret;
+ 
+-	ASSERT(bio_ctrl->end_io_func);
+-
+-	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
++	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
++			      NULL);
+ 	/*
+ 	 * For compressed page range, its disk_bytenr is always @disk_bytenr
+ 	 * passed in, no matter if we have added any range into previous bio.
+@@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode,
+ 		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ 	else
+ 		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
++	btrfs_bio(bio)->file_offset = file_offset;
+ 	bio_ctrl->bio = bio;
+ 	bio_ctrl->compress_type = compress_type;
+-	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+-	if (ret < 0)
+-		goto error;
++	calc_bio_boundaries(bio_ctrl, inode, file_offset);
+ 
+ 	if (wbc) {
+ 		/*
+-		 * For Zone append we need the correct block_device that we are
+-		 * going to write to set in the bio to be able to respect the
+-		 * hardware limitation.  Look it up here:
++		 * Pick the last added device to support cgroup writeback.  For
++		 * multi-device file systems this means blk-cgroup policies have
++		 * to always be set on the last added/replaced device.
++		 * This is a bit odd but has been like that for a long time.
+ 		 */
+-		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+-			struct btrfs_device *dev;
+-
+-			dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+-						     fs_info->sectorsize);
+-			if (IS_ERR(dev)) {
+-				ret = PTR_ERR(dev);
+-				goto error;
+-			}
+-
+-			bio_set_dev(bio, dev->bdev);
+-		} else {
+-			/*
+-			 * Otherwise pick the last added device to support
+-			 * cgroup writeback.  For multi-device file systems this
+-			 * means blk-cgroup policies have to always be set on the
+-			 * last added/replaced device.  This is a bit odd but has
+-			 * been like that for a long time.
+-			 */
+-			bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
+-		}
++		bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
+ 		wbc_init_bio(wbc, bio);
+-	} else {
+-		ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
+ 	}
+-	return 0;
+-error:
+-	bio_ctrl->bio = NULL;
+-	btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+-	return ret;
+ }
+ 
+ /*
+@@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf,
+ 			      enum btrfs_compression_type compress_type,
+ 			      bool force_bio_submit)
+ {
+-	int ret = 0;
+ 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ 	unsigned int cur = pg_offset;
+ 
+@@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf,
+ 
+ 		/* Allocate new bio if needed */
+ 		if (!bio_ctrl->bio) {
+-			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+-					    disk_bytenr, offset,
+-					    page_offset(page) + cur,
+-					    compress_type);
+-			if (ret < 0)
+-				return ret;
++			alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
++				      offset, page_offset(page) + cur,
++				      compress_type);
+ 		}
+ 		/*
+ 		 * We must go through btrfs_bio_add_page() to ensure each
+@@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 		 * find_next_dirty_byte() are all exclusive
+ 		 */
+ 		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
+-
+-		if (btrfs_use_zone_append(inode, em->block_start))
+-			op = REQ_OP_ZONE_APPEND;
+-
+ 		free_extent_map(em);
+ 		em = NULL;
+ 
+@@ -2360,13 +1910,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
+ 	 */
+ 	mapping_set_error(page->mapping, -EIO);
+ 
+-	/*
+-	 * If we error out, we should add back the dirty_metadata_bytes
+-	 * to make it consistent.
+-	 */
+-	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+-				 eb->len, fs_info->dirty_metadata_batch);
+-
+ 	/*
+ 	 * If writeback for a btree extent that doesn't belong to a log tree
+ 	 * failed, increment the counter transaction->eb_write_errors.
+@@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
+ 	WARN_ON(atomic_read(&eb->refs) == 0);
+ }
+ 
+-void clear_extent_buffer_dirty(const struct extent_buffer *eb)
++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
++			      struct extent_buffer *eb)
+ {
++	struct btrfs_fs_info *fs_info = eb->fs_info;
+ 	int i;
+ 	int num_pages;
+ 	struct page *page;
+ 
++	btrfs_assert_tree_write_locked(eb);
++
++	if (trans && btrfs_header_generation(eb) != trans->transid)
++		return;
++
++	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
++		return;
++
++	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
++				 fs_info->dirty_metadata_batch);
++
+ 	if (eb->fs_info->nodesize < PAGE_SIZE)
+ 		return clear_subpage_extent_buffer_dirty(eb);
+ 
+diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
+index a2c82448b2e0..4341ad978fb8 100644
+--- a/fs/btrfs/extent_io.h
++++ b/fs/btrfs/extent_io.h
+@@ -11,6 +11,8 @@
+ #include "ulist.h"
+ #include "misc.h"
+ 
++struct btrfs_trans_handle;
++
+ enum {
+ 	EXTENT_BUFFER_UPTODATE,
+ 	EXTENT_BUFFER_DIRTY,
+@@ -60,11 +62,9 @@ enum {
+ #define BITMAP_LAST_BYTE_MASK(nbits) \
+ 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+ 
+-struct btrfs_bio;
+ struct btrfs_root;
+ struct btrfs_inode;
+ struct btrfs_fs_info;
+-struct io_failure_record;
+ struct extent_io_tree;
+ struct btrfs_tree_parent_check;
+ 
+@@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
+ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ 				unsigned long start, unsigned long pos,
+ 				unsigned long len);
+-void clear_extent_buffer_dirty(const struct extent_buffer *eb);
+ bool set_extent_buffer_dirty(struct extent_buffer *eb);
+ void set_extent_buffer_uptodate(struct extent_buffer *eb);
+ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
+@@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ 				  u32 bits_to_clear, unsigned long page_ops);
+ int extent_invalidate_folio(struct extent_io_tree *tree,
+ 			    struct folio *folio, size_t offset);
++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
++			      struct extent_buffer *buf);
+ 
+ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+ 
+ void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+ 
+-/*
+- * When IO fails, either with EIO or csum verification fails, we
+- * try other mirrors that might have a good copy of the data.  This
+- * io_failure_record is used to record state as we go through all the
+- * mirrors.  If another mirror has good data, the sector is set up to date
+- * and things continue.  If a good mirror can't be found, the original
+- * bio end_io callback is called to indicate things have failed.
+- */
+-struct io_failure_record {
+-	/* Use rb_simple_node for search/insert */
+-	struct {
+-		struct rb_node rb_node;
+-		u64 bytenr;
+-	};
+-	struct page *page;
+-	u64 len;
+-	u64 logical;
+-	int this_mirror;
+-	int failed_mirror;
+-	int num_copies;
+-};
+-
+-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
+-			    u32 bio_offset, struct page *page, unsigned int pgoff,
+-			    bool submit_buffered);
+-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+-			   struct page *page, unsigned int pg_offset);
+-
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ bool find_lock_delalloc_range(struct inode *inode,
+ 			     struct page *locked_page, u64 *start,
+diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
+index 5de73466b2ca..41c77a100853 100644
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+ /*
+  * Lookup the checksum for the read bio in csum tree.
+  *
+- * @inode:  inode that the bio is for.
+- * @bio:    bio to look up.
+- * @dst:    Buffer of size nblocks * btrfs_super_csum_size() used to return
+- *          checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+- *          NULL, the checksum buffer is allocated and returned in
+- *          btrfs_bio(bio)->csum instead.
+- *
+  * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
+  */
+-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
++blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+ {
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+-	struct btrfs_bio *bbio = NULL;
++	struct btrfs_inode *inode = bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct extent_io_tree *io_tree = &inode->io_tree;
++	struct bio *bio = &bbio->bio;
+ 	struct btrfs_path *path;
+ 	const u32 sectorsize = fs_info->sectorsize;
+ 	const u32 csum_size = fs_info->csum_size;
+ 	u32 orig_len = bio->bi_iter.bi_size;
+ 	u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 	u64 cur_disk_bytenr;
+-	u8 *csum;
+ 	const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
+ 	int count = 0;
+ 	blk_status_t ret = BLK_STS_OK;
+ 
+-	if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
++	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ 	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ 		return BLK_STS_OK;
+ 
+@@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
+ 	if (!path)
+ 		return BLK_STS_RESOURCE;
+ 
+-	if (!dst) {
+-		bbio = btrfs_bio(bio);
+-
+-		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+-			bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+-			if (!bbio->csum) {
+-				btrfs_free_path(path);
+-				return BLK_STS_RESOURCE;
+-			}
+-		} else {
+-			bbio->csum = bbio->csum_inline;
++	if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
++		bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
++		if (!bbio->csum) {
++			btrfs_free_path(path);
++			return BLK_STS_RESOURCE;
+ 		}
+-		csum = bbio->csum;
+ 	} else {
+-		csum = dst;
++		bbio->csum = bbio->csum_inline;
+ 	}
+ 
+ 	/*
+@@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
+ 	 * read from the commit root and sidestep a nasty deadlock
+ 	 * between reading the free space cache and updating the csum tree.
+ 	 */
+-	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
++	if (btrfs_is_free_space_inode(inode)) {
+ 		path->search_commit_root = 1;
+ 		path->skip_locking = 1;
+ 	}
+@@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
+ 		ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+ 		sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+ 				fs_info->sectorsize_bits;
+-		csum_dst = csum + sector_offset * csum_size;
++		csum_dst = bbio->csum + sector_offset * csum_size;
+ 
+ 		count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+ 					 search_len, csum_dst);
+ 		if (count < 0) {
+ 			ret = errno_to_blk_status(count);
+-			if (bbio)
+-				btrfs_bio_free_csum(bbio);
++			if (bbio->csum != bbio->csum_inline)
++				kfree(bbio->csum);
++			bbio->csum = NULL;
+ 			break;
+ 		}
+ 
+@@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
+ 			memset(csum_dst, 0, csum_size);
+ 			count = 1;
+ 
+-			if (BTRFS_I(inode)->root->root_key.objectid ==
++			if (inode->root->root_key.objectid ==
+ 			    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ 				u64 file_offset;
+ 				int ret;
+ 
+-				ret = search_file_offset_in_bio(bio, inode,
++				ret = search_file_offset_in_bio(bio,
++						&inode->vfs_inode,
+ 						cur_disk_bytenr, &file_offset);
+ 				if (ret)
+ 					set_extent_bits(io_tree, file_offset,
+@@ -784,23 +772,16 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ 
+ /*
+  * Calculate checksums of the data contained inside a bio.
+- *
+- * @inode:	 Owner of the data inside the bio
+- * @bio:	 Contains the data to be checksummed
+- * @offset:      If (u64)-1, @bio may contain discontiguous bio vecs, so the
+- *               file offsets are determined from the page offsets in the bio.
+- *               Otherwise, this is the starting file offset of the bio vecs in
+- *               @bio, which must be contiguous.
+- * @one_ordered: If true, @bio only refers to one ordered extent.
+  */
+-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+-				u64 offset, bool one_ordered)
++blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
+ {
++	struct btrfs_inode *inode = bbio->inode;
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
++	struct bio *bio = &bbio->bio;
++	u64 offset = bbio->file_offset;
+ 	struct btrfs_ordered_sum *sums;
+ 	struct btrfs_ordered_extent *ordered = NULL;
+-	const bool use_page_offsets = (offset == (u64)-1);
+ 	char *data;
+ 	struct bvec_iter iter;
+ 	struct bio_vec bvec;
+@@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ 	shash->tfm = fs_info->csum_shash;
+ 
+ 	bio_for_each_segment(bvec, bio, iter) {
+-		if (use_page_offsets)
+-			offset = page_offset(bvec.bv_page) + bvec.bv_offset;
+-
+ 		if (!ordered) {
+ 			ordered = btrfs_lookup_ordered_extent(inode, offset);
+ 			/*
+@@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ 						 - 1);
+ 
+ 		for (i = 0; i < blockcount; i++) {
+-			if (!one_ordered &&
++			if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) &&
+ 			    !in_range(offset, ordered->file_offset,
+ 				      ordered->num_bytes)) {
+ 				unsigned long bytes_left;
+diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
+index 031225668434..cd7f2ae515c0 100644
+--- a/fs/btrfs/file-item.h
++++ b/fs/btrfs/file-item.h
+@@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+ 
+ int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ 		    struct btrfs_root *root, u64 bytenr, u64 len);
+-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
++blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
+ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root, u64 objectid, u64 pos,
+ 			     u64 num_bytes);
+@@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root,
+ 			   struct btrfs_ordered_sum *sums);
+-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+-				u64 offset, bool one_ordered);
++blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
++int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
++			     struct list_head *list, int search_commit,
++			     bool nowait);
+ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ 			    struct list_head *list, int search_commit,
+ 			    bool nowait);
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index af046d22300e..5cc5a1faaef5 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ 				unlock_page(pages[i]);
+ 				put_page(pages[i]);
+ 			}
+-			btrfs_start_ordered_extent(ordered, 1);
++			btrfs_start_ordered_extent(ordered);
+ 			btrfs_put_ordered_extent(ordered);
+ 			return -EAGAIN;
+ 		}
+diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
+index c667e878ef1a..4d155a48ec59 100644
+--- a/fs/btrfs/free-space-tree.c
++++ b/fs/btrfs/free-space-tree.c
+@@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+ 	list_del(&free_space_root->dirty_list);
+ 
+ 	btrfs_tree_lock(free_space_root->node);
+-	btrfs_clean_tree_block(free_space_root->node);
++	btrfs_clear_buffer_dirty(trans, free_space_root->node);
+ 	btrfs_tree_unlock(free_space_root->node);
+ 	btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ 			      free_space_root->node, 0, 1);
+diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
+index 5553e1f8afe8..31c1648bc0b4 100644
+--- a/fs/btrfs/fs.c
++++ b/fs/btrfs/fs.c
+@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ 				name, flag);
+ 		}
+ 		spin_unlock(&fs_info->super_lock);
++		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
+ 	}
+ }
+ 
+@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
+ 				name, flag);
+ 		}
+ 		spin_unlock(&fs_info->super_lock);
++		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
+ 	}
+ }
+ 
+@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ 				name, flag);
+ 		}
+ 		spin_unlock(&fs_info->super_lock);
++		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
+ 	}
+ }
+ 
+@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
+ 				name, flag);
+ 		}
+ 		spin_unlock(&fs_info->super_lock);
++		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
+ 	}
+ }
+diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
+index 37b86acfcbcf..4c477eae6891 100644
+--- a/fs/btrfs/fs.h
++++ b/fs/btrfs/fs.h
+@@ -3,6 +3,7 @@
+ #ifndef BTRFS_FS_H
+ #define BTRFS_FS_H
+ 
++#include <linux/blkdev.h>
+ #include <linux/fs.h>
+ #include <linux/btrfs_tree.h>
+ #include <linux/sizes.h>
+@@ -125,6 +126,12 @@ enum {
+ 	 */
+ 	BTRFS_FS_NO_OVERCOMMIT,
+ 
++	/*
++	 * Indicate if we have some features changed, this is mostly for
++	 * cleaner thread to update the sysfs interface.
++	 */
++	BTRFS_FS_FEATURE_CHANGED,
++
+ #if BITS_PER_LONG == 32
+ 	/* Indicate if we have error/warn message printed on 32bit systems */
+ 	BTRFS_FS_32BIT_ERROR,
+@@ -742,8 +749,10 @@ struct btrfs_fs_info {
+ 	 */
+ 	u64 zone_size;
+ 
+-	/* Max size to emit ZONE_APPEND write command */
++	/* Constraints for ZONE_APPEND commands: */
++	struct queue_limits limits;
+ 	u64 max_zone_append_size;
++
+ 	struct mutex zoned_meta_io_lock;
+ 	spinlock_t treelog_bg_lock;
+ 	u64 treelog_bg;
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 98a800b8bd43..44e9acc77a74 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -84,27 +84,12 @@ struct btrfs_dio_data {
+ };
+ 
+ struct btrfs_dio_private {
+-	struct btrfs_inode *inode;
+-
+-	/*
+-	 * Since DIO can use anonymous page, we cannot use page_offset() to
+-	 * grab the file offset, thus need a dedicated member for file offset.
+-	 */
++	/* Range of I/O */
+ 	u64 file_offset;
+-	/* Used for bio::bi_size */
+ 	u32 bytes;
+ 
+-	/*
+-	 * References to this structure. There is one reference per in-flight
+-	 * bio plus one while we're still setting up.
+-	 */
+-	refcount_t refs;
+-
+-	/* Array of checksums */
+-	u8 *csums;
+-
+ 	/* This must be last */
+-	struct bio bio;
++	struct btrfs_bio bbio;
+ };
+ 
+ static struct bio_set btrfs_dio_bioset;
+@@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
+ {
+ 	unsigned long index = offset >> PAGE_SHIFT;
+ 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+-	u64 page_start, page_end;
++	u64 page_start = 0, page_end = 0;
+ 	struct page *page;
+ 
+ 	if (locked_page) {
+@@ -2535,19 +2520,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
+ 	}
+ }
+ 
+-/*
+- * in order to insert checksums into the metadata in large chunks,
+- * we wait until bio submission time.   All the pages in the bio are
+- * checksummed and sums are attached onto the ordered extent record.
+- *
+- * At IO completion time the cums attached on the ordered extent record
+- * are inserted into the btree
+- */
+-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio)
+-{
+-	return btrfs_csum_one_bio(inode, bio, (u64)-1, false);
+-}
+-
+ /*
+  * Split an extent_map at [start, start + len]
+  *
+@@ -2663,19 +2635,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ 	return ret;
+ }
+ 
+-static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+-					   struct bio *bio, loff_t file_offset)
++blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
+ {
++	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
++	u64 len = bbio->bio.bi_iter.bi_size;
++	struct btrfs_inode *inode = bbio->inode;
+ 	struct btrfs_ordered_extent *ordered;
+-	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 	u64 file_len;
+-	u64 len = bio->bi_iter.bi_size;
+ 	u64 end = start + len;
+ 	u64 ordered_end;
+ 	u64 pre, post;
+ 	int ret = 0;
+ 
+-	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
++	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
+ 	if (WARN_ON_ONCE(!ordered))
+ 		return BLK_STS_IOERR;
+ 
+@@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ 	ret = btrfs_split_ordered_extent(ordered, pre, post);
+ 	if (ret)
+ 		goto out;
+-	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
++	ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
+ 
+ out:
+ 	btrfs_put_ordered_extent(ordered);
+@@ -2723,75 +2695,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ 	return errno_to_blk_status(ret);
+ }
+ 
+-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	blk_status_t ret;
+-
+-	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+-		ret = extract_ordered_extent(inode, bio,
+-				page_offset(bio_first_bvec_all(bio)->bv_page));
+-		if (ret) {
+-			btrfs_bio_end_io(btrfs_bio(bio), ret);
+-			return;
+-		}
+-	}
+-
+-	/*
+-	 * If we need to checksum, and the I/O is not issued by fsync and
+-	 * friends, that is ->sync_writers != 0, defer the submission to a
+-	 * workqueue to parallelize it.
+-	 *
+-	 * Csum items for reloc roots have already been cloned at this point,
+-	 * so they are handled as part of the no-checksum case.
+-	 */
+-	if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+-	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+-	    !btrfs_is_data_reloc_root(inode->root)) {
+-		if (!atomic_read(&inode->sync_writers) &&
+-		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA))
+-			return;
+-
+-		ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false);
+-		if (ret) {
+-			btrfs_bio_end_io(btrfs_bio(bio), ret);
+-			return;
+-		}
+-	}
+-	btrfs_submit_bio(fs_info, bio, mirror_num);
+-}
+-
+-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
+-			int mirror_num, enum btrfs_compression_type compress_type)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	blk_status_t ret;
+-
+-	if (compress_type != BTRFS_COMPRESS_NONE) {
+-		/*
+-		 * btrfs_submit_compressed_read will handle completing the bio
+-		 * if there were any errors, so just return here.
+-		 */
+-		btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num);
+-		return;
+-	}
+-
+-	/* Save the original iter for read repair */
+-	btrfs_bio(bio)->iter = bio->bi_iter;
+-
+-	/*
+-	 * Lookup bio sums does extra checks around whether we need to csum or
+-	 * not, which is why we ignore skip_sum here.
+-	 */
+-	ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+-	if (ret) {
+-		btrfs_bio_end_io(btrfs_bio(bio), ret);
+-		return;
+-	}
+-
+-	btrfs_submit_bio(fs_info, bio, mirror_num);
+-}
+-
+ /*
+  * given a list of ordered sums record them in the inode.  This happens
+  * at IO completion time based on sums calculated at bio submission time.
+@@ -2969,7 +2872,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+ 		unlock_extent(&inode->io_tree, page_start, page_end,
+ 			      &cached_state);
+ 		unlock_page(page);
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 		goto again;
+ 	}
+@@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+ 		goto out;
+ 	}
+ 
+-	/* A valid bdev implies a write on a sequential zone */
+-	if (ordered_extent->bdev) {
++	/* A valid ->physical implies a write on a sequential zone. */
++	if (ordered_extent->physical != (u64)-1) {
+ 		btrfs_rewrite_logical_zoned(ordered_extent);
+ 		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ 					ordered_extent->disk_num_bytes);
+ 	}
+ 
+-	btrfs_free_io_failure_record(inode, start, end);
+-
+ 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+ 		truncated = true;
+ 		logical_len = ordered_extent->truncated_len;
+@@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
+ }
+ 
+ /*
+- * check_data_csum - verify checksum of one sector of uncompressed data
+- * @inode:	inode
+- * @bbio:	btrfs_bio which contains the csum
++ * Verify the checksum of a single data sector.
++ *
++ * @bbio:	btrfs_io_bio which contains the csum
++ * @dev:	device the sector is on
+  * @bio_offset:	offset to the beginning of the bio (in bytes)
+- * @page:	page where is the data to be verified
+- * @pgoff:	offset inside the page
++ * @bv:		bio_vec to check
+  *
+- * The length of such check is always one sector size.
++ * Check if the checksum on a data block is valid.  When a checksum mismatch is
++ * detected, report the error and fill the corrupted range with zero.
+  *
+- * When csum mismatch is detected, we will also report the error and fill the
+- * corrupted range with zero. (Thus it needs the extra parameters)
++ * Return %true if the sector is ok or had no checksum to start with, else %false.
+  */
+-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
+-			  u32 bio_offset, struct page *page, u32 pgoff)
++bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
++			u32 bio_offset, struct bio_vec *bv)
+ {
++	struct btrfs_inode *inode = bbio->inode;
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	u32 len = fs_info->sectorsize;
++	u64 file_offset = bbio->file_offset + bio_offset;
++	u64 end = file_offset + bv->bv_len - 1;
+ 	u8 *csum_expected;
+ 	u8 csum[BTRFS_CSUM_SIZE];
+ 
+-	ASSERT(pgoff + len <= PAGE_SIZE);
++	ASSERT(bv->bv_len == fs_info->sectorsize);
+ 
+-	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
++	if (!bbio->csum)
++		return true;
+ 
+-	if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
++	if (btrfs_is_data_reloc_root(inode->root) &&
++	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
++			   1, NULL)) {
++		/* Skip the range without csum for data reloc inode */
++		clear_extent_bits(&inode->io_tree, file_offset, end,
++				  EXTENT_NODATASUM);
++		return true;
++	}
++
++	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
++	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
++				    csum_expected))
+ 		goto zeroit;
+-	return 0;
++	return true;
+ 
+ zeroit:
+-	btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset,
+-				    csum, csum_expected, bbio->mirror_num);
+-	if (bbio->device)
+-		btrfs_dev_stat_inc_and_print(bbio->device,
+-					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
+-	memzero_page(page, pgoff, len);
+-	return -EIO;
+-}
+-
+-/*
+- * When reads are done, we need to check csums to verify the data is correct.
+- * if there's a match, we allow the bio to finish.  If not, the code in
+- * extent_io.c will try to find good copies for us.
+- *
+- * @bio_offset:	offset to the beginning of the bio (in bytes)
+- * @start:	file offset of the range start
+- * @end:	file offset of the range end (inclusive)
+- *
+- * Return a bitmap where bit set means a csum mismatch, and bit not set means
+- * csum match.
+- */
+-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+-				    u32 bio_offset, struct page *page,
+-				    u64 start, u64 end)
+-{
+-	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+-	struct btrfs_root *root = inode->root;
+-	struct btrfs_fs_info *fs_info = root->fs_info;
+-	struct extent_io_tree *io_tree = &inode->io_tree;
+-	const u32 sectorsize = root->fs_info->sectorsize;
+-	u32 pg_off;
+-	unsigned int result = 0;
+-
+-	/*
+-	 * This only happens for NODATASUM or compressed read.
+-	 * Normally this should be covered by above check for compressed read
+-	 * or the next check for NODATASUM.  Just do a quicker exit here.
+-	 */
+-	if (bbio->csum == NULL)
+-		return 0;
+-
+-	if (inode->flags & BTRFS_INODE_NODATASUM)
+-		return 0;
+-
+-	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
+-		return 0;
+-
+-	ASSERT(page_offset(page) <= start &&
+-	       end <= page_offset(page) + PAGE_SIZE - 1);
+-	for (pg_off = offset_in_page(start);
+-	     pg_off < offset_in_page(end);
+-	     pg_off += sectorsize, bio_offset += sectorsize) {
+-		u64 file_offset = pg_off + page_offset(page);
+-		int ret;
+-
+-		if (btrfs_is_data_reloc_root(root) &&
+-		    test_range_bit(io_tree, file_offset,
+-				   file_offset + sectorsize - 1,
+-				   EXTENT_NODATASUM, 1, NULL)) {
+-			/* Skip the range without csum for data reloc inode */
+-			clear_extent_bits(io_tree, file_offset,
+-					  file_offset + sectorsize - 1,
+-					  EXTENT_NODATASUM);
+-			continue;
+-		}
+-		ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
+-		if (ret < 0) {
+-			const int nr_bit = (pg_off - offset_in_page(start)) >>
+-				     root->fs_info->sectorsize_bits;
+-
+-			result |= (1U << nr_bit);
+-		}
+-	}
+-	return result;
++	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
++				    bbio->mirror_num);
++	if (dev)
++		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
++	memzero_bvec(bv);
++	return false;
+ }
+ 
+ /*
+@@ -4987,7 +4834,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ 		unlock_extent(io_tree, block_start, block_end, &cached_state);
+ 		unlock_page(page);
+ 		put_page(page);
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 		goto again;
+ 	}
+@@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode)
+ 	if (is_bad_inode(inode))
+ 		goto no_delete;
+ 
+-	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
+-
+ 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ 		goto no_delete;
+ 
+@@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ 			 */
+ 			if (writing ||
+ 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+-				btrfs_start_ordered_extent(ordered, 1);
++				btrfs_start_ordered_extent(ordered);
+ 			else
+ 				ret = nowait ? -EAGAIN : -ENOTBLK;
+ 			btrfs_put_ordered_extent(ordered);
+@@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 	iomap->offset = start;
+ 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ 	iomap->length = len;
+-
+-	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
+-		iomap->flags |= IOMAP_F_ZONE_APPEND;
+-
+ 	free_extent_map(em);
+ 
+ 	return 0;
+@@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ 	return ret;
+ }
+ 
+-static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
+-{
+-	/*
+-	 * This implies a barrier so that stores to dio_bio->bi_status before
+-	 * this and loads of dio_bio->bi_status after this are fully ordered.
+-	 */
+-	if (!refcount_dec_and_test(&dip->refs))
+-		return;
+-
+-	if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
+-		btrfs_mark_ordered_io_finished(dip->inode, NULL,
+-					       dip->file_offset, dip->bytes,
+-					       !dip->bio.bi_status);
+-	} else {
+-		unlock_extent(&dip->inode->io_tree,
+-			      dip->file_offset,
+-			      dip->file_offset + dip->bytes - 1, NULL);
+-	}
+-
+-	kfree(dip->csums);
+-	bio_endio(&dip->bio);
+-}
+-
+-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
+-{
+-	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
+-
+-	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
+-
+-	refcount_inc(&dip->refs);
+-	btrfs_submit_bio(inode->root->fs_info, bio, mirror_num);
+-}
+-
+-static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+-					     struct btrfs_bio *bbio,
+-					     const bool uptodate)
+-{
+-	struct inode *inode = &dip->inode->vfs_inode;
+-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+-	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+-	blk_status_t err = BLK_STS_OK;
+-	struct bvec_iter iter;
+-	struct bio_vec bv;
+-	u32 offset;
+-
+-	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+-		u64 start = bbio->file_offset + offset;
+-
+-		if (uptodate &&
+-		    (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset,
+-						     bv.bv_page, bv.bv_offset))) {
+-			btrfs_clean_io_failure(BTRFS_I(inode), start,
+-					       bv.bv_page, bv.bv_offset);
+-		} else {
+-			int ret;
+-
+-			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
+-					bv.bv_page, bv.bv_offset, false);
+-			if (ret)
+-				err = errno_to_blk_status(ret);
+-		}
+-	}
+-
+-	return err;
+-}
+-
+-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
+-					      struct bio *bio,
+-					      u64 dio_file_offset)
++static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+ {
+-	return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
+-}
+-
+-static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
+-{
+-	struct btrfs_dio_private *dip = bbio->private;
++	struct btrfs_dio_private *dip =
++		container_of(bbio, struct btrfs_dio_private, bbio);
++	struct btrfs_inode *inode = bbio->inode;
+ 	struct bio *bio = &bbio->bio;
+-	blk_status_t err = bio->bi_status;
+-
+-	if (err)
+-		btrfs_warn(dip->inode->root->fs_info,
+-			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
+-			   btrfs_ino(dip->inode), bio_op(bio),
+-			   bio->bi_opf, bio->bi_iter.bi_sector,
+-			   bio->bi_iter.bi_size, err);
+-
+-	if (bio_op(bio) == REQ_OP_READ)
+-		err = btrfs_check_read_dio_bio(dip, bbio, !err);
+-
+-	if (err)
+-		dip->bio.bi_status = err;
+-
+-	btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio);
+-
+-	bio_put(bio);
+-	btrfs_dio_private_put(dip);
+-}
+ 
+-static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
+-				 u64 file_offset, int async_submit)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
+-	blk_status_t ret;
+-
+-	/* Save the original iter for read repair */
+-	if (btrfs_op(bio) == BTRFS_MAP_READ)
+-		btrfs_bio(bio)->iter = bio->bi_iter;
+-
+-	if (inode->flags & BTRFS_INODE_NODATASUM)
+-		goto map;
++	if (bio->bi_status) {
++		btrfs_warn(inode->root->fs_info,
++		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
++			   btrfs_ino(inode), bio->bi_opf,
++			   dip->file_offset, dip->bytes, bio->bi_status);
++	}
+ 
+-	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+-		/* Check btrfs_submit_data_write_bio() for async submit rules */
+-		if (async_submit && !atomic_read(&inode->sync_writers) &&
+-		    btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+-					WQ_SUBMIT_DATA_DIO))
+-			return;
++	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
++		btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset,
++					       dip->bytes, !bio->bi_status);
++	else
++		unlock_extent(&inode->io_tree, dip->file_offset,
++			      dip->file_offset + dip->bytes - 1, NULL);
+ 
+-		/*
+-		 * If we aren't doing async submit, calculate the csum of the
+-		 * bio now.
+-		 */
+-		ret = btrfs_csum_one_bio(inode, bio, file_offset, false);
+-		if (ret) {
+-			btrfs_bio_end_io(btrfs_bio(bio), ret);
+-			return;
+-		}
+-	} else {
+-		btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+-						      file_offset - dip->file_offset);
+-	}
+-map:
+-	btrfs_submit_bio(fs_info, bio, 0);
++	bbio->bio.bi_private = bbio->private;
++	iomap_dio_bio_end_io(bio);
+ }
+ 
+-static void btrfs_submit_direct(const struct iomap_iter *iter,
+-		struct bio *dio_bio, loff_t file_offset)
++static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
++				loff_t file_offset)
+ {
++	struct btrfs_bio *bbio = btrfs_bio(bio);
+ 	struct btrfs_dio_private *dip =
+-		container_of(dio_bio, struct btrfs_dio_private, bio);
+-	struct inode *inode = iter->inode;
+-	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+-			     BTRFS_BLOCK_GROUP_RAID56_MASK);
+-	struct bio *bio;
+-	u64 start_sector;
+-	int async_submit = 0;
+-	u64 submit_len;
+-	u64 clone_offset = 0;
+-	u64 clone_len;
+-	u64 logical;
+-	int ret;
+-	blk_status_t status;
+-	struct btrfs_io_geometry geom;
++		container_of(bbio, struct btrfs_dio_private, bbio);
+ 	struct btrfs_dio_data *dio_data = iter->private;
+-	struct extent_map *em = NULL;
+-
+-	dip->inode = BTRFS_I(inode);
+-	dip->file_offset = file_offset;
+-	dip->bytes = dio_bio->bi_iter.bi_size;
+-	refcount_set(&dip->refs, 1);
+-	dip->csums = NULL;
+-
+-	if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+-		unsigned int nr_sectors =
+-			(dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
+-
+-		/*
+-		 * Load the csums up front to reduce csum tree searches and
+-		 * contention when submitting bios.
+-		 */
+-		status = BLK_STS_RESOURCE;
+-		dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+-		if (!dip->csums)
+-			goto out_err;
+-
+-		status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
+-		if (status != BLK_STS_OK)
+-			goto out_err;
+-	}
+-
+-	start_sector = dio_bio->bi_iter.bi_sector;
+-	submit_len = dio_bio->bi_iter.bi_size;
+-
+-	do {
+-		logical = start_sector << 9;
+-		em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+-		if (IS_ERR(em)) {
+-			status = errno_to_blk_status(PTR_ERR(em));
+-			em = NULL;
+-			goto out_err_em;
+-		}
+-		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+-					    logical, &geom);
+-		if (ret) {
+-			status = errno_to_blk_status(ret);
+-			goto out_err_em;
+-		}
+ 
+-		clone_len = min(submit_len, geom.len);
+-		ASSERT(clone_len <= UINT_MAX);
++	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
++	bbio->file_offset = file_offset;
+ 
+-		/*
+-		 * This will never fail as it's passing GPF_NOFS and
+-		 * the allocation is backed by btrfs_bioset.
+-		 */
+-		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+-					      btrfs_end_dio_bio, dip);
+-		btrfs_bio(bio)->file_offset = file_offset;
+-
+-		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+-			status = extract_ordered_extent(BTRFS_I(inode), bio,
+-							file_offset);
+-			if (status) {
+-				bio_put(bio);
+-				goto out_err;
+-			}
+-		}
+-
+-		ASSERT(submit_len >= clone_len);
+-		submit_len -= clone_len;
+-
+-		/*
+-		 * Increase the count before we submit the bio so we know
+-		 * the end IO handler won't happen before we increase the
+-		 * count. Otherwise, the dip might get freed before we're
+-		 * done setting it up.
+-		 *
+-		 * We transfer the initial reference to the last bio, so we
+-		 * don't need to increment the reference count for the last one.
+-		 */
+-		if (submit_len > 0) {
+-			refcount_inc(&dip->refs);
+-			/*
+-			 * If we are submitting more than one bio, submit them
+-			 * all asynchronously. The exception is RAID 5 or 6, as
+-			 * asynchronous checksums make it difficult to collect
+-			 * full stripe writes.
+-			 */
+-			if (!raid56)
+-				async_submit = 1;
+-		}
+-
+-		btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
+-
+-		dio_data->submitted += clone_len;
+-		clone_offset += clone_len;
+-		start_sector += clone_len >> 9;
+-		file_offset += clone_len;
+-
+-		free_extent_map(em);
+-	} while (submit_len > 0);
+-	return;
++	dip->file_offset = file_offset;
++	dip->bytes = bio->bi_iter.bi_size;
+ 
+-out_err_em:
+-	free_extent_map(em);
+-out_err:
+-	dio_bio->bi_status = status;
+-	btrfs_dio_private_put(dip);
++	dio_data->submitted += bio->bi_iter.bi_size;
++	btrfs_submit_bio(bio, 0);
+ }
+ 
+ static const struct iomap_ops btrfs_dio_iomap_ops = {
+@@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {
+ };
+ 
+ static const struct iomap_dio_ops btrfs_dio_ops = {
+-	.submit_io		= btrfs_submit_direct,
++	.submit_io		= btrfs_dio_submit_io,
+ 	.bio_set		= &btrfs_dio_bioset,
+ };
+ 
+@@ -8552,7 +8173,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+ 		unlock_extent(io_tree, page_start, page_end, &cached_state);
+ 		unlock_page(page);
+ 		up_read(&BTRFS_I(inode)->i_mmap_lock);
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 		goto again;
+ 	}
+@@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ 	ei->last_log_commit = 0;
+ 
+ 	spin_lock_init(&ei->lock);
+-	spin_lock_init(&ei->io_failure_lock);
+ 	ei->outstanding_extents = 0;
+ 	if (sb->s_magic != BTRFS_TEST_MAGIC)
+ 		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
+@@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ 	ei->io_tree.inode = ei;
+ 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
+ 			    IO_TREE_INODE_FILE_EXTENT);
+-	ei->io_failure_tree = RB_ROOT;
+ 	atomic_set(&ei->sync_writers, 0);
+ 	mutex_init(&ei->log_mutex);
+ 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+@@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void)
+ 		goto fail;
+ 
+ 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+-			offsetof(struct btrfs_dio_private, bio),
++			offsetof(struct btrfs_dio_private, bbio.bio),
+ 			BIOSET_NEED_BVECS))
+ 		goto fail;
+ 
+@@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private {
+ 	wait_queue_head_t wait;
+ 	atomic_t pending;
+ 	blk_status_t status;
+-	bool skip_csum;
+ };
+ 
+-static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+-					    struct bio *bio, int mirror_num)
+-{
+-	struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	blk_status_t ret;
+-
+-	if (!priv->skip_csum) {
+-		ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+-		if (ret)
+-			return ret;
+-	}
+-
+-	atomic_inc(&priv->pending);
+-	btrfs_submit_bio(fs_info, bio, mirror_num);
+-	return BLK_STS_OK;
+-}
+-
+-static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+-{
+-	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+-	struct btrfs_encoded_read_private *priv = bbio->private;
+-	struct btrfs_inode *inode = priv->inode;
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	u32 sectorsize = fs_info->sectorsize;
+-	struct bio_vec *bvec;
+-	struct bvec_iter_all iter_all;
+-	u32 bio_offset = 0;
+-
+-	if (priv->skip_csum || !uptodate)
+-		return bbio->bio.bi_status;
+-
+-	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+-		unsigned int i, nr_sectors, pgoff;
+-
+-		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+-		pgoff = bvec->bv_offset;
+-		for (i = 0; i < nr_sectors; i++) {
+-			ASSERT(pgoff < PAGE_SIZE);
+-			if (btrfs_check_data_csum(inode, bbio, bio_offset,
+-					    bvec->bv_page, pgoff))
+-				return BLK_STS_IOERR;
+-			bio_offset += sectorsize;
+-			pgoff += sectorsize;
+-		}
+-	}
+-	return BLK_STS_OK;
+-}
+-
+ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+ {
+ 	struct btrfs_encoded_read_private *priv = bbio->private;
+-	blk_status_t status;
+ 
+-	status = btrfs_encoded_read_verify_csum(bbio);
+-	if (status) {
++	if (bbio->bio.bi_status) {
+ 		/*
+ 		 * The memory barrier implied by the atomic_dec_return() here
+ 		 * pairs with the memory barrier implied by the
+@@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+ 		 * write is observed before the load of status in
+ 		 * btrfs_encoded_read_regular_fill_pages().
+ 		 */
+-		WRITE_ONCE(priv->status, status);
++		WRITE_ONCE(priv->status, bbio->bio.bi_status);
+ 	}
+ 	if (!atomic_dec_return(&priv->pending))
+ 		wake_up(&priv->wait);
+-	btrfs_bio_free_csum(bbio);
+ 	bio_put(&bbio->bio);
+ }
+ 
+@@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ 					  u64 file_offset, u64 disk_bytenr,
+ 					  u64 disk_io_size, struct page **pages)
+ {
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct btrfs_encoded_read_private priv = {
+ 		.inode = inode,
+ 		.file_offset = file_offset,
+ 		.pending = ATOMIC_INIT(1),
+-		.skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ 	};
+ 	unsigned long i = 0;
+ 	u64 cur = 0;
+-	int ret;
+ 
+ 	init_waitqueue_head(&priv.wait);
+-	/*
+-	 * Submit bios for the extent, splitting due to bio or stripe limits as
+-	 * necessary.
+-	 */
++	/* Submit bios for the extent, splitting due to bio limits as necessary. */
+ 	while (cur < disk_io_size) {
+-		struct extent_map *em;
+-		struct btrfs_io_geometry geom;
+ 		struct bio *bio = NULL;
+-		u64 remaining;
++		u64 remaining = disk_io_size - cur;
+ 
+-		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+-					 disk_io_size - cur);
+-		if (IS_ERR(em)) {
+-			ret = PTR_ERR(em);
+-		} else {
+-			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+-						    disk_bytenr + cur, &geom);
+-			free_extent_map(em);
+-		}
+-		if (ret) {
+-			WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+-			break;
+-		}
+-		remaining = min(geom.len, disk_io_size - cur);
+ 		while (bio || remaining) {
+ 			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+ 
+ 			if (!bio) {
+ 				bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
++						      inode,
+ 						      btrfs_encoded_read_endio,
+ 						      &priv);
+ 				bio->bi_iter.bi_sector =
+@@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ 
+ 			if (!bytes ||
+ 			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+-				blk_status_t status;
+-
+-				status = submit_encoded_read_bio(inode, bio, 0);
+-				if (status) {
+-					WRITE_ONCE(priv.status, status);
+-					bio_put(bio);
+-					goto out;
+-				}
++				atomic_inc(&priv.pending);
++				btrfs_submit_bio(bio, 0);
+ 				bio = NULL;
+ 				continue;
+ 			}
+@@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ 		}
+ 	}
+ 
+-out:
+ 	if (atomic_dec_return(&priv.pending))
+ 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ 	/* See btrfs_encoded_read_endio() for ordering. */
+@@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+ 		return 0;
+ 
+ 	max_pages = sis->max - bsi->nr_pages;
+-	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+-	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+-				PAGE_SIZE) >> PAGE_SHIFT;
++	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
++	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
+ 
+ 	if (first_ppage >= next_ppage)
+ 		return 0;
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index 7e348bd2ccde..8ea557e22252 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
+ 		 * exists).
+ 		 */
+ 		btrfs_tree_lock(leaf);
+-		btrfs_clean_tree_block(leaf);
++		btrfs_clear_buffer_dirty(trans, leaf);
+ 		btrfs_tree_unlock(leaf);
+ 		btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ 		free_extent_buffer(leaf);
+diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
+new file mode 100644
+index 000000000000..0fe0ae54ac67
+--- /dev/null
++++ b/fs/btrfs/lru_cache.c
+@@ -0,0 +1,166 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/mm.h>
++#include "lru_cache.h"
++#include "messages.h"
++
++/*
++ * Initialize a cache object.
++ *
++ * @cache:      The cache.
++ * @max_size:   Maximum size (number of entries) for the cache.
++ *              Use 0 for unlimited size, it's the user's responsability to
++ *              trim the cache in that case.
++ */
++void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
++{
++	INIT_LIST_HEAD(&cache->lru_list);
++	mt_init(&cache->entries);
++	cache->size = 0;
++	cache->max_size = max_size;
++}
++
++static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key,
++						 u64 gen)
++{
++	struct btrfs_lru_cache_entry *entry;
++
++	list_for_each_entry(entry, head, list) {
++		if (entry->key == key && entry->gen == gen)
++			return entry;
++	}
++
++	return NULL;
++}
++
++/*
++ * Lookup for an entry in the cache.
++ *
++ * @cache:      The cache.
++ * @key:        The key of the entry we are looking for.
++ * @gen:        Generation associated to the key.
++ *
++ * Returns the entry associated with the key or NULL if none found.
++ */
++struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
++						     u64 key, u64 gen)
++{
++	struct list_head *head;
++	struct btrfs_lru_cache_entry *entry;
++
++	head = mtree_load(&cache->entries, key);
++	if (!head)
++		return NULL;
++
++	entry = match_entry(head, key, gen);
++	if (entry)
++		list_move_tail(&entry->lru_list, &cache->lru_list);
++
++	return entry;
++}
++
++/*
++ * Remove an entry from the cache.
++ *
++ * @cache:     The cache to remove from.
++ * @entry:     The entry to remove from the cache.
++ *
++ * Note: this also frees the memory used by the entry.
++ */
++void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
++			    struct btrfs_lru_cache_entry *entry)
++{
++	struct list_head *prev = entry->list.prev;
++
++	ASSERT(cache->size > 0);
++	ASSERT(!mtree_empty(&cache->entries));
++
++	list_del(&entry->list);
++	list_del(&entry->lru_list);
++
++	if (list_empty(prev)) {
++		struct list_head *head;
++
++		/*
++		 * If previous element in the list entry->list is now empty, it
++		 * means it's a head entry not pointing to any cached entries,
++		 * so remove it from the maple tree and free it.
++		 */
++		head = mtree_erase(&cache->entries, entry->key);
++		ASSERT(head == prev);
++		kfree(head);
++	}
++
++	kfree(entry);
++	cache->size--;
++}
++
++/*
++ * Store an entry in the cache.
++ *
++ * @cache:      The cache.
++ * @entry:      The entry to store.
++ *
++ * Returns 0 on success and < 0 on error.
++ */
++int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
++			  struct btrfs_lru_cache_entry *new_entry,
++			  gfp_t gfp)
++{
++	const u64 key = new_entry->key;
++	struct list_head *head;
++	int ret;
++
++	head = kmalloc(sizeof(*head), gfp);
++	if (!head)
++		return -ENOMEM;
++
++	ret = mtree_insert(&cache->entries, key, head, gfp);
++	if (ret == 0) {
++		INIT_LIST_HEAD(head);
++		list_add_tail(&new_entry->list, head);
++	} else if (ret == -EEXIST) {
++		kfree(head);
++		head = mtree_load(&cache->entries, key);
++		ASSERT(head != NULL);
++		if (match_entry(head, key, new_entry->gen) != NULL)
++			return -EEXIST;
++		list_add_tail(&new_entry->list, head);
++	} else if (ret < 0) {
++		kfree(head);
++		return ret;
++	}
++
++	if (cache->max_size > 0 && cache->size == cache->max_size) {
++		struct btrfs_lru_cache_entry *lru_entry;
++
++		lru_entry = list_first_entry(&cache->lru_list,
++					     struct btrfs_lru_cache_entry,
++					     lru_list);
++		btrfs_lru_cache_remove(cache, lru_entry);
++	}
++
++	list_add_tail(&new_entry->lru_list, &cache->lru_list);
++	cache->size++;
++
++	return 0;
++}
++
++/*
++ * Empty a cache.
++ *
++ * @cache:     The cache to empty.
++ *
++ * Removes all entries from the cache.
++ */
++void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache)
++{
++	struct btrfs_lru_cache_entry *entry;
++	struct btrfs_lru_cache_entry *tmp;
++
++	list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list)
++		btrfs_lru_cache_remove(cache, entry);
++
++	ASSERT(cache->size == 0);
++	ASSERT(mtree_empty(&cache->entries));
++}
+diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
+new file mode 100644
+index 000000000000..de3e18bce24a
+--- /dev/null
++++ b/fs/btrfs/lru_cache.h
+@@ -0,0 +1,80 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef BTRFS_LRU_CACHE_H
++#define BTRFS_LRU_CACHE_H
++
++#include <linux/maple_tree.h>
++#include <linux/list.h>
++
++/*
++ * A cache entry. This is meant to be embedded in a structure of a user of
++ * this module. Similar to how struct list_head and struct rb_node are used.
++ *
++ * Note: it should be embedded as the first element in a struct (offset 0), and
++ * this module assumes it was allocated with kmalloc(), so it calls kfree() when
++ * it needs to free an entry.
++ */
++struct btrfs_lru_cache_entry {
++	struct list_head lru_list;
++	u64 key;
++	/*
++	 * Optional generation associated to a key. Use 0 if not needed/used.
++	 * Entries with the same key and different generations are stored in a
++	 * linked list, so use this only for cases where there's a small number
++	 * of different generations.
++	 */
++	u64 gen;
++	/*
++	 * The maple tree uses unsigned long type for the keys, which is 32 bits
++	 * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to
++	 * use something like inode numbers as keys, which are always a u64, we
++	 * have to deal with this in a special way - we store the key in the
++	 * entry itself, as a u64, and the values inserted into the maple tree
++	 * are linked lists of entries - so in case we are on a 64 bits system,
++	 * that list always has a single entry, while on 32 bits systems it
++	 * may have more than one, with each entry having the same value for
++	 * their lower 32 bits of the u64 key.
++	 */
++	struct list_head list;
++};
++
++struct btrfs_lru_cache {
++	struct list_head lru_list;
++	struct maple_tree entries;
++	/* Number of entries stored in the cache. */
++	unsigned int size;
++	/* Maximum number of entries the cache can have. */
++	unsigned int max_size;
++};
++
++#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp)		\
++	list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
++
++static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
++{
++	return cache->size;
++}
++
++static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
++{
++	return cache->size >= cache->max_size;
++}
++
++static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
++					      struct btrfs_lru_cache *cache)
++{
++	return list_first_entry_or_null(&cache->lru_list,
++					struct btrfs_lru_cache_entry, lru_list);
++}
++
++void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size);
++struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
++						     u64 key, u64 gen);
++int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
++			  struct btrfs_lru_cache_entry *new_entry,
++			  gfp_t gfp);
++void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
++			    struct btrfs_lru_cache_entry *entry);
++void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache);
++
++#endif
+diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
+index d5e78cbc8fbc..71f6d8302d50 100644
+--- a/fs/btrfs/lzo.c
++++ b/fs/btrfs/lzo.c
+@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ 		}
+ 
+ 		/* Check if we have reached page boundary */
+-		if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
++		if (PAGE_ALIGNED(cur_in)) {
+ 			put_page(page_in);
+ 			page_in = NULL;
+ 		}
+diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
+index 625bbbbb2608..fde5aaa6e7c9 100644
+--- a/fs/btrfs/messages.c
++++ b/fs/btrfs/messages.c
+@@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+ }
+ #endif
+ 
+-/*
+- * We only mark the transaction aborted and then set the file system read-only.
+- * This will prevent new transactions from starting or trying to join this
+- * one.
+- *
+- * This means that error recovery at the call site is limited to freeing
+- * any local memory allocations and passing the error code up without
+- * further cleanup. The transaction should complete as it normally would
+- * in the call path but will return -EIO.
+- *
+- * We'll complete the cleanup in btrfs_end_transaction and
+- * btrfs_commit_transaction.
+- */
+-__cold
+-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+-			       const char *function,
+-			       unsigned int line, int errno, bool first_hit)
+-{
+-	struct btrfs_fs_info *fs_info = trans->fs_info;
+-
+-	WRITE_ONCE(trans->aborted, errno);
+-	WRITE_ONCE(trans->transaction->aborted, errno);
+-	if (first_hit && errno == -ENOSPC)
+-		btrfs_dump_space_info_for_trans_abort(fs_info);
+-	/* Wake up anybody who may be waiting on this transaction */
+-	wake_up(&fs_info->transaction_wait);
+-	wake_up(&fs_info->transaction_blocked_wait);
+-	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+-}
+-
+ /*
+  * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
+  * alert, and either panics or BUGs, depending on mount options.
+diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
+index 190af1f698d9..8c516ee58ff9 100644
+--- a/fs/btrfs/messages.h
++++ b/fs/btrfs/messages.h
+@@ -6,7 +6,6 @@
+ #include <linux/types.h>
+ 
+ struct btrfs_fs_info;
+-struct btrfs_trans_handle;
+ 
+ static inline __printf(2, 3) __cold
+ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
+ 
+ const char * __attribute_const__ btrfs_decode_error(int errno);
+ 
+-__cold
+-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+-			       const char *function,
+-			       unsigned int line, int errno, bool first_hit);
+-
+-bool __cold abort_should_print_stack(int errno);
+-
+-/*
+- * Call btrfs_abort_transaction as early as possible when an error condition is
+- * detected, that way the exact stack trace is reported for some errors.
+- */
+-#define btrfs_abort_transaction(trans, errno)			\
+-do {								\
+-	bool first = false;					\
+-	/* Report first abort since mount */			\
+-	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
+-			      &((trans)->fs_info->fs_state))) {	\
+-		first = true;					\
+-		if (WARN(abort_should_print_stack(errno),       \
+-			KERN_ERR				\
+-			"BTRFS: Transaction aborted (error %d)\n",	\
+-			(errno))) {					\
+-			/* Stack trace printed. */			\
+-		} else {						\
+-			btrfs_err((trans)->fs_info,			\
+-				  "Transaction aborted (error %d)",     \
+-				  (errno));			\
+-		}						\
+-	}							\
+-	__btrfs_abort_transaction((trans), __func__,		\
+-				  __LINE__, (errno), first);	\
+-} while (0)
+-
+ #define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
+ 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
+ 				(errno), fmt, ##args)
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index 57d8c72737e1..6c24b69e2d0a 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+ 	struct btrfs_ordered_extent *ordered;
+ 
+ 	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+-	btrfs_start_ordered_extent(ordered, 1);
++	btrfs_start_ordered_extent(ordered);
+ 	complete(&ordered->completion);
+ }
+ 
+@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ }
+ 
+ /*
+- * Used to start IO or wait for a given ordered extent to finish.
++ * Start IO and wait for a given ordered extent to finish.
+  *
+- * If wait is one, this effectively waits on page writeback for all the pages
+- * in the extent, and it waits on the io completion code to insert
+- * metadata into the btree corresponding to the extent
++ * Wait on page writeback for all the pages in the extent and the IO completion
++ * code to insert metadata into the btree corresponding to the extent.
+  */
+-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
++void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+ {
+ 	u64 start = entry->file_offset;
+ 	u64 end = start + entry->num_bytes - 1;
+@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
+ 	 */
+ 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ 		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+-	if (wait) {
+-		if (!freespace_inode)
+-			btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
+-		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+-						 &entry->flags));
+-	}
++
++	if (!freespace_inode)
++		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
++	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
+ }
+ 
+ /*
+@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+ 			btrfs_put_ordered_extent(ordered);
+ 			break;
+ 		}
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		end = ordered->file_offset;
+ 		/*
+ 		 * If the ordered extent had an error save the error but don't
+@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
+ 			break;
+ 		}
+ 		unlock_extent(&inode->io_tree, start, end, cachedp);
+-		btrfs_start_ordered_extent(ordered, 1);
++		btrfs_start_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 	}
+ }
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index 89f82b78f590..eb40cb39f842 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -157,7 +157,6 @@ struct btrfs_ordered_extent {
+ 	 * command in a workqueue context
+ 	 */
+ 	u64 physical;
+-	struct block_device *bdev;
+ };
+ 
+ static inline void
+@@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
+ 			   struct btrfs_ordered_sum *sum);
+ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
+ 							 u64 file_offset);
+-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
++void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
+ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+ struct btrfs_ordered_extent *
+ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
+diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
+index af97413abcf4..52a7d2fa2284 100644
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
+ 	list_del(&quota_root->dirty_list);
+ 
+ 	btrfs_tree_lock(quota_root->node);
+-	btrfs_clean_tree_block(quota_root->node);
++	btrfs_clear_buffer_dirty(trans, quota_root->node);
+ 	btrfs_tree_unlock(quota_root->node);
+ 	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ 			      quota_root->node, 0, 1);
+diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
+index ff4b1d583788..642828c1b299 100644
+--- a/fs/btrfs/raid56.c
++++ b/fs/btrfs/raid56.c
+@@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+ }
+ 
+ /*
+- * Return the total numer of errors found in the vertical stripe of @sector_nr.
++ * Return the total number of errors found in the vertical stripe of @sector_nr.
+  *
+  * @faila and @failb will also be updated to the first and second stripe
+  * number of the errors.
+@@ -1183,7 +1183,15 @@ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ 	trace_info->stripe_nr = -1;
+ }
+ 
+-/* Generate PQ for one veritical stripe. */
++static inline void bio_list_put(struct bio_list *bio_list)
++{
++	struct bio *bio;
++
++	while ((bio = bio_list_pop(bio_list)))
++		bio_put(bio);
++}
++
++/* Generate PQ for one vertical stripe. */
+ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+ {
+ 	void **pointers = rbio->finish_pointers;
+@@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
+ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ 				   struct bio_list *bio_list)
+ {
+-	struct bio *bio;
+ 	/* The total sector number inside the full stripe. */
+ 	int total_sector_nr;
+ 	int sectornr;
+@@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ 
+ 	return 0;
+ error:
+-	while ((bio = bio_list_pop(bio_list)))
+-		bio_put(bio);
++	bio_list_put(bio_list);
+ 	return -EIO;
+ }
+ 
+@@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
+ }
+ 
+ /*
+- * For subpage case, we can no longer set page Uptodate directly for
++ * For subpage case, we can no longer set page Up-to-date directly for
+  * stripe_pages[], thus we need to locate the sector.
+  */
+ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+@@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi
+ 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ 	u32 bio_size = 0;
+ 	struct bio_vec *bvec;
+-	struct bvec_iter_all iter_all;
+ 	int i;
+ 
+-	bio_for_each_segment_all(bvec, bio, iter_all)
++	bio_for_each_bvec_all(bvec, bio, i)
+ 		bio_size += bvec->bv_len;
+ 
+ 	/*
+@@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio)
+ 		wake_up(&rbio->io_wait);
+ }
+ 
+-static void submit_read_bios(struct btrfs_raid_bio *rbio,
++static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
+ 			     struct bio_list *bio_list)
+ {
+ 	struct bio *bio;
+@@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio,
+ 		}
+ 		submit_bio(bio);
+ 	}
+-}
+-
+-static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
+-				  struct bio_list *bio_list)
+-{
+-	struct bio *bio;
+-	int total_sector_nr;
+-	int ret = 0;
+-
+-	ASSERT(bio_list_size(bio_list) == 0);
+-
+-	/*
+-	 * Build a list of bios to read all sectors (including data and P/Q).
+-	 *
+-	 * This behaviro is to compensate the later csum verification and
+-	 * recovery.
+-	 */
+-	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+-	     total_sector_nr++) {
+-		struct sector_ptr *sector;
+-		int stripe = total_sector_nr / rbio->stripe_nsectors;
+-		int sectornr = total_sector_nr % rbio->stripe_nsectors;
+-
+-		sector = rbio_stripe_sector(rbio, stripe, sectornr);
+-		ret = rbio_add_io_sector(rbio, bio_list, sector,
+-			       stripe, sectornr, REQ_OP_READ);
+-		if (ret)
+-			goto cleanup;
+-	}
+-	return 0;
+ 
+-cleanup:
+-	while ((bio = bio_list_pop(bio_list)))
+-		bio_put(bio);
+-	return ret;
++	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ }
+ 
+ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
+@@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
+ 	struct btrfs_raid_bio *rbio;
+ 	struct btrfs_plug_cb *plug = NULL;
+ 	struct blk_plug_cb *cb;
+-	int ret = 0;
+ 
+ 	rbio = alloc_rbio(fs_info, bioc);
+ 	if (IS_ERR(rbio)) {
+-		ret = PTR_ERR(rbio);
+-		goto fail;
++		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
++		bio_endio(bio);
++		return;
+ 	}
+ 	rbio->operation = BTRFS_RBIO_WRITE;
+ 	rbio_add_bio(rbio, bio);
+@@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
+ 	 * Don't plug on full rbios, just get them out the door
+ 	 * as quickly as we can
+ 	 */
+-	if (rbio_is_full(rbio))
+-		goto queue_rbio;
+-
+-	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
+-	if (cb) {
+-		plug = container_of(cb, struct btrfs_plug_cb, cb);
+-		if (!plug->info) {
+-			plug->info = fs_info;
+-			INIT_LIST_HEAD(&plug->rbio_list);
++	if (!rbio_is_full(rbio)) {
++		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
++		if (cb) {
++			plug = container_of(cb, struct btrfs_plug_cb, cb);
++			if (!plug->info) {
++				plug->info = fs_info;
++				INIT_LIST_HEAD(&plug->rbio_list);
++			}
++			list_add_tail(&rbio->plug_list, &plug->rbio_list);
++			return;
+ 		}
+-		list_add_tail(&rbio->plug_list, &plug->rbio_list);
+-		return;
+ 	}
+-queue_rbio:
++
+ 	/*
+ 	 * Either we don't have any existing plug, or we're doing a full stripe,
+-	 * can queue the rmw work now.
++	 * queue the rmw work now.
+ 	 */
+ 	start_async_work(rbio, rmw_rbio_work);
+-
+-	return;
+-
+-fail:
+-	bio->bi_status = errno_to_blk_status(ret);
+-	bio_endio(bio);
+ }
+ 
+ static int verify_one_sector(struct btrfs_raid_bio *rbio,
+@@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+ 						 &failb);
+ 	/*
+-	 * No errors in the veritical stripe, skip it.  Can happen for recovery
++	 * No errors in the vertical stripe, skip it.  Can happen for recovery
+ 	 * which only part of a stripe failed csum check.
+ 	 */
+ 	if (!found_errors)
+@@ -1949,14 +1914,25 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
+ 	return ret;
+ }
+ 
+-static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+-				      struct bio_list *bio_list)
++static void recover_rbio(struct btrfs_raid_bio *rbio)
+ {
+-	struct bio *bio;
++	struct bio_list bio_list = BIO_EMPTY_LIST;
+ 	int total_sector_nr;
+ 	int ret = 0;
+ 
+-	ASSERT(bio_list_size(bio_list) == 0);
++	/*
++	 * Either we're doing recover for a read failure or degraded write,
++	 * caller should have set error bitmap correctly.
++	 */
++	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
++
++	/* For recovery, we need to read all sectors including P/Q. */
++	ret = alloc_rbio_pages(rbio);
++	if (ret < 0)
++		goto out;
++
++	index_rbio_pages(rbio);
++
+ 	/*
+ 	 * Read everything that hasn't failed. However this time we will
+ 	 * not trust any cached sector.
+@@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ 		}
+ 
+ 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
+-		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
++		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ 					 sectornr, REQ_OP_READ);
+-		if (ret < 0)
+-			goto error;
++		if (ret < 0) {
++			bio_list_put(&bio_list);
++			goto out;
++		}
+ 	}
+-	return 0;
+-error:
+-	while ((bio = bio_list_pop(bio_list)))
+-		bio_put(bio);
+-
+-	return -EIO;
+-}
+-
+-static int recover_rbio(struct btrfs_raid_bio *rbio)
+-{
+-	struct bio_list bio_list;
+-	struct bio *bio;
+-	int ret;
+-
+-	/*
+-	 * Either we're doing recover for a read failure or degraded write,
+-	 * caller should have set error bitmap correctly.
+-	 */
+-	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
+-	bio_list_init(&bio_list);
+-
+-	/* For recovery, we need to read all sectors including P/Q. */
+-	ret = alloc_rbio_pages(rbio);
+-	if (ret < 0)
+-		goto out;
+-
+-	index_rbio_pages(rbio);
+-
+-	ret = recover_assemble_read_bios(rbio, &bio_list);
+-	if (ret < 0)
+-		goto out;
+-
+-	submit_read_bios(rbio, &bio_list);
+-	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ 
++	submit_read_wait_bio_list(rbio, &bio_list);
+ 	ret = recover_sectors(rbio);
+-
+ out:
+-	while ((bio = bio_list_pop(&bio_list)))
+-		bio_put(bio);
+-
+-	return ret;
++	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+ 
+ static void recover_rbio_work(struct work_struct *work)
+ {
+ 	struct btrfs_raid_bio *rbio;
+-	int ret;
+ 
+ 	rbio = container_of(work, struct btrfs_raid_bio, work);
+-
+-	ret = lock_stripe_add(rbio);
+-	if (ret == 0) {
+-		ret = recover_rbio(rbio);
+-		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+-	}
++	if (!lock_stripe_add(rbio))
++		recover_rbio(rbio);
+ }
+ 
+ static void recover_rbio_work_locked(struct work_struct *work)
+ {
+-	struct btrfs_raid_bio *rbio;
+-	int ret;
+-
+-	rbio = container_of(work, struct btrfs_raid_bio, work);
+-
+-	ret = recover_rbio(rbio);
+-	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
++	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
+ }
+ 
+ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+@@ -2204,11 +2134,9 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
+ 
+ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
+ {
+-	struct bio_list bio_list;
+-	struct bio *bio;
+-	int ret;
+-
+-	bio_list_init(&bio_list);
++	struct bio_list bio_list = BIO_EMPTY_LIST;
++	int total_sector_nr;
++	int ret = 0;
+ 
+ 	/*
+ 	 * Fill the data csums we need for data verification.  We need to fill
+@@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
+ 	 */
+ 	fill_data_csums(rbio);
+ 
+-	ret = rmw_assemble_read_bios(rbio, &bio_list);
+-	if (ret < 0)
+-		goto out;
++	/*
++	 * Build a list of bios to read all sectors (including data and P/Q).
++	 *
++	 * This behavior is to compensate the later csum verification and recovery.
++	 */
++	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
++	     total_sector_nr++) {
++		struct sector_ptr *sector;
++		int stripe = total_sector_nr / rbio->stripe_nsectors;
++		int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ 
+-	submit_read_bios(rbio, &bio_list);
+-	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
++		sector = rbio_stripe_sector(rbio, stripe, sectornr);
++		ret = rbio_add_io_sector(rbio, &bio_list, sector,
++			       stripe, sectornr, REQ_OP_READ);
++		if (ret) {
++			bio_list_put(&bio_list);
++			return ret;
++		}
++	}
+ 
+ 	/*
+ 	 * We may or may not have any corrupted sectors (including missing dev
+ 	 * and csum mismatch), just let recover_sectors() to handle them all.
+ 	 */
+-	ret = recover_sectors(rbio);
+-	return ret;
+-out:
+-	while ((bio = bio_list_pop(&bio_list)))
+-		bio_put(bio);
+-
+-	return ret;
++	submit_read_wait_bio_list(rbio, &bio_list);
++	return recover_sectors(rbio);
+ }
+ 
+ static void raid_wait_write_end_io(struct bio *bio)
+@@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
+ 	return false;
+ }
+ 
+-static int rmw_rbio(struct btrfs_raid_bio *rbio)
++static void rmw_rbio(struct btrfs_raid_bio *rbio)
+ {
+ 	struct bio_list bio_list;
+ 	int sectornr;
+@@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
+ 	 */
+ 	ret = alloc_rbio_parity_pages(rbio);
+ 	if (ret < 0)
+-		return ret;
++		goto out;
+ 
+ 	/*
+ 	 * Either full stripe write, or we have every data sector already
+ 	 * cached, can go to write path immediately.
+ 	 */
+-	if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
+-		goto write;
+-
+-	/*
+-	 * Now we're doing sub-stripe write, also need all data stripes to do
+-	 * the full RMW.
+-	 */
+-	ret = alloc_rbio_data_pages(rbio);
+-	if (ret < 0)
+-		return ret;
++	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
++		/*
++		 * Now we're doing sub-stripe write, also need all data stripes
++		 * to do the full RMW.
++		 */
++		ret = alloc_rbio_data_pages(rbio);
++		if (ret < 0)
++			goto out;
+ 
+-	index_rbio_pages(rbio);
++		index_rbio_pages(rbio);
+ 
+-	ret = rmw_read_wait_recover(rbio);
+-	if (ret < 0)
+-		return ret;
++		ret = rmw_read_wait_recover(rbio);
++		if (ret < 0)
++			goto out;
++	}
+ 
+-write:
+ 	/*
+ 	 * At this stage we're not allowed to add any new bios to the
+ 	 * bio list any more, anyone else that wants to change this stripe
+@@ -2356,7 +2290,7 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
+ 	bio_list_init(&bio_list);
+ 	ret = rmw_assemble_write_bios(rbio, &bio_list);
+ 	if (ret < 0)
+-		return ret;
++		goto out;
+ 
+ 	/* We should have at least one bio assembled. */
+ 	ASSERT(bio_list_size(&bio_list));
+@@ -2373,32 +2307,22 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio)
+ 			break;
+ 		}
+ 	}
+-	return ret;
++out:
++	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+ 
+ static void rmw_rbio_work(struct work_struct *work)
+ {
+ 	struct btrfs_raid_bio *rbio;
+-	int ret;
+ 
+ 	rbio = container_of(work, struct btrfs_raid_bio, work);
+-
+-	ret = lock_stripe_add(rbio);
+-	if (ret == 0) {
+-		ret = rmw_rbio(rbio);
+-		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+-	}
++	if (lock_stripe_add(rbio) == 0)
++		rmw_rbio(rbio);
+ }
+ 
+ static void rmw_rbio_work_locked(struct work_struct *work)
+ {
+-	struct btrfs_raid_bio *rbio;
+-	int ret;
+-
+-	rbio = container_of(work, struct btrfs_raid_bio, work);
+-
+-	ret = rmw_rbio(rbio);
+-	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
++	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
+ }
+ 
+ /*
+@@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+ 	struct sector_ptr p_sector = { 0 };
+ 	struct sector_ptr q_sector = { 0 };
+ 	struct bio_list bio_list;
+-	struct bio *bio;
+ 	int is_replace = 0;
+ 	int ret;
+ 
+@@ -2637,8 +2560,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+ 	return 0;
+ 
+ cleanup:
+-	while ((bio = bio_list_pop(&bio_list)))
+-		bio_put(bio);
++	bio_list_put(&bio_list);
+ 	return ret;
+ }
+ 
+@@ -2733,15 +2655,12 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
+ 	return ret;
+ }
+ 
+-static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+-				    struct bio_list *bio_list)
++static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
+ {
+-	struct bio *bio;
++	struct bio_list bio_list = BIO_EMPTY_LIST;
+ 	int total_sector_nr;
+ 	int ret = 0;
+ 
+-	ASSERT(bio_list_size(bio_list) == 0);
+-
+ 	/* Build a list of bios to read all the missing parts. */
+ 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ 	     total_sector_nr++) {
+@@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ 		if (sector->uptodate)
+ 			continue;
+ 
+-		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
++		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ 					 sectornr, REQ_OP_READ);
+-		if (ret)
+-			goto error;
++		if (ret) {
++			bio_list_put(&bio_list);
++			return ret;
++		}
+ 	}
++
++	submit_read_wait_bio_list(rbio, &bio_list);
+ 	return 0;
+-error:
+-	while ((bio = bio_list_pop(bio_list)))
+-		bio_put(bio);
+-	return ret;
+ }
+ 
+-static int scrub_rbio(struct btrfs_raid_bio *rbio)
++static void scrub_rbio(struct btrfs_raid_bio *rbio)
+ {
+ 	bool need_check = false;
+-	struct bio_list bio_list;
+ 	int sector_nr;
+ 	int ret;
+-	struct bio *bio;
+-
+-	bio_list_init(&bio_list);
+ 
+ 	ret = alloc_rbio_essential_pages(rbio);
+ 	if (ret)
+-		goto cleanup;
++		goto out;
+ 
+ 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+ 
+-	ret = scrub_assemble_read_bios(rbio, &bio_list);
++	ret = scrub_assemble_read_bios(rbio);
+ 	if (ret < 0)
+-		goto cleanup;
+-
+-	submit_read_bios(rbio, &bio_list);
+-	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
++		goto out;
+ 
+ 	/* We may have some failures, recover the failed sectors first. */
+ 	ret = recover_scrub_rbio(rbio);
+ 	if (ret < 0)
+-		goto cleanup;
++		goto out;
+ 
+ 	/*
+ 	 * We have every sector properly prepared. Can finish the scrub
+@@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
+ 			break;
+ 		}
+ 	}
+-	return ret;
+-
+-cleanup:
+-	while ((bio = bio_list_pop(&bio_list)))
+-		bio_put(bio);
+-
+-	return ret;
++out:
++	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+ 
+ static void scrub_rbio_work_locked(struct work_struct *work)
+ {
+-	struct btrfs_raid_bio *rbio;
+-	int ret;
+-
+-	rbio = container_of(work, struct btrfs_raid_bio, work);
+-	ret = scrub_rbio(rbio);
+-	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
++	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
+ }
+ 
+ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
+index 7c73a443939e..df0e0abdeb1f 100644
+--- a/fs/btrfs/raid56.h
++++ b/fs/btrfs/raid56.h
+@@ -65,7 +65,7 @@ struct btrfs_raid_bio {
+ 	/* Number of data stripes (no p/q) */
+ 	u8 nr_data;
+ 
+-	/* Numer of all stripes (including P/Q) */
++	/* Number of all stripes (including P/Q) */
+ 	u8 real_stripes;
+ 
+ 	/* How many pages there are for each stripe */
+@@ -132,7 +132,7 @@ struct btrfs_raid_bio {
+ 
+ 	/*
+ 	 * Checksum buffer if the rbio is for data.  The buffer should cover
+-	 * all data sectors (exlcuding P/Q sectors).
++	 * all data sectors (excluding P/Q sectors).
+ 	 */
+ 	u8 *csum_buf;
+ 
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index 31ec4a7658ce..ef13a9d4e370 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
+ 	 *
+ 	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ 	 */
+-	if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
++	if (!PAGE_ALIGNED(i_size)) {
+ 		struct address_space *mapping = inode->vfs_inode.i_mapping;
+ 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 		const u32 sectorsize = fs_info->sectorsize;
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 52b346795f66..69c93ae333f6 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -229,7 +229,7 @@ struct full_stripe_lock {
+ };
+ 
+ #ifndef CONFIG_64BIT
+-/* This structure is for archtectures whose (void *) is smaller than u64 */
++/* This structure is for architectures whose (void *) is smaller than u64 */
+ struct scrub_page_private {
+ 	u64 logical;
+ };
+@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
+ 	 * a) don't have an extent buffer and
+ 	 * b) the page is already kmapped
+ 	 */
+-	if (sblock->logical != btrfs_stack_header_bytenr(h))
++	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
+ 		sblock->header_error = 1;
+-
+-	if (sector->generation != btrfs_stack_header_generation(h)) {
+-		sblock->header_error = 1;
+-		sblock->generation_error = 1;
++		btrfs_warn_rl(fs_info,
++		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
++			      sblock->logical, sblock->mirror_num,
++			      btrfs_stack_header_bytenr(h),
++			      sblock->logical);
++		goto out;
+ 	}
+ 
+-	if (!scrub_check_fsid(h->fsid, sector))
++	if (!scrub_check_fsid(h->fsid, sector)) {
+ 		sblock->header_error = 1;
++		btrfs_warn_rl(fs_info,
++		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
++			      sblock->logical, sblock->mirror_num,
++			      h->fsid, sblock->dev->fs_devices->fsid);
++		goto out;
++	}
+ 
+-	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+-		   BTRFS_UUID_SIZE))
++	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
+ 		sblock->header_error = 1;
++		btrfs_warn_rl(fs_info,
++		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
++			      sblock->logical, sblock->mirror_num,
++			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
++		goto out;
++	}
+ 
+ 	shash->tfm = fs_info->csum_shash;
+ 	crypto_shash_init(shash);
+@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
+ 	}
+ 
+ 	crypto_shash_final(shash, calculated_csum);
+-	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
++	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
+ 		sblock->checksum_error = 1;
++		btrfs_warn_rl(fs_info,
++		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
++			      sblock->logical, sblock->mirror_num,
++			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
++			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
++		goto out;
++	}
++
++	if (sector->generation != btrfs_stack_header_generation(h)) {
++		sblock->header_error = 1;
++		sblock->generation_error = 1;
++		btrfs_warn_rl(fs_info,
++		"tree block %llu mirror %u has bad generation, has %llu want %llu",
++			      sblock->logical, sblock->mirror_num,
++			      btrfs_stack_header_generation(h),
++			      sector->generation);
++	}
+ 
++out:
+ 	return sblock->header_error || sblock->checksum_error;
+ }
+ 
+diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
+index d50182b6deec..e5c963bb873d 100644
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -32,6 +32,7 @@
+ #include "file-item.h"
+ #include "ioctl.h"
+ #include "verity.h"
++#include "lru_cache.h"
+ 
+ /*
+  * Maximum number of references an extent can have in order for us to attempt to
+@@ -80,23 +81,23 @@ struct clone_root {
+ 	bool found_ref;
+ };
+ 
+-#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+-#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
++#define SEND_MAX_NAME_CACHE_SIZE			256
+ 
+ /*
+- * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+- * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
++ * Limit the root_ids array of struct backref_cache_entry to 17 elements.
++ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
++ * can be satisfied from the kmalloc-192 slab, without wasting any space.
+  * The most common case is to have a single root for cloning, which corresponds
+- * to the send root. Having the user specify more than 11 clone roots is not
++ * to the send root. Having the user specify more than 16 clone roots is not
+  * common, and in such rare cases we simply don't use caching if the number of
+- * cloning roots that lead down to a leaf is more than 12.
++ * cloning roots that lead down to a leaf is more than 17.
+  */
+-#define SEND_MAX_BACKREF_CACHE_ROOTS 12
++#define SEND_MAX_BACKREF_CACHE_ROOTS			17
+ 
+ /*
+  * Max number of entries in the cache.
+- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+- * maple tree's internal nodes, is 16K.
++ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
++ * maple tree's internal nodes, is 24K.
+  */
+ #define SEND_MAX_BACKREF_CACHE_SIZE 128
+ 
+@@ -107,15 +108,31 @@ struct clone_root {
+  * x86_64).
+  */
+ struct backref_cache_entry {
+-	/* List to link to the cache's lru list. */
+-	struct list_head list;
+-	/* The key for this entry in the cache. */
+-	u64 key;
++	struct btrfs_lru_cache_entry entry;
+ 	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+ 	/* Number of valid elements in the root_ids array. */
+ 	int num_roots;
+ };
+ 
++/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
++static_assert(offsetof(struct backref_cache_entry, entry) == 0);
++
++/*
++ * Max number of entries in the cache that stores directories that were already
++ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
++ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
++ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
++ */
++#define SEND_MAX_DIR_CREATED_CACHE_SIZE			64
++
++/*
++ * Max number of entries in the cache that stores directories that were already
++ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
++ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
++ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
++ */
++#define SEND_MAX_DIR_UTIMES_CACHE_SIZE			64
++
+ struct send_ctx {
+ 	struct file *send_filp;
+ 	loff_t send_off;
+@@ -174,9 +191,7 @@ struct send_ctx {
+ 	struct list_head new_refs;
+ 	struct list_head deleted_refs;
+ 
+-	struct radix_tree_root name_cache;
+-	struct list_head name_cache_list;
+-	int name_cache_size;
++	struct btrfs_lru_cache name_cache;
+ 
+ 	/*
+ 	 * The inode we are currently processing. It's not NULL only when we
+@@ -285,13 +300,11 @@ struct send_ctx {
+ 	struct rb_root rbtree_new_refs;
+ 	struct rb_root rbtree_deleted_refs;
+ 
+-	struct {
+-		u64 last_reloc_trans;
+-		struct list_head lru_list;
+-		struct maple_tree entries;
+-		/* Number of entries stored in the cache. */
+-		int size;
+-	} backref_cache;
++	struct btrfs_lru_cache backref_cache;
++	u64 backref_cache_last_reloc_trans;
++
++	struct btrfs_lru_cache dir_created_cache;
++	struct btrfs_lru_cache dir_utimes_cache;
+ };
+ 
+ struct pending_dir_move {
+@@ -321,21 +334,15 @@ struct orphan_dir_info {
+ 	u64 ino;
+ 	u64 gen;
+ 	u64 last_dir_index_offset;
++	u64 dir_high_seq_ino;
+ };
+ 
+ struct name_cache_entry {
+-	struct list_head list;
+ 	/*
+-	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
+-	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
+-	 * more then one inum would fall into the same entry, we use radix_list
+-	 * to store the additional entries. radix_list is also used to store
+-	 * entries where two entries have the same inum but different
+-	 * generations.
++	 * The key in the entry is an inode number, and the generation matches
++	 * the inode's generation.
+ 	 */
+-	struct list_head radix_list;
+-	u64 ino;
+-	u64 gen;
++	struct btrfs_lru_cache_entry entry;
+ 	u64 parent_ino;
+ 	u64 parent_gen;
+ 	int ret;
+@@ -344,6 +351,9 @@ struct name_cache_entry {
+ 	char name[];
+ };
+ 
++/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
++static_assert(offsetof(struct name_cache_entry, entry) == 0);
++
+ #define ADVANCE							1
+ #define ADVANCE_ONLY_NEXT					-1
+ 
+@@ -956,14 +966,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
+ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
+ {
+ 	int ret;
+-	struct btrfs_inode_info info;
++	struct btrfs_inode_info info = { 0 };
+ 
+-	if (!gen)
+-		return -EPERM;
++	ASSERT(gen);
+ 
+ 	ret = get_inode_info(root, ino, &info);
+-	if (!ret)
+-		*gen = info.gen;
++	*gen = info.gen;
+ 	return ret;
+ }
+ 
+@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+ 	return 0;
+ }
+ 
+-static void empty_backref_cache(struct send_ctx *sctx)
+-{
+-	struct backref_cache_entry *entry;
+-	struct backref_cache_entry *tmp;
+-
+-	list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+-		kfree(entry);
+-
+-	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+-	mtree_destroy(&sctx->backref_cache.entries);
+-	sctx->backref_cache.size = 0;
+-}
+-
+ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ 				 const u64 **root_ids_ret, int *root_count_ret)
+ {
+@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ 	struct send_ctx *sctx = bctx->sctx;
+ 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ 	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
++	struct btrfs_lru_cache_entry *raw_entry;
+ 	struct backref_cache_entry *entry;
+ 
+-	if (sctx->backref_cache.size == 0)
++	if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
+ 		return false;
+ 
+ 	/*
+@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+ 	 * transaction handle or holding fs_info->commit_root_sem, so no need
+ 	 * to take any lock here.
+ 	 */
+-	if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+-		empty_backref_cache(sctx);
++	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
++		btrfs_lru_cache_clear(&sctx->backref_cache);
+ 		return false;
+ 	}
+ 
+-	entry = mtree_load(&sctx->backref_cache.entries, key);
+-	if (!entry)
++	raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
++	if (!raw_entry)
+ 		return false;
+ 
++	entry = container_of(raw_entry, struct backref_cache_entry, entry);
+ 	*root_ids_ret = entry->root_ids;
+ 	*root_count_ret = entry->num_roots;
+-	list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+ 
+ 	return true;
+ }
+@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ 	if (!new_entry)
+ 		return;
+ 
+-	new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
++	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
++	new_entry->entry.gen = 0;
+ 	new_entry->num_roots = 0;
+ 	ULIST_ITER_INIT(&uiter);
+ 	while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ 	 * none of the roots is part of the list of roots from which we are
+ 	 * allowed to clone. Cache the new entry as it's still useful to avoid
+ 	 * backref walking to determine which roots have a path to the leaf.
++	 *
++	 * Also use GFP_NOFS because we're called while holding a transaction
++	 * handle or while holding fs_info->commit_root_sem.
+ 	 */
+-
+-	if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+-		struct backref_cache_entry *lru_entry;
+-		struct backref_cache_entry *mt_entry;
+-
+-		lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+-					     struct backref_cache_entry, list);
+-		mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+-		ASSERT(mt_entry == lru_entry);
+-		list_del(&mt_entry->list);
+-		kfree(mt_entry);
+-		sctx->backref_cache.size--;
+-	}
+-
+-	ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+-			   new_entry, GFP_NOFS);
++	ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
++				    GFP_NOFS);
+ 	ASSERT(ret == 0 || ret == -ENOMEM);
+ 	if (ret) {
+ 		/* Caching is optional, no worries. */
+@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+ 		return;
+ 	}
+ 
+-	list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+-
+ 	/*
+ 	 * We are called from iterate_extent_inodes() while either holding a
+ 	 * transaction handle or holding fs_info->commit_root_sem, so no need
+ 	 * to take any lock here.
+ 	 */
+-	if (sctx->backref_cache.size == 0)
+-		sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+-
+-	sctx->backref_cache.size++;
++	if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
++		sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
+ }
+ 
+ static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+@@ -1886,7 +1868,8 @@ enum inode_state {
+ 	inode_state_did_delete,
+ };
+ 
+-static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
++static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
++			       u64 *send_gen, u64 *parent_gen)
+ {
+ 	int ret;
+ 	int left_ret;
+@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+ 		goto out;
+ 	left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ 	left_gen = info.gen;
++	if (send_gen)
++		*send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
+ 
+ 	if (!sctx->parent_root) {
+ 		right_ret = -ENOENT;
+@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+ 			goto out;
+ 		right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ 		right_gen = info.gen;
++		if (parent_gen)
++			*parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
+ 	}
+ 
+ 	if (!left_ret && !right_ret) {
+@@ -1953,14 +1940,15 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+ 	return ret;
+ }
+ 
+-static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
++static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
++			     u64 *send_gen, u64 *parent_gen)
+ {
+ 	int ret;
+ 
+ 	if (ino == BTRFS_FIRST_FREE_OBJECTID)
+ 		return 1;
+ 
+-	ret = get_cur_inode_state(sctx, ino, gen);
++	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
+ 	if (ret < 0)
+ 		goto out;
+ 
+@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 			      const char *name, int name_len,
+ 			      u64 *who_ino, u64 *who_gen, u64 *who_mode)
+ {
+-	int ret = 0;
+-	u64 gen;
++	int ret;
++	u64 parent_root_dir_gen;
+ 	u64 other_inode = 0;
+ 	struct btrfs_inode_info info;
+ 
+ 	if (!sctx->parent_root)
+-		goto out;
++		return 0;
+ 
+-	ret = is_inode_existent(sctx, dir, dir_gen);
++	ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
+ 	if (ret <= 0)
+-		goto out;
++		return 0;
+ 
+ 	/*
+ 	 * If we have a parent root we need to verify that the parent dir was
+ 	 * not deleted and then re-created, if it was then we have no overwrite
+ 	 * and we can just unlink this entry.
++	 *
++	 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
++	 * parent root.
+ 	 */
+-	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
+-		ret = get_inode_gen(sctx->parent_root, dir, &gen);
+-		if (ret < 0 && ret != -ENOENT)
+-			goto out;
+-		if (ret) {
+-			ret = 0;
+-			goto out;
+-		}
+-		if (gen != dir_gen)
+-			goto out;
+-	}
++	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
++	    parent_root_dir_gen != dir_gen)
++		return 0;
+ 
+ 	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+ 				    &other_inode);
+-	if (ret < 0 && ret != -ENOENT)
+-		goto out;
+-	if (ret) {
+-		ret = 0;
+-		goto out;
+-	}
++	if (ret == -ENOENT)
++		return 0;
++	else if (ret < 0)
++		return ret;
+ 
+ 	/*
+ 	 * Check if the overwritten ref was already processed. If yes, the ref
+@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 	    is_waiting_for_move(sctx, other_inode)) {
+ 		ret = get_inode_info(sctx->parent_root, other_inode, &info);
+ 		if (ret < 0)
+-			goto out;
++			return ret;
+ 
+-		ret = 1;
+ 		*who_ino = other_inode;
+ 		*who_gen = info.gen;
+ 		*who_mode = info.mode;
+-	} else {
+-		ret = 0;
++		return 1;
+ 	}
+ 
+-out:
+-	return ret;
++	return 0;
+ }
+ 
+ /*
+@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx,
+ 			    u64 ino, u64 ino_gen,
+ 			    const char *name, int name_len)
+ {
+-	int ret = 0;
+-	u64 gen;
++	int ret;
+ 	u64 ow_inode;
++	u64 ow_gen = 0;
++	u64 send_root_dir_gen;
+ 
+ 	if (!sctx->parent_root)
+-		goto out;
++		return 0;
+ 
+-	ret = is_inode_existent(sctx, dir, dir_gen);
++	ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
+ 	if (ret <= 0)
+-		goto out;
++		return ret;
+ 
+-	if (dir != BTRFS_FIRST_FREE_OBJECTID) {
+-		ret = get_inode_gen(sctx->send_root, dir, &gen);
+-		if (ret < 0 && ret != -ENOENT)
+-			goto out;
+-		if (ret) {
+-			ret = 0;
+-			goto out;
+-		}
+-		if (gen != dir_gen)
+-			goto out;
+-	}
++	/*
++	 * @send_root_dir_gen was set to 0 if the inode does not exist in the
++	 * send root.
++	 */
++	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
++		return 0;
+ 
+ 	/* check if the ref was overwritten by another ref */
+ 	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+ 				    &ow_inode);
+-	if (ret < 0 && ret != -ENOENT)
+-		goto out;
+-	if (ret) {
++	if (ret == -ENOENT) {
+ 		/* was never and will never be overwritten */
+-		ret = 0;
+-		goto out;
++		return 0;
++	} else if (ret < 0) {
++		return ret;
+ 	}
+ 
+-	ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
+-	if (ret < 0)
+-		goto out;
++	if (ow_inode == ino) {
++		ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
++		if (ret < 0)
++			return ret;
+ 
+-	if (ow_inode == ino && gen == ino_gen) {
+-		ret = 0;
+-		goto out;
++		/* It's the same inode, so no overwrite happened. */
++		if (ow_gen == ino_gen)
++			return 0;
+ 	}
+ 
+ 	/*
+@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx,
+ 	 * inode 'ino' to be orphanized, therefore check if ow_inode matches
+ 	 * the current inode being processed.
+ 	 */
+-	if ((ow_inode < sctx->send_progress) ||
+-	    (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+-	     gen == sctx->cur_inode_gen))
+-		ret = 1;
+-	else
+-		ret = 0;
++	if (ow_inode < sctx->send_progress)
++		return 1;
+ 
+-out:
+-	return ret;
++	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
++		if (ow_gen == 0) {
++			ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
++			if (ret < 0)
++				return ret;
++		}
++		if (ow_gen == sctx->cur_inode_gen)
++			return 1;
++	}
++
++	return 0;
+ }
+ 
+ /*
+@@ -2285,113 +2264,16 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+ 	return ret;
+ }
+ 
+-/*
+- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+- * so we need to do some special handling in case we have clashes. This function
+- * takes care of this with the help of name_cache_entry::radix_list.
+- * In case of error, nce is kfreed.
+- */
+-static int name_cache_insert(struct send_ctx *sctx,
+-			     struct name_cache_entry *nce)
++static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
++							 u64 ino, u64 gen)
+ {
+-	int ret = 0;
+-	struct list_head *nce_head;
+-
+-	nce_head = radix_tree_lookup(&sctx->name_cache,
+-			(unsigned long)nce->ino);
+-	if (!nce_head) {
+-		nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
+-		if (!nce_head) {
+-			kfree(nce);
+-			return -ENOMEM;
+-		}
+-		INIT_LIST_HEAD(nce_head);
+-
+-		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+-		if (ret < 0) {
+-			kfree(nce_head);
+-			kfree(nce);
+-			return ret;
+-		}
+-	}
+-	list_add_tail(&nce->radix_list, nce_head);
+-	list_add_tail(&nce->list, &sctx->name_cache_list);
+-	sctx->name_cache_size++;
+-
+-	return ret;
+-}
++	struct btrfs_lru_cache_entry *entry;
+ 
+-static void name_cache_delete(struct send_ctx *sctx,
+-			      struct name_cache_entry *nce)
+-{
+-	struct list_head *nce_head;
+-
+-	nce_head = radix_tree_lookup(&sctx->name_cache,
+-			(unsigned long)nce->ino);
+-	if (!nce_head) {
+-		btrfs_err(sctx->send_root->fs_info,
+-	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+-			nce->ino, sctx->name_cache_size);
+-	}
+-
+-	list_del(&nce->radix_list);
+-	list_del(&nce->list);
+-	sctx->name_cache_size--;
+-
+-	/*
+-	 * We may not get to the final release of nce_head if the lookup fails
+-	 */
+-	if (nce_head && list_empty(nce_head)) {
+-		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+-		kfree(nce_head);
+-	}
+-}
+-
+-static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+-						    u64 ino, u64 gen)
+-{
+-	struct list_head *nce_head;
+-	struct name_cache_entry *cur;
+-
+-	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+-	if (!nce_head)
++	entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
++	if (!entry)
+ 		return NULL;
+ 
+-	list_for_each_entry(cur, nce_head, radix_list) {
+-		if (cur->ino == ino && cur->gen == gen)
+-			return cur;
+-	}
+-	return NULL;
+-}
+-
+-/*
+- * Remove some entries from the beginning of name_cache_list.
+- */
+-static void name_cache_clean_unused(struct send_ctx *sctx)
+-{
+-	struct name_cache_entry *nce;
+-
+-	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+-		return;
+-
+-	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+-		nce = list_entry(sctx->name_cache_list.next,
+-				struct name_cache_entry, list);
+-		name_cache_delete(sctx, nce);
+-		kfree(nce);
+-	}
+-}
+-
+-static void name_cache_free(struct send_ctx *sctx)
+-{
+-	struct name_cache_entry *nce;
+-
+-	while (!list_empty(&sctx->name_cache_list)) {
+-		nce = list_entry(sctx->name_cache_list.next,
+-				struct name_cache_entry, list);
+-		name_cache_delete(sctx, nce);
+-		kfree(nce);
+-	}
++	return container_of(entry, struct name_cache_entry, entry);
+ }
+ 
+ /*
+@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ {
+ 	int ret;
+ 	int nce_ret;
+-	struct name_cache_entry *nce = NULL;
++	struct name_cache_entry *nce;
+ 
+ 	/*
+ 	 * First check if we already did a call to this function with the same
+@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ 	nce = name_cache_search(sctx, ino, gen);
+ 	if (nce) {
+ 		if (ino < sctx->send_progress && nce->need_later_update) {
+-			name_cache_delete(sctx, nce);
+-			kfree(nce);
++			btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
+ 			nce = NULL;
+ 		} else {
+-			/*
+-			 * Removes the entry from the list and adds it back to
+-			 * the end.  This marks the entry as recently used so
+-			 * that name_cache_clean_unused does not remove it.
+-			 */
+-			list_move_tail(&nce->list, &sctx->name_cache_list);
+-
+ 			*parent_ino = nce->parent_ino;
+ 			*parent_gen = nce->parent_gen;
+ 			ret = fs_path_add(dest, nce->name, nce->name_len);
+@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ 	 * This should only happen for the parent dir that we determine in
+ 	 * record_new_ref_if_needed().
+ 	 */
+-	ret = is_inode_existent(sctx, ino, gen);
++	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
+ 	if (ret < 0)
+ 		goto out;
+ 
+@@ -2497,8 +2371,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ 		goto out;
+ 	}
+ 
+-	nce->ino = ino;
+-	nce->gen = gen;
++	nce->entry.key = ino;
++	nce->entry.gen = gen;
+ 	nce->parent_ino = *parent_ino;
+ 	nce->parent_gen = *parent_gen;
+ 	nce->name_len = fs_path_len(dest);
+@@ -2510,10 +2384,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ 	else
+ 		nce->need_later_update = 1;
+ 
+-	nce_ret = name_cache_insert(sctx, nce);
+-	if (nce_ret < 0)
++	nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
++	if (nce_ret < 0) {
++		kfree(nce);
+ 		ret = nce_ret;
+-	name_cache_clean_unused(sctx);
++	}
+ 
+ out:
+ 	return ret;
+@@ -2883,6 +2758,63 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+ 	return ret;
+ }
+ 
++/*
++ * If the cache is full, we can't remove entries from it and do a call to
++ * send_utimes() for each respective inode, because we might be finishing
++ * processing an inode that is a directory and it just got renamed, and existing
++ * entries in the cache may refer to inodes that have the directory in their
++ * full path - in which case we would generate outdated paths (pre-rename)
++ * for the inodes that the cache entries point to. Instead of prunning the
++ * cache when inserting, do it after we finish processing each inode at
++ * finish_inode_if_needed().
++ */
++static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
++{
++	struct btrfs_lru_cache_entry *entry;
++	int ret;
++
++	entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
++	if (entry != NULL)
++		return 0;
++
++	/* Caching is optional, don't fail if we can't allocate memory. */
++	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
++	if (!entry)
++		return send_utimes(sctx, dir, gen);
++
++	entry->key = dir;
++	entry->gen = gen;
++
++	ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
++	ASSERT(ret != -EEXIST);
++	if (ret) {
++		kfree(entry);
++		return send_utimes(sctx, dir, gen);
++	}
++
++	return 0;
++}
++
++static int trim_dir_utimes_cache(struct send_ctx *sctx)
++{
++	while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
++	       SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
++		struct btrfs_lru_cache_entry *lru;
++		int ret;
++
++		lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
++		ASSERT(lru != NULL);
++
++		ret = send_utimes(sctx, lru->key, lru->gen);
++		if (ret)
++			return ret;
++
++		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
++	}
++
++	return 0;
++}
++
+ /*
+  * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+  * a valid path yet because we did not process the refs yet. So, the inode
+@@ -2971,6 +2903,23 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
+ 	return ret;
+ }
+ 
++static void cache_dir_created(struct send_ctx *sctx, u64 dir)
++{
++	struct btrfs_lru_cache_entry *entry;
++	int ret;
++
++	/* Caching is optional, ignore any failures. */
++	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
++	if (!entry)
++		return;
++
++	entry->key = dir;
++	entry->gen = 0;
++	ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
++	if (ret < 0)
++		kfree(entry);
++}
++
+ /*
+  * We need some special handling for inodes that get processed before the parent
+  * directory got created. See process_recorded_refs for details.
+@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
+ 	struct btrfs_key di_key;
+ 	struct btrfs_dir_item *di;
+ 
++	if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
++		return 1;
++
+ 	path = alloc_path_for_send();
+ 	if (!path)
+ 		return -ENOMEM;
+@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
+ 		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+ 		    di_key.objectid < sctx->send_progress) {
+ 			ret = 1;
++			cache_dir_created(sctx, dir);
+ 			break;
+ 		}
+ 	}
+@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
+ 			return 0;
+ 	}
+ 
+-	return send_create_inode(sctx, sctx->cur_ino);
++	ret = send_create_inode(sctx, sctx->cur_ino);
++
++	if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
++		cache_dir_created(sctx, sctx->cur_ino);
++
++	return ret;
+ }
+ 
+ struct recorded_ref {
+@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
+ 	odi->ino = dir_ino;
+ 	odi->gen = dir_gen;
+ 	odi->last_dir_index_offset = 0;
++	odi->dir_high_seq_ino = 0;
+ 
+ 	rb_link_node(&odi->node, parent, p);
+ 	rb_insert_color(&odi->node, &sctx->orphan_dirs);
+@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx,
+  * We check this by iterating all dir items and checking if the inode behind
+  * the dir item was already processed.
+  */
+-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+-		     u64 send_progress)
++static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
+ {
+ 	int ret = 0;
+ 	int iter_ret = 0;
+@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 	struct btrfs_key loc;
+ 	struct btrfs_dir_item *di;
+ 	struct orphan_dir_info *odi = NULL;
++	u64 dir_high_seq_ino = 0;
++	u64 last_dir_index_offset = 0;
+ 
+ 	/*
+ 	 * Don't try to rmdir the top/root subvolume dir.
+@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 	if (dir == BTRFS_FIRST_FREE_OBJECTID)
+ 		return 0;
+ 
++	odi = get_orphan_dir_info(sctx, dir, dir_gen);
++	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
++		return 0;
++
+ 	path = alloc_path_for_send();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++	if (!odi) {
++		/*
++		 * Find the inode number associated with the last dir index
++		 * entry. This is very likely the inode with the highest number
++		 * of all inodes that have an entry in the directory. We can
++		 * then use it to avoid future calls to can_rmdir(), when
++		 * processing inodes with a lower number, from having to search
++		 * the parent root b+tree for dir index keys.
++		 */
++		key.objectid = dir;
++		key.type = BTRFS_DIR_INDEX_KEY;
++		key.offset = (u64)-1;
++
++		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++		if (ret < 0) {
++			goto out;
++		} else if (ret > 0) {
++			/* Can't happen, the root is never empty. */
++			ASSERT(path->slots[0] > 0);
++			if (WARN_ON(path->slots[0] == 0)) {
++				ret = -EUCLEAN;
++				goto out;
++			}
++			path->slots[0]--;
++		}
++
++		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++		if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
++			/* No index keys, dir can be removed. */
++			ret = 1;
++			goto out;
++		}
++
++		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
++				    struct btrfs_dir_item);
++		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
++		dir_high_seq_ino = loc.objectid;
++		if (sctx->cur_ino < dir_high_seq_ino) {
++			ret = 0;
++			goto out;
++		}
++
++		btrfs_release_path(path);
++	}
++
+ 	key.objectid = dir;
+ 	key.type = BTRFS_DIR_INDEX_KEY;
+-	key.offset = 0;
+-
+-	odi = get_orphan_dir_info(sctx, dir, dir_gen);
+-	if (odi)
+-		key.offset = odi->last_dir_index_offset;
++	key.offset = (odi ? odi->last_dir_index_offset : 0);
+ 
+ 	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ 		struct waiting_dir_move *dm;
+@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 				struct btrfs_dir_item);
+ 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ 
++		dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
++		last_dir_index_offset = found_key.offset;
++
+ 		dm = get_waiting_dir_move(sctx, loc.objectid);
+ 		if (dm) {
+-			odi = add_orphan_dir_info(sctx, dir, dir_gen);
+-			if (IS_ERR(odi)) {
+-				ret = PTR_ERR(odi);
+-				goto out;
+-			}
+-			odi->gen = dir_gen;
+-			odi->last_dir_index_offset = found_key.offset;
+ 			dm->rmdir_ino = dir;
+ 			dm->rmdir_gen = dir_gen;
+ 			ret = 0;
+ 			goto out;
+ 		}
+ 
+-		if (loc.objectid > send_progress) {
+-			odi = add_orphan_dir_info(sctx, dir, dir_gen);
+-			if (IS_ERR(odi)) {
+-				ret = PTR_ERR(odi);
+-				goto out;
+-			}
+-			odi->gen = dir_gen;
+-			odi->last_dir_index_offset = found_key.offset;
++		if (loc.objectid > sctx->cur_ino) {
+ 			ret = 0;
+ 			goto out;
+ 		}
+@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ 
+ out:
+ 	btrfs_free_path(path);
+-	return ret;
++
++	if (ret)
++		return ret;
++
++	if (!odi) {
++		odi = add_orphan_dir_info(sctx, dir, dir_gen);
++		if (IS_ERR(odi))
++			return PTR_ERR(odi);
++
++		odi->gen = dir_gen;
++	}
++
++	odi->last_dir_index_offset = last_dir_index_offset;
++	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
++
++	return 0;
+ }
+ 
+ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+ 		}
+ 		gen = odi->gen;
+ 
+-		ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
++		ret = can_rmdir(sctx, rmdir_ino, gen);
+ 		if (ret < 0)
+ 			goto out;
+ 		if (!ret)
+@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+ 	}
+ 
+ finish:
+-	ret = send_utimes(sctx, pm->ino, pm->gen);
++	ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
+ 	if (ret < 0)
+ 		goto out;
+ 
+@@ -3619,7 +3628,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+ 		if (ret < 0)
+ 			goto out;
+ 
+-		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
++		ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
+ 		if (ret < 0)
+ 			goto out;
+ 	}
+@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 	 * "testdir_2".
+ 	 */
+ 	list_for_each_entry(cur, &sctx->new_refs, list) {
+-		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
++		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
+ 		if (ret < 0)
+ 			goto out;
+ 		if (ret == inode_state_will_create)
+@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 				 * the source path when performing its rename
+ 				 * operation.
+ 				 */
+-				if (is_waiting_for_move(sctx, ow_inode)) {
+-					wdm = get_waiting_dir_move(sctx,
+-								   ow_inode);
+-					ASSERT(wdm);
++				wdm = get_waiting_dir_move(sctx, ow_inode);
++				if (wdm)
+ 					wdm->orphanized = true;
+-				}
+ 
+ 				/*
+ 				 * Make sure we clear our orphanized inode's
+@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 				 * and get instead the orphan name.
+ 				 */
+ 				nce = name_cache_search(sctx, ow_inode, ow_gen);
+-				if (nce) {
+-					name_cache_delete(sctx, nce);
+-					kfree(nce);
+-				}
++				if (nce)
++					btrfs_lru_cache_remove(&sctx->name_cache,
++							       &nce->entry);
+ 
+ 				/*
+ 				 * ow_inode might currently be an ancestor of
+@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 		 * parent directory out of order. But we need to check if this
+ 		 * did already happen before due to other refs in the same dir.
+ 		 */
+-		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
++		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
+ 		if (ret < 0)
+ 			goto out;
+ 		if (ret == inode_state_will_create) {
+@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 				ret = send_create_inode(sctx, cur->dir);
+ 				if (ret < 0)
+ 					goto out;
++				cache_dir_created(sctx, cur->dir);
+ 			}
+ 		}
+ 
+@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 		 * later, we do this check again and rmdir it then if possible.
+ 		 * See the use of check_dirs for more details.
+ 		 */
+-		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+-				sctx->cur_ino);
++		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ 		if (ret < 0)
+ 			goto out;
+ 		if (ret) {
+@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+ 		if (cur->dir > sctx->cur_ino)
+ 			continue;
+ 
+-		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
++		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
+ 		if (ret < 0)
+ 			goto out;
+ 
+ 		if (ret == inode_state_did_create ||
+ 		    ret == inode_state_no_change) {
+-			/* TODO delayed utimes */
+-			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
++			ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
+ 			if (ret < 0)
+ 				goto out;
+ 		} else if (ret == inode_state_did_delete &&
+ 			   cur->dir != last_dir_ino_rm) {
+-			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+-					sctx->cur_ino);
++			ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
+ 			if (ret < 0)
+ 				goto out;
+ 			if (ret) {
+@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ 	 * boundary in the send buffer. This means that there may be a gap
+ 	 * between the beginning of the command and the file data.
+ 	 */
+-	data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
++	data_offset = PAGE_ALIGN(sctx->send_size);
+ 	if (data_offset > sctx->send_max_size ||
+ 	    sctx->send_max_size - data_offset < disk_num_bytes) {
+ 		ret = -EOVERFLOW;
+@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ 		sent += size;
+ 	}
+ 
+-	if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
++	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
+ 		/*
+ 		 * Always operate only on ranges that are a multiple of the page
+ 		 * size. This is not only to prevent zeroing parts of a page in
+@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+ 		 * it's moved/renamed, therefore we don't need to do it here.
+ 		 */
+ 		sctx->send_progress = sctx->cur_ino + 1;
+-		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
++
++		/*
++		 * If the current inode is a non-empty directory, delay issuing
++		 * the utimes command for it, as it's very likely we have inodes
++		 * with an higher number inside it. We want to issue the utimes
++		 * command only after adding all dentries to it.
++		 */
++		if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
++			ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
++		else
++			ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
++
+ 		if (ret < 0)
+ 			goto out;
+ 	}
+ 
+ out:
++	if (!ret)
++		ret = trim_dir_utimes_cache(sctx);
++
+ 	return ret;
+ }
+ 
+@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+ 	int clone_sources_to_rollback = 0;
+ 	size_t alloc_size;
+ 	int sort_clone_roots = 0;
++	struct btrfs_lru_cache_entry *entry;
++	struct btrfs_lru_cache_entry *tmp;
+ 
+ 	if (!capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+ 
+ 	INIT_LIST_HEAD(&sctx->new_refs);
+ 	INIT_LIST_HEAD(&sctx->deleted_refs);
+-	INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
+-	INIT_LIST_HEAD(&sctx->name_cache_list);
+ 
+-	INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+-	mt_init(&sctx->backref_cache.entries);
++	btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
++	btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
++	btrfs_lru_cache_init(&sctx->dir_created_cache,
++			     SEND_MAX_DIR_CREATED_CACHE_SIZE);
++	/*
++	 * This cache is periodically trimmed to a fixed size elsewhere, see
++	 * cache_dir_utimes() and trim_dir_utimes_cache().
++	 */
++	btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
++
++	sctx->pending_dir_moves = RB_ROOT;
++	sctx->waiting_dir_moves = RB_ROOT;
++	sctx->orphan_dirs = RB_ROOT;
++	sctx->rbtree_new_refs = RB_ROOT;
++	sctx->rbtree_deleted_refs = RB_ROOT;
+ 
+ 	sctx->flags = arg->flags;
+ 
+@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+ 		goto out;
+ 	}
+ 
+-	sctx->pending_dir_moves = RB_ROOT;
+-	sctx->waiting_dir_moves = RB_ROOT;
+-	sctx->orphan_dirs = RB_ROOT;
+-	sctx->rbtree_new_refs = RB_ROOT;
+-	sctx->rbtree_deleted_refs = RB_ROOT;
+-
+ 	sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ 				     arg->clone_sources_count + 1,
+ 				     GFP_KERNEL);
+@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+ 	if (ret < 0)
+ 		goto out;
+ 
++	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
++		ret = send_utimes(sctx, entry->key, entry->gen);
++		if (ret < 0)
++			goto out;
++		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
++	}
++
+ 	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+ 		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+ 		if (ret < 0)
+@@ -8358,11 +8389,12 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+ 		kvfree(sctx->send_buf);
+ 		kvfree(sctx->verity_descriptor);
+ 
+-		name_cache_free(sctx);
+-
+ 		close_current_inode(sctx);
+ 
+-		empty_backref_cache(sctx);
++		btrfs_lru_cache_clear(&sctx->name_cache);
++		btrfs_lru_cache_clear(&sctx->backref_cache);
++		btrfs_lru_cache_clear(&sctx->dir_created_cache);
++		btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
+ 
+ 		kfree(sctx);
+ 	}
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 433ce221dc5c..581845bc206a 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -58,6 +58,7 @@
+ #include "scrub.h"
+ #include "verity.h"
+ #include "super.h"
++#include "extent-tree.h"
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/btrfs.h>
+ 
+@@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+ 		}
+ 
+ 		/*
+-		 * Metadata in mixed block goup profiles are accounted in data
++		 * Metadata in mixed block group profiles are accounted in data
+ 		 */
+ 		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ 			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index 45615ce36498..8c5efa5813b3 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj)
+ 	kfree(to_raid_kobj(kobj));
+ }
+ 
+-static struct kobj_type btrfs_raid_ktype = {
++static const struct kobj_type btrfs_raid_ktype = {
+ 	.sysfs_ops = &kobj_sysfs_ops,
+ 	.release = release_raid_kobj,
+ 	.default_groups = raid_groups,
+@@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj)
+ 	kfree(sinfo);
+ }
+ 
+-static struct kobj_type space_info_ktype = {
++static const struct kobj_type space_info_ktype = {
+ 	.sysfs_ops = &kobj_sysfs_ops,
+ 	.release = space_info_release,
+ 	.default_groups = space_info_groups,
+@@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj)
+ 	complete(&fs_devs->kobj_unregister);
+ }
+ 
+-static struct kobj_type btrfs_ktype = {
++static const struct kobj_type btrfs_ktype = {
+ 	.sysfs_ops	= &kobj_sysfs_ops,
+ 	.release	= btrfs_release_fsid_kobj,
+ };
+@@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj)
+ 	complete(&device->kobj_unregister);
+ }
+ 
+-static struct kobj_type devid_ktype = {
++static const struct kobj_type devid_ktype = {
+ 	.sysfs_ops	= &kobj_sysfs_ops,
+ 	.default_groups = devid_groups,
+ 	.release	= btrfs_release_devid_kobj,
+@@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj)
+ 	kfree(kobj);
+ }
+ 
+-static struct kobj_type qgroups_ktype = {
++static const struct kobj_type qgroups_ktype = {
+ 	.sysfs_ops = &kobj_sysfs_ops,
+ 	.default_groups = qgroups_groups,
+ 	.release = qgroups_release,
+@@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj)
+ 	memset(&qgroup->kobj, 0, sizeof(*kobj));
+ }
+ 
+-static struct kobj_type qgroup_ktype = {
++static const struct kobj_type qgroup_ktype = {
+ 	.sysfs_ops = &kobj_sysfs_ops,
+ 	.release = qgroup_release,
+ 	.default_groups = qgroup_groups,
+@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+  * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+  * values in superblock. Call after any changes to incompat/compat_ro flags
+  */
+-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+-		u64 bit, enum btrfs_feature_set set)
++void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
+ {
+-	struct btrfs_fs_devices *fs_devs;
+ 	struct kobject *fsid_kobj;
+-	u64 __maybe_unused features;
+-	int __maybe_unused ret;
++	int ret;
+ 
+ 	if (!fs_info)
+ 		return;
+ 
+-	/*
+-	 * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+-	 * safe when called from some contexts (eg. balance)
+-	 */
+-	features = get_features(fs_info, set);
+-	ASSERT(bit & supported_feature_masks[set]);
+-
+-	fs_devs = fs_info->fs_devices;
+-	fsid_kobj = &fs_devs->fsid_kobj;
+-
++	fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ 	if (!fsid_kobj->state_initialized)
+ 		return;
+ 
+-	/*
+-	 * FIXME: this is too heavy to update just one value, ideally we'd like
+-	 * to use sysfs_update_group but some refactoring is needed first.
+-	 */
+-	sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+-	ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
++	ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
++	if (ret < 0)
++		btrfs_warn(fs_info,
++			   "failed to update /sys/fs/btrfs/%pU/features: %d",
++			   fs_info->fs_devices->fsid, ret);
+ }
+ 
+ int __init btrfs_init_sysfs(void)
+diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
+index bacef43f7267..86c7eef12873 100644
+--- a/fs/btrfs/sysfs.h
++++ b/fs/btrfs/sysfs.h
+@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
+ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
+ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
+-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+-		u64 bit, enum btrfs_feature_set set);
++void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
+ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
+ 
+ int __init btrfs_init_sysfs(void);
+diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
+index c5b3a631bf4f..f2f2e11dac4c 100644
+--- a/fs/btrfs/tests/extent-map-tests.c
++++ b/fs/btrfs/tests/extent-map-tests.c
+@@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
+ 		goto out_free;
+ 	}
+ 
+-	ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
++	ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ 			       &logical, &out_ndaddrs, &out_stripe_len);
+ 	if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
+ 		test_err("didn't rmap anything but expected %d",
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index b8c52e89688c..18329ebcb1cb 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
+ 	wake_up(&fs_info->transaction_wait);
+ 	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ 
++	/* If we have features changed, wake up the cleaner to update sysfs. */
++	if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
++	    fs_info->cleaner_kthread)
++		wake_up_process(fs_info->cleaner_kthread);
++
+ 	ret = btrfs_write_and_wait_transaction(trans);
+ 	if (ret) {
+ 		btrfs_handle_fs_error(fs_info, ret,
+@@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
+ 	return (ret < 0) ? 0 : 1;
+ }
+ 
++/*
++ * We only mark the transaction aborted and then set the file system read-only.
++ * This will prevent new transactions from starting or trying to join this
++ * one.
++ *
++ * This means that error recovery at the call site is limited to freeing
++ * any local memory allocations and passing the error code up without
++ * further cleanup. The transaction should complete as it normally would
++ * in the call path but will return -EIO.
++ *
++ * We'll complete the cleanup in btrfs_end_transaction and
++ * btrfs_commit_transaction.
++ */
++void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
++				      const char *function,
++				      unsigned int line, int errno, bool first_hit)
++{
++	struct btrfs_fs_info *fs_info = trans->fs_info;
++
++	WRITE_ONCE(trans->aborted, errno);
++	WRITE_ONCE(trans->transaction->aborted, errno);
++	if (first_hit && errno == -ENOSPC)
++		btrfs_dump_space_info_for_trans_abort(fs_info);
++	/* Wake up anybody who may be waiting on this transaction */
++	wake_up(&fs_info->transaction_wait);
++	wake_up(&fs_info->transaction_blocked_wait);
++	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
++}
++
+ int __init btrfs_transaction_init(void)
+ {
+ 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
+index 97f6c39f59c8..fa728ab80826 100644
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+ 	delayed_refs->qgroup_to_skip = 0;
+ }
+ 
++bool __cold abort_should_print_stack(int errno);
++
++/*
++ * Call btrfs_abort_transaction as early as possible when an error condition is
++ * detected, that way the exact stack trace is reported for some errors.
++ */
++#define btrfs_abort_transaction(trans, errno)		\
++do {								\
++	bool first = false;					\
++	/* Report first abort since mount */			\
++	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
++			&((trans)->fs_info->fs_state))) {	\
++		first = true;					\
++		if (WARN(abort_should_print_stack(errno),	\
++			KERN_ERR				\
++			"BTRFS: Transaction aborted (error %d)\n",	\
++			(errno))) {					\
++			/* Stack trace printed. */			\
++		} else {						\
++			btrfs_debug((trans)->fs_info,			\
++				    "Transaction aborted (error %d)", \
++				  (errno));			\
++		}						\
++	}							\
++	__btrfs_abort_transaction((trans), __func__,		\
++				  __LINE__, (errno), first);	\
++} while (0)
++
+ int btrfs_end_transaction(struct btrfs_trans_handle *trans);
+ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ 						   unsigned int num_items);
+@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
+ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ 			    struct btrfs_root *root);
+ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
++void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
++				      const char *function,
++				      unsigned int line, int errno, bool first_hit);
+ 
+ int __init btrfs_transaction_init(void);
+ void __cold btrfs_transaction_exit(void);
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 58599189bd18..200cea6e49e5 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
+ 	}
+ }
+ 
+-static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+-{
+-	filemap_fdatawait_range(buf->pages[0]->mapping,
+-			        buf->start, buf->start + buf->len - 1);
+-}
+-
+ /*
+  * the walk control struct is used to pass state down the chain when
+  * processing the log tree.  The stage field tells us which part
+@@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 					return ret;
+ 				}
+ 
++				btrfs_tree_lock(next);
++				btrfs_clear_buffer_dirty(trans, next);
++				wait_on_extent_buffer_writeback(next);
++				btrfs_tree_unlock(next);
++
+ 				if (trans) {
+-					btrfs_tree_lock(next);
+-					btrfs_clean_tree_block(next);
+-					btrfs_wait_tree_block_writeback(next);
+-					btrfs_tree_unlock(next);
+ 					ret = btrfs_pin_reserved_extent(trans,
+ 							bytenr, blocksize);
+ 					if (ret) {
+@@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 					btrfs_redirty_list_add(
+ 						trans->transaction, next);
+ 				} else {
+-					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+-						clear_extent_buffer_dirty(next);
+ 					unaccount_log_buffer(fs_info, bytenr);
+ 				}
+ 			}
+@@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ 
+ 				next = path->nodes[*level];
+ 
++				btrfs_tree_lock(next);
++				btrfs_clear_buffer_dirty(trans, next);
++				wait_on_extent_buffer_writeback(next);
++				btrfs_tree_unlock(next);
++
+ 				if (trans) {
+-					btrfs_tree_lock(next);
+-					btrfs_clean_tree_block(next);
+-					btrfs_wait_tree_block_writeback(next);
+-					btrfs_tree_unlock(next);
+ 					ret = btrfs_pin_reserved_extent(trans,
+ 						     path->nodes[*level]->start,
+ 						     path->nodes[*level]->len);
+@@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ 					btrfs_redirty_list_add(trans->transaction,
+ 							       next);
+ 				} else {
+-					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+-						clear_extent_buffer_dirty(next);
+-
+ 					unaccount_log_buffer(fs_info,
+ 						path->nodes[*level]->start);
+ 				}
+@@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
+ 
+ 			next = path->nodes[orig_level];
+ 
++			btrfs_tree_lock(next);
++			btrfs_clear_buffer_dirty(trans, next);
++			wait_on_extent_buffer_writeback(next);
++			btrfs_tree_unlock(next);
++
+ 			if (trans) {
+-				btrfs_tree_lock(next);
+-				btrfs_clean_tree_block(next);
+-				btrfs_wait_tree_block_writeback(next);
+-				btrfs_tree_unlock(next);
+ 				ret = btrfs_pin_reserved_extent(trans,
+ 						next->start, next->len);
+ 				if (ret)
+ 					goto out;
+ 				btrfs_redirty_list_add(trans->transaction, next);
+ 			} else {
+-				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+-					clear_extent_buffer_dirty(next);
+ 				unaccount_log_buffer(fs_info, next->start);
+ 			}
+ 		}
+@@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ 
+ 	/*
+ 	 * If for some unexpected reason the last item's index is not greater
+-	 * than the last index we logged, warn and return an error to fallback
+-	 * to a transaction commit.
++	 * than the last index we logged, warn and force a transaction commit.
+ 	 */
+ 	if (WARN_ON(last_index <= inode->last_dir_index_offset))
+-		ret = -EUCLEAN;
++		ret = BTRFS_LOG_FORCE_COMMIT;
+ 	else
+ 		inode->last_dir_index_offset = last_index;
+ out:
+@@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	struct btrfs_key min_key;
+ 	struct btrfs_root *root = inode->root;
+ 	struct btrfs_root *log = root->log_root;
+-	int err = 0;
+ 	int ret;
+ 	u64 last_old_dentry_offset = min_offset - 1;
+ 	u64 last_offset = (u64)-1;
+@@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 					      path->slots[0]);
+ 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ 				last_old_dentry_offset = tmp.offset;
+-		} else if (ret < 0) {
+-			err = ret;
++		} else if (ret > 0) {
++			ret = 0;
+ 		}
+ 
+ 		goto done;
+@@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 		if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ 			last_old_dentry_offset = tmp.offset;
+ 	} else if (ret < 0) {
+-		err = ret;
+ 		goto done;
+ 	}
+ 
+@@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	 */
+ search:
+ 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+-	if (ret > 0)
++	if (ret > 0) {
+ 		ret = btrfs_next_item(root, path);
++		if (ret > 0) {
++			/* There are no more keys in the inode's root. */
++			ret = 0;
++			goto done;
++		}
++	}
+ 	if (ret < 0)
+-		err = ret;
+-	/* If ret is 1, there are no more keys in the inode's root. */
+-	if (ret != 0)
+ 		goto done;
+ 
+ 	/*
+@@ -3897,8 +3887,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ 					     &last_old_dentry_offset);
+ 		if (ret != 0) {
+-			if (ret < 0)
+-				err = ret;
++			if (ret > 0)
++				ret = 0;
+ 			goto done;
+ 		}
+ 		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
+@@ -3909,10 +3899,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 		 */
+ 		ret = btrfs_next_leaf(root, path);
+ 		if (ret) {
+-			if (ret == 1)
++			if (ret == 1) {
+ 				last_offset = (u64)-1;
+-			else
+-				err = ret;
++				ret = 0;
++			}
+ 			goto done;
+ 		}
+ 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+@@ -3943,7 +3933,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 	btrfs_release_path(path);
+ 	btrfs_release_path(dst_path);
+ 
+-	if (err == 0) {
++	if (ret == 0) {
+ 		*last_offset_ret = last_offset;
+ 		/*
+ 		 * In case the leaf was changed in the current transaction but
+@@ -3954,15 +3944,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ 		 * a range, last_old_dentry_offset is == to last_offset.
+ 		 */
+ 		ASSERT(last_old_dentry_offset <= last_offset);
+-		if (last_old_dentry_offset < last_offset) {
++		if (last_old_dentry_offset < last_offset)
+ 			ret = insert_dir_log_key(trans, log, path, ino,
+ 						 last_old_dentry_offset + 1,
+ 						 last_offset);
+-			if (ret)
+-				err = ret;
+-		}
+ 	}
+-	return err;
++
++	return ret;
+ }
+ 
+ /*
+@@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ 	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ 	 * commits.
+ 	 */
+-	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+-		btrfs_set_log_full_commit(trans);
++	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ 		return BTRFS_LOG_FORCE_COMMIT;
+-	}
+ 
+ 	inode = btrfs_iget(root->fs_info->sb, ino, root);
+ 	/*
+@@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ 	 * result in losing the file after a log replay.
+ 	 */
+ 	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
+-		btrfs_set_log_full_commit(trans);
+ 		ret = BTRFS_LOG_FORCE_COMMIT;
+ 		goto out_unlock;
+ 	}
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index 85cd24cb0540..bdeb5216718f 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -13,8 +13,13 @@
+ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+ #define BTRFS_NO_LOG_SYNC 256
+ 
+-/* We can't use the tree log for whatever reason, force a transaction commit */
+-#define BTRFS_LOG_FORCE_COMMIT				(1)
++/*
++ * We can't use the tree log for whatever reason, force a transaction commit.
++ * We use a negative value because there are functions through the logging code
++ * that need to return an error (< 0 value), false (0) or true (1). Any negative
++ * value will do, as it will cause the log to be marked for a full sync.
++ */
++#define BTRFS_LOG_FORCE_COMMIT				(-(MAX_ERRNO + 1))
+ 
+ struct btrfs_log_ctx {
+ 	int log_ret;
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index df43093b7a46..7823168c08a6 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata(
+ 	/*
+ 	 * Handle the case where the scanned device is part of an fs whose last
+ 	 * metadata UUID change reverted it to the original FSID. At the same
+-	 * time * fs_devices was first created by another constitutent device
++	 * time fs_devices was first created by another constituent device
+ 	 * which didn't fully observe the operation. This results in an
+ 	 * btrfs_fs_devices created with metadata/fsid different AND
+ 	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
+@@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op)
+ 	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+ }
+ 
+-/*
+- * Calculate the geometry of a particular (address, len) tuple. This
+- * information is used to calculate how big a particular bio can get before it
+- * straddles a stripe.
+- *
+- * @fs_info: the filesystem
+- * @em:      mapping containing the logical extent
+- * @op:      type of operation - write or read
+- * @logical: address that we want to figure out the geometry of
+- * @io_geom: pointer used to return values
+- *
+- * Returns < 0 in case a chunk for the given logical address cannot be found,
+- * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
+- */
+-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+-			  enum btrfs_map_op op, u64 logical,
+-			  struct btrfs_io_geometry *io_geom)
++static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
++			    u64 offset, u64 *stripe_nr, u64 *stripe_offset,
++			    u64 *full_stripe_start)
+ {
+-	struct map_lookup *map;
+-	u64 len;
+-	u64 offset;
+-	u64 stripe_offset;
+-	u64 stripe_nr;
+-	u32 stripe_len;
+-	u64 raid56_full_stripe_start = (u64)-1;
+-	int data_stripes;
++	u32 stripe_len = map->stripe_len;
+ 
+ 	ASSERT(op != BTRFS_MAP_DISCARD);
+ 
+-	map = em->map_lookup;
+-	/* Offset of this logical address in the chunk */
+-	offset = logical - em->start;
+-	/* Len of a stripe in a chunk */
+-	stripe_len = map->stripe_len;
+ 	/*
+-	 * Stripe_nr is where this block falls in
+-	 * stripe_offset is the offset of this block in its stripe.
++	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
++	 * the offset of this block in its stripe.
+ 	 */
+-	stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+-	ASSERT(stripe_offset < U32_MAX);
++	*stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
++	ASSERT(*stripe_offset < U32_MAX);
+ 
+-	data_stripes = nr_data_stripes(map);
++	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
++		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ 
+-	/* Only stripe based profiles needs to check against stripe length. */
+-	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
+-		u64 max_len = stripe_len - stripe_offset;
++		*full_stripe_start =
++			div64_u64(offset, full_stripe_len) * full_stripe_len;
+ 
+ 		/*
+-		 * In case of raid56, we need to know the stripe aligned start
++		 * For writes to RAID56, allow to write a full stripe set, but
++		 * no straddling of stripe sets.
+ 		 */
+-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+-			unsigned long full_stripe_len = stripe_len * data_stripes;
+-			raid56_full_stripe_start = offset;
+-
+-			/*
+-			 * Allow a write of a full stripe, but make sure we
+-			 * don't allow straddling of stripes
+-			 */
+-			raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+-					full_stripe_len);
+-			raid56_full_stripe_start *= full_stripe_len;
+-
+-			/*
+-			 * For writes to RAID[56], allow a full stripeset across
+-			 * all disks. For other RAID types and for RAID[56]
+-			 * reads, just allow a single stripe (on a single disk).
+-			 */
+-			if (op == BTRFS_MAP_WRITE) {
+-				max_len = stripe_len * data_stripes -
+-					  (offset - raid56_full_stripe_start);
+-			}
+-		}
+-		len = min_t(u64, em->len - offset, max_len);
+-	} else {
+-		len = em->len - offset;
++		if (op == BTRFS_MAP_WRITE)
++			return full_stripe_len - (offset - *full_stripe_start);
+ 	}
+ 
+-	io_geom->len = len;
+-	io_geom->offset = offset;
+-	io_geom->stripe_len = stripe_len;
+-	io_geom->stripe_nr = stripe_nr;
+-	io_geom->stripe_offset = stripe_offset;
+-	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
+-
+-	return 0;
++	/*
++	 * For other RAID types and for RAID56 reads, allow a single stripe (on
++	 * a single disk).
++	 */
++	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
++		return stripe_len - *stripe_offset;
++	return U64_MAX;
+ }
+ 
+ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+@@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ {
+ 	struct extent_map *em;
+ 	struct map_lookup *map;
++	u64 map_offset;
+ 	u64 stripe_offset;
+ 	u64 stripe_nr;
+ 	u64 stripe_len;
+@@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	int patch_the_first_stripe_for_dev_replace = 0;
+ 	u64 physical_to_patch_in_first_stripe = 0;
+ 	u64 raid56_full_stripe_start = (u64)-1;
+-	struct btrfs_io_geometry geom;
++	u64 max_len;
+ 
+ 	ASSERT(bioc_ret);
+ 	ASSERT(op != BTRFS_MAP_DISCARD);
+@@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	em = btrfs_get_chunk_map(fs_info, logical, *length);
+ 	ASSERT(!IS_ERR(em));
+ 
+-	ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
+-	if (ret < 0)
+-		return ret;
+-
+ 	map = em->map_lookup;
+-
+-	*length = geom.len;
+-	stripe_len = geom.stripe_len;
+-	stripe_nr = geom.stripe_nr;
+-	stripe_offset = geom.stripe_offset;
+-	raid56_full_stripe_start = geom.raid56_stripe_offset;
+ 	data_stripes = nr_data_stripes(map);
++	stripe_len = map->stripe_len;
++
++	map_offset = logical - em->start;
++	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
++				   &stripe_offset, &raid56_full_stripe_start);
++	*length = min_t(u64, em->len - map_offset, max_len);
+ 
+ 	down_read(&dev_replace->rwsem);
+ 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
+index 6b7a05f6cf82..7e51f2238f72 100644
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -53,21 +53,6 @@ enum btrfs_raid_types {
+ 	BTRFS_NR_RAID_TYPES
+ };
+ 
+-struct btrfs_io_geometry {
+-	/* remaining bytes before crossing a stripe */
+-	u64 len;
+-	/* offset of logical address in chunk */
+-	u64 offset;
+-	/* length of single IO stripe */
+-	u32 stripe_len;
+-	/* offset of address in stripe */
+-	u32 stripe_offset;
+-	/* number of stripe where address falls */
+-	u64 stripe_nr;
+-	/* offset of raid56 stripe into the chunk */
+-	u64 raid56_stripe_offset;
+-};
+-
+ /*
+  * Use sequence counter to get consistent device stat data on
+  * 32-bit processors.
+@@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 					       u64 logical, u64 *length_ret,
+ 					       u32 *num_stripes);
+-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
+-			  enum btrfs_map_op op, u64 logical,
+-			  struct btrfs_io_geometry *io_geom);
+ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
+ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 1f503e8e42d4..f95b2c94d619 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -17,6 +17,7 @@
+ #include "space-info.h"
+ #include "fs.h"
+ #include "accessors.h"
++#include "bio.h"
+ 
+ /* Maximum number of zones to report per blkdev_report_zones() call */
+ #define BTRFS_REPORT_NR_ZONES   4096
+@@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+  */
+ static inline u32 sb_zone_number(int shift, int mirror)
+ {
+-	u64 zone;
++	u64 zone = U64_MAX;
+ 
+ 	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ 	switch (mirror) {
+@@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ 			       struct blk_zone *zones, unsigned int *nr_zones)
+ {
+ 	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+-	u32 zno;
+ 	int ret;
+ 
+ 	if (!*nr_zones)
+@@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ 	/* Check cache */
+ 	if (zinfo->zone_cache) {
+ 		unsigned int i;
++		u32 zno;
+ 
+ 		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ 		zno = pos >> zinfo->zone_size_shift;
+@@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ 		return -EIO;
+ 
+ 	/* Populate cache */
+-	if (zinfo->zone_cache)
++	if (zinfo->zone_cache) {
++		u32 zno = pos >> zinfo->zone_size_shift;
++
+ 		memcpy(zinfo->zone_cache + zno, zones,
+ 		       sizeof(*zinfo->zone_cache) * *nr_zones);
++	}
+ 
+ 	return 0;
+ }
+@@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+ 	nr_sectors = bdev_nr_sectors(bdev);
+ 	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ 	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+-	/*
+-	 * We limit max_zone_append_size also by max_segments *
+-	 * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
+-	 * since btrfs adds the pages one by one to a bio, and btrfs cannot
+-	 * increase the metadata reservation even if it increases the number of
+-	 * extents, it is safe to stick with the limit.
+-	 *
+-	 * With the zoned emulation, we can have non-zoned device on the zoned
+-	 * mode. In this case, we don't have a valid max zone append size. So,
+-	 * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
+-	 */
+-	if (bdev_is_zoned(bdev)) {
+-		zone_info->max_zone_append_size = min_t(u64,
+-			(u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+-			(u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+-	} else {
+-		zone_info->max_zone_append_size =
+-			(u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+-	}
+ 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ 		zone_info->nr_zones++;
+ 
+@@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+ 
+ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ {
++	struct queue_limits *lim = &fs_info->limits;
+ 	struct btrfs_device *device;
+ 	u64 zone_size = 0;
+-	u64 max_zone_append_size = 0;
+ 	int ret;
+ 
+ 	/*
+@@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ 	if (!btrfs_fs_incompat(fs_info, ZONED))
+ 		return btrfs_check_for_zoned_device(fs_info);
+ 
++	blk_set_stacking_limits(lim);
++
+ 	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ 		struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ 
+@@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ 				  zone_info->zone_size, zone_size);
+ 			return -EINVAL;
+ 		}
+-		if (!max_zone_append_size ||
+-		    (zone_info->max_zone_append_size &&
+-		     zone_info->max_zone_append_size < max_zone_append_size))
+-			max_zone_append_size = zone_info->max_zone_append_size;
++
++		/*
++		 * With the zoned emulation, we can have non-zoned device on the
++		 * zoned mode. In this case, we don't have a valid max zone
++		 * append size.
++		 */
++		if (bdev_is_zoned(device->bdev)) {
++			blk_stack_limits(lim,
++					 &bdev_get_queue(device->bdev)->limits,
++					 0);
++		}
+ 	}
+ 
+ 	/*
+@@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ 	}
+ 
+ 	fs_info->zone_size = zone_size;
+-	fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
+-						   fs_info->sectorsize);
++	/*
++	 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
++	 * Technically, we can have multiple pages per segment. But, since
++	 * we add the pages one by one to a bio, and cannot increase the
++	 * metadata reservation even if it increases the number of extents, it
++	 * is safe to stick with the limit.
++	 */
++	fs_info->max_zone_append_size = ALIGN_DOWN(
++		min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
++		     (u64)lim->max_sectors << SECTOR_SHIFT,
++		     (u64)lim->max_segments << PAGE_SHIFT),
++		fs_info->sectorsize);
+ 	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+ 	if (fs_info->max_zone_append_size < fs_info->max_extent_size)
+ 		fs_info->max_extent_size = fs_info->max_zone_append_size;
+@@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+ 	spin_unlock(&trans->releasing_ebs_lock);
+ }
+ 
+-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
++bool btrfs_use_zone_append(struct btrfs_bio *bbio)
+ {
++	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
++	struct btrfs_inode *inode = bbio->inode;
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct btrfs_block_group *cache;
+ 	bool ret = false;
+@@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+ 	if (!is_data_inode(&inode->vfs_inode))
+ 		return false;
+ 
++	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
++		return false;
++
+ 	/*
+ 	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ 	 * extent layout the relocation code has.
+@@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+ 	return ret;
+ }
+ 
+-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+-				 struct bio *bio)
++void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
+ {
++	const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ 	struct btrfs_ordered_extent *ordered;
+-	const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 
+-	if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+-		return;
+-
+-	ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
++	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+ 	if (WARN_ON(!ordered))
+ 		return;
+ 
+ 	ordered->physical = physical;
+-	ordered->bdev = bio->bi_bdev;
+-
+ 	btrfs_put_ordered_extent(ordered);
+ }
+ 
+@@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+ 	struct extent_map *em;
+ 	struct btrfs_ordered_sum *sum;
+ 	u64 orig_logical = ordered->disk_bytenr;
+-	u64 *logical = NULL;
+-	int nr, stripe_len;
++	struct map_lookup *map;
++	u64 physical = ordered->physical;
++	u64 chunk_start_phys;
++	u64 logical;
+ 
+-	/* Zoned devices should not have partitions. So, we can assume it is 0 */
+-	ASSERT(!bdev_is_partition(ordered->bdev));
+-	if (WARN_ON(!ordered->bdev))
++	em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
++	if (IS_ERR(em))
+ 		return;
++	map = em->map_lookup;
++	chunk_start_phys = map->stripes[0].physical;
+ 
+-	if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
+-				     ordered->physical, &logical, &nr,
+-				     &stripe_len)))
+-		goto out;
+-
+-	WARN_ON(nr != 1);
++	if (WARN_ON_ONCE(map->num_stripes > 1) ||
++	    WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
++	    WARN_ON_ONCE(physical < chunk_start_phys) ||
++	    WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
++		free_extent_map(em);
++		return;
++	}
++	logical = em->start + (physical - map->stripes[0].physical);
++	free_extent_map(em);
+ 
+-	if (orig_logical == *logical)
+-		goto out;
++	if (orig_logical == logical)
++		return;
+ 
+-	ordered->disk_bytenr = *logical;
++	ordered->disk_bytenr = logical;
+ 
+ 	em_tree = &inode->extent_tree;
+ 	write_lock(&em_tree->lock);
+ 	em = search_extent_mapping(em_tree, ordered->file_offset,
+ 				   ordered->num_bytes);
+-	em->block_start = *logical;
++	em->block_start = logical;
+ 	free_extent_map(em);
+ 	write_unlock(&em_tree->lock);
+ 
+ 	list_for_each_entry(sum, &ordered->list, list) {
+-		if (*logical < orig_logical)
+-			sum->bytenr -= orig_logical - *logical;
++		if (logical < orig_logical)
++			sum->bytenr -= orig_logical - logical;
+ 		else
+-			sum->bytenr += *logical - orig_logical;
++			sum->bytenr += logical - orig_logical;
+ 	}
+-
+-out:
+-	kfree(logical);
+ }
+ 
+ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+@@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ 	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+ }
+ 
+-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+-					    u64 logical, u64 length)
+-{
+-	struct btrfs_device *device;
+-	struct extent_map *em;
+-	struct map_lookup *map;
+-
+-	em = btrfs_get_chunk_map(fs_info, logical, length);
+-	if (IS_ERR(em))
+-		return ERR_CAST(em);
+-
+-	map = em->map_lookup;
+-	/* We only support single profile for now */
+-	device = map->stripes[0].dev;
+-
+-	free_extent_map(em);
+-
+-	return device;
+-}
+-
+ /*
+  * Activate block group and underlying device zones
+  *
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index f43990985d80..c0570d35fea2 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -20,7 +20,6 @@ struct btrfs_zoned_device_info {
+ 	 */
+ 	u64 zone_size;
+ 	u8  zone_size_shift;
+-	u64 max_zone_append_size;
+ 	u32 nr_zones;
+ 	unsigned int max_active_zones;
+ 	atomic_t active_zones_left;
+@@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
+ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ 			    struct extent_buffer *eb);
+ void btrfs_free_redirty_list(struct btrfs_transaction *trans);
+-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
+-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+-				 struct bio *bio);
++bool btrfs_use_zone_append(struct btrfs_bio *bbio);
++void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
+ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
+ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ 				    struct extent_buffer *eb,
+@@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
+ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ 				  u64 physical_start, u64 physical_pos);
+-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+-					    u64 logical, u64 length);
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+@@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ 					  struct extent_buffer *eb) { }
+ static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
+ 
+-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
++static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
+ {
+ 	return false;
+ }
+ 
+-static inline void btrfs_record_physical_zoned(struct inode *inode,
+-					       u64 file_offset, struct bio *bio)
++static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
+ {
+ }
+ 
+@@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
+ 	return -EOPNOTSUPP;
+ }
+ 
+-static inline struct btrfs_device *btrfs_zoned_get_device(
+-						  struct btrfs_fs_info *fs_info,
+-						  u64 logical, u64 length)
+-{
+-	return ERR_PTR(-EOPNOTSUPP);
+-}
+-
+ static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+ {
+ 	return true;
+diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
+index e7537fd305dd..e191ecfb1fde 100644
+--- a/fs/gfs2/bmap.c
++++ b/fs/gfs2/bmap.c
+@@ -956,26 +956,40 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+ 	goto out;
+ }
+ 
+-static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
+-				   unsigned len)
++static struct folio *
++gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
+ {
++	struct inode *inode = iter->inode;
+ 	unsigned int blockmask = i_blocksize(inode) - 1;
+ 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+ 	unsigned int blocks;
++	struct folio *folio;
++	int status;
+ 
+ 	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
+-	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
++	status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
++	if (status)
++		return ERR_PTR(status);
++
++	folio = iomap_get_folio(iter, pos);
++	if (IS_ERR(folio))
++		gfs2_trans_end(sdp);
++	return folio;
+ }
+ 
+-static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
+-				 unsigned copied, struct page *page)
++static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
++				 unsigned copied, struct folio *folio)
+ {
+ 	struct gfs2_trans *tr = current->journal_info;
+ 	struct gfs2_inode *ip = GFS2_I(inode);
+ 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+ 
+-	if (page && !gfs2_is_stuffed(ip))
+-		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
++	if (!gfs2_is_stuffed(ip))
++		gfs2_page_add_databufs(ip, &folio->page, offset_in_page(pos),
++				       copied);
++
++	folio_unlock(folio);
++	folio_put(folio);
+ 
+ 	if (tr->tr_num_buf_new)
+ 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+@@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
+ 	gfs2_trans_end(sdp);
+ }
+ 
+-static const struct iomap_page_ops gfs2_iomap_page_ops = {
+-	.page_prepare = gfs2_iomap_page_prepare,
+-	.page_done = gfs2_iomap_page_done,
++static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
++	.get_folio = gfs2_iomap_get_folio,
++	.put_folio = gfs2_iomap_put_folio,
+ };
+ 
+ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
+@@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
+ 	}
+ 
+ 	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
+-		iomap->page_ops = &gfs2_iomap_page_ops;
++		iomap->folio_ops = &gfs2_iomap_folio_ops;
+ 	return 0;
+ 
+ out_trans_end:
+@@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
+ /*
+  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
+  * uses iomap write to perform its actions, which begin their own transactions
+- * (iomap_begin, page_prepare, etc.)
++ * (iomap_begin, get_folio, etc.)
+  */
+ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
+ 				 unsigned int length)
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index 356193e44cf0..d3c300563eb8 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
+ }
+ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+ 
++/**
++ * iomap_get_folio - get a folio reference for writing
++ * @iter: iteration structure
++ * @pos: start offset of write
++ *
++ * Returns a locked reference to the folio at @pos, or an error pointer if the
++ * folio could not be obtained.
++ */
++struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
++{
++	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
++	struct folio *folio;
++
++	if (iter->flags & IOMAP_NOWAIT)
++		fgp |= FGP_NOWAIT;
++
++	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
++			fgp, mapping_gfp_mask(iter->inode->i_mapping));
++	if (folio)
++		return folio;
++
++	if (iter->flags & IOMAP_NOWAIT)
++		return ERR_PTR(-EAGAIN);
++	return ERR_PTR(-ENOMEM);
++}
++EXPORT_SYMBOL_GPL(iomap_get_folio);
++
+ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
+ {
+ 	trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
+@@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+ 	return 0;
+ }
+ 
++static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
++		size_t len)
++{
++	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
++
++	if (folio_ops && folio_ops->get_folio)
++		return folio_ops->get_folio(iter, pos, len);
++	else
++		return iomap_get_folio(iter, pos);
++}
++
++static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
++		struct folio *folio)
++{
++	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
++
++	if (folio_ops && folio_ops->put_folio) {
++		folio_ops->put_folio(iter->inode, pos, ret, folio);
++	} else {
++		folio_unlock(folio);
++		folio_put(folio);
++	}
++}
++
+ static int iomap_write_begin_inline(const struct iomap_iter *iter,
+ 		struct folio *folio)
+ {
+@@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
+ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+ 		size_t len, struct folio **foliop)
+ {
+-	const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
++	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ 	struct folio *folio;
+-	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
+ 	int status = 0;
+ 
+-	if (iter->flags & IOMAP_NOWAIT)
+-		fgp |= FGP_NOWAIT;
+-
+ 	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
+ 	if (srcmap != &iter->iomap)
+ 		BUG_ON(pos + len > srcmap->offset + srcmap->length);
+@@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+ 	if (!mapping_large_folio_support(iter->inode->i_mapping))
+ 		len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+ 
+-	if (page_ops && page_ops->page_prepare) {
+-		status = page_ops->page_prepare(iter->inode, pos, len);
+-		if (status)
+-			return status;
+-	}
+-
+-	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+-			fgp, mapping_gfp_mask(iter->inode->i_mapping));
+-	if (!folio) {
+-		status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
+-		goto out_no_page;
+-	}
++	folio = __iomap_get_folio(iter, pos, len);
++	if (IS_ERR(folio))
++		return PTR_ERR(folio);
+ 
+ 	/*
+ 	 * Now we have a locked folio, before we do anything with it we need to
+@@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+ 	 * could do the wrong thing here (zero a page range incorrectly or fail
+ 	 * to zero) and corrupt data.
+ 	 */
+-	if (page_ops && page_ops->iomap_valid) {
+-		bool iomap_valid = page_ops->iomap_valid(iter->inode,
+-							&iter->iomap);
++	if (folio_ops && folio_ops->iomap_valid) {
++		bool iomap_valid = folio_ops->iomap_valid(iter->inode,
++							 &iter->iomap);
+ 		if (!iomap_valid) {
+ 			iter->iomap.flags |= IOMAP_F_STALE;
+ 			status = 0;
+@@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+ 	return 0;
+ 
+ out_unlock:
+-	folio_unlock(folio);
+-	folio_put(folio);
++	__iomap_put_folio(iter, pos, 0, folio);
+ 	iomap_write_failed(iter->inode, pos, len);
+ 
+-out_no_page:
+-	if (page_ops && page_ops->page_done)
+-		page_ops->page_done(iter->inode, pos, 0, NULL);
+ 	return status;
+ }
+ 
+@@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
+ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
+ 		size_t copied, struct folio *folio)
+ {
+-	const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+ 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ 	loff_t old_size = iter->inode->i_size;
+ 	size_t ret;
+@@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
+ 		i_size_write(iter->inode, pos + ret);
+ 		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+ 	}
+-	folio_unlock(folio);
++	__iomap_put_folio(iter, pos, ret, folio);
+ 
+ 	if (old_size < pos)
+ 		pagecache_isize_extended(iter->inode, old_size, pos);
+-	if (page_ops && page_ops->page_done)
+-		page_ops->page_done(iter->inode, pos, ret, &folio->page);
+-	folio_put(folio);
+-
+ 	if (ret < len)
+ 		iomap_write_failed(iter->inode, pos + ret, len - ret);
+ 	return ret;
+diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
+index 9804714b1751..f771001574d0 100644
+--- a/fs/iomap/direct-io.c
++++ b/fs/iomap/direct-io.c
+@@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
+ {
+ 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
+ 
+-	if (!(dio->flags & IOMAP_DIO_WRITE)) {
+-		WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
++	if (!(dio->flags & IOMAP_DIO_WRITE))
+ 		return REQ_OP_READ;
+-	}
+-
+-	if (iomap->flags & IOMAP_F_ZONE_APPEND)
+-		opflags |= REQ_OP_ZONE_APPEND;
+-	else
+-		opflags |= REQ_OP_WRITE;
+ 
++	opflags |= REQ_OP_WRITE;
+ 	if (use_fua)
+ 		opflags |= REQ_FUA;
+ 	else
+diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
+index 989cf341779b..f8ff81c3de76 100644
+--- a/fs/xfs/libxfs/xfs_alloc.c
++++ b/fs/xfs/libxfs/xfs_alloc.c
+@@ -2472,20 +2472,20 @@ xfs_defer_agfl_block(
+ 	struct xfs_owner_info		*oinfo)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+-	struct xfs_extent_free_item	*new;		/* new element */
++	struct xfs_extent_free_item	*xefi;
+ 
+ 	ASSERT(xfs_extfree_item_cache != NULL);
+ 	ASSERT(oinfo != NULL);
+ 
+-	new = kmem_cache_zalloc(xfs_extfree_item_cache,
++	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+ 			       GFP_KERNEL | __GFP_NOFAIL);
+-	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+-	new->xefi_blockcount = 1;
+-	new->xefi_owner = oinfo->oi_owner;
++	xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
++	xefi->xefi_blockcount = 1;
++	xefi->xefi_owner = oinfo->oi_owner;
+ 
+ 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+ 
+-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
++	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ }
+ 
+ /*
+@@ -2500,7 +2500,7 @@ __xfs_free_extent_later(
+ 	const struct xfs_owner_info	*oinfo,
+ 	bool				skip_discard)
+ {
+-	struct xfs_extent_free_item	*new;		/* new element */
++	struct xfs_extent_free_item	*xefi;
+ #ifdef DEBUG
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	xfs_agnumber_t			agno;
+@@ -2519,27 +2519,27 @@ __xfs_free_extent_later(
+ #endif
+ 	ASSERT(xfs_extfree_item_cache != NULL);
+ 
+-	new = kmem_cache_zalloc(xfs_extfree_item_cache,
++	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+ 			       GFP_KERNEL | __GFP_NOFAIL);
+-	new->xefi_startblock = bno;
+-	new->xefi_blockcount = (xfs_extlen_t)len;
++	xefi->xefi_startblock = bno;
++	xefi->xefi_blockcount = (xfs_extlen_t)len;
+ 	if (skip_discard)
+-		new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
++		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ 	if (oinfo) {
+ 		ASSERT(oinfo->oi_offset == 0);
+ 
+ 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+-			new->xefi_flags |= XFS_EFI_ATTR_FORK;
++			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
+ 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+-			new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+-		new->xefi_owner = oinfo->oi_owner;
++			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
++		xefi->xefi_owner = oinfo->oi_owner;
+ 	} else {
+-		new->xefi_owner = XFS_RMAP_OWN_NULL;
++		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
+ 	}
+ 	trace_xfs_bmap_free_defer(tp->t_mountp,
+ 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+-	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
++	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ }
+ 
+ #ifdef DEBUG
+diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
+index 0d56a8d862e8..c8c65387136c 100644
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent(
+ int
+ xfs_bmap_finish_one(
+ 	struct xfs_trans		*tp,
+-	struct xfs_inode		*ip,
+-	enum xfs_bmap_intent_type	type,
+-	int				whichfork,
+-	xfs_fileoff_t			startoff,
+-	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			*blockcount,
+-	xfs_exntst_t			state)
++	struct xfs_bmap_intent		*bi)
+ {
++	struct xfs_bmbt_irec		*bmap = &bi->bi_bmap;
+ 	int				error = 0;
+ 
+ 	ASSERT(tp->t_firstblock == NULLFSBLOCK);
+ 
+ 	trace_xfs_bmap_deferred(tp->t_mountp,
+-			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+-			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+-			ip->i_ino, whichfork, startoff, *blockcount, state);
++			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
++			bi->bi_type,
++			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
++			bi->bi_owner->i_ino, bi->bi_whichfork,
++			bmap->br_startoff, bmap->br_blockcount,
++			bmap->br_state);
+ 
+-	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
++	if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
+ 		return -EFSCORRUPTED;
+ 
+ 	if (XFS_TEST_ERROR(false, tp->t_mountp,
+ 			XFS_ERRTAG_BMAP_FINISH_ONE))
+ 		return -EIO;
+ 
+-	switch (type) {
++	switch (bi->bi_type) {
+ 	case XFS_BMAP_MAP:
+-		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
+-				startblock, 0);
+-		*blockcount = 0;
++		error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
++				bmap->br_blockcount, bmap->br_startblock, 0);
++		bmap->br_blockcount = 0;
+ 		break;
+ 	case XFS_BMAP_UNMAP:
+-		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+-				XFS_BMAPI_REMAP, 1);
++		error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
++				&bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
+ 		break;
+ 	default:
+ 		ASSERT(0);
+diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
+index 16db95b11589..01c2df35c3e3 100644
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -234,10 +234,7 @@ struct xfs_bmap_intent {
+ 	struct xfs_bmbt_irec			bi_bmap;
+ };
+ 
+-int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
+-		enum xfs_bmap_intent_type type, int whichfork,
+-		xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-		xfs_filblks_t *blockcount, xfs_exntst_t state);
++int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+ void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ 		struct xfs_bmbt_irec *imap);
+ void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
+index 35f574421670..da8c769887fd 100644
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -2913,9 +2913,22 @@ xfs_btree_split_worker(
+ }
+ 
+ /*
+- * BMBT split requests often come in with little stack to work on. Push
++ * BMBT split requests often come in with little stack to work on so we push
+  * them off to a worker thread so there is lots of stack to use. For the other
+  * btree types, just call directly to avoid the context switch overhead here.
++ *
++ * Care must be taken here - the work queue rescuer thread introduces potential
++ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
++ * AGFs to allocate blocks. A task being run by the rescuer could attempt to
++ * lock an AGF that is already locked by a task queued to run by the rescuer,
++ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
++ * release it until the current thread it is running gains the lock.
++ *
++ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
++ * already locked to allocate from. The only place that doesn't hold an AGF
++ * locked is unwritten extent conversion at IO completion, but that has already
++ * been offloaded to a worker thread and hence has no stack consumption issues
++ * we have to worry about.
+  */
+ STATIC int					/* error */
+ xfs_btree_split(
+@@ -2929,7 +2942,8 @@ xfs_btree_split(
+ 	struct xfs_btree_split_args	args;
+ 	DECLARE_COMPLETION_ONSTACK(done);
+ 
+-	if (cur->bc_btnum != XFS_BTNUM_BMAP)
++	if (cur->bc_btnum != XFS_BTNUM_BMAP ||
++	    cur->bc_tp->t_firstblock == NULLFSBLOCK)
+ 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+ 
+ 	args.cur = cur;
+diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
+index 6f7ed9288fe4..bcf46aa0d08b 100644
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -1213,37 +1213,33 @@ xfs_refcount_adjust_extents(
+ STATIC int
+ xfs_refcount_adjust(
+ 	struct xfs_btree_cur	*cur,
+-	xfs_agblock_t		agbno,
+-	xfs_extlen_t		aglen,
+-	xfs_agblock_t		*new_agbno,
+-	xfs_extlen_t		*new_aglen,
++	xfs_agblock_t		*agbno,
++	xfs_extlen_t		*aglen,
+ 	enum xfs_refc_adjust_op	adj)
+ {
+ 	bool			shape_changed;
+ 	int			shape_changes = 0;
+ 	int			error;
+ 
+-	*new_agbno = agbno;
+-	*new_aglen = aglen;
+ 	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+-		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+-				agbno, aglen);
++		trace_xfs_refcount_increase(cur->bc_mp,
++				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+ 	else
+-		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+-				agbno, aglen);
++		trace_xfs_refcount_decrease(cur->bc_mp,
++				cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+ 
+ 	/*
+ 	 * Ensure that no rcextents cross the boundary of the adjustment range.
+ 	 */
+ 	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+-			agbno, &shape_changed);
++			*agbno, &shape_changed);
+ 	if (error)
+ 		goto out_error;
+ 	if (shape_changed)
+ 		shape_changes++;
+ 
+ 	error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+-			agbno + aglen, &shape_changed);
++			*agbno + *aglen, &shape_changed);
+ 	if (error)
+ 		goto out_error;
+ 	if (shape_changed)
+@@ -1253,7 +1249,7 @@ xfs_refcount_adjust(
+ 	 * Try to merge with the left or right extents of the range.
+ 	 */
+ 	error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+-			new_agbno, new_aglen, adj, &shape_changed);
++			agbno, aglen, adj, &shape_changed);
+ 	if (error)
+ 		goto out_error;
+ 	if (shape_changed)
+@@ -1262,7 +1258,7 @@ xfs_refcount_adjust(
+ 		cur->bc_ag.refc.shape_changes++;
+ 
+ 	/* Now that we've taken care of the ends, adjust the middle extents */
+-	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
++	error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
+ 	if (error)
+ 		goto out_error;
+ 
+@@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup(
+ static inline int
+ xfs_refcount_continue_op(
+ 	struct xfs_btree_cur		*cur,
+-	xfs_fsblock_t			startblock,
+-	xfs_agblock_t			new_agbno,
+-	xfs_extlen_t			new_len,
+-	xfs_fsblock_t			*new_fsbno)
++	struct xfs_refcount_intent	*ri,
++	xfs_agblock_t			new_agbno)
+ {
+ 	struct xfs_mount		*mp = cur->bc_mp;
+ 	struct xfs_perag		*pag = cur->bc_ag.pag;
+ 
+-	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
++	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
++					ri->ri_blockcount)))
+ 		return -EFSCORRUPTED;
+ 
+-	*new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
++	ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ 
+-	ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+-	ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
++	ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
++	ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
+ 
+ 	return 0;
+ }
+@@ -1327,11 +1322,7 @@ xfs_refcount_continue_op(
+ int
+ xfs_refcount_finish_one(
+ 	struct xfs_trans		*tp,
+-	enum xfs_refcount_intent_type	type,
+-	xfs_fsblock_t			startblock,
+-	xfs_extlen_t			blockcount,
+-	xfs_fsblock_t			*new_fsb,
+-	xfs_extlen_t			*new_len,
++	struct xfs_refcount_intent	*ri,
+ 	struct xfs_btree_cur		**pcur)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+@@ -1339,17 +1330,16 @@ xfs_refcount_finish_one(
+ 	struct xfs_buf			*agbp = NULL;
+ 	int				error = 0;
+ 	xfs_agblock_t			bno;
+-	xfs_agblock_t			new_agbno;
+ 	unsigned long			nr_ops = 0;
+ 	int				shape_changes = 0;
+ 	struct xfs_perag		*pag;
+ 
+-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+-	bno = XFS_FSB_TO_AGBNO(mp, startblock);
++	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
++	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
+ 
+-	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+-			type, XFS_FSB_TO_AGBNO(mp, startblock),
+-			blockcount);
++	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
++			ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
++			ri->ri_blockcount);
+ 
+ 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
+ 		error = -EIO;
+@@ -1380,42 +1370,42 @@ xfs_refcount_finish_one(
+ 	}
+ 	*pcur = rcur;
+ 
+-	switch (type) {
++	switch (ri->ri_type) {
+ 	case XFS_REFCOUNT_INCREASE:
+-		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+-				new_len, XFS_REFCOUNT_ADJUST_INCREASE);
++		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
++				XFS_REFCOUNT_ADJUST_INCREASE);
+ 		if (error)
+ 			goto out_drop;
+-		if (*new_len > 0)
+-			error = xfs_refcount_continue_op(rcur, startblock,
+-					new_agbno, *new_len, new_fsb);
++		if (ri->ri_blockcount > 0)
++			error = xfs_refcount_continue_op(rcur, ri, bno);
+ 		break;
+ 	case XFS_REFCOUNT_DECREASE:
+-		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+-				new_len, XFS_REFCOUNT_ADJUST_DECREASE);
++		error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
++				XFS_REFCOUNT_ADJUST_DECREASE);
+ 		if (error)
+ 			goto out_drop;
+-		if (*new_len > 0)
+-			error = xfs_refcount_continue_op(rcur, startblock,
+-					new_agbno, *new_len, new_fsb);
++		if (ri->ri_blockcount > 0)
++			error = xfs_refcount_continue_op(rcur, ri, bno);
+ 		break;
+ 	case XFS_REFCOUNT_ALLOC_COW:
+-		*new_fsb = startblock + blockcount;
+-		*new_len = 0;
+-		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
++		error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
++		if (error)
++			goto out_drop;
++		ri->ri_blockcount = 0;
+ 		break;
+ 	case XFS_REFCOUNT_FREE_COW:
+-		*new_fsb = startblock + blockcount;
+-		*new_len = 0;
+-		error = __xfs_refcount_cow_free(rcur, bno, blockcount);
++		error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
++		if (error)
++			goto out_drop;
++		ri->ri_blockcount = 0;
+ 		break;
+ 	default:
+ 		ASSERT(0);
+ 		error = -EFSCORRUPTED;
+ 	}
+-	if (!error && *new_len > 0)
+-		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
+-				bno, blockcount, new_agbno, *new_len);
++	if (!error && ri->ri_blockcount > 0)
++		trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
++				ri->ri_type, bno, ri->ri_blockcount);
+ out_drop:
+ 	xfs_perag_put(pag);
+ 	return error;
+diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
+index 452f30556f5a..c633477ce3ce 100644
+--- a/fs/xfs/libxfs/xfs_refcount.h
++++ b/fs/xfs/libxfs/xfs_refcount.h
+@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp,
+ extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+ 		struct xfs_btree_cur *rcur, int error);
+ extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+-		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+-		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+-		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
++		struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
+ 
+ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+ 		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
+index b56aca1e7c66..df720041cd3d 100644
+--- a/fs/xfs/libxfs/xfs_rmap.c
++++ b/fs/xfs/libxfs/xfs_rmap.c
+@@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup(
+ int
+ xfs_rmap_finish_one(
+ 	struct xfs_trans		*tp,
+-	enum xfs_rmap_intent_type	type,
+-	uint64_t			owner,
+-	int				whichfork,
+-	xfs_fileoff_t			startoff,
+-	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
+-	xfs_exntst_t			state,
++	struct xfs_rmap_intent		*ri,
+ 	struct xfs_btree_cur		**pcur)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+@@ -2408,11 +2402,13 @@ xfs_rmap_finish_one(
+ 	xfs_agblock_t			bno;
+ 	bool				unwritten;
+ 
+-	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
+-	bno = XFS_FSB_TO_AGBNO(mp, startblock);
++	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock));
++	bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
+ 
+-	trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
+-			startoff, blockcount, state);
++	trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno,
++			ri->ri_owner, ri->ri_whichfork,
++			ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
++			ri->ri_bmap.br_state);
+ 
+ 	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
+ 		error = -EIO;
+@@ -2448,35 +2444,37 @@ xfs_rmap_finish_one(
+ 	}
+ 	*pcur = rcur;
+ 
+-	xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
+-	unwritten = state == XFS_EXT_UNWRITTEN;
+-	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
++	xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
++			ri->ri_bmap.br_startoff);
++	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
++	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
+ 
+-	switch (type) {
++	switch (ri->ri_type) {
+ 	case XFS_RMAP_ALLOC:
+ 	case XFS_RMAP_MAP:
+-		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
++		error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
++				unwritten, &oinfo);
+ 		break;
+ 	case XFS_RMAP_MAP_SHARED:
+-		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+-				&oinfo);
++		error = xfs_rmap_map_shared(rcur, bno,
++				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+ 		break;
+ 	case XFS_RMAP_FREE:
+ 	case XFS_RMAP_UNMAP:
+-		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
+-				&oinfo);
++		error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
++				unwritten, &oinfo);
+ 		break;
+ 	case XFS_RMAP_UNMAP_SHARED:
+-		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+-				&oinfo);
++		error = xfs_rmap_unmap_shared(rcur, bno,
++				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+ 		break;
+ 	case XFS_RMAP_CONVERT:
+-		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
+-				&oinfo);
++		error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
++				!unwritten, &oinfo);
+ 		break;
+ 	case XFS_RMAP_CONVERT_SHARED:
+-		error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+-				!unwritten, &oinfo);
++		error = xfs_rmap_convert_shared(rcur, bno,
++				ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
+ 		break;
+ 	default:
+ 		ASSERT(0);
+diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
+index 54741a591a17..2dac88cea28d 100644
+--- a/fs/xfs/libxfs/xfs_rmap.h
++++ b/fs/xfs/libxfs/xfs_rmap.h
+@@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+ 
+ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
+ 		struct xfs_btree_cur *rcur, int error);
+-int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
+-		uint64_t owner, int whichfork, xfs_fileoff_t startoff,
+-		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
+-		xfs_exntst_t state, struct xfs_btree_cur **pcur);
++int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
++		struct xfs_btree_cur **pcur);
+ 
+ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ 		uint64_t owner, uint64_t offset, unsigned int flags,
+diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
+index 41323da523d1..6e2f0013380a 100644
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -246,18 +246,11 @@ static int
+ xfs_trans_log_finish_bmap_update(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_bud_log_item		*budp,
+-	enum xfs_bmap_intent_type	type,
+-	struct xfs_inode		*ip,
+-	int				whichfork,
+-	xfs_fileoff_t			startoff,
+-	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			*blockcount,
+-	xfs_exntst_t			state)
++	struct xfs_bmap_intent		*bi)
+ {
+ 	int				error;
+ 
+-	error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
+-			startblock, blockcount, state);
++	error = xfs_bmap_finish_one(tp, bi);
+ 
+ 	/*
+ 	 * Mark the transaction dirty, even on error. This ensures the
+@@ -290,24 +283,24 @@ xfs_bmap_update_diff_items(
+ /* Set the map extent flags for this mapping. */
+ static void
+ xfs_trans_set_bmap_flags(
+-	struct xfs_map_extent		*bmap,
++	struct xfs_map_extent		*map,
+ 	enum xfs_bmap_intent_type	type,
+ 	int				whichfork,
+ 	xfs_exntst_t			state)
+ {
+-	bmap->me_flags = 0;
++	map->me_flags = 0;
+ 	switch (type) {
+ 	case XFS_BMAP_MAP:
+ 	case XFS_BMAP_UNMAP:
+-		bmap->me_flags = type;
++		map->me_flags = type;
+ 		break;
+ 	default:
+ 		ASSERT(0);
+ 	}
+ 	if (state == XFS_EXT_UNWRITTEN)
+-		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
++		map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ 	if (whichfork == XFS_ATTR_FORK)
+-		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
++		map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+ }
+ 
+ /* Log bmap updates in the intent item. */
+@@ -315,7 +308,7 @@ STATIC void
+ xfs_bmap_update_log_item(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_bui_log_item		*buip,
+-	struct xfs_bmap_intent		*bmap)
++	struct xfs_bmap_intent		*bi)
+ {
+ 	uint				next_extent;
+ 	struct xfs_map_extent		*map;
+@@ -331,12 +324,12 @@ xfs_bmap_update_log_item(
+ 	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+ 	ASSERT(next_extent < buip->bui_format.bui_nextents);
+ 	map = &buip->bui_format.bui_extents[next_extent];
+-	map->me_owner = bmap->bi_owner->i_ino;
+-	map->me_startblock = bmap->bi_bmap.br_startblock;
+-	map->me_startoff = bmap->bi_bmap.br_startoff;
+-	map->me_len = bmap->bi_bmap.br_blockcount;
+-	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+-			bmap->bi_bmap.br_state);
++	map->me_owner = bi->bi_owner->i_ino;
++	map->me_startblock = bi->bi_bmap.br_startblock;
++	map->me_startoff = bi->bi_bmap.br_startoff;
++	map->me_len = bi->bi_bmap.br_blockcount;
++	xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
++			bi->bi_bmap.br_state);
+ }
+ 
+ static struct xfs_log_item *
+@@ -348,15 +341,15 @@ xfs_bmap_update_create_intent(
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_bui_log_item		*buip = xfs_bui_init(mp);
+-	struct xfs_bmap_intent		*bmap;
++	struct xfs_bmap_intent		*bi;
+ 
+ 	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+ 
+ 	xfs_trans_add_item(tp, &buip->bui_item);
+ 	if (sort)
+ 		list_sort(mp, items, xfs_bmap_update_diff_items);
+-	list_for_each_entry(bmap, items, bi_list)
+-		xfs_bmap_update_log_item(tp, buip, bmap);
++	list_for_each_entry(bi, items, bi_list)
++		xfs_bmap_update_log_item(tp, buip, bi);
+ 	return &buip->bui_item;
+ }
+ 
+@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item(
+ 	struct list_head		*item,
+ 	struct xfs_btree_cur		**state)
+ {
+-	struct xfs_bmap_intent		*bmap;
+-	xfs_filblks_t			count;
++	struct xfs_bmap_intent		*bi;
+ 	int				error;
+ 
+-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+-	count = bmap->bi_bmap.br_blockcount;
+-	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
+-			bmap->bi_type,
+-			bmap->bi_owner, bmap->bi_whichfork,
+-			bmap->bi_bmap.br_startoff,
+-			bmap->bi_bmap.br_startblock,
+-			&count,
+-			bmap->bi_bmap.br_state);
+-	if (!error && count > 0) {
+-		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+-		bmap->bi_bmap.br_blockcount = count;
++	bi = container_of(item, struct xfs_bmap_intent, bi_list);
++
++	error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
++	if (!error && bi->bi_bmap.br_blockcount > 0) {
++		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
+ 		return -EAGAIN;
+ 	}
+-	kmem_cache_free(xfs_bmap_intent_cache, bmap);
++	kmem_cache_free(xfs_bmap_intent_cache, bi);
+ 	return error;
+ }
+ 
+@@ -413,10 +398,10 @@ STATIC void
+ xfs_bmap_update_cancel_item(
+ 	struct list_head		*item)
+ {
+-	struct xfs_bmap_intent		*bmap;
++	struct xfs_bmap_intent		*bi;
+ 
+-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+-	kmem_cache_free(xfs_bmap_intent_cache, bmap);
++	bi = container_of(item, struct xfs_bmap_intent, bi_list);
++	kmem_cache_free(xfs_bmap_intent_cache, bi);
+ }
+ 
+ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+@@ -434,18 +419,18 @@ xfs_bui_validate(
+ 	struct xfs_mount		*mp,
+ 	struct xfs_bui_log_item		*buip)
+ {
+-	struct xfs_map_extent		*bmap;
++	struct xfs_map_extent		*map;
+ 
+ 	/* Only one mapping operation per BUI... */
+ 	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ 		return false;
+ 
+-	bmap = &buip->bui_format.bui_extents[0];
++	map = &buip->bui_format.bui_extents[0];
+ 
+-	if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
++	if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
+ 		return false;
+ 
+-	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
++	switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ 	case XFS_BMAP_MAP:
+ 	case XFS_BMAP_UNMAP:
+ 		break;
+@@ -453,13 +438,13 @@ xfs_bui_validate(
+ 		return false;
+ 	}
+ 
+-	if (!xfs_verify_ino(mp, bmap->me_owner))
++	if (!xfs_verify_ino(mp, map->me_owner))
+ 		return false;
+ 
+-	if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len))
++	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
+ 		return false;
+ 
+-	return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len);
++	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
+ }
+ 
+ /*
+@@ -471,17 +456,13 @@ xfs_bui_item_recover(
+ 	struct xfs_log_item		*lip,
+ 	struct list_head		*capture_list)
+ {
+-	struct xfs_bmbt_irec		irec;
++	struct xfs_bmap_intent		fake = { };
+ 	struct xfs_bui_log_item		*buip = BUI_ITEM(lip);
+ 	struct xfs_trans		*tp;
+ 	struct xfs_inode		*ip = NULL;
+ 	struct xfs_mount		*mp = lip->li_log->l_mp;
+-	struct xfs_map_extent		*bmap;
++	struct xfs_map_extent		*map;
+ 	struct xfs_bud_log_item		*budp;
+-	xfs_filblks_t			count;
+-	xfs_exntst_t			state;
+-	unsigned int			bui_type;
+-	int				whichfork;
+ 	int				iext_delta;
+ 	int				error = 0;
+ 
+@@ -491,14 +472,12 @@ xfs_bui_item_recover(
+ 		return -EFSCORRUPTED;
+ 	}
+ 
+-	bmap = &buip->bui_format.bui_extents[0];
+-	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+-			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+-	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
++	map = &buip->bui_format.bui_extents[0];
++	fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ 			XFS_ATTR_FORK : XFS_DATA_FORK;
+-	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
++	fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ 
+-	error = xlog_recover_iget(mp, bmap->me_owner, &ip);
++	error = xlog_recover_iget(mp, map->me_owner, &ip);
+ 	if (error)
+ 		return error;
+ 
+@@ -512,34 +491,34 @@ xfs_bui_item_recover(
+ 	xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
+-	if (bui_type == XFS_BMAP_MAP)
++	if (fake.bi_type == XFS_BMAP_MAP)
+ 		iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
+ 	else
+ 		iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
+ 
+-	error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
++	error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
+ 	if (error == -EFBIG)
+ 		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ 	if (error)
+ 		goto err_cancel;
+ 
+-	count = bmap->me_len;
+-	error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+-			whichfork, bmap->me_startoff, bmap->me_startblock,
+-			&count, state);
++	fake.bi_owner = ip;
++	fake.bi_bmap.br_startblock = map->me_startblock;
++	fake.bi_bmap.br_startoff = map->me_startoff;
++	fake.bi_bmap.br_blockcount = map->me_len;
++	fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
++			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
++
++	error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
+ 	if (error == -EFSCORRUPTED)
+-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
+-				sizeof(*bmap));
++		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
++				sizeof(*map));
+ 	if (error)
+ 		goto err_cancel;
+ 
+-	if (count > 0) {
+-		ASSERT(bui_type == XFS_BMAP_UNMAP);
+-		irec.br_startblock = bmap->me_startblock;
+-		irec.br_blockcount = count;
+-		irec.br_startoff = bmap->me_startoff;
+-		irec.br_state = state;
+-		xfs_bmap_unmap_extent(tp, ip, &irec);
++	if (fake.bi_bmap.br_blockcount > 0) {
++		ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
++		xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
+ 	}
+ 
+ 	/*
+@@ -579,18 +558,18 @@ xfs_bui_item_relog(
+ {
+ 	struct xfs_bud_log_item		*budp;
+ 	struct xfs_bui_log_item		*buip;
+-	struct xfs_map_extent		*extp;
++	struct xfs_map_extent		*map;
+ 	unsigned int			count;
+ 
+ 	count = BUI_ITEM(intent)->bui_format.bui_nextents;
+-	extp = BUI_ITEM(intent)->bui_format.bui_extents;
++	map = BUI_ITEM(intent)->bui_format.bui_extents;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ 	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+ 
+ 	buip = xfs_bui_init(tp->t_mountp);
+-	memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
++	memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
+ 	atomic_set(&buip->bui_next_extent, count);
+ 	xfs_trans_add_item(tp, &buip->bui_item);
+ 	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
+index ae082808cfed..b2cbbba3e15a 100644
+--- a/fs/xfs/xfs_error.c
++++ b/fs/xfs/xfs_error.c
+@@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_errortag);
+ 
+-static struct kobj_type xfs_errortag_ktype = {
++static const struct kobj_type xfs_errortag_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_errortag_sysfs_ops,
+ 	.default_groups = xfs_errortag_groups,
+diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
+index dbe6c37dc697..0b9c5ba8a598 100644
+--- a/fs/xfs/xfs_error.h
++++ b/fs/xfs/xfs_error.h
+@@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
+ 
+ /*
+  * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+- *			a panic by setting xfs_panic_mask in a sysctl.
++ *			a panic by setting fs.xfs.panic_mask in a sysctl.
+  */
+ #define		XFS_NO_PTAG			0u
+ #define		XFS_PTAG_IFLUSH			(1u << 0)
+@@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
+ #define		XFS_PTAG_FSBLOCK_ZERO		(1u << 7)
+ #define		XFS_PTAG_VERIFIER_ERROR		(1u << 8)
+ 
++#define		XFS_PTAG_MASK	(XFS_PTAG_IFLUSH | \
++				 XFS_PTAG_LOGRES | \
++				 XFS_PTAG_AILDELETE | \
++				 XFS_PTAG_ERROR_REPORT | \
++				 XFS_PTAG_SHUTDOWN_CORRUPT | \
++				 XFS_PTAG_SHUTDOWN_IOERROR | \
++				 XFS_PTAG_SHUTDOWN_LOGERROR | \
++				 XFS_PTAG_FSBLOCK_ZERO | \
++				 XFS_PTAG_VERIFIER_ERROR)
++
+ #define XFS_PTAG_STRINGS \
+ 	{ XFS_NO_PTAG,			"none" }, \
+ 	{ XFS_PTAG_IFLUSH,		"iflush" }, \
+diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
+index d5130d1fcfae..011b50469301 100644
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -345,23 +345,30 @@ static int
+ xfs_trans_free_extent(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_efd_log_item		*efdp,
+-	xfs_fsblock_t			start_block,
+-	xfs_extlen_t			ext_len,
+-	const struct xfs_owner_info	*oinfo,
+-	bool				skip_discard)
++	struct xfs_extent_free_item	*xefi)
+ {
++	struct xfs_owner_info		oinfo = { };
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_extent		*extp;
+ 	uint				next_extent;
+-	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, start_block);
++	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp,
++							xefi->xefi_startblock);
+ 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
+-								start_block);
++							xefi->xefi_startblock);
+ 	int				error;
+ 
+-	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
++	oinfo.oi_owner = xefi->xefi_owner;
++	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
++		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
++	if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
++		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
++
++	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno,
++			xefi->xefi_blockcount);
+ 
+-	error = __xfs_free_extent(tp, start_block, ext_len,
+-				  oinfo, XFS_AG_RESV_NONE, skip_discard);
++	error = __xfs_free_extent(tp, xefi->xefi_startblock,
++			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
++			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ 	/*
+ 	 * Mark the transaction dirty, even on error. This ensures the
+ 	 * transaction is aborted, which:
+@@ -375,8 +382,8 @@ xfs_trans_free_extent(
+ 	next_extent = efdp->efd_next_extent;
+ 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ 	extp = &(efdp->efd_format.efd_extents[next_extent]);
+-	extp->ext_start = start_block;
+-	extp->ext_len = ext_len;
++	extp->ext_start = xefi->xefi_startblock;
++	extp->ext_len = xefi->xefi_blockcount;
+ 	efdp->efd_next_extent++;
+ 
+ 	return error;
+@@ -404,7 +411,7 @@ STATIC void
+ xfs_extent_free_log_item(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_efi_log_item		*efip,
+-	struct xfs_extent_free_item	*free)
++	struct xfs_extent_free_item	*xefi)
+ {
+ 	uint				next_extent;
+ 	struct xfs_extent		*extp;
+@@ -420,8 +427,8 @@ xfs_extent_free_log_item(
+ 	next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+ 	ASSERT(next_extent < efip->efi_format.efi_nextents);
+ 	extp = &efip->efi_format.efi_extents[next_extent];
+-	extp->ext_start = free->xefi_startblock;
+-	extp->ext_len = free->xefi_blockcount;
++	extp->ext_start = xefi->xefi_startblock;
++	extp->ext_len = xefi->xefi_blockcount;
+ }
+ 
+ static struct xfs_log_item *
+@@ -433,15 +440,15 @@ xfs_extent_free_create_intent(
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
+-	struct xfs_extent_free_item	*free;
++	struct xfs_extent_free_item	*xefi;
+ 
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &efip->efi_item);
+ 	if (sort)
+ 		list_sort(mp, items, xfs_extent_free_diff_items);
+-	list_for_each_entry(free, items, xefi_list)
+-		xfs_extent_free_log_item(tp, efip, free);
++	list_for_each_entry(xefi, items, xefi_list)
++		xfs_extent_free_log_item(tp, efip, xefi);
+ 	return &efip->efi_item;
+ }
+ 
+@@ -463,21 +470,13 @@ xfs_extent_free_finish_item(
+ 	struct list_head		*item,
+ 	struct xfs_btree_cur		**state)
+ {
+-	struct xfs_owner_info		oinfo = { };
+-	struct xfs_extent_free_item	*free;
++	struct xfs_extent_free_item	*xefi;
+ 	int				error;
+ 
+-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-	oinfo.oi_owner = free->xefi_owner;
+-	if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+-		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+-	if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+-		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+-	error = xfs_trans_free_extent(tp, EFD_ITEM(done),
+-			free->xefi_startblock,
+-			free->xefi_blockcount,
+-			&oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+-	kmem_cache_free(xfs_extfree_item_cache, free);
++	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
++
++	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
++	kmem_cache_free(xfs_extfree_item_cache, xefi);
+ 	return error;
+ }
+ 
+@@ -494,10 +493,10 @@ STATIC void
+ xfs_extent_free_cancel_item(
+ 	struct list_head		*item)
+ {
+-	struct xfs_extent_free_item	*free;
++	struct xfs_extent_free_item	*xefi;
+ 
+-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-	kmem_cache_free(xfs_extfree_item_cache, free);
++	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
++	kmem_cache_free(xfs_extfree_item_cache, xefi);
+ }
+ 
+ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+@@ -523,7 +522,7 @@ xfs_agfl_free_finish_item(
+ 	struct xfs_owner_info		oinfo = { };
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_efd_log_item		*efdp = EFD_ITEM(done);
+-	struct xfs_extent_free_item	*free;
++	struct xfs_extent_free_item	*xefi;
+ 	struct xfs_extent		*extp;
+ 	struct xfs_buf			*agbp;
+ 	int				error;
+@@ -532,13 +531,13 @@ xfs_agfl_free_finish_item(
+ 	uint				next_extent;
+ 	struct xfs_perag		*pag;
+ 
+-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-	ASSERT(free->xefi_blockcount == 1);
+-	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+-	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+-	oinfo.oi_owner = free->xefi_owner;
++	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
++	ASSERT(xefi->xefi_blockcount == 1);
++	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
++	agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
++	oinfo.oi_owner = xefi->xefi_owner;
+ 
+-	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
++	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount);
+ 
+ 	pag = xfs_perag_get(mp, agno);
+ 	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+@@ -559,11 +558,11 @@ xfs_agfl_free_finish_item(
+ 	next_extent = efdp->efd_next_extent;
+ 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ 	extp = &(efdp->efd_format.efd_extents[next_extent]);
+-	extp->ext_start = free->xefi_startblock;
+-	extp->ext_len = free->xefi_blockcount;
++	extp->ext_start = xefi->xefi_startblock;
++	extp->ext_len = xefi->xefi_blockcount;
+ 	efdp->efd_next_extent++;
+ 
+-	kmem_cache_free(xfs_extfree_item_cache, free);
++	kmem_cache_free(xfs_extfree_item_cache, xefi);
+ 	return error;
+ }
+ 
+@@ -599,7 +598,6 @@ xfs_efi_item_recover(
+ 	struct xfs_mount		*mp = lip->li_log->l_mp;
+ 	struct xfs_efd_log_item		*efdp;
+ 	struct xfs_trans		*tp;
+-	struct xfs_extent		*extp;
+ 	int				i;
+ 	int				error = 0;
+ 
+@@ -624,10 +622,17 @@ xfs_efi_item_recover(
+ 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+ 
+ 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
++		struct xfs_extent_free_item	fake = {
++			.xefi_owner		= XFS_RMAP_OWN_UNKNOWN,
++		};
++		struct xfs_extent		*extp;
++
+ 		extp = &efip->efi_format.efi_extents[i];
+-		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+-					      extp->ext_len,
+-					      &XFS_RMAP_OINFO_ANY_OWNER, false);
++
++		fake.xefi_startblock = extp->ext_start;
++		fake.xefi_blockcount = extp->ext_len;
++
++		error = xfs_trans_free_extent(tp, efdp, &fake);
+ 		if (error == -EFSCORRUPTED)
+ 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ 					extp, sizeof(*extp));
+diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
+index 4d0a98f920ca..9edc1f2bc939 100644
+--- a/fs/xfs/xfs_globals.c
++++ b/fs/xfs/xfs_globals.c
+@@ -4,6 +4,7 @@
+  * All Rights Reserved.
+  */
+ #include "xfs.h"
++#include "xfs_error.h"
+ 
+ /*
+  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+@@ -15,7 +16,7 @@ xfs_param_t xfs_params = {
+ 			  /*	MIN		DFLT		MAX	*/
+ 	.sgid_inherit	= {	0,		0,		1	},
+ 	.symlink_mode	= {	0,		0,		1	},
+-	.panic_mask	= {	0,		0,		256	},
++	.panic_mask	= {	0,		0,		XFS_PTAG_MASK},
+ 	.error_level	= {	0,		3,		11	},
+ 	.syncd_timer	= {	1*100,		30*100,		7200*100},
+ 	.stats_clear	= {	0,		0,		1	},
+diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
+index fc1946f80a4a..69dbe7814128 100644
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -83,7 +83,7 @@ xfs_iomap_valid(
+ 	return true;
+ }
+ 
+-static const struct iomap_page_ops xfs_iomap_page_ops = {
++static const struct iomap_folio_ops xfs_iomap_folio_ops = {
+ 	.iomap_valid		= xfs_iomap_valid,
+ };
+ 
+@@ -133,7 +133,7 @@ xfs_bmbt_to_iomap(
+ 		iomap->flags |= IOMAP_F_DIRTY;
+ 
+ 	iomap->validity_cookie = sequence_cookie;
+-	iomap->page_ops = &xfs_iomap_page_ops;
++	iomap->folio_ops = &xfs_iomap_folio_ops;
+ 	return 0;
+ }
+ 
+diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
+index 858e3e9eb4a8..48d771a76add 100644
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -252,17 +252,12 @@ static int
+ xfs_trans_log_finish_refcount_update(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_cud_log_item		*cudp,
+-	enum xfs_refcount_intent_type	type,
+-	xfs_fsblock_t			startblock,
+-	xfs_extlen_t			blockcount,
+-	xfs_fsblock_t			*new_fsb,
+-	xfs_extlen_t			*new_len,
++	struct xfs_refcount_intent	*ri,
+ 	struct xfs_btree_cur		**pcur)
+ {
+ 	int				error;
+ 
+-	error = xfs_refcount_finish_one(tp, type, startblock,
+-			blockcount, new_fsb, new_len, pcur);
++	error = xfs_refcount_finish_one(tp, ri, pcur);
+ 
+ 	/*
+ 	 * Mark the transaction dirty, even on error. This ensures the
+@@ -297,16 +292,16 @@ xfs_refcount_update_diff_items(
+ /* Set the phys extent flags for this reverse mapping. */
+ static void
+ xfs_trans_set_refcount_flags(
+-	struct xfs_phys_extent		*refc,
++	struct xfs_phys_extent		*pmap,
+ 	enum xfs_refcount_intent_type	type)
+ {
+-	refc->pe_flags = 0;
++	pmap->pe_flags = 0;
+ 	switch (type) {
+ 	case XFS_REFCOUNT_INCREASE:
+ 	case XFS_REFCOUNT_DECREASE:
+ 	case XFS_REFCOUNT_ALLOC_COW:
+ 	case XFS_REFCOUNT_FREE_COW:
+-		refc->pe_flags |= type;
++		pmap->pe_flags |= type;
+ 		break;
+ 	default:
+ 		ASSERT(0);
+@@ -318,10 +313,10 @@ STATIC void
+ xfs_refcount_update_log_item(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_cui_log_item		*cuip,
+-	struct xfs_refcount_intent	*refc)
++	struct xfs_refcount_intent	*ri)
+ {
+ 	uint				next_extent;
+-	struct xfs_phys_extent		*ext;
++	struct xfs_phys_extent		*pmap;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+@@ -333,10 +328,10 @@ xfs_refcount_update_log_item(
+ 	 */
+ 	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+ 	ASSERT(next_extent < cuip->cui_format.cui_nextents);
+-	ext = &cuip->cui_format.cui_extents[next_extent];
+-	ext->pe_startblock = refc->ri_startblock;
+-	ext->pe_len = refc->ri_blockcount;
+-	xfs_trans_set_refcount_flags(ext, refc->ri_type);
++	pmap = &cuip->cui_format.cui_extents[next_extent];
++	pmap->pe_startblock = ri->ri_startblock;
++	pmap->pe_len = ri->ri_blockcount;
++	xfs_trans_set_refcount_flags(pmap, ri->ri_type);
+ }
+ 
+ static struct xfs_log_item *
+@@ -348,15 +343,15 @@ xfs_refcount_update_create_intent(
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
+-	struct xfs_refcount_intent	*refc;
++	struct xfs_refcount_intent	*ri;
+ 
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &cuip->cui_item);
+ 	if (sort)
+ 		list_sort(mp, items, xfs_refcount_update_diff_items);
+-	list_for_each_entry(refc, items, ri_list)
+-		xfs_refcount_update_log_item(tp, cuip, refc);
++	list_for_each_entry(ri, items, ri_list)
++		xfs_refcount_update_log_item(tp, cuip, ri);
+ 	return &cuip->cui_item;
+ }
+ 
+@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item(
+ 	struct list_head		*item,
+ 	struct xfs_btree_cur		**state)
+ {
+-	struct xfs_refcount_intent	*refc;
+-	xfs_fsblock_t			new_fsb;
+-	xfs_extlen_t			new_aglen;
++	struct xfs_refcount_intent	*ri;
+ 	int				error;
+ 
+-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+-	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
+-			refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
+-			&new_fsb, &new_aglen, state);
++	ri = container_of(item, struct xfs_refcount_intent, ri_list);
++	error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
++			state);
+ 
+ 	/* Did we run out of reservation?  Requeue what we didn't finish. */
+-	if (!error && new_aglen > 0) {
+-		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+-		       refc->ri_type == XFS_REFCOUNT_DECREASE);
+-		refc->ri_startblock = new_fsb;
+-		refc->ri_blockcount = new_aglen;
++	if (!error && ri->ri_blockcount > 0) {
++		ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
++		       ri->ri_type == XFS_REFCOUNT_DECREASE);
+ 		return -EAGAIN;
+ 	}
+-	kmem_cache_free(xfs_refcount_intent_cache, refc);
++	kmem_cache_free(xfs_refcount_intent_cache, ri);
+ 	return error;
+ }
+ 
+@@ -413,10 +403,10 @@ STATIC void
+ xfs_refcount_update_cancel_item(
+ 	struct list_head		*item)
+ {
+-	struct xfs_refcount_intent	*refc;
++	struct xfs_refcount_intent	*ri;
+ 
+-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+-	kmem_cache_free(xfs_refcount_intent_cache, refc);
++	ri = container_of(item, struct xfs_refcount_intent, ri_list);
++	kmem_cache_free(xfs_refcount_intent_cache, ri);
+ }
+ 
+ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+@@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ static inline bool
+ xfs_cui_validate_phys(
+ 	struct xfs_mount		*mp,
+-	struct xfs_phys_extent		*refc)
++	struct xfs_phys_extent		*pmap)
+ {
+ 	if (!xfs_has_reflink(mp))
+ 		return false;
+ 
+-	if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
++	if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
+ 		return false;
+ 
+-	switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
++	switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+ 	case XFS_REFCOUNT_INCREASE:
+ 	case XFS_REFCOUNT_DECREASE:
+ 	case XFS_REFCOUNT_ALLOC_COW:
+@@ -451,7 +441,7 @@ xfs_cui_validate_phys(
+ 		return false;
+ 	}
+ 
+-	return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len);
++	return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
+ }
+ 
+ /*
+@@ -463,18 +453,13 @@ xfs_cui_item_recover(
+ 	struct xfs_log_item		*lip,
+ 	struct list_head		*capture_list)
+ {
+-	struct xfs_bmbt_irec		irec;
+ 	struct xfs_cui_log_item		*cuip = CUI_ITEM(lip);
+-	struct xfs_phys_extent		*refc;
+ 	struct xfs_cud_log_item		*cudp;
+ 	struct xfs_trans		*tp;
+ 	struct xfs_btree_cur		*rcur = NULL;
+ 	struct xfs_mount		*mp = lip->li_log->l_mp;
+-	xfs_fsblock_t			new_fsb;
+-	xfs_extlen_t			new_len;
+ 	unsigned int			refc_type;
+ 	bool				requeue_only = false;
+-	enum xfs_refcount_intent_type	type;
+ 	int				i;
+ 	int				error = 0;
+ 
+@@ -513,14 +498,17 @@ xfs_cui_item_recover(
+ 	cudp = xfs_trans_get_cud(tp, cuip);
+ 
+ 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+-		refc = &cuip->cui_format.cui_extents[i];
+-		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
++		struct xfs_refcount_intent	fake = { };
++		struct xfs_phys_extent		*pmap;
++
++		pmap = &cuip->cui_format.cui_extents[i];
++		refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ 		switch (refc_type) {
+ 		case XFS_REFCOUNT_INCREASE:
+ 		case XFS_REFCOUNT_DECREASE:
+ 		case XFS_REFCOUNT_ALLOC_COW:
+ 		case XFS_REFCOUNT_FREE_COW:
+-			type = refc_type;
++			fake.ri_type = refc_type;
+ 			break;
+ 		default:
+ 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+@@ -529,13 +517,12 @@ xfs_cui_item_recover(
+ 			error = -EFSCORRUPTED;
+ 			goto abort_error;
+ 		}
+-		if (requeue_only) {
+-			new_fsb = refc->pe_startblock;
+-			new_len = refc->pe_len;
+-		} else
++
++		fake.ri_startblock = pmap->pe_startblock;
++		fake.ri_blockcount = pmap->pe_len;
++		if (!requeue_only)
+ 			error = xfs_trans_log_finish_refcount_update(tp, cudp,
+-				type, refc->pe_startblock, refc->pe_len,
+-				&new_fsb, &new_len, &rcur);
++					&fake, &rcur);
+ 		if (error == -EFSCORRUPTED)
+ 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ 					&cuip->cui_format,
+@@ -544,10 +531,13 @@ xfs_cui_item_recover(
+ 			goto abort_error;
+ 
+ 		/* Requeue what we didn't finish. */
+-		if (new_len > 0) {
+-			irec.br_startblock = new_fsb;
+-			irec.br_blockcount = new_len;
+-			switch (type) {
++		if (fake.ri_blockcount > 0) {
++			struct xfs_bmbt_irec	irec = {
++				.br_startblock	= fake.ri_startblock,
++				.br_blockcount	= fake.ri_blockcount,
++			};
++
++			switch (fake.ri_type) {
+ 			case XFS_REFCOUNT_INCREASE:
+ 				xfs_refcount_increase_extent(tp, &irec);
+ 				break;
+@@ -596,18 +586,18 @@ xfs_cui_item_relog(
+ {
+ 	struct xfs_cud_log_item		*cudp;
+ 	struct xfs_cui_log_item		*cuip;
+-	struct xfs_phys_extent		*extp;
++	struct xfs_phys_extent		*pmap;
+ 	unsigned int			count;
+ 
+ 	count = CUI_ITEM(intent)->cui_format.cui_nextents;
+-	extp = CUI_ITEM(intent)->cui_format.cui_extents;
++	pmap = CUI_ITEM(intent)->cui_format.cui_extents;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ 	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+ 
+ 	cuip = xfs_cui_init(tp->t_mountp, count);
+-	memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
++	memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
+ 	atomic_set(&cuip->cui_next_extent, count);
+ 	xfs_trans_add_item(tp, &cuip->cui_item);
+ 	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
+index 534504ede1a3..a1619d67015f 100644
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -244,40 +244,40 @@ xfs_trans_get_rud(
+ /* Set the map extent flags for this reverse mapping. */
+ static void
+ xfs_trans_set_rmap_flags(
+-	struct xfs_map_extent		*rmap,
++	struct xfs_map_extent		*map,
+ 	enum xfs_rmap_intent_type	type,
+ 	int				whichfork,
+ 	xfs_exntst_t			state)
+ {
+-	rmap->me_flags = 0;
++	map->me_flags = 0;
+ 	if (state == XFS_EXT_UNWRITTEN)
+-		rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
++		map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ 	if (whichfork == XFS_ATTR_FORK)
+-		rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
++		map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ 	switch (type) {
+ 	case XFS_RMAP_MAP:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
++		map->me_flags |= XFS_RMAP_EXTENT_MAP;
+ 		break;
+ 	case XFS_RMAP_MAP_SHARED:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
++		map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ 		break;
+ 	case XFS_RMAP_UNMAP:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
++		map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ 		break;
+ 	case XFS_RMAP_UNMAP_SHARED:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
++		map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ 		break;
+ 	case XFS_RMAP_CONVERT:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
++		map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ 		break;
+ 	case XFS_RMAP_CONVERT_SHARED:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
++		map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ 		break;
+ 	case XFS_RMAP_ALLOC:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
++		map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ 		break;
+ 	case XFS_RMAP_FREE:
+-		rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
++		map->me_flags |= XFS_RMAP_EXTENT_FREE;
+ 		break;
+ 	default:
+ 		ASSERT(0);
+@@ -293,19 +293,12 @@ static int
+ xfs_trans_log_finish_rmap_update(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_rud_log_item		*rudp,
+-	enum xfs_rmap_intent_type	type,
+-	uint64_t			owner,
+-	int				whichfork,
+-	xfs_fileoff_t			startoff,
+-	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
+-	xfs_exntst_t			state,
++	struct xfs_rmap_intent		*ri,
+ 	struct xfs_btree_cur		**pcur)
+ {
+ 	int				error;
+ 
+-	error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+-			startblock, blockcount, state, pcur);
++	error = xfs_rmap_finish_one(tp, ri, pcur);
+ 
+ 	/*
+ 	 * Mark the transaction dirty, even on error. This ensures the
+@@ -342,7 +335,7 @@ STATIC void
+ xfs_rmap_update_log_item(
+ 	struct xfs_trans		*tp,
+ 	struct xfs_rui_log_item		*ruip,
+-	struct xfs_rmap_intent		*rmap)
++	struct xfs_rmap_intent		*ri)
+ {
+ 	uint				next_extent;
+ 	struct xfs_map_extent		*map;
+@@ -358,12 +351,12 @@ xfs_rmap_update_log_item(
+ 	next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+ 	ASSERT(next_extent < ruip->rui_format.rui_nextents);
+ 	map = &ruip->rui_format.rui_extents[next_extent];
+-	map->me_owner = rmap->ri_owner;
+-	map->me_startblock = rmap->ri_bmap.br_startblock;
+-	map->me_startoff = rmap->ri_bmap.br_startoff;
+-	map->me_len = rmap->ri_bmap.br_blockcount;
+-	xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+-			rmap->ri_bmap.br_state);
++	map->me_owner = ri->ri_owner;
++	map->me_startblock = ri->ri_bmap.br_startblock;
++	map->me_startoff = ri->ri_bmap.br_startoff;
++	map->me_len = ri->ri_bmap.br_blockcount;
++	xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork,
++			ri->ri_bmap.br_state);
+ }
+ 
+ static struct xfs_log_item *
+@@ -375,15 +368,15 @@ xfs_rmap_update_create_intent(
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
+-	struct xfs_rmap_intent		*rmap;
++	struct xfs_rmap_intent		*ri;
+ 
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &ruip->rui_item);
+ 	if (sort)
+ 		list_sort(mp, items, xfs_rmap_update_diff_items);
+-	list_for_each_entry(rmap, items, ri_list)
+-		xfs_rmap_update_log_item(tp, ruip, rmap);
++	list_for_each_entry(ri, items, ri_list)
++		xfs_rmap_update_log_item(tp, ruip, ri);
+ 	return &ruip->rui_item;
+ }
+ 
+@@ -405,16 +398,14 @@ xfs_rmap_update_finish_item(
+ 	struct list_head		*item,
+ 	struct xfs_btree_cur		**state)
+ {
+-	struct xfs_rmap_intent		*rmap;
++	struct xfs_rmap_intent		*ri;
+ 	int				error;
+ 
+-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+-	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
+-			rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
+-			rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
+-			rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
++	ri = container_of(item, struct xfs_rmap_intent, ri_list);
++
++	error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
+ 			state);
+-	kmem_cache_free(xfs_rmap_intent_cache, rmap);
++	kmem_cache_free(xfs_rmap_intent_cache, ri);
+ 	return error;
+ }
+ 
+@@ -431,10 +422,10 @@ STATIC void
+ xfs_rmap_update_cancel_item(
+ 	struct list_head		*item)
+ {
+-	struct xfs_rmap_intent		*rmap;
++	struct xfs_rmap_intent		*ri;
+ 
+-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+-	kmem_cache_free(xfs_rmap_intent_cache, rmap);
++	ri = container_of(item, struct xfs_rmap_intent, ri_list);
++	kmem_cache_free(xfs_rmap_intent_cache, ri);
+ }
+ 
+ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+@@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ static inline bool
+ xfs_rui_validate_map(
+ 	struct xfs_mount		*mp,
+-	struct xfs_map_extent		*rmap)
++	struct xfs_map_extent		*map)
+ {
+ 	if (!xfs_has_rmapbt(mp))
+ 		return false;
+ 
+-	if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
++	if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
+ 		return false;
+ 
+-	switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
++	switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ 	case XFS_RMAP_EXTENT_MAP:
+ 	case XFS_RMAP_EXTENT_MAP_SHARED:
+ 	case XFS_RMAP_EXTENT_UNMAP:
+@@ -473,14 +464,14 @@ xfs_rui_validate_map(
+ 		return false;
+ 	}
+ 
+-	if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) &&
+-	    !xfs_verify_ino(mp, rmap->me_owner))
++	if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) &&
++	    !xfs_verify_ino(mp, map->me_owner))
+ 		return false;
+ 
+-	if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len))
++	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
+ 		return false;
+ 
+-	return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len);
++	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
+ }
+ 
+ /*
+@@ -493,15 +484,11 @@ xfs_rui_item_recover(
+ 	struct list_head		*capture_list)
+ {
+ 	struct xfs_rui_log_item		*ruip = RUI_ITEM(lip);
+-	struct xfs_map_extent		*rmap;
+ 	struct xfs_rud_log_item		*rudp;
+ 	struct xfs_trans		*tp;
+ 	struct xfs_btree_cur		*rcur = NULL;
+ 	struct xfs_mount		*mp = lip->li_log->l_mp;
+-	enum xfs_rmap_intent_type	type;
+-	xfs_exntst_t			state;
+ 	int				i;
+-	int				whichfork;
+ 	int				error = 0;
+ 
+ 	/*
+@@ -526,35 +513,34 @@ xfs_rui_item_recover(
+ 	rudp = xfs_trans_get_rud(tp, ruip);
+ 
+ 	for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
+-		rmap = &ruip->rui_format.rui_extents[i];
+-		state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+-				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+-		whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+-				XFS_ATTR_FORK : XFS_DATA_FORK;
+-		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
++		struct xfs_rmap_intent	fake = { };
++		struct xfs_map_extent	*map;
++
++		map = &ruip->rui_format.rui_extents[i];
++		switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ 		case XFS_RMAP_EXTENT_MAP:
+-			type = XFS_RMAP_MAP;
++			fake.ri_type = XFS_RMAP_MAP;
+ 			break;
+ 		case XFS_RMAP_EXTENT_MAP_SHARED:
+-			type = XFS_RMAP_MAP_SHARED;
++			fake.ri_type = XFS_RMAP_MAP_SHARED;
+ 			break;
+ 		case XFS_RMAP_EXTENT_UNMAP:
+-			type = XFS_RMAP_UNMAP;
++			fake.ri_type = XFS_RMAP_UNMAP;
+ 			break;
+ 		case XFS_RMAP_EXTENT_UNMAP_SHARED:
+-			type = XFS_RMAP_UNMAP_SHARED;
++			fake.ri_type = XFS_RMAP_UNMAP_SHARED;
+ 			break;
+ 		case XFS_RMAP_EXTENT_CONVERT:
+-			type = XFS_RMAP_CONVERT;
++			fake.ri_type = XFS_RMAP_CONVERT;
+ 			break;
+ 		case XFS_RMAP_EXTENT_CONVERT_SHARED:
+-			type = XFS_RMAP_CONVERT_SHARED;
++			fake.ri_type = XFS_RMAP_CONVERT_SHARED;
+ 			break;
+ 		case XFS_RMAP_EXTENT_ALLOC:
+-			type = XFS_RMAP_ALLOC;
++			fake.ri_type = XFS_RMAP_ALLOC;
+ 			break;
+ 		case XFS_RMAP_EXTENT_FREE:
+-			type = XFS_RMAP_FREE;
++			fake.ri_type = XFS_RMAP_FREE;
+ 			break;
+ 		default:
+ 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+@@ -563,13 +549,21 @@ xfs_rui_item_recover(
+ 			error = -EFSCORRUPTED;
+ 			goto abort_error;
+ 		}
+-		error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
+-				rmap->me_owner, whichfork,
+-				rmap->me_startoff, rmap->me_startblock,
+-				rmap->me_len, state, &rcur);
++
++		fake.ri_owner = map->me_owner;
++		fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
++				XFS_ATTR_FORK : XFS_DATA_FORK;
++		fake.ri_bmap.br_startblock = map->me_startblock;
++		fake.ri_bmap.br_startoff = map->me_startoff;
++		fake.ri_bmap.br_blockcount = map->me_len;
++		fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
++				XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
++
++		error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
++				&rcur);
+ 		if (error == -EFSCORRUPTED)
+ 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+-					rmap, sizeof(*rmap));
++					map, sizeof(*map));
+ 		if (error)
+ 			goto abort_error;
+ 
+@@ -600,18 +594,18 @@ xfs_rui_item_relog(
+ {
+ 	struct xfs_rud_log_item		*rudp;
+ 	struct xfs_rui_log_item		*ruip;
+-	struct xfs_map_extent		*extp;
++	struct xfs_map_extent		*map;
+ 	unsigned int			count;
+ 
+ 	count = RUI_ITEM(intent)->rui_format.rui_nextents;
+-	extp = RUI_ITEM(intent)->rui_format.rui_extents;
++	map = RUI_ITEM(intent)->rui_format.rui_extents;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ 	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+ 
+ 	ruip = xfs_rui_init(tp->t_mountp, count);
+-	memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
++	memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
+ 	atomic_set(&ruip->rui_next_extent, count);
+ 	xfs_trans_add_item(tp, &ruip->rui_item);
+ 	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
+index f7faf6e70d7f..a3c6b1548723 100644
+--- a/fs/xfs/xfs_sysfs.c
++++ b/fs/xfs/xfs_sysfs.c
+@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_mp);
+ 
+-struct kobj_type xfs_mp_ktype = {
++const struct kobj_type xfs_mp_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ 	.default_groups = xfs_mp_groups,
+@@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_dbg);
+ 
+-struct kobj_type xfs_dbg_ktype = {
++const struct kobj_type xfs_dbg_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ 	.default_groups = xfs_dbg_groups,
+@@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_stats);
+ 
+-struct kobj_type xfs_stats_ktype = {
++const struct kobj_type xfs_stats_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ 	.default_groups = xfs_stats_groups,
+@@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_log);
+ 
+-struct kobj_type xfs_log_ktype = {
++const struct kobj_type xfs_log_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ 	.default_groups = xfs_log_groups,
+@@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = {
+ };
+ ATTRIBUTE_GROUPS(xfs_error);
+ 
+-static struct kobj_type xfs_error_cfg_ktype = {
++static const struct kobj_type xfs_error_cfg_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ 	.default_groups = xfs_error_groups,
+ };
+ 
+-static struct kobj_type xfs_error_ktype = {
++static const struct kobj_type xfs_error_ktype = {
+ 	.release = xfs_sysfs_release,
+ 	.sysfs_ops = &xfs_sysfs_ops,
+ };
+diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
+index 513095e353a5..148893ebfdef 100644
+--- a/fs/xfs/xfs_sysfs.h
++++ b/fs/xfs/xfs_sysfs.h
+@@ -7,10 +7,10 @@
+ #ifndef __XFS_SYSFS_H__
+ #define __XFS_SYSFS_H__
+ 
+-extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
+-extern struct kobj_type xfs_dbg_ktype;	/* debug */
+-extern struct kobj_type xfs_log_ktype;	/* xlog */
+-extern struct kobj_type xfs_stats_ktype;	/* stats */
++extern const struct kobj_type xfs_mp_ktype;	/* xfs_mount */
++extern const struct kobj_type xfs_dbg_ktype;	/* debug */
++extern const struct kobj_type xfs_log_ktype;	/* xlog */
++extern const struct kobj_type xfs_stats_ktype;	/* stats */
+ 
+ static inline struct xfs_kobj *
+ to_kobj(struct kobject *kobject)
+@@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject)
+ static inline int
+ xfs_sysfs_init(
+ 	struct xfs_kobj		*kobj,
+-	struct kobj_type	*ktype,
++	const struct kobj_type	*ktype,
+ 	struct xfs_kobj		*parent_kobj,
+ 	const char		*name)
+ {
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index 421d1e504ac4..6b0e9ae7c513 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
+ 
+ TRACE_EVENT(xfs_refcount_finish_one_leftover,
+ 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+-		 int type, xfs_agblock_t agbno, xfs_extlen_t len,
+-		 xfs_agblock_t new_agbno, xfs_extlen_t new_len),
+-	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
++		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
++	TP_ARGS(mp, agno, type, agbno, len),
+ 	TP_STRUCT__entry(
+ 		__field(dev_t, dev)
+ 		__field(xfs_agnumber_t, agno)
+ 		__field(int, type)
+ 		__field(xfs_agblock_t, agbno)
+ 		__field(xfs_extlen_t, len)
+-		__field(xfs_agblock_t, new_agbno)
+-		__field(xfs_extlen_t, new_len)
+ 	),
+ 	TP_fast_assign(
+ 		__entry->dev = mp->m_super->s_dev;
+@@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
+ 		__entry->type = type;
+ 		__entry->agbno = agbno;
+ 		__entry->len = len;
+-		__entry->new_agbno = new_agbno;
+-		__entry->new_len = new_len;
+ 	),
+-	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
++	TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+ 		  __entry->type,
+ 		  __entry->agno,
+ 		  __entry->agbno,
+-		  __entry->len,
+-		  __entry->new_agbno,
+-		  __entry->new_len)
++		  __entry->len)
+ );
+ 
+ /* simple inode-based error/%ip tracepoint class */
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index c1da63f6c808..d766be7152e1 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -12,6 +12,8 @@
+ 
+ #define BIO_MAX_VECS		256U
+ 
++struct queue_limits;
++
+ static inline unsigned int bio_max_segs(unsigned int nr_segs)
+ {
+ 	return min(nr_segs, BIO_MAX_VECS);
+@@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip,
+ void bio_trim(struct bio *bio, sector_t offset, sector_t size);
+ extern struct bio *bio_split(struct bio *bio, int sectors,
+ 			     gfp_t gfp, struct bio_set *bs);
++struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
++		unsigned *segs, struct bio_set *bs, unsigned max_bytes);
+ 
+ /**
+  * bio_next_split - get next @sectors from a bio, splitting if necessary
+diff --git a/include/linux/iomap.h b/include/linux/iomap.h
+index 0983dfc9a203..0f8123504e5e 100644
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -13,6 +13,7 @@
+ struct address_space;
+ struct fiemap_extent_info;
+ struct inode;
++struct iomap_iter;
+ struct iomap_dio;
+ struct iomap_writepage_ctx;
+ struct iov_iter;
+@@ -58,8 +59,7 @@ struct vm_fault;
+ #define IOMAP_F_SHARED		(1U << 2)
+ #define IOMAP_F_MERGED		(1U << 3)
+ #define IOMAP_F_BUFFER_HEAD	(1U << 4)
+-#define IOMAP_F_ZONE_APPEND	(1U << 5)
+-#define IOMAP_F_XATTR		(1U << 6)
++#define IOMAP_F_XATTR		(1U << 5)
+ 
+ /*
+  * Flags set by the core iomap code during operations:
+@@ -85,7 +85,7 @@ struct vm_fault;
+  */
+ #define IOMAP_NULL_ADDR -1ULL	/* addr is not valid */
+ 
+-struct iomap_page_ops;
++struct iomap_folio_ops;
+ 
+ struct iomap {
+ 	u64			addr; /* disk offset of mapping, bytes */
+@@ -97,7 +97,7 @@ struct iomap {
+ 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
+ 	void			*inline_data;
+ 	void			*private; /* filesystem private */
+-	const struct iomap_page_ops *page_ops;
++	const struct iomap_folio_ops *folio_ops;
+ 	u64			validity_cookie; /* used with .iomap_valid() */
+ };
+ 
+@@ -125,19 +125,20 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap)
+ }
+ 
+ /*
+- * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
+- * and page_done will be called for each page written to.  This only applies to
+- * buffered writes as unbuffered writes will not typically have pages
++ * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio
++ * and put_folio will be called for each folio written to.  This only applies
++ * to buffered writes as unbuffered writes will not typically have folios
+  * associated with them.
+  *
+- * When page_prepare succeeds, page_done will always be called to do any
+- * cleanup work necessary.  In that page_done call, @page will be NULL if the
+- * associated page could not be obtained.
++ * When get_folio succeeds, put_folio will always be called to do any
++ * cleanup work necessary.  put_folio is responsible for unlocking and putting
++ * @folio.
+  */
+-struct iomap_page_ops {
+-	int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
+-	void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
+-			struct page *page);
++struct iomap_folio_ops {
++	struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
++			unsigned len);
++	void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
++			struct folio *folio);
+ 
+ 	/*
+ 	 * Check that the cached iomap still maps correctly to the filesystem's
+@@ -260,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
+ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
++struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
+ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
+index 6548b5b5aa60..75d7d22c3a27 100644
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -32,6 +32,7 @@ struct prelim_ref;
+ struct btrfs_space_info;
+ struct btrfs_raid_bio;
+ struct raid56_bio_trace_info;
++struct find_free_extent_ctl;
+ 
+ #define show_ref_type(type)						\
+ 	__print_symbolic(type,						\
+@@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
+ 
+ TRACE_EVENT(find_free_extent,
+ 
+-	TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
+-		 u64 empty_size, u64 data),
++	TP_PROTO(const struct btrfs_root *root,
++		 const struct find_free_extent_ctl *ffe_ctl),
+ 
+-	TP_ARGS(root, num_bytes, empty_size, data),
++	TP_ARGS(root, ffe_ctl),
+ 
+ 	TP_STRUCT__entry_btrfs(
+ 		__field(	u64,	root_objectid		)
+ 		__field(	u64,	num_bytes		)
+ 		__field(	u64,	empty_size		)
+-		__field(	u64,	data			)
++		__field(	u64,	flags			)
+ 	),
+ 
+ 	TP_fast_assign_btrfs(root->fs_info,
+ 		__entry->root_objectid	= root->root_key.objectid;
+-		__entry->num_bytes	= num_bytes;
+-		__entry->empty_size	= empty_size;
+-		__entry->data		= data;
++		__entry->num_bytes	= ffe_ctl->num_bytes;
++		__entry->empty_size	= ffe_ctl->empty_size;
++		__entry->flags		= ffe_ctl->flags;
+ 	),
+ 
+ 	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
+ 		  show_root_type(__entry->root_objectid),
+-		  __entry->num_bytes, __entry->empty_size, __entry->data,
+-		  __print_flags((unsigned long)__entry->data, "|",
++		  __entry->num_bytes, __entry->empty_size, __entry->flags,
++		  __print_flags((unsigned long)__entry->flags, "|",
++				 BTRFS_GROUP_FLAGS))
++);
++
++TRACE_EVENT(find_free_extent_search_loop,
++
++	TP_PROTO(const struct btrfs_root *root,
++		 const struct find_free_extent_ctl *ffe_ctl),
++
++	TP_ARGS(root, ffe_ctl),
++
++	TP_STRUCT__entry_btrfs(
++		__field(	u64,	root_objectid		)
++		__field(	u64,	num_bytes		)
++		__field(	u64,	empty_size		)
++		__field(	u64,	flags			)
++		__field(	u64,	loop			)
++	),
++
++	TP_fast_assign_btrfs(root->fs_info,
++		__entry->root_objectid	= root->root_key.objectid;
++		__entry->num_bytes	= ffe_ctl->num_bytes;
++		__entry->empty_size	= ffe_ctl->empty_size;
++		__entry->flags		= ffe_ctl->flags;
++		__entry->loop		= ffe_ctl->loop;
++	),
++
++	TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu",
++		  show_root_type(__entry->root_objectid),
++		  __entry->num_bytes, __entry->empty_size, __entry->flags,
++		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
++		  __entry->loop)
++);
++
++TRACE_EVENT(find_free_extent_have_block_group,
++
++	TP_PROTO(const struct btrfs_root *root,
++		 const struct find_free_extent_ctl *ffe_ctl,
++		 const struct btrfs_block_group *block_group),
++
++	TP_ARGS(root, ffe_ctl, block_group),
++
++	TP_STRUCT__entry_btrfs(
++		__field(	u64,	root_objectid		)
++		__field(	u64,	num_bytes		)
++		__field(	u64,	empty_size		)
++		__field(	u64,	flags			)
++		__field(	u64,	loop			)
++		__field(	bool,	hinted			)
++		__field(	u64,	bg_start		)
++		__field(	u64,	bg_flags		)
++	),
++
++	TP_fast_assign_btrfs(root->fs_info,
++		__entry->root_objectid	= root->root_key.objectid;
++		__entry->num_bytes	= ffe_ctl->num_bytes;
++		__entry->empty_size	= ffe_ctl->empty_size;
++		__entry->flags		= ffe_ctl->flags;
++		__entry->loop		= ffe_ctl->loop;
++		__entry->hinted		= ffe_ctl->hinted;
++		__entry->bg_start	= block_group->start;
++		__entry->bg_flags	= block_group->flags;
++	),
++
++	TP_printk_btrfs(
++"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)",
++		  show_root_type(__entry->root_objectid),
++		  __entry->num_bytes, __entry->empty_size, __entry->flags,
++		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
++		  __entry->loop, __entry->hinted,
++		  __entry->bg_start, __entry->bg_flags,
++		  __print_flags((unsigned long)__entry->bg_flags, "|",
+ 				 BTRFS_GROUP_FLAGS))
+ );
+ 
+ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
+ 
+-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
+-		 u64 len),
++	TP_PROTO(const struct btrfs_block_group *block_group,
++		 const struct find_free_extent_ctl *ffe_ctl),
+ 
+-	TP_ARGS(block_group, start, len),
++	TP_ARGS(block_group, ffe_ctl),
+ 
+ 	TP_STRUCT__entry_btrfs(
+ 		__field(	u64,	bg_objectid		)
+ 		__field(	u64,	flags			)
++		__field(	int,	bg_size_class		)
+ 		__field(	u64,	start			)
+ 		__field(	u64,	len			)
++		__field(	u64,	loop			)
++		__field(	bool,	hinted			)
++		__field(	int,	size_class		)
+ 	),
+ 
+ 	TP_fast_assign_btrfs(block_group->fs_info,
+ 		__entry->bg_objectid	= block_group->start;
+ 		__entry->flags		= block_group->flags;
+-		__entry->start		= start;
+-		__entry->len		= len;
++		__entry->bg_size_class	= block_group->size_class;
++		__entry->start		= ffe_ctl->search_start;
++		__entry->len		= ffe_ctl->num_bytes;
++		__entry->loop		= ffe_ctl->loop;
++		__entry->hinted		= ffe_ctl->hinted;
++		__entry->size_class	= ffe_ctl->size_class;
+ 	),
+ 
+-	TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) "
+-		  "start=%llu len=%llu",
++	TP_printk_btrfs(
++"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d",
+ 		  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
+ 		  __entry->bg_objectid,
+ 		  __entry->flags, __print_flags((unsigned long)__entry->flags,
+ 						"|", BTRFS_GROUP_FLAGS),
+-		  __entry->start, __entry->len)
++		  __entry->bg_size_class, __entry->start, __entry->len,
++		  __entry->loop, __entry->hinted, __entry->size_class)
+ );
+ 
+ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
+ 
+-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
+-		 u64 len),
++	TP_PROTO(const struct btrfs_block_group *block_group,
++		 const struct find_free_extent_ctl *ffe_ctl),
+ 
+-	TP_ARGS(block_group, start, len)
++	TP_ARGS(block_group, ffe_ctl)
+ );
+ 
+ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
+ 
+-	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
+-		 u64 len),
++	TP_PROTO(const struct btrfs_block_group *block_group,
++		 const struct find_free_extent_ctl *ffe_ctl),
+ 
+-	TP_ARGS(block_group, start, len)
++	TP_ARGS(block_group, ffe_ctl)
+ );
+ 
+ TRACE_EVENT(btrfs_find_cluster,
+-- 
+2.39.2
+
+From 8ef75a5bf012b92f0642e7e288ce34cd247bc41e Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 17 Feb 2023 15:35:46 +0100
+Subject: [PATCH 08/15] Implement amd-pstate-epp and amd-pstate-guided driver
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .../admin-guide/kernel-parameters.txt         |  33 +-
+ Documentation/admin-guide/pm/amd-pstate.rst   |  95 ++-
+ drivers/acpi/cppc_acpi.c                      | 188 ++++-
+ drivers/cpufreq/amd-pstate.c                  | 794 +++++++++++++++++-
+ drivers/cpufreq/brcmstb-avs-cpufreq.c         |   5 +-
+ drivers/cpufreq/cpufreq.c                     |   8 +-
+ drivers/cpufreq/davinci-cpufreq.c             |   4 +-
+ drivers/cpufreq/mediatek-cpufreq-hw.c         |   4 +-
+ drivers/cpufreq/omap-cpufreq.c                |   4 +-
+ drivers/cpufreq/qcom-cpufreq-hw.c             |   4 +-
+ include/acpi/cppc_acpi.h                      |  23 +
+ include/linux/amd-pstate.h                    |  34 +
+ include/linux/cpufreq.h                       |   2 +-
+ 13 files changed, 1139 insertions(+), 59 deletions(-)
+
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 9595abf34974..f39b8f05392c 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -339,6 +339,29 @@
+ 			             This mode requires kvm-amd.avic=1.
+ 			             (Default when IOMMU HW support is present.)
+ 
++	amd_pstate=	[X86]
++			disable
++			  Do not enable amd_pstate as the default
++			  scaling driver for the supported processors
++			passive
++			  Use amd_pstate with passive mode as a scaling driver.
++			  In this mode autonomous selection is disabled.
++			  Driver requests a desired performance level and platform
++			  tries to match the same performance level if it is
++			  satisfied by guaranteed performance level.
++			active
++			  Use amd_pstate_epp driver instance as the scaling driver,
++			  driver provides a hint to the hardware if software wants
++			  to bias toward performance (0x0) or energy efficiency (0xff)
++			  to the CPPC firmware. then CPPC power algorithm will
++			  calculate the runtime workload and adjust the realtime cores
++			  frequency.
++			guided
++			  Activate guided autonomous mode. Driver requests minimum and
++			  maximum performance level and the platform autonomously
++			  selects a performance level in this range and appropriate
++			  to the current workload.
++
+ 	amijoy.map=	[HW,JOY] Amiga joystick support
+ 			Map of devices attached to JOY0DAT and JOY1DAT
+ 			Format: <a>,<b>
+@@ -7019,13 +7042,3 @@
+ 				xmon commands.
+ 			off	xmon is disabled.
+ 
+-	amd_pstate=	[X86]
+-			disable
+-			  Do not enable amd_pstate as the default
+-			  scaling driver for the supported processors
+-			passive
+-			  Use amd_pstate as a scaling driver, driver requests a
+-			  desired performance on this abstract scale and the power
+-			  management firmware translates the requests into actual
+-			  hardware states (core frequency, data fabric and memory
+-			  clocks etc.)
+diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
+index 5376d53faaa8..f24a90007e98 100644
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond
+ to the request from AMD P-States.
+ 
+ 
+-User Space Interface in ``sysfs``
+-==================================
++User Space Interface in ``sysfs`` - Per-policy control
++======================================================
+ 
+ ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
+ control its functionality at the system level. They are located in the
+@@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability
+ <perf_cap_>`_.)
+ This attribute is read-only.
+ 
++``energy_performance_available_preferences``
++
++A list of all the supported EPP preferences that could be used for
++``energy_performance_preference`` on this system.
++These profiles represent different hints that are provided
++to the low-level firmware about the user's desired energy vs efficiency
++tradeoff.  ``default`` represents the epp value is set by platform
++firmware. This attribute is read-only.
++
++``energy_performance_preference``
++
++The current energy performance preference can be read from this attribute.
++and user can change current preference according to energy or performance needs
++Please get all support profiles list from
++``energy_performance_available_preferences`` attribute, all the profiles are
++integer values defined between 0 to 255 when EPP feature is enabled by platform
++firmware, if EPP feature is disabled, driver will ignore the written value
++This attribute is read-write.
++
+ Other performance and frequency values can be read back from
+ ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
+ 
+@@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD
+ platforms. The AMD P-States mechanism is the more performance and energy
+ efficiency frequency management method on AMD processors.
+ 
+-Kernel Module Options for ``amd-pstate``
+-=========================================
++
++AMD Pstate Driver Operation Modes
++=================================
++
++``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
++non-autonomous (passive) mode and guided autonomous (guided) mode.
++Active/passive/guided mode can be chosen by different kernel parameters.
++
++- In autonomous mode, platform ignores the desired performance level request
++  and takes into account only the values set to the minimum, maximum and energy
++  performance preference registers.
++- In non-autonomous mode, platform gets desired performance level
++  from OS directly through Desired Performance Register.
++- In guided-autonomous mode, platform sets operating performance level
++  autonomously according to the current workload and within the limits set by
++  OS through min and max performance registers.
++
++Active Mode
++------------
++
++``amd_pstate=active``
++
++This is the low-level firmware control mode which is implemented by ``amd_pstate_epp``
++driver with ``amd_pstate=active`` passed to the kernel in the command line.
++In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software
++wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware.
++then CPPC power algorithm will calculate the runtime workload and adjust the realtime
++cores frequency according to the power supply and thermal, core voltage and some other
++hardware conditions.
+ 
+ Passive Mode
+ ------------
+@@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l
+ processor must provide at least nominal performance requested and go higher if current
+ operating conditions allow.
+ 
++Guided Mode
++-----------
++
++``amd_pstate=guided``
++
++If ``amd_pstate=guided`` is passed to kernel command line option then this mode
++is activated.  In this mode, driver requests minimum and maximum performance
++level and the platform autonomously selects a performance level in this range
++and appropriate to the current workload.
++
++User Space Interface in ``sysfs`` - General
++===========================================
++
++Global Attributes
++-----------------
++
++``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
++control its functionality at the system level.  They are located in the
++``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs.
++
++``status``
++	Operation mode of the driver: "active", "passive" or "disable".
++
++	"active"
++		The driver is functional and in the ``active mode``
++
++	"passive"
++		The driver is functional and in the ``passive mode``
++
++	"guided"
++		The driver is functional and in the ``guided mode``
++
++	"disable"
++		The driver is unregistered and not functional now.
++
++        This attribute can be written to in order to change the driver's
++        operation mode or to unregister it.  The string written to it must be
++        one of the possible values of it and, if successful, writing one of
++        these values to the sysfs file will cause the driver to switch over
++        to the operation mode represented by that string - or to be
++        unregistered in the "disable" case.
+ 
+ ``cpupower`` tool support for ``amd-pstate``
+ ===============================================
+diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
+index 0f17b1c32718..0efdbeed6ada 100644
+--- a/drivers/acpi/cppc_acpi.c
++++ b/drivers/acpi/cppc_acpi.c
+@@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+ 	return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
+ }
+ 
++/**
++ * cppc_get_epp_perf - Get the epp register value.
++ * @cpunum: CPU from which to get epp preference value.
++ * @epp_perf: Return address.
++ *
++ * Return: 0 for success, -EIO otherwise.
++ */
++int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
++{
++	return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf);
++}
++EXPORT_SYMBOL_GPL(cppc_get_epp_perf);
++
+ /**
+  * cppc_get_perf_caps - Get a CPU's performance capabilities.
+  * @cpunum: CPU from which to get capabilities info.
+@@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
+ }
+ EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs);
+ 
++/*
++ * Set Energy Performance Preference Register value through
++ * Performance Controls Interface
++ */
++int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
++{
++	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
++	struct cpc_register_resource *epp_set_reg;
++	struct cpc_register_resource *auto_sel_reg;
++	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
++	struct cppc_pcc_data *pcc_ss_data = NULL;
++	int ret;
++
++	if (!cpc_desc) {
++		pr_debug("No CPC descriptor for CPU:%d\n", cpu);
++		return -ENODEV;
++	}
++
++	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
++	epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF];
++
++	if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) {
++		if (pcc_ss_id < 0) {
++			pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu);
++			return -ENODEV;
++		}
++
++		if (CPC_SUPPORTED(auto_sel_reg)) {
++			ret = cpc_write(cpu, auto_sel_reg, enable);
++			if (ret)
++				return ret;
++		}
++
++		if (CPC_SUPPORTED(epp_set_reg)) {
++			ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf);
++			if (ret)
++				return ret;
++		}
++
++		pcc_ss_data = pcc_data[pcc_ss_id];
++
++		down_write(&pcc_ss_data->pcc_lock);
++		/* after writing CPC, transfer the ownership of PCC to platform */
++		ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
++		up_write(&pcc_ss_data->pcc_lock);
++	} else {
++		ret = -ENOTSUPP;
++		pr_debug("_CPC in PCC is not supported\n");
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
++
++/*
++ * cppc_get_auto_sel_caps - Read autonomous selection register.
++ * @cpunum : CPU from which to read register.
++ * @perf_caps : struct where autonomous selection register value is updated.
++ */
++int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
++{
++	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
++	struct cpc_register_resource *auto_sel_reg;
++	u64  auto_sel;
++
++	if (!cpc_desc) {
++		pr_debug("No CPC descriptor for CPU:%d\n", cpunum);
++		return -ENODEV;
++	}
++
++	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
++
++	if (!CPC_SUPPORTED(auto_sel_reg))
++		pr_warn_once("Autonomous mode is not unsupported!\n");
++
++	if (CPC_IN_PCC(auto_sel_reg)) {
++		int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
++		struct cppc_pcc_data *pcc_ss_data = NULL;
++		int ret = 0;
++
++		if (pcc_ss_id < 0)
++			return -ENODEV;
++
++		pcc_ss_data = pcc_data[pcc_ss_id];
++
++		down_write(&pcc_ss_data->pcc_lock);
++
++		if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) {
++			cpc_read(cpunum, auto_sel_reg, &auto_sel);
++			perf_caps->auto_sel = (bool)auto_sel;
++		} else {
++			ret = -EIO;
++		}
++
++		up_write(&pcc_ss_data->pcc_lock);
++
++		return ret;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps);
++
++/*
++ * cppc_set_auto_sel - Write autonomous selection register.
++ * @cpu    : CPU to which to write register.
++ * @enable : the desired value of autonomous selection resiter to be updated.
++ */
++int cppc_set_auto_sel(int cpu, bool enable)
++{
++	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
++	struct cpc_register_resource *auto_sel_reg;
++	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
++	struct cppc_pcc_data *pcc_ss_data = NULL;
++	int ret = -EINVAL;
++
++	if (!cpc_desc) {
++		pr_debug("No CPC descriptor for CPU:%d\n", cpu);
++		return -ENODEV;
++	}
++
++	auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
++
++	if (CPC_IN_PCC(auto_sel_reg)) {
++		if (pcc_ss_id < 0) {
++			pr_debug("Invalid pcc_ss_id\n");
++			return -ENODEV;
++		}
++
++		if (CPC_SUPPORTED(auto_sel_reg)) {
++			ret = cpc_write(cpu, auto_sel_reg, enable);
++			if (ret)
++				return ret;
++		}
++
++		pcc_ss_data = pcc_data[pcc_ss_id];
++
++		down_write(&pcc_ss_data->pcc_lock);
++		/* after writing CPC, transfer the ownership of PCC to platform */
++		ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
++		up_write(&pcc_ss_data->pcc_lock);
++	} else {
++		ret = -ENOTSUPP;
++		pr_debug("_CPC in PCC is not supported\n");
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(cppc_set_auto_sel);
++
++
+ /**
+  * cppc_set_enable - Set to enable CPPC on the processor by writing the
+  * Continuous Performance Control package EnableRegister field.
+@@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable);
+ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ {
+ 	struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
+-	struct cpc_register_resource *desired_reg;
++	struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg;
+ 	int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
+ 	struct cppc_pcc_data *pcc_ss_data = NULL;
+ 	int ret = 0;
+@@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ 	}
+ 
+ 	desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
++	min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF];
++	max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF];
+ 
+ 	/*
+ 	 * This is Phase-I where we want to write to CPC registers
+@@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ 	 * Since read_lock can be acquired by multiple CPUs simultaneously we
+ 	 * achieve that goal here
+ 	 */
+-	if (CPC_IN_PCC(desired_reg)) {
++	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
+ 		if (pcc_ss_id < 0) {
+ 			pr_debug("Invalid pcc_ss_id\n");
+ 			return -ENODEV;
+@@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ 		cpc_desc->write_cmd_status = 0;
+ 	}
+ 
+-	/*
+-	 * Skip writing MIN/MAX until Linux knows how to come up with
+-	 * useful values.
+-	 */
+ 	cpc_write(cpu, desired_reg, perf_ctrls->desired_perf);
+ 
+-	if (CPC_IN_PCC(desired_reg))
++	/**
++	 * Only write if min_perf and max_perf not zero. Some drivers pass zero
++	 * value to min and max perf, but they don't mean to set the zero value,
++	 * they just don't want to write to those registers.
++	 */
++	if (perf_ctrls->min_perf)
++		cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf);
++	if (perf_ctrls->max_perf)
++		cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf);
++
++	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg))
+ 		up_read(&pcc_ss_data->pcc_lock);	/* END Phase-I */
+ 	/*
+ 	 * This is Phase-II where we transfer the ownership of PCC to Platform
+@@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
+ 	 * case during a CMD_READ and if there are pending writes it delivers
+ 	 * the write command before servicing the read command
+ 	 */
+-	if (CPC_IN_PCC(desired_reg)) {
++	if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
+ 		if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */
+ 			/* Update only if there are pending write commands */
+ 			if (pcc_ss_data->pending_pcc_write_cmd)
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index c17bd845f5fc..d4e60da7a544 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -59,8 +59,173 @@
+  * we disable it by default to go acpi-cpufreq on these processors and add a
+  * module parameter to be able to enable it manually for debugging.
+  */
++static struct cpufreq_driver *current_pstate_driver;
+ static struct cpufreq_driver amd_pstate_driver;
+-static int cppc_load __initdata;
++static struct cpufreq_driver amd_pstate_epp_driver;
++static int cppc_state = AMD_PSTATE_DISABLE;
++struct kobject *amd_pstate_kobj;
++
++/*
++ * AMD Energy Preference Performance (EPP)
++ * The EPP is used in the CCLK DPM controller to drive
++ * the frequency that a core is going to operate during
++ * short periods of activity. EPP values will be utilized for
++ * different OS profiles (balanced, performance, power savings)
++ * display strings corresponding to EPP index in the
++ * energy_perf_strings[]
++ *	index		String
++ *-------------------------------------
++ *	0		default
++ *	1		performance
++ *	2		balance_performance
++ *	3		balance_power
++ *	4		power
++ */
++enum energy_perf_value_index {
++	EPP_INDEX_DEFAULT = 0,
++	EPP_INDEX_PERFORMANCE,
++	EPP_INDEX_BALANCE_PERFORMANCE,
++	EPP_INDEX_BALANCE_POWERSAVE,
++	EPP_INDEX_POWERSAVE,
++};
++
++static const char * const energy_perf_strings[] = {
++	[EPP_INDEX_DEFAULT] = "default",
++	[EPP_INDEX_PERFORMANCE] = "performance",
++	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
++	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
++	[EPP_INDEX_POWERSAVE] = "power",
++	NULL
++};
++
++static unsigned int epp_values[] = {
++	[EPP_INDEX_DEFAULT] = 0,
++	[EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE,
++	[EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE,
++	[EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE,
++	[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
++ };
++
++typedef int (*cppc_mode_transition_fn)(int);
++
++static inline int get_mode_idx_from_str(const char *str, size_t size)
++{
++	int i;
++
++	for (i=0; i < AMD_PSTATE_MAX; i++) {
++		if (!strncmp(str, amd_pstate_mode_string[i], size))
++			return i;
++	}
++	return -EINVAL;
++}
++
++static DEFINE_MUTEX(amd_pstate_limits_lock);
++static DEFINE_MUTEX(amd_pstate_driver_lock);
++
++static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
++{
++	u64 epp;
++	int ret;
++
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		if (!cppc_req_cached) {
++			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
++					&cppc_req_cached);
++			if (epp)
++				return epp;
++		}
++		epp = (cppc_req_cached >> 24) & 0xFF;
++	} else {
++		ret = cppc_get_epp_perf(cpudata->cpu, &epp);
++		if (ret < 0) {
++			pr_debug("Could not retrieve energy perf value (%d)\n", ret);
++			return -EIO;
++		}
++	}
++
++	return (s16)(epp & 0xff);
++}
++
++static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
++{
++	s16 epp;
++	int index = -EINVAL;
++
++	epp = amd_pstate_get_epp(cpudata, 0);
++	if (epp < 0)
++		return epp;
++
++	switch (epp) {
++	case AMD_CPPC_EPP_PERFORMANCE:
++		index = EPP_INDEX_PERFORMANCE;
++		break;
++	case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
++		index = EPP_INDEX_BALANCE_PERFORMANCE;
++		break;
++	case AMD_CPPC_EPP_BALANCE_POWERSAVE:
++		index = EPP_INDEX_BALANCE_POWERSAVE;
++		break;
++	case AMD_CPPC_EPP_POWERSAVE:
++		index = EPP_INDEX_POWERSAVE;
++		break;
++	default:
++		break;
++	}
++
++	return index;
++}
++
++static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
++{
++	int ret;
++	struct cppc_perf_ctrls perf_ctrls;
++
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		u64 value = READ_ONCE(cpudata->cppc_req_cached);
++
++		value &= ~GENMASK_ULL(31, 24);
++		value |= (u64)epp << 24;
++		WRITE_ONCE(cpudata->cppc_req_cached, value);
++
++		ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
++		if (!ret)
++			cpudata->epp_cached = epp;
++	} else {
++		perf_ctrls.energy_perf = epp;
++		ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
++		if (ret) {
++			pr_debug("failed to set energy perf value (%d)\n", ret);
++			return ret;
++		}
++		cpudata->epp_cached = epp;
++	}
++
++	return ret;
++}
++
++static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
++		int pref_index)
++{
++	int epp = -EINVAL;
++	int ret;
++
++	if (!pref_index) {
++		pr_debug("EPP pref_index is invalid\n");
++		return -EINVAL;
++	}
++
++	if (epp == -EINVAL)
++		epp = epp_values[pref_index];
++
++	if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
++		pr_debug("EPP cannot be set under performance policy\n");
++		return -EBUSY;
++	}
++
++	ret = amd_pstate_set_epp(cpudata, epp);
++
++	return ret;
++}
+ 
+ static inline int pstate_enable(bool enable)
+ {
+@@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable)
+ static int cppc_enable(bool enable)
+ {
+ 	int cpu, ret = 0;
++	struct cppc_perf_ctrls perf_ctrls;
+ 
+ 	for_each_present_cpu(cpu) {
+ 		ret = cppc_set_enable(cpu, enable);
+ 		if (ret)
+ 			return ret;
++
++		/* Enable autonomous mode for EPP */
++		if (cppc_state == AMD_PSTATE_ACTIVE) {
++			/* Set desired perf as zero to allow EPP firmware control */
++			perf_ctrls.desired_perf = 0;
++			ret = cppc_set_perf(cpu, &perf_ctrls);
++			if (ret)
++				return ret;
++		}
+ 	}
+ 
+ 	return ret;
+@@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
+ 		   cppc_perf.lowest_nonlinear_perf);
+ 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
+ 
+-	return 0;
++	if (cppc_state == AMD_PSTATE_ACTIVE)
++		return 0;
++
++	ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf);
++	if (ret) {
++		pr_warn("failed to get auto_sel, ret: %d\n", ret);
++		return 0;
++	}
++
++	ret = cppc_set_auto_sel(cpudata->cpu,
++			(cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
++
++	if (ret)
++		pr_warn("failed to set auto_sel, ret: %d\n", ret);
++
++	return ret;
+ }
+ 
+ DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
+@@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
+ }
+ 
+ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
+-			      u32 des_perf, u32 max_perf, bool fast_switch)
++			      u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags)
+ {
+ 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
+ 	u64 value = prev;
+ 
+ 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
++
++	if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
++		min_perf = des_perf;
++		des_perf = 0;
++	}
++
+ 	value &= ~AMD_CPPC_MIN_PERF(~0L);
+ 	value |= AMD_CPPC_MIN_PERF(min_perf);
+ 
+@@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
+ 
+ 	cpufreq_freq_transition_begin(policy, &freqs);
+ 	amd_pstate_update(cpudata, min_perf, des_perf,
+-			  max_perf, false);
++			  max_perf, false, policy->governor->flags);
+ 	cpufreq_freq_transition_end(policy, &freqs, false);
+ 
+ 	return 0;
+@@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
+ 	if (max_perf < min_perf)
+ 		max_perf = min_perf;
+ 
+-	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true);
++	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
++			policy->governor->flags);
+ 	cpufreq_cpu_put(policy);
+ }
+ 
+@@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
+ 		return;
+ 
+ 	cpudata->boost_supported = true;
+-	amd_pstate_driver.boost_enabled = true;
++	current_pstate_driver->boost_enabled = true;
+ }
+ 
+ static void amd_perf_ctl_reset(unsigned int cpu)
+@@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
+ 	policy->driver_data = cpudata;
+ 
+ 	amd_pstate_boost_init(cpudata);
++	if (!current_pstate_driver->adjust_perf)
++		current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 
+ 	return 0;
+ 
+@@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy,
+ 	if (max_freq < 0)
+ 		return max_freq;
+ 
+-	return sprintf(&buf[0], "%u\n", max_freq);
++	return sysfs_emit(buf, "%u\n", max_freq);
+ }
+ 
+ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy,
+@@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli
+ 	if (freq < 0)
+ 		return freq;
+ 
+-	return sprintf(&buf[0], "%u\n", freq);
++	return sysfs_emit(buf, "%u\n", freq);
+ }
+ 
+ /*
+@@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
+ 
+ 	perf = READ_ONCE(cpudata->highest_perf);
+ 
+-	return sprintf(&buf[0], "%u\n", perf);
++	return sysfs_emit(buf, "%u\n", perf);
++}
++
++static ssize_t show_energy_performance_available_preferences(
++				struct cpufreq_policy *policy, char *buf)
++{
++	int i = 0;
++	int offset = 0;
++
++	while (energy_perf_strings[i] != NULL)
++		offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]);
++
++	sysfs_emit_at(buf, offset, "\n");
++
++	return offset;
++}
++
++static ssize_t store_energy_performance_preference(
++		struct cpufreq_policy *policy, const char *buf, size_t count)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	char str_preference[21];
++	ssize_t ret;
++
++	ret = sscanf(buf, "%20s", str_preference);
++	if (ret != 1)
++		return -EINVAL;
++
++	ret = match_string(energy_perf_strings, -1, str_preference);
++	if (ret < 0)
++		return -EINVAL;
++
++	mutex_lock(&amd_pstate_limits_lock);
++	ret = amd_pstate_set_energy_pref_index(cpudata, ret);
++	mutex_unlock(&amd_pstate_limits_lock);
++
++	return ret ?: count;
++}
++
++static ssize_t show_energy_performance_preference(
++				struct cpufreq_policy *policy, char *buf)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	int preference;
++
++	preference = amd_pstate_get_energy_pref_index(cpudata);
++	if (preference < 0)
++		return preference;
++
++	return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
++}
++
++static void amd_pstate_driver_cleanup(void)
++{
++	amd_pstate_enable(false);
++	cppc_state = AMD_PSTATE_DISABLE;
++	current_pstate_driver = NULL;
++}
++
++static int amd_pstate_register_driver(int mode)
++{
++	int ret;
++
++	if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
++		current_pstate_driver = &amd_pstate_driver;
++	else if (mode == AMD_PSTATE_ACTIVE)
++		current_pstate_driver = &amd_pstate_epp_driver;
++	else
++		return -EINVAL;
++
++	cppc_state = mode;
++	ret = cpufreq_register_driver(current_pstate_driver);
++	if (ret) {
++		amd_pstate_driver_cleanup();
++		return ret;
++	}
++	return 0;
++}
++
++static int amd_pstate_unregister_driver(int dummy)
++{
++	cpufreq_unregister_driver(current_pstate_driver);
++	amd_pstate_driver_cleanup();
++	return 0;
++}
++
++static int amd_pstate_change_mode_without_dvr_change(int mode)
++{
++	int cpu = 0;
++
++	cppc_state = mode;
++
++	if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
++		return 0;
++
++	for_each_present_cpu(cpu) {
++		cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
++	}
++
++	return 0;
++}
++
++static int amd_pstate_change_driver_mode(int mode)
++{
++	int ret;
++
++	ret = amd_pstate_unregister_driver(0);
++	if (ret)
++		return ret;
++
++	ret = amd_pstate_register_driver(mode);
++	if (ret)
++		return ret;
++
++	return 0;
++}
++
++cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = {
++	[AMD_PSTATE_DISABLE]         = {
++		[AMD_PSTATE_DISABLE]     = NULL,
++		[AMD_PSTATE_PASSIVE]     = amd_pstate_register_driver,
++		[AMD_PSTATE_ACTIVE]      = amd_pstate_register_driver,
++		[AMD_PSTATE_GUIDED]      = amd_pstate_register_driver,
++	},
++	[AMD_PSTATE_PASSIVE]         = {
++		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
++		[AMD_PSTATE_PASSIVE]     = NULL,
++		[AMD_PSTATE_ACTIVE]      = amd_pstate_change_driver_mode,
++		[AMD_PSTATE_GUIDED]      = amd_pstate_change_mode_without_dvr_change,
++	},
++	[AMD_PSTATE_ACTIVE]          = {
++		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
++		[AMD_PSTATE_PASSIVE]     = amd_pstate_change_driver_mode,
++		[AMD_PSTATE_ACTIVE]      = NULL,
++		[AMD_PSTATE_GUIDED]      = amd_pstate_change_driver_mode,
++	},
++	[AMD_PSTATE_GUIDED]          = {
++		[AMD_PSTATE_DISABLE]     = amd_pstate_unregister_driver,
++		[AMD_PSTATE_PASSIVE]     = amd_pstate_change_mode_without_dvr_change,
++		[AMD_PSTATE_ACTIVE]      = amd_pstate_change_driver_mode,
++		[AMD_PSTATE_GUIDED]      = NULL,
++	},
++};
++
++static ssize_t amd_pstate_show_status(char *buf)
++{
++	if (!current_pstate_driver)
++		return sysfs_emit(buf, "disable\n");
++
++	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
++}
++
++static int amd_pstate_update_status(const char *buf, size_t size)
++{
++	int mode_idx;
++
++	if (size > strlen("passive") || size < strlen("active"))
++		return -EINVAL;
++
++	mode_idx = get_mode_idx_from_str(buf, size);
++
++	if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
++		return -EINVAL;
++
++	if (mode_state_machine[cppc_state][mode_idx])
++		return mode_state_machine[cppc_state][mode_idx](mode_idx);
++
++	return 0;
++}
++
++static ssize_t show_status(struct kobject *kobj,
++			   struct kobj_attribute *attr, char *buf)
++{
++	ssize_t ret;
++
++	mutex_lock(&amd_pstate_driver_lock);
++	ret = amd_pstate_show_status(buf);
++	mutex_unlock(&amd_pstate_driver_lock);
++
++	return ret;
++}
++
++static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
++			    const char *buf, size_t count)
++{
++	char *p = memchr(buf, '\n', count);
++	int ret;
++
++	mutex_lock(&amd_pstate_driver_lock);
++	ret = amd_pstate_update_status(buf, p ? p - buf : count);
++	mutex_unlock(&amd_pstate_driver_lock);
++
++	return ret < 0 ? ret : count;
+ }
+ 
+ cpufreq_freq_attr_ro(amd_pstate_max_freq);
+ cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
+ 
+ cpufreq_freq_attr_ro(amd_pstate_highest_perf);
++cpufreq_freq_attr_rw(energy_performance_preference);
++cpufreq_freq_attr_ro(energy_performance_available_preferences);
++define_one_global_rw(status);
+ 
+ static struct freq_attr *amd_pstate_attr[] = {
+ 	&amd_pstate_max_freq,
+@@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = {
+ 	NULL,
+ };
+ 
++static struct freq_attr *amd_pstate_epp_attr[] = {
++	&amd_pstate_max_freq,
++	&amd_pstate_lowest_nonlinear_freq,
++	&amd_pstate_highest_perf,
++	&energy_performance_preference,
++	&energy_performance_available_preferences,
++	NULL,
++};
++
++static struct attribute *pstate_global_attributes[] = {
++	&status.attr,
++	NULL
++};
++
++static const struct attribute_group amd_pstate_global_attr_group = {
++	.attrs = pstate_global_attributes,
++};
++
++static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
++{
++	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
++	struct amd_cpudata *cpudata;
++	struct device *dev;
++	u64 value;
++
++	/*
++	 * Resetting PERF_CTL_MSR will put the CPU in P0 frequency,
++	 * which is ideal for initialization process.
++	 */
++	amd_perf_ctl_reset(policy->cpu);
++	dev = get_cpu_device(policy->cpu);
++	if (!dev)
++		return -ENODEV;
++
++	cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL);
++	if (!cpudata)
++		return -ENOMEM;
++
++	cpudata->cpu = policy->cpu;
++	cpudata->epp_policy = 0;
++
++	ret = amd_pstate_init_perf(cpudata);
++	if (ret)
++		goto free_cpudata1;
++
++	min_freq = amd_get_min_freq(cpudata);
++	max_freq = amd_get_max_freq(cpudata);
++	nominal_freq = amd_get_nominal_freq(cpudata);
++	lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata);
++	if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) {
++		dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n",
++				min_freq, max_freq);
++		ret = -EINVAL;
++		goto free_cpudata1;
++	}
++
++	policy->cpuinfo.min_freq = min_freq;
++	policy->cpuinfo.max_freq = max_freq;
++	/* It will be updated by governor */
++	policy->cur = policy->cpuinfo.min_freq;
++
++	/* Initial processor data capability frequencies */
++	cpudata->max_freq = max_freq;
++	cpudata->min_freq = min_freq;
++	cpudata->nominal_freq = nominal_freq;
++	cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
++
++	policy->driver_data = cpudata;
++
++	cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0);
++
++	policy->min = policy->cpuinfo.min_freq;
++	policy->max = policy->cpuinfo.max_freq;
++
++	/*
++	 * Set the policy to powersave to provide a valid fallback value in case
++	 * the default cpufreq governor is neither powersave nor performance.
++	 */
++	policy->policy = CPUFREQ_POLICY_POWERSAVE;
++
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		policy->fast_switch_possible = true;
++		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
++		if (ret)
++			return ret;
++		WRITE_ONCE(cpudata->cppc_req_cached, value);
++
++		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
++		if (ret)
++			return ret;
++		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
++	}
++	amd_pstate_boost_init(cpudata);
++
++	return 0;
++
++free_cpudata1:
++	kfree(cpudata);
++	return ret;
++}
++
++static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
++{
++	pr_debug("CPU %d exiting\n", policy->cpu);
++	policy->fast_switch_possible = false;
++	return 0;
++}
++
++static void amd_pstate_epp_init(unsigned int cpu)
++{
++	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
++	struct amd_cpudata *cpudata = policy->driver_data;
++	u32 max_perf, min_perf;
++	u64 value;
++	s16 epp;
++
++	max_perf = READ_ONCE(cpudata->highest_perf);
++	min_perf = READ_ONCE(cpudata->lowest_perf);
++
++	value = READ_ONCE(cpudata->cppc_req_cached);
++
++	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
++		min_perf = max_perf;
++
++	/* Initial min/max values for CPPC Performance Controls Register */
++	value &= ~AMD_CPPC_MIN_PERF(~0L);
++	value |= AMD_CPPC_MIN_PERF(min_perf);
++
++	value &= ~AMD_CPPC_MAX_PERF(~0L);
++	value |= AMD_CPPC_MAX_PERF(max_perf);
++
++	/* CPPC EPP feature require to set zero to the desire perf bit */
++	value &= ~AMD_CPPC_DES_PERF(~0L);
++	value |= AMD_CPPC_DES_PERF(0);
++
++	if (cpudata->epp_policy == cpudata->policy)
++		goto skip_epp;
++
++	cpudata->epp_policy = cpudata->policy;
++
++	/* Get BIOS pre-defined epp value */
++	epp = amd_pstate_get_epp(cpudata, value);
++	if (epp < 0) {
++		/**
++		 * This return value can only be negative for shared_memory
++		 * systems where EPP register read/write not supported.
++		 */
++		goto skip_epp;
++	}
++
++	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
++		epp = 0;
++
++	/* Set initial EPP value */
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		value &= ~GENMASK_ULL(31, 24);
++		value |= (u64)epp << 24;
++	}
++
++	WRITE_ONCE(cpudata->cppc_req_cached, value);
++	amd_pstate_set_epp(cpudata, epp);
++skip_epp:
++	cpufreq_cpu_put(policy);
++}
++
++static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++
++	if (!policy->cpuinfo.max_freq)
++		return -ENODEV;
++
++	pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
++				policy->cpuinfo.max_freq, policy->max);
++
++	cpudata->policy = policy->policy;
++
++	amd_pstate_epp_init(policy->cpu);
++
++	return 0;
++}
++
++static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
++{
++	struct cppc_perf_ctrls perf_ctrls;
++	u64 value, max_perf;
++	int ret;
++
++	ret = amd_pstate_enable(true);
++	if (ret)
++		pr_err("failed to enable amd pstate during resume, return %d\n", ret);
++
++	value = READ_ONCE(cpudata->cppc_req_cached);
++	max_perf = READ_ONCE(cpudata->highest_perf);
++
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
++	} else {
++		perf_ctrls.max_perf = max_perf;
++		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
++		cppc_set_perf(cpudata->cpu, &perf_ctrls);
++	}
++}
++
++static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++
++	pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
++
++	if (cppc_state == AMD_PSTATE_ACTIVE) {
++		amd_pstate_epp_reenable(cpudata);
++		cpudata->suspended = false;
++	}
++
++	return 0;
++}
++
++static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	struct cppc_perf_ctrls perf_ctrls;
++	int min_perf;
++	u64 value;
++
++	min_perf = READ_ONCE(cpudata->lowest_perf);
++	value = READ_ONCE(cpudata->cppc_req_cached);
++
++	mutex_lock(&amd_pstate_limits_lock);
++	if (boot_cpu_has(X86_FEATURE_CPPC)) {
++		cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN;
++
++		/* Set max perf same as min perf */
++		value &= ~AMD_CPPC_MAX_PERF(~0L);
++		value |= AMD_CPPC_MAX_PERF(min_perf);
++		value &= ~AMD_CPPC_MIN_PERF(~0L);
++		value |= AMD_CPPC_MIN_PERF(min_perf);
++		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
++	} else {
++		perf_ctrls.desired_perf = 0;
++		perf_ctrls.max_perf = min_perf;
++		perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
++		cppc_set_perf(cpudata->cpu, &perf_ctrls);
++	}
++	mutex_unlock(&amd_pstate_limits_lock);
++}
++
++static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++
++	pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu);
++
++	if (cpudata->suspended)
++		return 0;
++
++	if (cppc_state == AMD_PSTATE_ACTIVE)
++		amd_pstate_epp_offline(policy);
++
++	return 0;
++}
++
++static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
++{
++	cpufreq_verify_within_cpu_limits(policy);
++	pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
++	return 0;
++}
++
++static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++	int ret;
++
++	/* avoid suspending when EPP is not enabled */
++	if (cppc_state != AMD_PSTATE_ACTIVE)
++		return 0;
++
++	/* set this flag to avoid setting core offline*/
++	cpudata->suspended = true;
++
++	/* disable CPPC in lowlevel firmware */
++	ret = amd_pstate_enable(false);
++	if (ret)
++		pr_err("failed to suspend, return %d\n", ret);
++
++	return 0;
++}
++
++static int amd_pstate_epp_resume(struct cpufreq_policy *policy)
++{
++	struct amd_cpudata *cpudata = policy->driver_data;
++
++	if (cpudata->suspended) {
++		mutex_lock(&amd_pstate_limits_lock);
++
++		/* enable amd pstate from suspend state*/
++		amd_pstate_epp_reenable(cpudata);
++
++		mutex_unlock(&amd_pstate_limits_lock);
++
++		cpudata->suspended = false;
++	}
++
++	return 0;
++}
++
+ static struct cpufreq_driver amd_pstate_driver = {
+ 	.flags		= CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
+ 	.verify		= amd_pstate_verify,
+@@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = {
+ 	.attr		= amd_pstate_attr,
+ };
+ 
++static struct cpufreq_driver amd_pstate_epp_driver = {
++	.flags		= CPUFREQ_CONST_LOOPS,
++	.verify		= amd_pstate_epp_verify_policy,
++	.setpolicy	= amd_pstate_epp_set_policy,
++	.init		= amd_pstate_epp_cpu_init,
++	.exit		= amd_pstate_epp_cpu_exit,
++	.offline	= amd_pstate_epp_cpu_offline,
++	.online		= amd_pstate_epp_cpu_online,
++	.suspend	= amd_pstate_epp_suspend,
++	.resume		= amd_pstate_epp_resume,
++	.name		= "amd_pstate_epp",
++	.attr		= amd_pstate_epp_attr,
++};
++
+ static int __init amd_pstate_init(void)
+ {
+ 	int ret;
+@@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void)
+ 	/*
+ 	 * by default the pstate driver is disabled to load
+ 	 * enable the amd_pstate passive mode driver explicitly
+-	 * with amd_pstate=passive in kernel command line
++	 * with amd_pstate=passive or other modes in kernel command line
+ 	 */
+-	if (!cppc_load) {
+-		pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n");
++	if (cppc_state == AMD_PSTATE_DISABLE) {
++		pr_debug("driver load is disabled, boot with specific mode to enable this\n");
+ 		return -ENODEV;
+ 	}
+ 
+@@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void)
+ 	/* capability check */
+ 	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+ 		pr_debug("AMD CPPC MSR based functionality is supported\n");
+-		amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf;
++		if (cppc_state != AMD_PSTATE_ACTIVE)
++			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
+ 	} else {
+ 		pr_debug("AMD CPPC shared memory based functionality is supported\n");
+ 		static_call_update(amd_pstate_enable, cppc_enable);
+@@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void)
+ 	/* enable amd pstate feature */
+ 	ret = amd_pstate_enable(true);
+ 	if (ret) {
+-		pr_err("failed to enable amd-pstate with return %d\n", ret);
++		pr_err("failed to enable with return %d\n", ret);
+ 		return ret;
+ 	}
+ 
+-	ret = cpufreq_register_driver(&amd_pstate_driver);
++	ret = cpufreq_register_driver(current_pstate_driver);
+ 	if (ret)
+-		pr_err("failed to register amd_pstate_driver with return %d\n",
+-		       ret);
++		pr_err("failed to register with return %d\n", ret);
++
++	amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj);
++	if (!amd_pstate_kobj) {
++		ret = -EINVAL;
++		pr_err("global sysfs registration failed.\n");
++		goto kobject_free;
++	}
+ 
++	ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group);
++	if (ret) {
++		pr_err("sysfs attribute export failed with error %d.\n", ret);
++		goto global_attr_free;
++	}
++
++	return ret;
++
++global_attr_free:
++	kobject_put(amd_pstate_kobj);
++kobject_free:
++	cpufreq_unregister_driver(current_pstate_driver);
+ 	return ret;
+ }
+ device_initcall(amd_pstate_init);
+ 
+ static int __init amd_pstate_param(char *str)
+ {
++	size_t size;
++	int mode_idx;
++
+ 	if (!str)
+ 		return -EINVAL;
+ 
+-	if (!strcmp(str, "disable")) {
+-		cppc_load = 0;
+-		pr_info("driver is explicitly disabled\n");
+-	} else if (!strcmp(str, "passive"))
+-		cppc_load = 1;
++	size = strlen(str);
++	mode_idx = get_mode_idx_from_str(str, size);
+ 
+-	return 0;
++	if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
++		cppc_state = mode_idx;
++		if (cppc_state == AMD_PSTATE_DISABLE)
++			pr_info("driver is explicitly disabled\n");
++
++		if (cppc_state == AMD_PSTATE_ACTIVE)
++			current_pstate_driver = &amd_pstate_epp_driver;
++
++		if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
++			current_pstate_driver = &amd_pstate_driver;
++
++		return 0;
++	}
++
++	return -EINVAL;
+ }
+ early_param("amd_pstate", amd_pstate_param);
+ 
+diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
+index 4153150e20db..ffea6402189d 100644
+--- a/drivers/cpufreq/brcmstb-avs-cpufreq.c
++++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
+@@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
+ 
+ static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
+ {
+-	int ret;
+-
+-	ret = cpufreq_unregister_driver(&brcm_avs_driver);
+-	WARN_ON(ret);
++	cpufreq_unregister_driver(&brcm_avs_driver);
+ 
+ 	brcm_avs_prepare_uninit(pdev);
+ 
+diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
+index 7e56a42750ea..85a0bea2dbf1 100644
+--- a/drivers/cpufreq/cpufreq.c
++++ b/drivers/cpufreq/cpufreq.c
+@@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver);
+  * Returns zero if successful, and -EINVAL if the cpufreq_driver is
+  * currently not initialised.
+  */
+-int cpufreq_unregister_driver(struct cpufreq_driver *driver)
++void cpufreq_unregister_driver(struct cpufreq_driver *driver)
+ {
+ 	unsigned long flags;
+ 
+-	if (!cpufreq_driver || (driver != cpufreq_driver))
+-		return -EINVAL;
++	if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver)))
++		return;
+ 
+ 	pr_debug("unregistering driver %s\n", driver->name);
+ 
+@@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
+ 
+ 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
+ 	cpus_read_unlock();
+-
+-	return 0;
+ }
+ EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
+ 
+diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
+index 9e97f60f8199..2d23015e2abd 100644
+--- a/drivers/cpufreq/davinci-cpufreq.c
++++ b/drivers/cpufreq/davinci-cpufreq.c
+@@ -138,7 +138,9 @@ static int __exit davinci_cpufreq_remove(struct platform_device *pdev)
+ 	if (cpufreq.asyncclk)
+ 		clk_put(cpufreq.asyncclk);
+ 
+-	return cpufreq_unregister_driver(&davinci_driver);
++	cpufreq_unregister_driver(&davinci_driver);
++
++	return 0;
+ }
+ 
+ static struct platform_driver davinci_cpufreq_driver = {
+diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
+index f80339779084..f21a9e3df53d 100644
+--- a/drivers/cpufreq/mediatek-cpufreq-hw.c
++++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
+@@ -317,7 +317,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev)
+ 
+ static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev)
+ {
+-	return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver);
++	cpufreq_unregister_driver(&cpufreq_mtk_hw_driver);
++
++	return 0;
+ }
+ 
+ static const struct of_device_id mtk_cpufreq_hw_match[] = {
+diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
+index 1b50df06c6bc..81649a1969b6 100644
+--- a/drivers/cpufreq/omap-cpufreq.c
++++ b/drivers/cpufreq/omap-cpufreq.c
+@@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev)
+ 
+ static int omap_cpufreq_remove(struct platform_device *pdev)
+ {
+-	return cpufreq_unregister_driver(&omap_driver);
++	cpufreq_unregister_driver(&omap_driver);
++
++	return 0;
+ }
+ 
+ static struct platform_driver omap_cpufreq_platdrv = {
+diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
+index d3f55ca06ed3..2f581d2d617d 100644
+--- a/drivers/cpufreq/qcom-cpufreq-hw.c
++++ b/drivers/cpufreq/qcom-cpufreq-hw.c
+@@ -770,7 +770,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev)
+ 
+ static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev)
+ {
+-	return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver);
++	cpufreq_unregister_driver(&cpufreq_qcom_hw_driver);
++
++	return 0;
+ }
+ 
+ static struct platform_driver qcom_cpufreq_hw_driver = {
+diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
+index c5614444031f..6126c977ece0 100644
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -108,12 +108,15 @@ struct cppc_perf_caps {
+ 	u32 lowest_nonlinear_perf;
+ 	u32 lowest_freq;
+ 	u32 nominal_freq;
++	u32 energy_perf;
++	bool auto_sel;
+ };
+ 
+ struct cppc_perf_ctrls {
+ 	u32 max_perf;
+ 	u32 min_perf;
+ 	u32 desired_perf;
++	u32 energy_perf;
+ };
+ 
+ struct cppc_perf_fb_ctrs {
+@@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void);
+ extern bool cpc_supported_by_cpu(void);
+ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
+ extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val);
++extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
++extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
++extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
++extern int cppc_set_auto_sel(int cpu, bool enable);
+ #else /* !CONFIG_ACPI_CPPC_LIB */
+ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+ {
+@@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+ {
+ 	return -ENOTSUPP;
+ }
++static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
++{
++	return -ENOTSUPP;
++}
++static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
++{
++	return -ENOTSUPP;
++}
++static inline int cppc_set_auto_sel(int cpu, bool enable)
++{
++	return -ENOTSUPP;
++}
++static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
++{
++	return -ENOTSUPP;
++}
+ #endif /* !CONFIG_ACPI_CPPC_LIB */
+ 
+ #endif /* _CPPC_ACPI_H*/
+diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
+index 1c4b8659f171..c10ebf8c42e6 100644
+--- a/include/linux/amd-pstate.h
++++ b/include/linux/amd-pstate.h
+@@ -12,6 +12,11 @@
+ 
+ #include <linux/pm_qos.h>
+ 
++#define AMD_CPPC_EPP_PERFORMANCE		0x00
++#define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
++#define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
++#define AMD_CPPC_EPP_POWERSAVE			0xFF
++
+ /*********************************************************************
+  *                        AMD P-state INTERFACE                       *
+  *********************************************************************/
+@@ -47,6 +52,10 @@ struct amd_aperf_mperf {
+  * @prev: Last Aperf/Mperf/tsc count value read from register
+  * @freq: current cpu frequency value
+  * @boost_supported: check whether the Processor or SBIOS supports boost mode
++ * @epp_policy: Last saved policy used to set energy-performance preference
++ * @epp_cached: Cached CPPC energy-performance preference value
++ * @policy: Cpufreq policy value
++ * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value
+  *
+  * The amd_cpudata is key private data for each CPU thread in AMD P-State, and
+  * represents all the attributes and goals that AMD P-State requests at runtime.
+@@ -72,6 +81,31 @@ struct amd_cpudata {
+ 
+ 	u64	freq;
+ 	bool	boost_supported;
++
++	/* EPP feature related attributes*/
++	s16	epp_policy;
++	s16	epp_cached;
++	u32	policy;
++	u64	cppc_cap1_cached;
++	bool	suspended;
+ };
+ 
++/*
++ * enum amd_pstate_mode - driver working mode of amd pstate
++ */
++enum amd_pstate_mode {
++	AMD_PSTATE_DISABLE = 0,
++	AMD_PSTATE_PASSIVE,
++	AMD_PSTATE_ACTIVE,
++	AMD_PSTATE_GUIDED,
++	AMD_PSTATE_MAX,
++};
++
++static const char * const amd_pstate_mode_string[] = {
++	[AMD_PSTATE_DISABLE]     = "disable",
++	[AMD_PSTATE_PASSIVE]     = "passive",
++	[AMD_PSTATE_ACTIVE]      = "active",
++	[AMD_PSTATE_GUIDED]      = "guided",
++	NULL,
++};
+ #endif /* _LINUX_AMD_PSTATE_H */
+diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
+index 6a94a6eaad27..65623233ab2f 100644
+--- a/include/linux/cpufreq.h
++++ b/include/linux/cpufreq.h
+@@ -448,7 +448,7 @@ struct cpufreq_driver {
+ #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
+ 
+ int cpufreq_register_driver(struct cpufreq_driver *driver_data);
+-int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
++void cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
+ 
+ bool cpufreq_driver_test_flags(u16 flags);
+ const char *cpufreq_get_current_driver(void);
+-- 
+2.39.2
+
+From 087384681c8c010c8a826bc03b6aa7634f73a3bf Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 22 Jan 2023 13:41:50 +0100
+Subject: [PATCH 09/15] ksm
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
+ arch/arm/tools/syscall.tbl                  |   1 +
+ arch/arm64/include/asm/unistd.h             |   2 +-
+ arch/arm64/include/asm/unistd32.h           |   2 +
+ arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
+ arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
+ arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
+ arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
+ arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
+ arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
+ arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
+ arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
+ arch/s390/kernel/syscalls/syscall.tbl       |   1 +
+ arch/sh/kernel/syscalls/syscall.tbl         |   1 +
+ arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
+ arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
+ arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
+ arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
+ include/linux/ksm.h                         |   4 +
+ include/linux/syscalls.h                    |   1 +
+ include/uapi/asm-generic/unistd.h           |   5 +-
+ kernel/sys_ni.c                             |   1 +
+ mm/ksm.c                                    |  88 +++++++++------
+ mm/madvise.c                                | 113 ++++++++++++++++++++
+ 24 files changed, 198 insertions(+), 34 deletions(-)
+
+diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
+index 8ebacf37a8cf..c9d25f85d86d 100644
+--- a/arch/alpha/kernel/syscalls/syscall.tbl
++++ b/arch/alpha/kernel/syscalls/syscall.tbl
+@@ -490,3 +490,4 @@
+ 558	common	process_mrelease		sys_process_mrelease
+ 559	common  futex_waitv                     sys_futex_waitv
+ 560	common	set_mempolicy_home_node		sys_ni_syscall
++561	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
+index ac964612d8b0..90933eabe115 100644
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -464,3 +464,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common	futex_waitv			sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
+index 037feba03a51..64a514f90131 100644
+--- a/arch/arm64/include/asm/unistd.h
++++ b/arch/arm64/include/asm/unistd.h
+@@ -39,7 +39,7 @@
+ #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
+ #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
+ 
+-#define __NR_compat_syscalls		451
++#define __NR_compat_syscalls		452
+ #endif
+ 
+ #define __ARCH_WANT_SYS_CLONE
+diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
+index 604a2053d006..91f2bb7199af 100644
+--- a/arch/arm64/include/asm/unistd32.h
++++ b/arch/arm64/include/asm/unistd32.h
+@@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
+ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
+ #define __NR_set_mempolicy_home_node 450
+ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
++#define __NR_pmadv_ksm 451
++__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
+ 
+ /*
+  * Please add new compat syscalls above this comment and update
+diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
+index 72c929d9902b..0d5b1d14b2b5 100644
+--- a/arch/ia64/kernel/syscalls/syscall.tbl
++++ b/arch/ia64/kernel/syscalls/syscall.tbl
+@@ -371,3 +371,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
+index b1f3940bc298..5ccf925567da 100644
+--- a/arch/m68k/kernel/syscalls/syscall.tbl
++++ b/arch/m68k/kernel/syscalls/syscall.tbl
+@@ -450,3 +450,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
+index 820145e47350..6b76208597f3 100644
+--- a/arch/microblaze/kernel/syscalls/syscall.tbl
++++ b/arch/microblaze/kernel/syscalls/syscall.tbl
+@@ -456,3 +456,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
+index 253ff994ed2e..e4aeedb17c38 100644
+--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
+@@ -389,3 +389,4 @@
+ 448	n32	process_mrelease		sys_process_mrelease
+ 449	n32	futex_waitv			sys_futex_waitv
+ 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	n32	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
+index 3f1886ad9d80..fe88db51efa0 100644
+--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
+@@ -365,3 +365,4 @@
+ 448	n64	process_mrelease		sys_process_mrelease
+ 449	n64	futex_waitv			sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	n64	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
+index 8f243e35a7b2..674cb940bd15 100644
+--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
+@@ -438,3 +438,4 @@
+ 448	o32	process_mrelease		sys_process_mrelease
+ 449	o32	futex_waitv			sys_futex_waitv
+ 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	o32	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
+index 0e42fceb2d5e..5914aa460255 100644
+--- a/arch/parisc/kernel/syscalls/syscall.tbl
++++ b/arch/parisc/kernel/syscalls/syscall.tbl
+@@ -448,3 +448,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common	futex_waitv			sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
+index a0be127475b1..347894da4eb6 100644
+--- a/arch/powerpc/kernel/syscalls/syscall.tbl
++++ b/arch/powerpc/kernel/syscalls/syscall.tbl
+@@ -537,3 +537,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
+index 799147658dee..1cd523748bd2 100644
+--- a/arch/s390/kernel/syscalls/syscall.tbl
++++ b/arch/s390/kernel/syscalls/syscall.tbl
+@@ -453,3 +453,4 @@
+ 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
+ 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
+ 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
++451  common	pmadv_ksm		sys_pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
+index 2de85c977f54..cfc75fa43eae 100644
+--- a/arch/sh/kernel/syscalls/syscall.tbl
++++ b/arch/sh/kernel/syscalls/syscall.tbl
+@@ -453,3 +453,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
+index 4398cc6fb68d..d2c0a6426f6b 100644
+--- a/arch/sparc/kernel/syscalls/syscall.tbl
++++ b/arch/sparc/kernel/syscalls/syscall.tbl
+@@ -496,3 +496,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 320480a8db4f..331aaf1a782f 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -455,3 +455,4 @@
+ 448	i386	process_mrelease	sys_process_mrelease
+ 449	i386	futex_waitv		sys_futex_waitv
+ 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	i386	pmadv_ksm		sys_pmadv_ksm
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index c84d12608cd2..14902db4c01f 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -372,6 +372,7 @@
+ 448	common	process_mrelease	sys_process_mrelease
+ 449	common	futex_waitv		sys_futex_waitv
+ 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
++451	common	pmadv_ksm		sys_pmadv_ksm
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
+index 52c94ab5c205..1518e261d882 100644
+--- a/arch/xtensa/kernel/syscalls/syscall.tbl
++++ b/arch/xtensa/kernel/syscalls/syscall.tbl
+@@ -421,3 +421,4 @@
+ 448	common	process_mrelease		sys_process_mrelease
+ 449	common  futex_waitv                     sys_futex_waitv
+ 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
++451	common	pmadv_ksm			sys_pmadv_ksm
+diff --git a/include/linux/ksm.h b/include/linux/ksm.h
+index 7e232ba59b86..632a1a792ebb 100644
+--- a/include/linux/ksm.h
++++ b/include/linux/ksm.h
+@@ -16,6 +16,10 @@
+ #include <linux/sched/coredump.h>
+ 
+ #ifdef CONFIG_KSM
++int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
++		unsigned long *vm_flags);
++int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
++		unsigned long end, unsigned long *vm_flags);
+ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ 		unsigned long end, int advice, unsigned long *vm_flags);
+ int __ksm_enter(struct mm_struct *mm);
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 33a0ee3bcb2e..62f14e800839 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -919,6 +919,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+ asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ 			size_t vlen, int behavior, unsigned int flags);
+ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
++asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags);
+ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+ 			unsigned long prot, unsigned long pgoff,
+ 			unsigned long flags);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 45fa180cc56a..40f7e6d04af0 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
+ #define __NR_set_mempolicy_home_node 450
+ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+ 
++#define __NR_pmadv_ksm 451
++__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 451
++#define __NR_syscalls 452
+ 
+ /*
+  * 32 bit systems traditionally used different
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index 860b2dcf3ac4..810e1fcaff94 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -292,6 +292,7 @@ COND_SYSCALL(mincore);
+ COND_SYSCALL(madvise);
+ COND_SYSCALL(process_madvise);
+ COND_SYSCALL(process_mrelease);
++COND_SYSCALL(pmadv_ksm);
+ COND_SYSCALL(remap_file_pages);
+ COND_SYSCALL(mbind);
+ COND_SYSCALL(get_mempolicy);
+diff --git a/mm/ksm.c b/mm/ksm.c
+index c267b92b837b..4474b7ac0cd6 100644
+--- a/mm/ksm.c
++++ b/mm/ksm.c
+@@ -2575,54 +2575,78 @@ static int ksm_scan_thread(void *nothing)
+ 	return 0;
+ }
+ 
+-int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+-		unsigned long end, int advice, unsigned long *vm_flags)
++int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
++		unsigned long *vm_flags)
+ {
+-	struct mm_struct *mm = vma->vm_mm;
+ 	int err;
+ 
+-	switch (advice) {
+-	case MADV_MERGEABLE:
+-		/*
+-		 * Be somewhat over-protective for now!
+-		 */
+-		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
+-				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
+-				 VM_HUGETLB | VM_MIXEDMAP))
+-			return 0;		/* just ignore the advice */
++	/*
++	 * Be somewhat over-protective for now!
++	 */
++	if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
++			 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
++			 VM_HUGETLB | VM_MIXEDMAP))
++		return 0;		/* just ignore the advice */
+ 
+-		if (vma_is_dax(vma))
+-			return 0;
++	if (vma_is_dax(vma))
++		return 0;
+ 
+ #ifdef VM_SAO
+ 		if (*vm_flags & VM_SAO)
+ 			return 0;
+ #endif
+ #ifdef VM_SPARC_ADI
+-		if (*vm_flags & VM_SPARC_ADI)
+-			return 0;
++	if (*vm_flags & VM_SPARC_ADI)
++		return 0;
+ #endif
+ 
+-		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+-			err = __ksm_enter(mm);
+-			if (err)
+-				return err;
+-		}
++	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
++		err = __ksm_enter(mm);
++		if (err)
++			return err;
++	}
+ 
+-		*vm_flags |= VM_MERGEABLE;
+-		break;
++	*vm_flags |= VM_MERGEABLE;
+ 
+-	case MADV_UNMERGEABLE:
+-		if (!(*vm_flags & VM_MERGEABLE))
+-			return 0;		/* just ignore the advice */
++	return 0;
++}
+ 
+-		if (vma->anon_vma) {
+-			err = unmerge_ksm_pages(vma, start, end);
+-			if (err)
+-				return err;
+-		}
++int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
++		unsigned long end, unsigned long *vm_flags)
++{
++	int err;
++
++	if (!(*vm_flags & VM_MERGEABLE))
++		return 0;		/* just ignore the advice */
++
++	if (vma->anon_vma) {
++		err = unmerge_ksm_pages(vma, start, end);
++		if (err)
++			return err;
++	}
+ 
+-		*vm_flags &= ~VM_MERGEABLE;
++	*vm_flags &= ~VM_MERGEABLE;
++
++	return 0;
++}
++
++int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
++		unsigned long end, int advice, unsigned long *vm_flags)
++{
++	struct mm_struct *mm = vma->vm_mm;
++	int err;
++
++	switch (advice) {
++	case MADV_MERGEABLE:
++		err = ksm_madvise_merge(mm, vma, vm_flags);
++		if (err)
++			return err;
++		break;
++
++	case MADV_UNMERGEABLE:
++		err = ksm_madvise_unmerge(vma, start, end, vm_flags);
++		if (err)
++			return err;
+ 		break;
+ 	}
+ 
+diff --git a/mm/madvise.c b/mm/madvise.c
+index b6ea204d4e23..0064dcafb812 100644
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1527,3 +1527,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ out:
+ 	return ret;
+ }
++
++SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags)
++{
++#ifdef CONFIG_KSM
++	ssize_t ret;
++	struct pid *pid;
++	struct task_struct *task;
++	struct mm_struct *mm;
++	unsigned int f_flags;
++	struct vm_area_struct *vma;
++	struct vma_iterator vmi;
++
++	if (flags != 0) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	switch (behaviour) {
++		case MADV_MERGEABLE:
++		case MADV_UNMERGEABLE:
++			break;
++		default:
++			ret = -EINVAL;
++			goto out;
++			break;
++	}
++
++	pid = pidfd_get_pid(pidfd, &f_flags);
++	if (IS_ERR(pid)) {
++		ret = PTR_ERR(pid);
++		goto out;
++	}
++
++	task = get_pid_task(pid, PIDTYPE_PID);
++	if (!task) {
++		ret = -ESRCH;
++		goto put_pid;
++	}
++
++	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
++	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
++	if (IS_ERR_OR_NULL(mm)) {
++		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
++		goto release_task;
++	}
++
++	/* Require CAP_SYS_NICE for influencing process performance. */
++	if (!capable(CAP_SYS_NICE)) {
++		ret = -EPERM;
++		goto release_mm;
++	}
++
++	if (mmap_write_lock_killable(mm)) {
++		ret = -EINTR;
++		goto release_mm;
++	}
++
++	vma_iter_init(&vmi, mm, 0);
++	for_each_vma(vmi, vma) {
++		switch (behaviour) {
++			case MADV_MERGEABLE:
++				ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
++				break;
++			case MADV_UNMERGEABLE:
++				ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
++				break;
++			default:
++				/* look, ma, no brain */
++				break;
++		}
++		if (ret)
++			break;
++	}
++
++	mmap_write_unlock(mm);
++
++release_mm:
++	mmput(mm);
++release_task:
++	put_task_struct(task);
++put_pid:
++	put_pid(pid);
++out:
++	return ret;
++#else /* CONFIG_KSM */
++	return -ENOSYS;
++#endif /* CONFIG_KSM */
++}
++
++#ifdef CONFIG_KSM
++static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr,
++			 char *buf)
++{
++	return sprintf(buf, "%u\n", __NR_pmadv_ksm);
++}
++static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm);
++
++static struct attribute *pmadv_sysfs_attrs[] = {
++	&pmadv_ksm_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group pmadv_sysfs_attr_group = {
++	.attrs = pmadv_sysfs_attrs,
++	.name = "pmadv",
++};
++
++static int __init pmadv_sysfs_init(void)
++{
++	return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group);
++}
++subsys_initcall(pmadv_sysfs_init);
++#endif /* CONFIG_KSM */
+-- 
+2.39.2
+
+From 3fcdb0864bf3a1d90f3689ffa8acceec00a5926e Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 13 Feb 2023 09:25:47 +0100
+Subject: [PATCH 10/15] maple-lru
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/mm/multigen_lru.rst |   86 ++-
+ include/linux/fs.h                |    2 +
+ include/linux/maple_tree.h        |    6 -
+ include/linux/memcontrol.h        |   10 +
+ include/linux/mm_inline.h         |   19 +-
+ include/linux/mmzone.h            |  122 +++-
+ lib/maple_tree.c                  |  113 ++-
+ mm/fadvise.c                      |    5 +-
+ mm/memcontrol.c                   |   12 +
+ mm/memory.c                       |    7 +-
+ mm/page_alloc.c                   |    1 +
+ mm/rmap.c                         |   42 +-
+ mm/vmscan.c                       | 1059 ++++++++++++++++++-----------
+ mm/workingset.c                   |    4 +-
+ tools/testing/radix-tree/maple.c  |   18 +-
+ 15 files changed, 1002 insertions(+), 504 deletions(-)
+
+diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
+index d7062c6a8946..5f1f6ecbb79b 100644
+--- a/Documentation/mm/multigen_lru.rst
++++ b/Documentation/mm/multigen_lru.rst
+@@ -89,15 +89,15 @@ variables are monotonically increasing.
+ 
+ Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
+ bits in order to fit into the gen counter in ``folio->flags``. Each
+-truncated generation number is an index to ``lrugen->lists[]``. The
++truncated generation number is an index to ``lrugen->folios[]``. The
+ sliding window technique is used to track at least ``MIN_NR_GENS`` and
+ at most ``MAX_NR_GENS`` generations. The gen counter stores a value
+ within ``[1, MAX_NR_GENS]`` while a page is on one of
+-``lrugen->lists[]``; otherwise it stores zero.
++``lrugen->folios[]``; otherwise it stores zero.
+ 
+ Each generation is divided into multiple tiers. A page accessed ``N``
+ times through file descriptors is in tier ``order_base_2(N)``. Unlike
+-generations, tiers do not have dedicated ``lrugen->lists[]``. In
++generations, tiers do not have dedicated ``lrugen->folios[]``. In
+ contrast to moving across generations, which requires the LRU lock,
+ moving across tiers only involves atomic operations on
+ ``folio->flags`` and therefore has a negligible cost. A feedback loop
+@@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
+ Eviction
+ --------
+ The eviction consumes old generations. Given an ``lruvec``, it
+-increments ``min_seq`` when ``lrugen->lists[]`` indexed by
++increments ``min_seq`` when ``lrugen->folios[]`` indexed by
+ ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
+ evict from, it first compares ``min_seq[]`` to select the older type.
+ If both types are equally old, it selects the one whose first tier has
+@@ -141,9 +141,85 @@ loop has detected outlying refaults from the tier this page is in. To
+ this end, the feedback loop uses the first tier as the baseline, for
+ the reason stated earlier.
+ 
++Working set protection
++----------------------
++Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is
++set, an ``lruvec`` is protected from the eviction when its oldest
++generation was born within ``lru_gen_min_ttl`` milliseconds. In other
++words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds
++from getting evicted. The OOM killer is triggered if this working set
++cannot be kept in memory.
++
++This time-based approach has the following advantages:
++
++1. It is easier to configure because it is agnostic to applications
++   and memory sizes.
++2. It is more reliable because it is directly wired to the OOM killer.
++
++Rmap/PT walk feedback
++---------------------
++Searching the rmap for PTEs mapping each page on an LRU list (to test
++and clear the accessed bit) can be expensive because pages from
++different VMAs (PA space) are not cache friendly to the rmap (VA
++space). For workloads mostly using mapped pages, searching the rmap
++can incur the highest CPU cost in the reclaim path.
++
++``lru_gen_look_around()`` exploits spatial locality to reduce the
++trips into the rmap. It scans the adjacent PTEs of a young PTE and
++promotes hot pages. If the scan was done cacheline efficiently, it
++adds the PMD entry pointing to the PTE table to the Bloom filter. This
++forms a feedback loop between the eviction and the aging.
++
++Bloom Filters
++-------------
++Bloom filters are a space and memory efficient data structure for set
++membership test, i.e., test if an element is not in the set or may be
++in the set.
++
++In the eviction path, specifically, in ``lru_gen_look_around()``, if a
++PMD has a sufficient number of hot pages, its address is placed in the
++filter. In the aging path, set membership means that the PTE range
++will be scanned for young pages.
++
++Note that Bloom filters are probabilistic on set membership. If a test
++is false positive, the cost is an additional scan of a range of PTEs,
++which may yield hot pages anyway. Parameters of the filter itself can
++control the false positive rate in the limit.
++
++Memcg LRU
++---------
++An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
++since each node and memcg combination has an LRU of folios (see
++``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
++global reclaim, which is critical to system-wide memory overcommit in
++data centers. Note that memcg LRU only applies to global reclaim.
++
++The basic structure of an memcg LRU can be understood by an analogy to
++the active/inactive LRU (of folios):
++
++1. It has the young and the old (generations), i.e., the counterparts
++   to the active and the inactive;
++2. The increment of ``max_seq`` triggers promotion, i.e., the
++   counterpart to activation;
++3. Other events trigger similar operations, e.g., offlining an memcg
++   triggers demotion, i.e., the counterpart to deactivation.
++
++In terms of global reclaim, it has two distinct features:
++
++1. Sharding, which allows each thread to start at a random memcg (in
++   the old generation) and improves parallelism;
++2. Eventual fairness, which allows direct reclaim to bail out at will
++   and reduces latency without affecting fairness over some time.
++
++In terms of traversing memcgs during global reclaim, it improves the
++best-case complexity from O(n) to O(1) and does not affect the
++worst-case complexity O(n). Therefore, on average, it has a sublinear
++complexity.
++
+ Summary
+ -------
+-The multi-gen LRU can be disassembled into the following parts:
++The multi-gen LRU (of folios) can be disassembled into the following
++parts:
+ 
+ * Generations
+ * Rmap walks
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index c1769a2c5d70..d353c262d669 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
+ /* File supports DIRECT IO */
+ #define	FMODE_CAN_ODIRECT	((__force fmode_t)0x400000)
+ 
++#define	FMODE_NOREUSE		((__force fmode_t)0x800000)
++
+ /* File was opened by fanotify and shouldn't generate fanotify events */
+ #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
+ 
+diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
+index e594db58a0f1..815a27661517 100644
+--- a/include/linux/maple_tree.h
++++ b/include/linux/maple_tree.h
+@@ -12,7 +12,6 @@
+ #include <linux/rcupdate.h>
+ #include <linux/spinlock.h>
+ /* #define CONFIG_MAPLE_RCU_DISABLED */
+-/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
+ 
+ /*
+  * Allocated nodes are mutable until they have been inserted into the tree,
+@@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas)
+ 	return mas->node == MAS_PAUSE;
+ }
+ 
+-void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
+-void mas_dup_store(struct ma_state *mas, void *entry);
+-
+ /*
+  * This finds an empty area from the highest address to the lowest.
+  * AKA "Topdown" version,
+@@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas)
+  * entry.
+  *
+  * Note: may return the zero entry.
+- *
+  */
+ #define mas_for_each(__mas, __entry, __max) \
+ 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
+@@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt)
+ }
+ 
+ static inline unsigned int mt_height(const struct maple_tree *mt)
+-
+ {
+ 	return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
+ }
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index 85dc9b88ea37..8e0be0680005 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+ 	percpu_ref_put(&objcg->refcnt);
+ }
+ 
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++	return !memcg || css_tryget(&memcg->css);
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ 	if (memcg)
+@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+ {
+ }
+ 
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++	return true;
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ }
+diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
+index ff3f3f23f649..de1e622dd366 100644
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
+ 	int zone = folio_zonenum(folio);
+ 	int delta = folio_nr_pages(folio);
+ 	enum lru_list lru = type * LRU_INACTIVE_FILE;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ 	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
+ 	int gen = folio_lru_gen(folio);
+ 	int type = folio_is_file_lru(folio);
+ 	int zone = folio_zonenum(folio);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+ 
+@@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
+ 	lru_gen_update_size(lruvec, folio, -1, gen);
+ 	/* for folio_rotate_reclaimable() */
+ 	if (reclaiming)
+-		list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
++		list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ 	else
+-		list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
++		list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
+ 
+ 	return true;
+ }
+@@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
+ #endif
+ }
+ 
++static inline bool vma_has_recency(struct vm_area_struct *vma)
++{
++	if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
++		return false;
++
++	if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
++		return false;
++
++	return true;
++}
++
+ #endif
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index cd28a100d9e4..977be526c939 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -7,6 +7,7 @@
+ 
+ #include <linux/spinlock.h>
+ #include <linux/list.h>
++#include <linux/list_nulls.h>
+ #include <linux/wait.h>
+ #include <linux/bitops.h>
+ #include <linux/cache.h>
+@@ -312,7 +313,7 @@ enum lruvec_flags {
+  * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+  * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+  * corresponding generation. The gen counter in folio->flags stores gen+1 while
+- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
++ * a page is on one of lrugen->folios[]. Otherwise it stores 0.
+  *
+  * A page is added to the youngest generation on faulting. The aging needs to
+  * check the accessed bit at least twice before handing this page over to the
+@@ -324,8 +325,8 @@ enum lruvec_flags {
+  * rest of generations, if they exist, are considered inactive. See
+  * lru_gen_is_active().
+  *
+- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+- * the aging needs not to worry about it. And it's set again when a page
++ * PG_active is always cleared while a page is on one of lrugen->folios[] so
++ * that the aging needs not to worry about it. And it's set again when a page
+  * considered active is isolated for non-reclaiming purposes, e.g., migration.
+  * See lru_gen_add_folio() and lru_gen_del_folio().
+  *
+@@ -404,7 +405,7 @@ enum {
+  * The number of pages in each generation is eventually consistent and therefore
+  * can be transiently negative when reset_batch_size() is pending.
+  */
+-struct lru_gen_struct {
++struct lru_gen_folio {
+ 	/* the aging increments the youngest generation number */
+ 	unsigned long max_seq;
+ 	/* the eviction increments the oldest generation numbers */
+@@ -412,7 +413,7 @@ struct lru_gen_struct {
+ 	/* the birth time of each generation in jiffies */
+ 	unsigned long timestamps[MAX_NR_GENS];
+ 	/* the multi-gen LRU lists, lazily sorted on eviction */
+-	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the multi-gen LRU sizes, eventually consistent */
+ 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the exponential moving average of refaulted */
+@@ -426,6 +427,14 @@ struct lru_gen_struct {
+ 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ 	/* whether the multi-gen LRU is enabled */
+ 	bool enabled;
++#ifdef CONFIG_MEMCG
++	/* the memcg generation this lru_gen_folio belongs to */
++	u8 gen;
++	/* the list segment this lru_gen_folio belongs to */
++	u8 seg;
++	/* per-node lru_gen_folio list for global reclaim */
++	struct hlist_nulls_node list;
++#endif
+ };
+ 
+ enum {
+@@ -461,7 +470,7 @@ struct lru_gen_mm_state {
+ struct lru_gen_mm_walk {
+ 	/* the lruvec under reclaim */
+ 	struct lruvec *lruvec;
+-	/* unstable max_seq from lru_gen_struct */
++	/* unstable max_seq from lru_gen_folio */
+ 	unsigned long max_seq;
+ 	/* the next address within an mm to scan */
+ 	unsigned long next_addr;
+@@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+ 
+ #ifdef CONFIG_MEMCG
++
++/*
++ * For each node, memcgs are divided into two generations: the old and the
++ * young. For each generation, memcgs are randomly sharded into multiple bins
++ * to improve scalability. For each bin, the hlist_nulls is virtually divided
++ * into three segments: the head, the tail and the default.
++ *
++ * An onlining memcg is added to the tail of a random bin in the old generation.
++ * The eviction starts at the head of a random bin in the old generation. The
++ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
++ * the old generation, is incremented when all its bins become empty.
++ *
++ * There are four operations:
++ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
++ *    current generation (old or young) and updates its "seg" to "head";
++ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
++ *    current generation (old or young) and updates its "seg" to "tail";
++ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
++ *    generation, updates its "gen" to "old" and resets its "seg" to "default";
++ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
++ *    young generation, updates its "gen" to "young" and resets its "seg" to
++ *    "default".
++ *
++ * The events that trigger the above operations are:
++ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
++ * 2. The first attempt to reclaim an memcg below low, which triggers
++ *    MEMCG_LRU_TAIL;
++ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
++ *    which triggers MEMCG_LRU_TAIL;
++ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
++ *    which triggers MEMCG_LRU_YOUNG;
++ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
++ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
++ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
++ *
++ * Note that memcg LRU only applies to global reclaim, and the round-robin
++ * incrementing of their max_seq counters ensures the eventual fairness to all
++ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
++ */
++#define MEMCG_NR_GENS	2
++#define MEMCG_NR_BINS	8
++
++struct lru_gen_memcg {
++	/* the per-node memcg generation counter */
++	unsigned long seq;
++	/* each memcg has one lru_gen_folio per node */
++	unsigned long nr_memcgs[MEMCG_NR_GENS];
++	/* per-node lru_gen_folio list for global reclaim */
++	struct hlist_nulls_head	fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
++	/* protects the above */
++	spinlock_t lock;
++};
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat);
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
+ void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+-#endif
++void lru_gen_online_memcg(struct mem_cgroup *memcg);
++void lru_gen_offline_memcg(struct mem_cgroup *memcg);
++void lru_gen_release_memcg(struct mem_cgroup *memcg);
++void lru_gen_soft_reclaim(struct lruvec *lruvec);
++
++#else /* !CONFIG_MEMCG */
++
++#define MEMCG_NR_GENS	1
++
++struct lru_gen_memcg {
++};
++
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+ 
+ #else /* !CONFIG_LRU_GEN */
+ 
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
+ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
+ }
+@@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ }
+ 
+ #ifdef CONFIG_MEMCG
++
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ }
+@@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ {
+ }
+-#endif
++
++static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+ 
+ #endif /* CONFIG_LRU_GEN */
+ 
+@@ -524,7 +626,7 @@ struct lruvec {
+ 	unsigned long			flags;
+ #ifdef CONFIG_LRU_GEN
+ 	/* evictable pages divided into generations */
+-	struct lru_gen_struct		lrugen;
++	struct lru_gen_folio		lrugen;
+ 	/* to concurrently iterate lru_gen_mm_list */
+ 	struct lru_gen_mm_state		mm_state;
+ #endif
+@@ -1243,6 +1345,8 @@ typedef struct pglist_data {
+ #ifdef CONFIG_LRU_GEN
+ 	/* kswap mm walk data */
+ 	struct lru_gen_mm_walk	mm_walk;
++	/* lru_gen_folio list */
++	struct lru_gen_memcg memcg_lru;
+ #endif
+ 
+ 	CACHELINE_PADDING(_pad2_);
+diff --git a/lib/maple_tree.c b/lib/maple_tree.c
+index 5a976393c9ae..b95652b79b55 100644
+--- a/lib/maple_tree.c
++++ b/lib/maple_tree.c
+@@ -149,13 +149,12 @@ struct maple_subtree_state {
+ /* Functions */
+ static inline struct maple_node *mt_alloc_one(gfp_t gfp)
+ {
+-	return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO);
++	return kmem_cache_alloc(maple_node_cache, gfp);
+ }
+ 
+ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
+ {
+-	return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size,
+-				     nodes);
++	return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
+ }
+ 
+ static inline void mt_free_bulk(size_t size, void __rcu **nodes)
+@@ -183,7 +182,6 @@ static void ma_free_rcu(struct maple_node *node)
+ 	call_rcu(&node->rcu, mt_free_rcu);
+ }
+ 
+-
+ static void mas_set_height(struct ma_state *mas)
+ {
+ 	unsigned int new_flags = mas->tree->ma_flags;
+@@ -468,7 +466,7 @@ static inline
+ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
+ 		    unsigned char slot)
+ {
+-	unsigned long val = (unsigned long) parent;
++	unsigned long val = (unsigned long)parent;
+ 	unsigned long shift;
+ 	unsigned long type;
+ 	enum maple_type p_type = mte_node_type(parent);
+@@ -502,10 +500,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
+  */
+ static inline unsigned int mte_parent_slot(const struct maple_enode *enode)
+ {
+-	unsigned long val = (unsigned long) mte_to_node(enode)->parent;
++	unsigned long val = (unsigned long)mte_to_node(enode)->parent;
+ 
+-	/* Root. */
+-	if (val & 1)
++	if (val & MA_ROOT_PARENT)
+ 		return 0;
+ 
+ 	/*
+@@ -1128,9 +1125,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
+ {
+ 	struct maple_alloc *ret, *node = mas->alloc;
+ 	unsigned long total = mas_allocated(mas);
++	unsigned int req = mas_alloc_req(mas);
+ 
+ 	/* nothing or a request pending. */
+-	if (unlikely(!total))
++	if (WARN_ON(!total))
+ 		return NULL;
+ 
+ 	if (total == 1) {
+@@ -1140,27 +1138,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
+ 		goto single_node;
+ 	}
+ 
+-	if (!node->node_count) {
++	if (node->node_count == 1) {
+ 		/* Single allocation in this node. */
+ 		mas->alloc = node->slot[0];
+-		node->slot[0] = NULL;
+ 		mas->alloc->total = node->total - 1;
+ 		ret = node;
+ 		goto new_head;
+ 	}
+-
+ 	node->total--;
+-	ret = node->slot[node->node_count];
+-	node->slot[node->node_count--] = NULL;
++	ret = node->slot[--node->node_count];
++	node->slot[node->node_count] = NULL;
+ 
+ single_node:
+ new_head:
+-	ret->total = 0;
+-	ret->node_count = 0;
+-	if (ret->request_count) {
+-		mas_set_alloc_req(mas, ret->request_count + 1);
+-		ret->request_count = 0;
++	if (req) {
++		req++;
++		mas_set_alloc_req(mas, req);
+ 	}
++
++	memset(ret, 0, sizeof(*ret));
+ 	return (struct maple_node *)ret;
+ }
+ 
+@@ -1179,21 +1175,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
+ 	unsigned long count;
+ 	unsigned int requested = mas_alloc_req(mas);
+ 
+-	memset(reuse, 0, sizeof(*reuse));
+ 	count = mas_allocated(mas);
+ 
+-	if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) {
+-		if (head->slot[0])
+-			head->node_count++;
+-		head->slot[head->node_count] = reuse;
++	reuse->request_count = 0;
++	reuse->node_count = 0;
++	if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
++		head->slot[head->node_count++] = reuse;
+ 		head->total++;
+ 		goto done;
+ 	}
+ 
+ 	reuse->total = 1;
+ 	if ((head) && !((unsigned long)head & 0x1)) {
+-		head->request_count = 0;
+ 		reuse->slot[0] = head;
++		reuse->node_count = 1;
+ 		reuse->total += head->total;
+ 	}
+ 
+@@ -1212,7 +1207,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+ {
+ 	struct maple_alloc *node;
+ 	unsigned long allocated = mas_allocated(mas);
+-	unsigned long success = allocated;
+ 	unsigned int requested = mas_alloc_req(mas);
+ 	unsigned int count;
+ 	void **slots = NULL;
+@@ -1228,24 +1222,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+ 		WARN_ON(!allocated);
+ 	}
+ 
+-	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) {
++	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
+ 		node = (struct maple_alloc *)mt_alloc_one(gfp);
+ 		if (!node)
+ 			goto nomem_one;
+ 
+-		if (allocated)
++		if (allocated) {
+ 			node->slot[0] = mas->alloc;
++			node->node_count = 1;
++		} else {
++			node->node_count = 0;
++		}
+ 
+-		success++;
+ 		mas->alloc = node;
++		node->total = ++allocated;
+ 		requested--;
+ 	}
+ 
+ 	node = mas->alloc;
++	node->request_count = 0;
+ 	while (requested) {
+ 		max_req = MAPLE_ALLOC_SLOTS;
+-		if (node->slot[0]) {
+-			unsigned int offset = node->node_count + 1;
++		if (node->node_count) {
++			unsigned int offset = node->node_count;
+ 
+ 			slots = (void **)&node->slot[offset];
+ 			max_req -= offset;
+@@ -1259,15 +1258,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+ 			goto nomem_bulk;
+ 
+ 		node->node_count += count;
+-		/* zero indexed. */
+-		if (slots == (void **)&node->slot)
+-			node->node_count--;
+-
+-		success += count;
++		allocated += count;
+ 		node = node->slot[0];
++		node->node_count = 0;
++		node->request_count = 0;
+ 		requested -= count;
+ 	}
+-	mas->alloc->total = success;
++	mas->alloc->total = allocated;
+ 	return;
+ 
+ nomem_bulk:
+@@ -1276,10 +1273,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+ nomem_one:
+ 	mas_set_alloc_req(mas, requested);
+ 	if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
+-		mas->alloc->total = success;
++		mas->alloc->total = allocated;
+ 	mas_set_err(mas, -ENOMEM);
+-	return;
+-
+ }
+ 
+ /*
+@@ -1887,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas,
+ 
+ 	/* Avoid ending a node on a NULL entry */
+ 	split = mab_no_null_split(bn, split, slot_count);
+-	if (!(*mid_split))
+-		return split;
+ 
+-	*mid_split = mab_no_null_split(bn, *mid_split, slot_count);
++	if (unlikely(*mid_split))
++		*mid_split = mab_no_null_split(bn, *mid_split, slot_count);
+ 
+ 	return split;
+ }
+@@ -2947,7 +2941,7 @@ static inline void *mtree_range_walk(struct ma_state *mas)
+ 	mas->min = prev_min;
+ 	mas->max = prev_max;
+ 	mas->node = last;
+-	return (void *) next;
++	return (void *)next;
+ 
+ dead_node:
+ 	mas_reset(mas);
+@@ -3467,7 +3461,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
+  */
+ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
+ {
+-
+ 	struct maple_subtree_state mast;
+ 	int height = 0;
+ 	unsigned char mid_split, split = 0;
+@@ -3893,7 +3886,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
+ 			goto dead_node;
+ 	} while (!ma_is_leaf(type));
+ 
+-	return (void *) next;
++	return (void *)next;
+ 
+ dead_node:
+ 	mas_reset(mas);
+@@ -4711,15 +4704,11 @@ static inline void *mas_next_nentry(struct ma_state *mas,
+ 
+ static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
+ {
+-
+ retry:
+ 	mas_set(mas, index);
+ 	mas_state_walk(mas);
+ 	if (mas_is_start(mas))
+ 		goto retry;
+-
+-	return;
+-
+ }
+ 
+ /*
+@@ -5590,8 +5579,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
+ 
+ /*
+  * mte_destroy_walk() - Free a tree or sub-tree.
+- * @enode - the encoded maple node (maple_enode) to start
+- * @mn - the tree to free - needed for node types.
++ * @enode: the encoded maple node (maple_enode) to start
++ * @mt: the tree to free - needed for node types.
+  *
+  * Must hold the write lock.
+  */
+@@ -5620,7 +5609,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
+ 				mas_reset(wr_mas->mas);
+ 		}
+ 	}
+-
+ }
+ 
+ /* Interface */
+@@ -5745,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
+ void mas_destroy(struct ma_state *mas)
+ {
+ 	struct maple_alloc *node;
++	unsigned long total;
+ 
+ 	/*
+ 	 * When using mas_for_each() to insert an expected number of elements,
+@@ -5767,14 +5756,20 @@ void mas_destroy(struct ma_state *mas)
+ 	}
+ 	mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
+ 
+-	while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) {
++	total = mas_allocated(mas);
++	while (total) {
+ 		node = mas->alloc;
+ 		mas->alloc = node->slot[0];
+-		if (node->node_count > 0)
+-			mt_free_bulk(node->node_count,
+-				     (void __rcu **)&node->slot[1]);
++		if (node->node_count > 1) {
++			size_t count = node->node_count - 1;
++
++			mt_free_bulk(count, (void __rcu **)&node->slot[1]);
++			total -= count;
++		}
+ 		kmem_cache_free(maple_node_cache, node);
++		total--;
+ 	}
++
+ 	mas->alloc = NULL;
+ }
+ EXPORT_SYMBOL_GPL(mas_destroy);
+@@ -6734,7 +6729,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
+ 
+ 		if (i < (MAPLE_RANGE64_SLOTS - 1))
+ 			last = node->pivot[i];
+-		else if (!node->slot[i] && max != mt_max[mte_node_type(entry)])
++		else if (!node->slot[i] && max != mt_node_max(entry))
+ 			break;
+ 		if (last == 0 && i > 0)
+ 			break;
+@@ -6841,7 +6836,7 @@ void mt_dump(const struct maple_tree *mt)
+ 	if (!xa_is_node(entry))
+ 		mt_dump_entry(entry, 0, 0, 0);
+ 	else if (entry)
+-		mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0);
++		mt_dump_node(mt, entry, 0, mt_node_max(entry), 0);
+ }
+ EXPORT_SYMBOL_GPL(mt_dump);
+ 
+diff --git a/mm/fadvise.c b/mm/fadvise.c
+index bf04fec87f35..fb7c5f43fd2a 100644
+--- a/mm/fadvise.c
++++ b/mm/fadvise.c
+@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+ 	case POSIX_FADV_NORMAL:
+ 		file->f_ra.ra_pages = bdi->ra_pages;
+ 		spin_lock(&file->f_lock);
+-		file->f_mode &= ~FMODE_RANDOM;
++		file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
+ 		spin_unlock(&file->f_lock);
+ 		break;
+ 	case POSIX_FADV_RANDOM:
+@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+ 		force_page_cache_readahead(mapping, file, start_index, nrpages);
+ 		break;
+ 	case POSIX_FADV_NOREUSE:
++		spin_lock(&file->f_lock);
++		file->f_mode |= FMODE_NOREUSE;
++		spin_unlock(&file->f_lock);
+ 		break;
+ 	case POSIX_FADV_DONTNEED:
+ 		__filemap_fdatawrite_range(mapping, offset, endbyte,
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index 73afff8062f9..7fe2f4f36cf4 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
+ 	struct mem_cgroup_per_node *mz;
+ 	struct mem_cgroup_tree_per_node *mctz;
+ 
++	if (lru_gen_enabled()) {
++		if (soft_limit_excess(memcg))
++			lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
++		return;
++	}
++
+ 	mctz = soft_limit_tree.rb_tree_per_node[nid];
+ 	if (!mctz)
+ 		return;
+@@ -3526,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ 	struct mem_cgroup_tree_per_node *mctz;
+ 	unsigned long excess;
+ 
++	if (lru_gen_enabled())
++		return 0;
++
+ 	if (order > 0)
+ 		return 0;
+ 
+@@ -5382,6 +5391,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
+ 	if (unlikely(mem_cgroup_is_root(memcg)))
+ 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 				   2UL*HZ);
++	lru_gen_online_memcg(memcg);
+ 	return 0;
+ offline_kmem:
+ 	memcg_offline_kmem(memcg);
+@@ -5413,6 +5423,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+ 	memcg_offline_kmem(memcg);
+ 	reparent_shrinker_deferred(memcg);
+ 	wb_memcg_offline(memcg);
++	lru_gen_offline_memcg(memcg);
+ 
+ 	drain_all_stock(memcg);
+ 
+@@ -5424,6 +5435,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+ 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ 
+ 	invalidate_reclaim_iterators(memcg);
++	lru_gen_release_memcg(memcg);
+ }
+ 
+ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+diff --git a/mm/memory.c b/mm/memory.c
+index f526b9152bef..4ad62eba3cb7 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1392,8 +1392,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 						force_flush = 1;
+ 					}
+ 				}
+-				if (pte_young(ptent) &&
+-				    likely(!(vma->vm_flags & VM_SEQ_READ)))
++				if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ 					mark_page_accessed(page);
+ 			}
+ 			rss[mm_counter(page)]--;
+@@ -5140,8 +5139,8 @@ static inline void mm_account_fault(struct pt_regs *regs,
+ #ifdef CONFIG_LRU_GEN
+ static void lru_gen_enter_fault(struct vm_area_struct *vma)
+ {
+-	/* the LRU algorithm doesn't apply to sequential or random reads */
+-	current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
++	/* the LRU algorithm only applies to accesses with recency */
++	current->in_lru_fault = vma_has_recency(vma);
+ }
+ 
+ static void lru_gen_exit_fault(void)
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3aec9a6a9cb7..6658cbf43f5d 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid)
+ 	pgdat_set_deferred_range(pgdat);
+ 
+ 	free_area_init_core(pgdat);
++	lru_gen_init_pgdat(pgdat);
+ }
+ 
+ static void __init free_area_init_memoryless_node(int nid)
+diff --git a/mm/rmap.c b/mm/rmap.c
+index b616870a09be..7b9205cb7d87 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio,
+ 		}
+ 
+ 		if (pvmw.pte) {
+-			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+-			    !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
++			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+ 				lru_gen_look_around(&pvmw);
+ 				referenced++;
+ 			}
+ 
+ 			if (ptep_clear_flush_young_notify(vma, address,
+-						pvmw.pte)) {
+-				/*
+-				 * Don't treat a reference through
+-				 * a sequentially read mapping as such.
+-				 * If the folio has been used in another mapping,
+-				 * we will catch it; if this other mapping is
+-				 * already gone, the unmap path will have set
+-				 * the referenced flag or activated the folio.
+-				 */
+-				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+-					referenced++;
+-			}
++						pvmw.pte))
++				referenced++;
+ 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ 			if (pmdp_clear_flush_young_notify(vma, address,
+ 						pvmw.pmd))
+@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
+ 	struct folio_referenced_arg *pra = arg;
+ 	struct mem_cgroup *memcg = pra->memcg;
+ 
+-	if (!mm_match_cgroup(vma->vm_mm, memcg))
++	/*
++	 * Ignore references from this mapping if it has no recency. If the
++	 * folio has been used in another mapping, we will catch it; if this
++	 * other mapping is already gone, the unmap path will have set the
++	 * referenced flag or activated the folio in zap_pte_range().
++	 */
++	if (!vma_has_recency(vma))
++		return true;
++
++	/*
++	 * If we are reclaiming on behalf of a cgroup, skip counting on behalf
++	 * of references from different cgroups.
++	 */
++	if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+ 		return true;
+ 
+ 	return false;
+@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked,
+ 		.arg = (void *)&pra,
+ 		.anon_lock = folio_lock_anon_vma_read,
+ 		.try_lock = true,
++		.invalid_vma = invalid_folio_referenced_vma,
+ 	};
+ 
+ 	*vm_flags = 0;
+@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked,
+ 			return 1;
+ 	}
+ 
+-	/*
+-	 * If we are reclaiming on behalf of a cgroup, skip
+-	 * counting on behalf of references from different
+-	 * cgroups
+-	 */
+-	if (memcg) {
+-		rwc.invalid_vma = invalid_folio_referenced_vma;
+-	}
+-
+ 	rmap_walk(folio, &rwc);
+ 	*vm_flags = pra.vm_flags;
+ 
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 160acbbdf111..04a54656b6b7 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -55,6 +55,8 @@
+ #include <linux/ctype.h>
+ #include <linux/debugfs.h>
+ #include <linux/khugepaged.h>
++#include <linux/rculist_nulls.h>
++#include <linux/random.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -135,12 +137,6 @@ struct scan_control {
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
+-#ifdef CONFIG_LRU_GEN
+-	/* help kswapd make better choices among multiple memcgs */
+-	unsigned int memcgs_need_aging:1;
+-	unsigned long last_reclaimed;
+-#endif
+-
+ 	/* Allocation order */
+ 	s8 order;
+ 
+@@ -453,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
+ 	return sc->target_mem_cgroup;
+ }
+ 
++static bool global_reclaim(struct scan_control *sc)
++{
++	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
++}
++
+ /**
+  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
+  * @sc: scan_control in question
+@@ -503,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
+ 	return false;
+ }
+ 
++static bool global_reclaim(struct scan_control *sc)
++{
++	return true;
++}
++
+ static bool writeback_throttling_sane(struct scan_control *sc)
+ {
+ 	return true;
+@@ -3184,6 +3190,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
+ 			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+ 
++#define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS)
++#define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS)
++
+ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+ {
+ 	struct pglist_data *pgdat = NODE_DATA(nid);
+@@ -3209,6 +3218,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
++	if (!sc->may_swap)
++		return 0;
++
+ 	if (!can_demote(pgdat->node_id, sc) &&
+ 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+ 		return 0;
+@@ -3223,12 +3235,104 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
+ 
+ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+ {
+-	/* see the comment on lru_gen_struct */
++	/* see the comment on lru_gen_folio */
+ 	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+ 	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+ 	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ }
+ 
++/******************************************************************************
++ *                          Bloom filters
++ ******************************************************************************/
++
++/*
++ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
++ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
++ * bits in a bitmap, k is the number of hash functions and n is the number of
++ * inserted items.
++ *
++ * Page table walkers use one of the two filters to reduce their search space.
++ * To get rid of non-leaf entries that no longer have enough leaf entries, the
++ * aging uses the double-buffering technique to flip to the other filter each
++ * time it produces a new generation. For non-leaf entries that have enough
++ * leaf entries, the aging carries them over to the next generation in
++ * walk_pmd_range(); the eviction also report them when walking the rmap
++ * in lru_gen_look_around().
++ *
++ * For future optimizations:
++ * 1. It's not necessary to keep both filters all the time. The spare one can be
++ *    freed after the RCU grace period and reallocated if needed again.
++ * 2. And when reallocating, it's worth scaling its size according to the number
++ *    of inserted entries in the other filter, to reduce the memory overhead on
++ *    small systems and false positives on large systems.
++ * 3. Jenkins' hash function is an alternative to Knuth's.
++ */
++#define BLOOM_FILTER_SHIFT	15
++
++static inline int filter_gen_from_seq(unsigned long seq)
++{
++	return seq % NR_BLOOM_FILTERS;
++}
++
++static void get_item_key(void *item, int *key)
++{
++	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
++
++	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
++
++	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
++	key[1] = hash >> BLOOM_FILTER_SHIFT;
++}
++
++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++	int key[2];
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
++	if (!filter)
++		return true;
++
++	get_item_key(item, key);
++
++	return test_bit(key[0], filter) && test_bit(key[1], filter);
++}
++
++static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++	int key[2];
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
++	if (!filter)
++		return;
++
++	get_item_key(item, key);
++
++	if (!test_bit(key[0], filter))
++		set_bit(key[0], filter);
++	if (!test_bit(key[1], filter))
++		set_bit(key[1], filter);
++}
++
++static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
++{
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = lruvec->mm_state.filters[gen];
++	if (filter) {
++		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
++		return;
++	}
++
++	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
++			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
++	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
++}
++
+ /******************************************************************************
+  *                          mm_struct list
+  ******************************************************************************/
+@@ -3348,94 +3452,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
+ }
+ #endif
+ 
+-/*
+- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+- * bits in a bitmap, k is the number of hash functions and n is the number of
+- * inserted items.
+- *
+- * Page table walkers use one of the two filters to reduce their search space.
+- * To get rid of non-leaf entries that no longer have enough leaf entries, the
+- * aging uses the double-buffering technique to flip to the other filter each
+- * time it produces a new generation. For non-leaf entries that have enough
+- * leaf entries, the aging carries them over to the next generation in
+- * walk_pmd_range(); the eviction also report them when walking the rmap
+- * in lru_gen_look_around().
+- *
+- * For future optimizations:
+- * 1. It's not necessary to keep both filters all the time. The spare one can be
+- *    freed after the RCU grace period and reallocated if needed again.
+- * 2. And when reallocating, it's worth scaling its size according to the number
+- *    of inserted entries in the other filter, to reduce the memory overhead on
+- *    small systems and false positives on large systems.
+- * 3. Jenkins' hash function is an alternative to Knuth's.
+- */
+-#define BLOOM_FILTER_SHIFT	15
+-
+-static inline int filter_gen_from_seq(unsigned long seq)
+-{
+-	return seq % NR_BLOOM_FILTERS;
+-}
+-
+-static void get_item_key(void *item, int *key)
+-{
+-	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+-
+-	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+-
+-	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+-	key[1] = hash >> BLOOM_FILTER_SHIFT;
+-}
+-
+-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+-{
+-	unsigned long *filter;
+-	int gen = filter_gen_from_seq(seq);
+-
+-	filter = lruvec->mm_state.filters[gen];
+-	if (filter) {
+-		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+-		return;
+-	}
+-
+-	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+-			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+-	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+-}
+-
+-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+-{
+-	int key[2];
+-	unsigned long *filter;
+-	int gen = filter_gen_from_seq(seq);
+-
+-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+-	if (!filter)
+-		return;
+-
+-	get_item_key(item, key);
+-
+-	if (!test_bit(key[0], filter))
+-		set_bit(key[0], filter);
+-	if (!test_bit(key[1], filter))
+-		set_bit(key[1], filter);
+-}
+-
+-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+-{
+-	int key[2];
+-	unsigned long *filter;
+-	int gen = filter_gen_from_seq(seq);
+-
+-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+-	if (!filter)
+-		return true;
+-
+-	get_item_key(item, key);
+-
+-	return test_bit(key[0], filter) && test_bit(key[1], filter);
+-}
+-
+ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+ {
+ 	int i;
+@@ -3623,7 +3639,7 @@ struct ctrl_pos {
+ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ 			  struct ctrl_pos *pos)
+ {
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+ 
+ 	pos->refaulted = lrugen->avg_refaulted[type][tier] +
+@@ -3638,7 +3654,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+ {
+ 	int hist, tier;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+ 	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+ 
+@@ -3715,7 +3731,7 @@ static int folio_update_gen(struct folio *folio, int gen)
+ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+ {
+ 	int type = folio_is_file_lru(folio);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ 	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ 
+@@ -3760,7 +3776,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
+ static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+ {
+ 	int gen, type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	walk->batched = 0;
+ 
+@@ -3793,7 +3809,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
+ 	if (is_vm_hugetlb_page(vma))
+ 		return true;
+ 
+-	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
++	if (!vma_has_recency(vma))
++		return true;
++
++	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
+ 		return true;
+ 
+ 	if (vma == get_gate_vma(vma->vm_mm))
+@@ -3988,8 +4007,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ }
+ 
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+-				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
++static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
++				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
+ {
+ 	int i;
+ 	pmd_t *pmd;
+@@ -4002,18 +4021,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
+ 	VM_WARN_ON_ONCE(pud_leaf(*pud));
+ 
+ 	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+-	if (*start == -1) {
+-		*start = next;
++	if (*first == -1) {
++		*first = addr;
++		bitmap_zero(bitmap, MIN_LRU_BATCH);
+ 		return;
+ 	}
+ 
+-	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
++	i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
+ 	if (i && i <= MIN_LRU_BATCH) {
+ 		__set_bit(i - 1, bitmap);
+ 		return;
+ 	}
+ 
+-	pmd = pmd_offset(pud, *start);
++	pmd = pmd_offset(pud, *first);
+ 
+ 	ptl = pmd_lockptr(args->mm, pmd);
+ 	if (!spin_trylock(ptl))
+@@ -4024,15 +4044,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
+ 	do {
+ 		unsigned long pfn;
+ 		struct folio *folio;
+-		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
++
++		/* don't round down the first address */
++		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
+ 
+ 		pfn = get_pmd_pfn(pmd[i], vma, addr);
+ 		if (pfn == -1)
+ 			goto next;
+ 
+ 		if (!pmd_trans_huge(pmd[i])) {
+-			if (arch_has_hw_nonleaf_pmd_young() &&
+-			    get_cap(LRU_GEN_NONLEAF_YOUNG))
++			if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ 				pmdp_test_and_clear_young(vma, addr, pmd + i);
+ 			goto next;
+ 		}
+@@ -4061,12 +4082,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
+ 	arch_leave_lazy_mmu_mode();
+ 	spin_unlock(ptl);
+ done:
+-	*start = -1;
+-	bitmap_zero(bitmap, MIN_LRU_BATCH);
++	*first = -1;
+ }
+ #else
+-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+-				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
++static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
++				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
+ {
+ }
+ #endif
+@@ -4079,9 +4099,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ 	unsigned long next;
+ 	unsigned long addr;
+ 	struct vm_area_struct *vma;
+-	unsigned long pos = -1;
++	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
++	unsigned long first = -1;
+ 	struct lru_gen_mm_walk *walk = args->private;
+-	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ 
+ 	VM_WARN_ON_ONCE(pud_leaf(*pud));
+ 
+@@ -4120,18 +4140,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ 			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ 				continue;
+ 
+-			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ 			continue;
+ 		}
+ #endif
+ 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
+ 
+-		if (arch_has_hw_nonleaf_pmd_young() &&
+-		    get_cap(LRU_GEN_NONLEAF_YOUNG)) {
++		if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ 			if (!pmd_young(val))
+ 				continue;
+ 
+-			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
+ 		}
+ 
+ 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+@@ -4148,7 +4167,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ 		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ 	}
+ 
+-	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
++	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
+ 
+ 	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+ 		goto restart;
+@@ -4238,7 +4257,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
+ 	} while (err == -EAGAIN);
+ }
+ 
+-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
+ {
+ 	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+ 
+@@ -4246,7 +4265,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+ 		VM_WARN_ON_ONCE(walk);
+ 
+ 		walk = &pgdat->mm_walk;
+-	} else if (!pgdat && !walk) {
++	} else if (!walk && force_alloc) {
+ 		VM_WARN_ON_ONCE(current_is_kswapd());
+ 
+ 		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+@@ -4274,7 +4293,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+ {
+ 	int zone;
+ 	int remaining = MAX_LRU_BATCH;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ 
+ 	if (type == LRU_GEN_ANON && !can_swap)
+@@ -4282,7 +4301,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+ 
+ 	/* prevent cold/hot inversion if force_scan is true */
+ 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+-		struct list_head *head = &lrugen->lists[old_gen][type][zone];
++		struct list_head *head = &lrugen->folios[old_gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			struct folio *folio = lru_to_folio(head);
+@@ -4293,7 +4312,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+ 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+ 
+ 			new_gen = folio_inc_gen(lruvec, folio, false);
+-			list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
++			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ 
+ 			if (!--remaining)
+ 				return false;
+@@ -4310,7 +4329,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+ {
+ 	int gen, type, zone;
+ 	bool success = false;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	DEFINE_MIN_SEQ(lruvec);
+ 
+ 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+@@ -4321,7 +4340,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+ 			gen = lru_gen_from_seq(min_seq[type]);
+ 
+ 			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+-				if (!list_empty(&lrugen->lists[gen][type][zone]))
++				if (!list_empty(&lrugen->folios[gen][type][zone]))
+ 					goto next;
+ 			}
+ 
+@@ -4331,7 +4350,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+ 		;
+ 	}
+ 
+-	/* see the comment on lru_gen_struct */
++	/* see the comment on lru_gen_folio */
+ 	if (can_swap) {
+ 		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+ 		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+@@ -4353,7 +4372,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+ {
+ 	int prev, next;
+ 	int type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	spin_lock_irq(&lruvec->lru_lock);
+ 
+@@ -4411,7 +4430,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ 	bool success;
+ 	struct lru_gen_mm_walk *walk;
+ 	struct mm_struct *mm = NULL;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ 
+@@ -4427,12 +4446,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ 	 * handful of PTEs. Spreading the work out over a period of time usually
+ 	 * is less efficient, but it avoids bursty page faults.
+ 	 */
+-	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
++	if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+ 	}
+ 
+-	walk = set_mm_walk(NULL);
++	walk = set_mm_walk(NULL, true);
+ 	if (!walk) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+@@ -4455,8 +4474,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ 		if (sc->priority <= DEF_PRIORITY - 2)
+ 			wait_event_killable(lruvec->mm_state.wait,
+ 					    max_seq < READ_ONCE(lrugen->max_seq));
+-
+-		return max_seq < READ_ONCE(lrugen->max_seq);
++		return false;
+ 	}
+ 
+ 	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+@@ -4469,97 +4487,56 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ 	return true;
+ }
+ 
+-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
+-			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
++/******************************************************************************
++ *                          working set protection
++ ******************************************************************************/
++
++static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	int gen, type, zone;
+-	unsigned long old = 0;
+-	unsigned long young = 0;
+ 	unsigned long total = 0;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	bool can_swap = get_swappiness(lruvec, sc);
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
+ 
+ 	for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ 		unsigned long seq;
+ 
+ 		for (seq = min_seq[type]; seq <= max_seq; seq++) {
+-			unsigned long size = 0;
+-
+ 			gen = lru_gen_from_seq(seq);
+ 
+ 			for (zone = 0; zone < MAX_NR_ZONES; zone++)
+-				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+-
+-			total += size;
+-			if (seq == max_seq)
+-				young += size;
+-			else if (seq + MIN_NR_GENS == max_seq)
+-				old += size;
++				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+ 		}
+ 	}
+ 
+-	/* try to scrape all its memory if this memcg was deleted */
+-	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+-
+-	/*
+-	 * The aging tries to be lazy to reduce the overhead, while the eviction
+-	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+-	 * ideal number of generations is MIN_NR_GENS+1.
+-	 */
+-	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
+-		return true;
+-	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+-		return false;
+-
+-	/*
+-	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+-	 * of the total number of pages for each generation. A reasonable range
+-	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+-	 * aging cares about the upper bound of hot pages, while the eviction
+-	 * cares about the lower bound of cold pages.
+-	 */
+-	if (young * MIN_NR_GENS > total)
+-		return true;
+-	if (old * (MIN_NR_GENS + 2) < total)
+-		return true;
+-
+-	return false;
++	/* whether the size is big enough to be helpful */
++	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+ }
+ 
+-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
++static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
++				  unsigned long min_ttl)
+ {
+-	bool need_aging;
+-	unsigned long nr_to_scan;
+-	int swappiness = get_swappiness(lruvec, sc);
++	int gen;
++	unsigned long birth;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+-	DEFINE_MAX_SEQ(lruvec);
+ 	DEFINE_MIN_SEQ(lruvec);
+ 
+-	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+-
+-	mem_cgroup_calculate_protection(NULL, memcg);
++	/* see the comment on lru_gen_folio */
++	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
++	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+ 
+-	if (mem_cgroup_below_min(NULL, memcg))
++	if (time_is_after_jiffies(birth + min_ttl))
+ 		return false;
+ 
+-	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+-
+-	if (min_ttl) {
+-		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+-		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+-
+-		if (time_is_after_jiffies(birth + min_ttl))
+-			return false;
+-
+-		/* the size is likely too small to be helpful */
+-		if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+-			return false;
+-	}
++	if (!lruvec_is_sizable(lruvec, sc))
++		return false;
+ 
+-	if (need_aging)
+-		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
++	mem_cgroup_calculate_protection(NULL, memcg);
+ 
+-	return true;
++	return !mem_cgroup_below_min(NULL, memcg);
+ }
+ 
+ /* to protect the working set of the last N jiffies */
+@@ -4572,46 +4549,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
+-	bool success = false;
+ 	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
+-	sc->last_reclaimed = sc->nr_reclaimed;
+-
+-	/*
+-	 * To reduce the chance of going into the aging path, which can be
+-	 * costly, optimistically skip it if the flag below was cleared in the
+-	 * eviction path. This improves the overall performance when multiple
+-	 * memcgs are available.
+-	 */
+-	if (!sc->memcgs_need_aging) {
+-		sc->memcgs_need_aging = true;
++	/* check the order to exclude compaction-induced reclaim */
++	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+ 		return;
+-	}
+-
+-	set_mm_walk(pgdat);
+ 
+ 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ 	do {
+ 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ 
+-		if (age_lruvec(lruvec, sc, min_ttl))
+-			success = true;
++		if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
++			mem_cgroup_iter_break(NULL, memcg);
++			return;
++		}
+ 
+ 		cond_resched();
+ 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+ 
+-	clear_mm_walk();
+-
+-	/* check the order to exclude compaction-induced reclaim */
+-	if (success || !min_ttl || sc->order)
+-		return;
+-
+ 	/*
+ 	 * The main goal is to OOM kill if every generation from all memcgs is
+ 	 * younger than min_ttl. However, another possibility is all memcgs are
+-	 * either below min or empty.
++	 * either too small or below min.
+ 	 */
+ 	if (mutex_trylock(&oom_lock)) {
+ 		struct oom_control oc = {
+@@ -4624,6 +4585,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ 	}
+ }
+ 
++/******************************************************************************
++ *                          rmap/PT walk feedback
++ ******************************************************************************/
++
+ /*
+  * This function exploits spatial locality when shrink_folio_list() walks the
+  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+@@ -4634,13 +4599,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ {
+ 	int i;
+-	pte_t *pte;
+ 	unsigned long start;
+ 	unsigned long end;
+-	unsigned long addr;
+ 	struct lru_gen_mm_walk *walk;
+ 	int young = 0;
+-	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
++	pte_t *pte = pvmw->pte;
++	unsigned long addr = pvmw->address;
+ 	struct folio *folio = pfn_folio(pvmw->pfn);
+ 	struct mem_cgroup *memcg = folio_memcg(folio);
+ 	struct pglist_data *pgdat = folio_pgdat(folio);
+@@ -4657,25 +4621,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 	/* avoid taking the LRU lock under the PTL when possible */
+ 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+ 
+-	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+-	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
++	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
++	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ 
+ 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+-		if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
++		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
+-		else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
++		else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ 			start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ 		else {
+-			start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+-			end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
++			start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
++			end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ 		}
+ 	}
+ 
+-	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
++	/* folio_update_gen() requires stable folio_memcg() */
++	if (!mem_cgroup_trylock_pages(memcg))
++		return;
+ 
+-	rcu_read_lock();
+ 	arch_enter_lazy_mmu_mode();
+ 
++	pte -= (addr - start) / PAGE_SIZE;
++
+ 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ 		unsigned long pfn;
+ 
+@@ -4700,58 +4667,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 		      !folio_test_swapcache(folio)))
+ 			folio_mark_dirty(folio);
+ 
++		if (walk) {
++			old_gen = folio_update_gen(folio, new_gen);
++			if (old_gen >= 0 && old_gen != new_gen)
++				update_batch_size(walk, folio, old_gen, new_gen);
++
++			continue;
++		}
++
+ 		old_gen = folio_lru_gen(folio);
+ 		if (old_gen < 0)
+ 			folio_set_referenced(folio);
+ 		else if (old_gen != new_gen)
+-			__set_bit(i, bitmap);
++			folio_activate(folio);
+ 	}
+ 
+ 	arch_leave_lazy_mmu_mode();
+-	rcu_read_unlock();
++	mem_cgroup_unlock_pages();
+ 
+ 	/* feedback from rmap walkers to page table walkers */
+ 	if (suitable_to_scan(i, young))
+ 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
++}
+ 
+-	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+-		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+-			folio = pfn_folio(pte_pfn(pte[i]));
+-			folio_activate(folio);
+-		}
+-		return;
++/******************************************************************************
++ *                          memcg LRU
++ ******************************************************************************/
++
++/* see the comment on MEMCG_NR_GENS */
++enum {
++	MEMCG_LRU_NOP,
++	MEMCG_LRU_HEAD,
++	MEMCG_LRU_TAIL,
++	MEMCG_LRU_OLD,
++	MEMCG_LRU_YOUNG,
++};
++
++#ifdef CONFIG_MEMCG
++
++static int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++	return READ_ONCE(lruvec->lrugen.seg);
++}
++
++static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
++{
++	int seg;
++	int old, new;
++	int bin = get_random_u32_below(MEMCG_NR_BINS);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	spin_lock(&pgdat->memcg_lru.lock);
++
++	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++	seg = 0;
++	new = old = lruvec->lrugen.gen;
++
++	/* see the comment on MEMCG_NR_GENS */
++	if (op == MEMCG_LRU_HEAD)
++		seg = MEMCG_LRU_HEAD;
++	else if (op == MEMCG_LRU_TAIL)
++		seg = MEMCG_LRU_TAIL;
++	else if (op == MEMCG_LRU_OLD)
++		new = get_memcg_gen(pgdat->memcg_lru.seq);
++	else if (op == MEMCG_LRU_YOUNG)
++		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
++	else
++		VM_WARN_ON_ONCE(true);
++
++	hlist_nulls_del_rcu(&lruvec->lrugen.list);
++
++	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
++		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++	else
++		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++
++	pgdat->memcg_lru.nr_memcgs[old]--;
++	pgdat->memcg_lru.nr_memcgs[new]++;
++
++	lruvec->lrugen.gen = new;
++	WRITE_ONCE(lruvec->lrugen.seg, seg);
++
++	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
++		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++	spin_unlock(&pgdat->memcg_lru.lock);
++}
++
++void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++	int gen;
++	int nid;
++	int bin = get_random_u32_below(MEMCG_NR_BINS);
++
++	for_each_node(nid) {
++		struct pglist_data *pgdat = NODE_DATA(nid);
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		spin_lock(&pgdat->memcg_lru.lock);
++
++		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++		gen = get_memcg_gen(pgdat->memcg_lru.seq);
++
++		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
++		pgdat->memcg_lru.nr_memcgs[gen]++;
++
++		lruvec->lrugen.gen = gen;
++
++		spin_unlock(&pgdat->memcg_lru.lock);
+ 	}
++}
+ 
+-	/* folio_update_gen() requires stable folio_memcg() */
+-	if (!mem_cgroup_trylock_pages(memcg))
+-		return;
++void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++	int nid;
+ 
+-	if (!walk) {
+-		spin_lock_irq(&lruvec->lru_lock);
+-		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
++	for_each_node(nid) {
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
+ 	}
++}
+ 
+-	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+-		folio = pfn_folio(pte_pfn(pte[i]));
+-		if (folio_memcg_rcu(folio) != memcg)
+-			continue;
++void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++	int gen;
++	int nid;
+ 
+-		old_gen = folio_update_gen(folio, new_gen);
+-		if (old_gen < 0 || old_gen == new_gen)
+-			continue;
++	for_each_node(nid) {
++		struct pglist_data *pgdat = NODE_DATA(nid);
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
+ 
+-		if (walk)
+-			update_batch_size(walk, folio, old_gen, new_gen);
+-		else
+-			lru_gen_update_size(lruvec, folio, old_gen, new_gen);
++		spin_lock(&pgdat->memcg_lru.lock);
++
++		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++		gen = lruvec->lrugen.gen;
++
++		hlist_nulls_del_rcu(&lruvec->lrugen.list);
++		pgdat->memcg_lru.nr_memcgs[gen]--;
++
++		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
++			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++		spin_unlock(&pgdat->memcg_lru.lock);
+ 	}
++}
++
++void lru_gen_soft_reclaim(struct lruvec *lruvec)
++{
++	/* see the comment on MEMCG_NR_GENS */
++	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
++		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
++}
+ 
+-	if (!walk)
+-		spin_unlock_irq(&lruvec->lru_lock);
++#else /* !CONFIG_MEMCG */
+ 
+-	mem_cgroup_unlock_pages();
++static int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++	return 0;
+ }
+ 
++#endif
++
+ /******************************************************************************
+  *                          the eviction
+  ******************************************************************************/
+@@ -4765,7 +4845,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+ 	int delta = folio_nr_pages(folio);
+ 	int refs = folio_lru_refs(folio);
+ 	int tier = lru_tier_from_refs(refs);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
+ 
+@@ -4790,7 +4870,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+ 
+ 	/* promoted */
+ 	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+-		list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
++		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+ 		return true;
+ 	}
+ 
+@@ -4799,7 +4879,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+ 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+ 
+ 		gen = folio_inc_gen(lruvec, folio, false);
+-		list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
++		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ 
+ 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+ 			   lrugen->protected[hist][type][tier - 1] + delta);
+@@ -4811,7 +4891,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+ 	if (folio_test_locked(folio) || folio_test_writeback(folio) ||
+ 	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ 		gen = folio_inc_gen(lruvec, folio, true);
+-		list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
++		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+ 		return true;
+ 	}
+ 
+@@ -4822,12 +4902,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
+ {
+ 	bool success;
+ 
+-	/* unmapping inhibited */
+-	if (!sc->may_unmap && folio_mapped(folio))
+-		return false;
+-
+ 	/* swapping inhibited */
+-	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++	if (!(sc->gfp_mask & __GFP_IO) &&
+ 	    (folio_test_dirty(folio) ||
+ 	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))
+ 		return false;
+@@ -4865,7 +4941,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ 	int scanned = 0;
+ 	int isolated = 0;
+ 	int remaining = MAX_LRU_BATCH;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 
+ 	VM_WARN_ON_ONCE(!list_empty(list));
+@@ -4878,7 +4954,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ 	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ 		LIST_HEAD(moved);
+ 		int skipped = 0;
+-		struct list_head *head = &lrugen->lists[gen][type][zone];
++		struct list_head *head = &lrugen->folios[gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			struct folio *folio = lru_to_folio(head);
+@@ -4924,9 +5000,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ 	__count_vm_events(PGSCAN_ANON + type, isolated);
+ 
+ 	/*
+-	 * There might not be eligible pages due to reclaim_idx, may_unmap and
+-	 * may_writepage. Check the remaining to prevent livelock if it's not
+-	 * making progress.
++	 * There might not be eligible folios due to reclaim_idx. Check the
++	 * remaining to prevent livelock if it's not making progress.
+ 	 */
+ 	return isolated || !remaining ? scanned : 0;
+ }
+@@ -5021,8 +5096,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
+ 	return scanned;
+ }
+ 
+-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+-			bool *need_swapping)
++static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+ {
+ 	int type;
+ 	int scanned;
+@@ -5111,153 +5185,348 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
+ 		goto retry;
+ 	}
+ 
+-	if (need_swapping && type == LRU_GEN_ANON)
+-		*need_swapping = true;
+-
+ 	return scanned;
+ }
+ 
++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
++			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
++{
++	int gen, type, zone;
++	unsigned long old = 0;
++	unsigned long young = 0;
++	unsigned long total = 0;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	/* whether this lruvec is completely out of cold folios */
++	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
++		*nr_to_scan = 0;
++		return true;
++	}
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		unsigned long seq;
++
++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
++			unsigned long size = 0;
++
++			gen = lru_gen_from_seq(seq);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++
++			total += size;
++			if (seq == max_seq)
++				young += size;
++			else if (seq + MIN_NR_GENS == max_seq)
++				old += size;
++		}
++	}
++
++	/* try to scrape all its memory if this memcg was deleted */
++	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
++
++	/*
++	 * The aging tries to be lazy to reduce the overhead, while the eviction
++	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
++	 * ideal number of generations is MIN_NR_GENS+1.
++	 */
++	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
++		return false;
++
++	/*
++	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
++	 * of the total number of pages for each generation. A reasonable range
++	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
++	 * aging cares about the upper bound of hot pages, while the eviction
++	 * cares about the lower bound of cold pages.
++	 */
++	if (young * MIN_NR_GENS > total)
++		return true;
++	if (old * (MIN_NR_GENS + 2) < total)
++		return true;
++
++	return false;
++}
++
+ /*
+  * For future optimizations:
+  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+  *    reclaim.
+  */
+-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+-				    bool can_swap, bool *need_aging)
++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+ {
+ 	unsigned long nr_to_scan;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+-	DEFINE_MIN_SEQ(lruvec);
+ 
+-	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
+-	    (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
+-	     !sc->memcg_low_reclaim))
++	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
+ 		return 0;
+ 
+-	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+-	if (!*need_aging)
++	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
+ 		return nr_to_scan;
+ 
+ 	/* skip the aging path at the default priority */
+ 	if (sc->priority == DEF_PRIORITY)
+-		goto done;
++		return nr_to_scan;
+ 
+-	/* leave the work to lru_gen_age_node() */
+-	if (current_is_kswapd())
+-		return 0;
++	/* skip this lruvec as it's low on cold folios */
++	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
++}
+ 
+-	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+-		return nr_to_scan;
+-done:
+-	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
++static unsigned long get_nr_to_reclaim(struct scan_control *sc)
++{
++	/* don't abort memcg reclaim to ensure fairness */
++	if (!global_reclaim(sc))
++		return -1;
++
++	return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ }
+ 
+-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+-			      struct scan_control *sc, bool need_swapping)
++static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+-	int i;
+-	DEFINE_MAX_SEQ(lruvec);
++	long nr_to_scan;
++	unsigned long scanned = 0;
++	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
++	int swappiness = get_swappiness(lruvec, sc);
+ 
+-	if (!current_is_kswapd()) {
+-		/* age each memcg at most once to ensure fairness */
+-		if (max_seq - seq > 1)
+-			return true;
++	/* clean file folios are more likely to exist */
++	if (swappiness && !(sc->gfp_mask & __GFP_IO))
++		swappiness = 1;
+ 
+-		/* over-swapping can increase allocation latency */
+-		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+-			return true;
++	while (true) {
++		int delta;
+ 
+-		/* give this thread a chance to exit and free its memory */
+-		if (fatal_signal_pending(current)) {
+-			sc->nr_reclaimed += MIN_LRU_BATCH;
+-			return true;
+-		}
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++		if (nr_to_scan <= 0)
++			break;
+ 
+-		if (cgroup_reclaim(sc))
+-			return false;
+-	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+-		return false;
++		delta = evict_folios(lruvec, sc, swappiness);
++		if (!delta)
++			break;
+ 
+-	/* keep scanning at low priorities to ensure fairness */
+-	if (sc->priority > DEF_PRIORITY - 2)
+-		return false;
++		scanned += delta;
++		if (scanned >= nr_to_scan)
++			break;
+ 
+-	/*
+-	 * A minimum amount of work was done under global memory pressure. For
+-	 * kswapd, it may be overshooting. For direct reclaim, the allocation
+-	 * may succeed if all suitable zones are somewhat safe. In either case,
+-	 * it's better to stop now, and restart later if necessary.
+-	 */
+-	for (i = 0; i <= sc->reclaim_idx; i++) {
+-		unsigned long wmark;
+-		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
++		if (sc->nr_reclaimed >= nr_to_reclaim)
++			break;
+ 
+-		if (!managed_zone(zone))
++		cond_resched();
++	}
++
++	/* whether try_to_inc_max_seq() was successful */
++	return nr_to_scan < 0;
++}
++
++static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
++{
++	bool success;
++	unsigned long scanned = sc->nr_scanned;
++	unsigned long reclaimed = sc->nr_reclaimed;
++	int seg = lru_gen_memcg_seg(lruvec);
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	/* see the comment on MEMCG_NR_GENS */
++	if (!lruvec_is_sizable(lruvec, sc))
++		return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
++
++	mem_cgroup_calculate_protection(NULL, memcg);
++
++	if (mem_cgroup_below_min(NULL, memcg))
++		return MEMCG_LRU_YOUNG;
++
++	if (mem_cgroup_below_low(NULL, memcg)) {
++		/* see the comment on MEMCG_NR_GENS */
++		if (seg != MEMCG_LRU_TAIL)
++			return MEMCG_LRU_TAIL;
++
++		memcg_memory_event(memcg, MEMCG_LOW);
++	}
++
++	success = try_to_shrink_lruvec(lruvec, sc);
++
++	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
++
++	if (!sc->proactive)
++		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
++			   sc->nr_reclaimed - reclaimed);
++
++	sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
++	current->reclaim_state->reclaimed_slab = 0;
++
++	return success ? MEMCG_LRU_YOUNG : 0;
++}
++
++#ifdef CONFIG_MEMCG
++
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	int op;
++	int gen;
++	int bin;
++	int first_bin;
++	struct lruvec *lruvec;
++	struct lru_gen_folio *lrugen;
++	struct mem_cgroup *memcg;
++	const struct hlist_nulls_node *pos;
++	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
++
++	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
++restart:
++	op = 0;
++	memcg = NULL;
++	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
++
++	rcu_read_lock();
++
++	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
++		if (op)
++			lru_gen_rotate_memcg(lruvec, op);
++
++		mem_cgroup_put(memcg);
++
++		lruvec = container_of(lrugen, struct lruvec, lrugen);
++		memcg = lruvec_memcg(lruvec);
++
++		if (!mem_cgroup_tryget(memcg)) {
++			op = 0;
++			memcg = NULL;
+ 			continue;
++		}
+ 
+-		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+-		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+-			return false;
++		rcu_read_unlock();
++
++		op = shrink_one(lruvec, sc);
++
++		rcu_read_lock();
++
++		if (sc->nr_reclaimed >= nr_to_reclaim)
++			break;
+ 	}
+ 
+-	sc->nr_reclaimed += MIN_LRU_BATCH;
++	rcu_read_unlock();
+ 
+-	return true;
++	if (op)
++		lru_gen_rotate_memcg(lruvec, op);
++
++	mem_cgroup_put(memcg);
++
++	if (sc->nr_reclaimed >= nr_to_reclaim)
++		return;
++
++	/* restart if raced with lru_gen_rotate_memcg() */
++	if (gen != get_nulls_value(pos))
++		goto restart;
++
++	/* try the rest of the bins of the current generation */
++	bin = get_memcg_bin(bin + 1);
++	if (bin != first_bin)
++		goto restart;
+ }
+ 
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	struct blk_plug plug;
+-	bool need_aging = false;
+-	bool need_swapping = false;
+-	unsigned long scanned = 0;
+-	unsigned long reclaimed = sc->nr_reclaimed;
+-	DEFINE_MAX_SEQ(lruvec);
++
++	VM_WARN_ON_ONCE(global_reclaim(sc));
++	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
+ 
+ 	lru_add_drain();
+ 
+ 	blk_start_plug(&plug);
+ 
+-	set_mm_walk(lruvec_pgdat(lruvec));
++	set_mm_walk(NULL, sc->proactive);
+ 
+-	while (true) {
+-		int delta;
+-		int swappiness;
+-		unsigned long nr_to_scan;
++	if (try_to_shrink_lruvec(lruvec, sc))
++		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
+ 
+-		if (sc->may_swap)
+-			swappiness = get_swappiness(lruvec, sc);
+-		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
+-			swappiness = 1;
+-		else
+-			swappiness = 0;
++	clear_mm_walk();
+ 
+-		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+-		if (!nr_to_scan)
+-			goto done;
++	blk_finish_plug(&plug);
++}
+ 
+-		delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
+-		if (!delta)
+-			goto done;
++#else /* !CONFIG_MEMCG */
+ 
+-		scanned += delta;
+-		if (scanned >= nr_to_scan)
+-			break;
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	BUILD_BUG();
++}
+ 
+-		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+-			break;
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	BUILD_BUG();
++}
+ 
+-		cond_resched();
+-	}
++#endif
++
++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	int priority;
++	unsigned long reclaimable;
++	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
++
++	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
++		return;
++	/*
++	 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
++	 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
++	 * estimated reclaimed_to_scanned_ratio = inactive / total.
++	 */
++	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
++	if (get_swappiness(lruvec, sc))
++		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
++
++	reclaimable /= MEMCG_NR_GENS;
++
++	/* round down reclaimable and round up sc->nr_to_reclaim */
++	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
++
++	sc->priority = clamp(priority, 0, DEF_PRIORITY);
++}
++
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	struct blk_plug plug;
++	unsigned long reclaimed = sc->nr_reclaimed;
++
++	VM_WARN_ON_ONCE(!global_reclaim(sc));
++
++	/*
++	 * Unmapped clean folios are already prioritized. Scanning for more of
++	 * them is likely futile and can cause high reclaim latency when there
++	 * is a large number of memcgs.
++	 */
++	if (!sc->may_writepage || !sc->may_unmap)
++		goto done;
++
++	lru_add_drain();
++
++	blk_start_plug(&plug);
++
++	set_mm_walk(pgdat, sc->proactive);
++
++	set_initial_priority(pgdat, sc);
++
++	if (current_is_kswapd())
++		sc->nr_reclaimed = 0;
++
++	if (mem_cgroup_disabled())
++		shrink_one(&pgdat->__lruvec, sc);
++	else
++		shrink_many(pgdat, sc);
++
++	if (current_is_kswapd())
++		sc->nr_reclaimed += reclaimed;
+ 
+-	/* see the comment in lru_gen_age_node() */
+-	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+-		sc->memcgs_need_aging = false;
+-done:
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);
++done:
++	/* kswapd should never fail */
++	pgdat->kswapd_failures = 0;
+ }
+ 
+ /******************************************************************************
+@@ -5266,7 +5535,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ 
+ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+ {
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	if (lrugen->enabled) {
+ 		enum lru_list lru;
+@@ -5279,7 +5548,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+ 		int gen, type, zone;
+ 
+ 		for_each_gen_type_zone(gen, type, zone) {
+-			if (!list_empty(&lrugen->lists[gen][type][zone]))
++			if (!list_empty(&lrugen->folios[gen][type][zone]))
+ 				return false;
+ 		}
+ 	}
+@@ -5324,7 +5593,7 @@ static bool drain_evictable(struct lruvec *lruvec)
+ 	int remaining = MAX_LRU_BATCH;
+ 
+ 	for_each_gen_type_zone(gen, type, zone) {
+-		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
++		struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			bool success;
+@@ -5545,7 +5814,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ 	int i;
+ 	int type, tier;
+ 	int hist = lru_hist_from_seq(seq);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ 		seq_printf(m, "            %10d", tier);
+@@ -5595,7 +5864,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
+ 	unsigned long seq;
+ 	bool full = !debugfs_real_fops(m->file)->write;
+ 	struct lruvec *lruvec = v;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 	int nid = lruvec_pgdat(lruvec)->node_id;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+@@ -5692,7 +5961,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
+ 		if (sc->nr_reclaimed >= nr_to_reclaim)
+ 			return 0;
+ 
+-		if (!evict_folios(lruvec, sc, swappiness, NULL))
++		if (!evict_folios(lruvec, sc, swappiness))
+ 			return 0;
+ 
+ 		cond_resched();
+@@ -5713,11 +5982,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ 
+ 	if (!mem_cgroup_disabled()) {
+ 		rcu_read_lock();
++
+ 		memcg = mem_cgroup_from_id(memcg_id);
+-#ifdef CONFIG_MEMCG
+-		if (memcg && !css_tryget(&memcg->css))
++		if (!mem_cgroup_tryget(memcg))
+ 			memcg = NULL;
+-#endif
++
+ 		rcu_read_unlock();
+ 
+ 		if (!memcg)
+@@ -5777,7 +6046,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ 	set_task_reclaim_state(current, &sc.reclaim_state);
+ 	flags = memalloc_noreclaim_save();
+ 	blk_start_plug(&plug);
+-	if (!set_mm_walk(NULL)) {
++	if (!set_mm_walk(NULL, true)) {
+ 		err = -ENOMEM;
+ 		goto done;
+ 	}
+@@ -5849,7 +6118,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
+ 	int i;
+ 	int gen, type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ 
+ 	lrugen->max_seq = MIN_NR_GENS + 1;
+ 	lrugen->enabled = lru_gen_enabled();
+@@ -5858,13 +6127,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+ 		lrugen->timestamps[i] = jiffies;
+ 
+ 	for_each_gen_type_zone(gen, type, zone)
+-		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
+ 
+ 	lruvec->mm_state.seq = MIN_NR_GENS;
+ 	init_waitqueue_head(&lruvec->mm_state.wait);
+ }
+ 
+ #ifdef CONFIG_MEMCG
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++	int i, j;
++
++	spin_lock_init(&pgdat->memcg_lru.lock);
++
++	for (i = 0; i < MEMCG_NR_GENS; i++) {
++		for (j = 0; j < MEMCG_NR_BINS; j++)
++			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
++	}
++}
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ 	INIT_LIST_HEAD(&memcg->mm_list.fifo);
+@@ -5876,19 +6158,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ 	int i;
+ 	int nid;
+ 
++	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
++
+ 	for_each_node(nid) {
+ 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+ 
++		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
+ 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ 					   sizeof(lruvec->lrugen.nr_pages)));
+ 
++		lruvec->lrugen.list.next = LIST_POISON1;
++
+ 		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+ 			bitmap_free(lruvec->mm_state.filters[i]);
+ 			lruvec->mm_state.filters[i] = NULL;
+ 		}
+ 	}
+ }
+-#endif
++
++#endif /* CONFIG_MEMCG */
+ 
+ static int __init init_lru_gen(void)
+ {
+@@ -5915,6 +6203,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+ {
+ }
+ 
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+ 
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -5928,7 +6220,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ 	bool proportional_reclaim;
+ 	struct blk_plug plug;
+ 
+-	if (lru_gen_enabled()) {
++	if (lru_gen_enabled() && !global_reclaim(sc)) {
+ 		lru_gen_shrink_lruvec(lruvec, sc);
+ 		return;
+ 	}
+@@ -6171,6 +6463,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 	struct lruvec *target_lruvec;
+ 	bool reclaimable = false;
+ 
++	if (lru_gen_enabled() && global_reclaim(sc)) {
++		lru_gen_shrink_node(pgdat, sc);
++		return;
++	}
++
+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+ 
+ again:
+diff --git a/mm/workingset.c b/mm/workingset.c
+index 1a86645b7b3c..fd666584515c 100644
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
+ 	unsigned long token;
+ 	unsigned long min_seq;
+ 	struct lruvec *lruvec;
+-	struct lru_gen_struct *lrugen;
++	struct lru_gen_folio *lrugen;
+ 	int type = folio_is_file_lru(folio);
+ 	int delta = folio_nr_pages(folio);
+ 	int refs = folio_lru_refs(folio);
+@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
+ 	unsigned long token;
+ 	unsigned long min_seq;
+ 	struct lruvec *lruvec;
+-	struct lru_gen_struct *lrugen;
++	struct lru_gen_folio *lrugen;
+ 	struct mem_cgroup *memcg;
+ 	struct pglist_data *pgdat;
+ 	int type = folio_is_file_lru(folio);
+diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
+index 81fa7ec2e66a..1f36bc1c5d36 100644
+--- a/tools/testing/radix-tree/maple.c
++++ b/tools/testing/radix-tree/maple.c
+@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt)
+ 
+ 		if (!MAPLE_32BIT) {
+ 			if (i >= 35)
+-				e = i - 35;
++				e = i - 34;
+ 			else if (i >= 5)
+-				e = i - 5;
++				e = i - 4;
+ 			else if (i >= 2)
+-				e = i - 2;
++				e = i - 1;
+ 		} else {
+ 			if (i >= 4)
+ 				e = i - 4;
+@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt)
+ 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
+ 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
++	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
+ 
+ 	mn = mas_pop_node(&mas); /* get the next node. */
+ 	MT_BUG_ON(mt, mn == NULL);
+ 	MT_BUG_ON(mt, not_empty(mn));
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS);
+-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2);
++	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+ 
+ 	mas_push_node(&mas, mn);
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
++	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
+ 
+ 	/* Check the limit of pop/push/pop */
+ 	mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */
+@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt)
+ 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
+ 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
+ 	MT_BUG_ON(mt, mas_alloc_req(&mas));
+-	MT_BUG_ON(mt, mas.alloc->node_count);
++	MT_BUG_ON(mt, mas.alloc->node_count != 1);
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
+ 	mn = mas_pop_node(&mas);
+ 	MT_BUG_ON(mt, not_empty(mn));
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
+-	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS - 1);
++	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS);
+ 	mas_push_node(&mas, mn);
+-	MT_BUG_ON(mt, mas.alloc->node_count);
++	MT_BUG_ON(mt, mas.alloc->node_count != 1);
+ 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
+ 	mn = mas_pop_node(&mas);
+ 	MT_BUG_ON(mt, not_empty(mn));
+-- 
+2.39.2
+
+From d3f266dbba701440ba392ceaf1b4cad9194dcdc7 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 17 Feb 2023 13:41:20 +0100
+Subject: [PATCH 11/15] mm/kvm: lockless accessed bit harvest
+
+TLDR
+====
+This patchset RCU-protects KVM page tables and compare-and-exchanges
+KVM PTEs with the accessed bit set by hardware. It significantly
+improves the performance of guests when the host is under heavy
+memory pressure.
+
+ChromeOS has been using a similar approach [1] since mid 2021 and it
+was proven successful on tens of millions devices.
+
+[1] https://crrev.com/c/2987928
+
+Overview
+========
+The goal of this patchset is to optimize the performance of guests
+when the host memory is overcommitted. It focuses on the vast
+majority of VMs that are not nested and run on hardware that sets the
+accessed bit in KVM page tables.
+
+Note that nested VMs and hardware that does not support the accessed
+bit are both out of scope.
+
+This patchset relies on two techniques, RCU and cmpxchg, to safely
+test and clear the accessed bit without taking kvm->mmu_lock. The
+former protects KVM page tables from being freed while the latter
+clears the accessed bit atomically against both hardware and other
+software page table walkers.
+
+A new MMU notifier API, mmu_notifier_test_clear_young(), is
+introduced. It follows two design patterns: fallback and batching.
+For any unsupported cases, it can optionally fall back to
+mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can test
+or test and clear their accessed bits according to a bitmap provided
+by the caller.
+
+This patchset only applies mmu_notifier_test_clear_young() to MGLRU.
+A follow-up patchset will apply it to /proc/PID/pagemap and
+/prod/PID/clear_refs.
+
+Evaluation
+==========
+An existing selftest can quickly demonstrate the effectiveness of
+this patchset. On a generic workstation equipped with 64 CPUs and
+256GB DRAM:
+
+  $ sudo max_guest_memory_test -c 64 -m 256 -s 256
+
+  MGLRU      run2
+  ---------------
+  Before    ~600s
+  After      ~50s
+  Off       ~250s
+
+  kswapd (MGLRU before)
+    100.00%  balance_pgdat
+      100.00%  shrink_node
+        100.00%  shrink_one
+          99.97%  try_to_shrink_lruvec
+            99.06%  evict_folios
+              97.41%  shrink_folio_list
+                31.33%  folio_referenced
+                  31.06%  rmap_walk_file
+                    30.89%  folio_referenced_one
+                      20.83%  __mmu_notifier_clear_flush_young
+                        20.54%  kvm_mmu_notifier_clear_flush_young
+  =>                      19.34%  _raw_write_lock
+
+  kswapd (MGLRU after)
+    100.00%  balance_pgdat
+      100.00%  shrink_node
+        100.00%  shrink_one
+          99.97%  try_to_shrink_lruvec
+            99.51%  evict_folios
+              71.70%  shrink_folio_list
+                7.08%  folio_referenced
+                  6.78%  rmap_walk_file
+                    6.72%  folio_referenced_one
+                      5.60%  lru_gen_look_around
+  =>                    1.53%  __mmu_notifier_test_clear_young
+
+  kswapd (MGLRU off)
+    100.00%  balance_pgdat
+      100.00%  shrink_node
+        99.92%  shrink_lruvec
+          69.95%  shrink_folio_list
+            19.35%  folio_referenced
+              18.37%  rmap_walk_file
+                17.88%  folio_referenced_one
+                  13.20%  __mmu_notifier_clear_flush_young
+                    11.64%  kvm_mmu_notifier_clear_flush_young
+  =>                  9.93%  _raw_write_lock
+          26.23%  shrink_active_list
+            25.50%  folio_referenced
+              25.35%  rmap_walk_file
+                25.28%  folio_referenced_one
+                  23.87%  __mmu_notifier_clear_flush_young
+                    23.69%  kvm_mmu_notifier_clear_flush_young
+  =>                  18.98%  _raw_write_lock
+
+Comprehensive benchmarks are coming soon.
+
+Yu Zhao (5):
+  mm/kvm: add mmu_notifier_test_clear_young()
+  kvm/x86: add kvm_arch_test_clear_young()
+  kvm/arm64: add kvm_arch_test_clear_young()
+  kvm/powerpc: add kvm_arch_test_clear_young()
+  mm: multi-gen LRU: use mmu_notifier_test_clear_young()
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/arm64/include/asm/kvm_host.h       |   7 ++
+ arch/arm64/include/asm/kvm_pgtable.h    |   8 ++
+ arch/arm64/include/asm/stage2_pgtable.h |  43 ++++++++
+ arch/arm64/kvm/arm.c                    |   1 +
+ arch/arm64/kvm/hyp/pgtable.c            |  51 ++--------
+ arch/arm64/kvm/mmu.c                    |  77 +++++++++++++-
+ arch/powerpc/include/asm/kvm_host.h     |  18 ++++
+ arch/powerpc/include/asm/kvm_ppc.h      |  14 +--
+ arch/powerpc/kvm/book3s.c               |   7 ++
+ arch/powerpc/kvm/book3s.h               |   2 +
+ arch/powerpc/kvm/book3s_64_mmu_radix.c  |  78 ++++++++++++++-
+ arch/powerpc/kvm/book3s_hv.c            |  10 +-
+ arch/x86/include/asm/kvm_host.h         |  27 +++++
+ arch/x86/kvm/mmu/spte.h                 |  12 ---
+ arch/x86/kvm/mmu/tdp_mmu.c              |  41 ++++++++
+ include/linux/kvm_host.h                |  29 ++++++
+ include/linux/mmu_notifier.h            |  40 ++++++++
+ include/linux/mmzone.h                  |   6 +-
+ mm/mmu_notifier.c                       |  26 +++++
+ mm/rmap.c                               |   8 +-
+ mm/vmscan.c                             | 127 +++++++++++++++++++++---
+ virt/kvm/kvm_main.c                     |  58 +++++++++++
+ 22 files changed, 593 insertions(+), 97 deletions(-)
+
+diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
+index 35a159d131b5..572bcd321586 100644
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -1031,4 +1031,11 @@ static inline void kvm_hyp_reserve(void) { }
+ void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
+ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
+ 
++/* see the comments on the generic kvm_arch_has_test_clear_young() */
++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
++static inline bool kvm_arch_has_test_clear_young(void)
++{
++	return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled();
++}
++
+ #endif /* __ARM64_KVM_HOST_H__ */
+diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
+index 63f81b27a4e3..8c9a04388c88 100644
+--- a/arch/arm64/include/asm/kvm_pgtable.h
++++ b/arch/arm64/include/asm/kvm_pgtable.h
+@@ -105,6 +105,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
+  * @put_page:			Decrement the refcount on a page. When the
+  *				refcount reaches 0 the page is automatically
+  *				freed.
++ * @put_page_rcu:		RCU variant of put_page().
+  * @page_count:			Return the refcount of a page.
+  * @phys_to_virt:		Convert a physical address into a virtual
+  *				address	mapped in the current context.
+@@ -122,6 +123,7 @@ struct kvm_pgtable_mm_ops {
+ 	void		(*free_removed_table)(void *addr, u32 level);
+ 	void		(*get_page)(void *addr);
+ 	void		(*put_page)(void *addr);
++	void		(*put_page_rcu)(void *addr);
+ 	int		(*page_count)(void *addr);
+ 	void*		(*phys_to_virt)(phys_addr_t phys);
+ 	phys_addr_t	(*virt_to_phys)(void *addr);
+@@ -188,6 +190,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
+  *					children.
+  * @KVM_PGTABLE_WALK_SHARED:		Indicates the page-tables may be shared
+  *					with other software walkers.
++ *
++ * kvm_arch_test_clear_young() is a special case. It relies on two
++ * techniques, RCU and cmpxchg, to safely test and clear the accessed
++ * bit without taking the MMU lock. The former protects KVM page tables
++ * from being freed while the latter clears the accessed bit atomically
++ * against both the hardware and other software page table walkers.
+  */
+ enum kvm_pgtable_walk_flags {
+ 	KVM_PGTABLE_WALK_LEAF			= BIT(0),
+diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
+index c8dca8ae359c..350437661d4b 100644
+--- a/arch/arm64/include/asm/stage2_pgtable.h
++++ b/arch/arm64/include/asm/stage2_pgtable.h
+@@ -30,4 +30,47 @@
+  */
+ #define kvm_mmu_cache_min_pages(kvm)	(kvm_stage2_levels(kvm) - 1)
+ 
++#define KVM_PTE_TYPE			BIT(1)
++#define KVM_PTE_TYPE_BLOCK		0
++#define KVM_PTE_TYPE_PAGE		1
++#define KVM_PTE_TYPE_TABLE		1
++
++#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
++
++#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
++#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
++#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
++#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
++#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
++#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
++#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
++
++#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
++#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
++#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
++#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
++#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
++#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
++
++#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
++
++#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
++
++#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
++
++#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
++
++#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
++					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
++					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
++
++#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
++#define KVM_MAX_OWNER_ID		1
++
++/*
++ * Used to indicate a pte for which a 'break-before-make' sequence is in
++ * progress.
++ */
++#define KVM_INVALID_PTE_LOCKED		BIT(10)
++
+ #endif	/* __ARM64_S2_PGTABLE_H_ */
+diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
+index 9c5573bc4614..6770bc47f5c9 100644
+--- a/arch/arm64/kvm/arm.c
++++ b/arch/arm64/kvm/arm.c
+@@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+  */
+ void kvm_arch_destroy_vm(struct kvm *kvm)
+ {
++	kvm_free_stage2_pgd(&kvm->arch.mmu);
+ 	bitmap_free(kvm->arch.pmu_filter);
+ 	free_cpumask_var(kvm->arch.supported_cpus);
+ 
+diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
+index b11cf2c618a6..8d65ee4767f1 100644
+--- a/arch/arm64/kvm/hyp/pgtable.c
++++ b/arch/arm64/kvm/hyp/pgtable.c
+@@ -12,49 +12,6 @@
+ #include <asm/stage2_pgtable.h>
+ 
+ 
+-#define KVM_PTE_TYPE			BIT(1)
+-#define KVM_PTE_TYPE_BLOCK		0
+-#define KVM_PTE_TYPE_PAGE		1
+-#define KVM_PTE_TYPE_TABLE		1
+-
+-#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+-
+-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
+-#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
+-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
+-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
+-#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
+-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
+-#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
+-
+-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
+-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
+-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
+-#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
+-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
+-#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
+-
+-#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+-
+-#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
+-
+-#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
+-
+-#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
+-
+-#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+-					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+-					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
+-
+-#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
+-#define KVM_MAX_OWNER_ID		1
+-
+-/*
+- * Used to indicate a pte for which a 'break-before-make' sequence is in
+- * progress.
+- */
+-#define KVM_INVALID_PTE_LOCKED		BIT(10)
+-
+ struct kvm_pgtable_walk_data {
+ 	struct kvm_pgtable_walker	*walker;
+ 
+@@ -994,8 +951,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
+ 					       kvm_granule_size(ctx->level));
+ 
+-	if (childp)
+-		mm_ops->put_page(childp);
++	if (childp) {
++		if (mm_ops->put_page_rcu)
++			mm_ops->put_page_rcu(childp);
++		else
++			mm_ops->put_page(childp);
++	}
+ 
+ 	return 0;
+ }
+diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
+index a3ee3b605c9b..761fffc788f5 100644
+--- a/arch/arm64/kvm/mmu.c
++++ b/arch/arm64/kvm/mmu.c
+@@ -171,6 +171,21 @@ static int kvm_host_page_count(void *addr)
+ 	return page_count(virt_to_page(addr));
+ }
+ 
++static void kvm_s2_rcu_put_page(struct rcu_head *head)
++{
++	put_page(container_of(head, struct page, rcu_head));
++}
++
++static void kvm_s2_put_page_rcu(void *addr)
++{
++	struct page *page = virt_to_page(addr);
++
++	if (kvm_host_page_count(addr) == 1)
++		kvm_account_pgtable_pages(addr, -1);
++
++	call_rcu(&page->rcu_head, kvm_s2_rcu_put_page);
++}
++
+ static phys_addr_t kvm_host_pa(void *addr)
+ {
+ 	return __pa(addr);
+@@ -684,6 +699,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
+ 	.free_removed_table	= stage2_free_removed_table,
+ 	.get_page		= kvm_host_get_page,
+ 	.put_page		= kvm_s2_put_page,
++	.put_page_rcu		= kvm_s2_put_page_rcu,
+ 	.page_count		= kvm_host_page_count,
+ 	.phys_to_virt		= kvm_host_va,
+ 	.virt_to_phys		= kvm_host_pa,
+@@ -1624,6 +1640,66 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+ 	return pte_valid(pte) && pte_young(pte);
+ }
+ 
++struct test_clear_young_arg {
++	struct kvm_gfn_range *range;
++	gfn_t lsb_gfn;
++	unsigned long *bitmap;
++};
++
++static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx,
++				   enum kvm_pgtable_walk_flags flags)
++{
++	struct test_clear_young_arg *arg = ctx->arg;
++	gfn_t gfn = ctx->addr / PAGE_SIZE;
++	kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
++
++	VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep)));
++	VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end);
++
++	if (!kvm_pte_valid(new))
++		return 0;
++
++	if (new == ctx->old)
++		return 0;
++
++	/* see the comments on the generic kvm_arch_has_test_clear_young() */
++	if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap))
++		cmpxchg64(ctx->ptep, ctx->old, new);
++
++	return 0;
++}
++
++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++			       gfn_t lsb_gfn, unsigned long *bitmap)
++{
++	u64 start = range->start * PAGE_SIZE;
++	u64 end = range->end * PAGE_SIZE;
++	struct test_clear_young_arg arg = {
++		.range		= range,
++		.lsb_gfn	= lsb_gfn,
++		.bitmap		= bitmap,
++	};
++	struct kvm_pgtable_walker walker = {
++		.cb		= stage2_test_clear_young,
++		.arg		= &arg,
++		.flags		= KVM_PGTABLE_WALK_LEAF,
++	};
++
++	BUILD_BUG_ON(is_hyp_code());
++
++	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
++		return false;
++
++	/* see the comments on kvm_pgtable_walk_flags */
++	rcu_read_lock();
++
++	kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker);
++
++	rcu_read_unlock();
++
++	return true;
++}
++
+ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+ {
+ 	if (!kvm->arch.mmu.pgt)
+@@ -1848,7 +1924,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+ 
+ void kvm_arch_flush_shadow_all(struct kvm *kvm)
+ {
+-	kvm_free_stage2_pgd(&kvm->arch.mmu);
+ }
+ 
+ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
+index caea15dcb91d..996850029ce0 100644
+--- a/arch/powerpc/include/asm/kvm_host.h
++++ b/arch/powerpc/include/asm/kvm_host.h
+@@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {}
+ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+ 
++static inline int kvmppc_radix_possible(void)
++{
++	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
++}
++
++static inline bool kvmhv_on_pseries(void)
++{
++	return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE);
++}
++
++/* see the comments on the generic kvm_arch_has_test_clear_young() */
++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
++static inline bool kvm_arch_has_test_clear_young(void)
++{
++	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
++	       kvmppc_radix_possible() && !kvmhv_on_pseries();
++}
++
+ #endif /* __POWERPC_KVM_HOST_H__ */
+diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
+index eae9619b6190..0bb772fc12b1 100644
+--- a/arch/powerpc/include/asm/kvm_ppc.h
++++ b/arch/powerpc/include/asm/kvm_ppc.h
+@@ -277,6 +277,8 @@ struct kvmppc_ops {
+ 	bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+ 	bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ 	bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
++	bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range,
++				 gfn_t lsb_gfn, unsigned long *bitmap);
+ 	bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ 	void (*free_memslot)(struct kvm_memory_slot *slot);
+ 	int (*init_vm)(struct kvm *kvm);
+@@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void)		{ return false; }
+ 
+ #endif
+ 
+-#ifdef CONFIG_PPC_PSERIES
+-static inline bool kvmhv_on_pseries(void)
+-{
+-	return !cpu_has_feature(CPU_FTR_HVMODE);
+-}
+-#else
+-static inline bool kvmhv_on_pseries(void)
+-{
+-	return false;
+-}
+-#endif
+-
+ #ifdef CONFIG_KVM_XICS
+ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+ {
+diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
+index 6d525285dbe8..f4cf330e3e81 100644
+--- a/arch/powerpc/kvm/book3s.c
++++ b/arch/powerpc/kvm/book3s.c
+@@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+ 	return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
+ }
+ 
++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++			       gfn_t lsb_gfn, unsigned long *bitmap)
++{
++	return kvm->arch.kvm_ops->test_clear_young &&
++	       kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap);
++}
++
+ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+ {
+ 	return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
+diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
+index 58391b4b32ed..fe9cac423817 100644
+--- a/arch/powerpc/kvm/book3s.h
++++ b/arch/powerpc/kvm/book3s.h
+@@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+ extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+ extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+ extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
++extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++				   gfn_t lsb_gfn, unsigned long *bitmap);
+ extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+ 
+ extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
+diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
+index 9d3743ca16d5..8476646c554c 100644
+--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
+@@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ 	return ref;
+ }
+ 
++bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++			    gfn_t lsb_gfn, unsigned long *bitmap)
++{
++	bool success;
++	gfn_t gfn = range->start;
++
++	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
++		return false;
++
++	/*
++	 * This function relies on two techniques, RCU and cmpxchg, to safely
++	 * test and clear the accessed bit without taking the MMU lock. The
++	 * former protects KVM page tables from being freed while the latter
++	 * clears the accessed bit atomically against both the hardware and
++	 * other software page table walkers.
++	 */
++	rcu_read_lock();
++
++	success = kvm_is_radix(kvm);
++	if (!success)
++		goto unlock;
++
++	/*
++	 * case 1:  this function          kvmppc_switch_mmu_to_hpt()
++	 *
++	 *          rcu_read_lock()
++	 *          test kvm_is_radix()    kvm->arch.radix = 0
++	 *          use kvm->arch.pgtable
++	 *          rcu_read_unlock()
++	 *                                 synchronize_rcu()
++	 *                                 kvmppc_free_radix()
++	 *
++	 *
++	 * case 2:  this function          kvmppc_switch_mmu_to_radix()
++	 *
++	 *                                 kvmppc_init_vm_radix()
++	 *                                 smp_wmb()
++	 *          test kvm_is_radix()    kvm->arch.radix = 1
++	 *          smp_rmb()
++	 *          use kvm->arch.pgtable
++	 */
++	smp_rmb();
++
++	while (gfn < range->end) {
++		pte_t *ptep;
++		pte_t old, new;
++		unsigned int shift;
++
++		ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift);
++		if (!ptep)
++			goto next;
++
++		VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep)));
++
++		old = READ_ONCE(*ptep);
++		if (!pte_present(old) || !pte_young(old))
++			goto next;
++
++		new = pte_mkold(old);
++
++		/* see the comments on the generic kvm_arch_has_test_clear_young() */
++		if (__test_and_change_bit(lsb_gfn - gfn, bitmap))
++			pte_xchg(ptep, old, new);
++next:
++		gfn += shift ? BIT(shift - PAGE_SHIFT) : 1;
++	}
++unlock:
++	rcu_read_unlock();
++
++	return success;
++}
++
+ /* Returns the number of PAGE_SIZE pages that are dirty */
+ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+ 				struct kvm_memory_slot *memslot, int pagenum)
+@@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void)
+ {
+ 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
+ 
+-	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
++	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
++					  SLAB_TYPESAFE_BY_RCU, pte_ctor);
+ 	if (!kvm_pte_cache)
+ 		return -ENOMEM;
+ 
+ 	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
+ 
+-	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
++	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
++					  SLAB_TYPESAFE_BY_RCU, pmd_ctor);
+ 	if (!kvm_pmd_cache) {
+ 		kmem_cache_destroy(kvm_pte_cache);
+ 		return -ENOMEM;
+diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
+index 6ba68dd6190b..17b415661282 100644
+--- a/arch/powerpc/kvm/book3s_hv.c
++++ b/arch/powerpc/kvm/book3s_hv.c
+@@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+ 	spin_lock(&kvm->mmu_lock);
+ 	kvm->arch.radix = 0;
+ 	spin_unlock(&kvm->mmu_lock);
++	/* see the comments in kvmhv_test_clear_young() */
++	synchronize_rcu();
+ 	kvmppc_free_radix(kvm);
+ 
+ 	lpcr = LPCR_VPM1;
+@@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+ 	if (err)
+ 		return err;
+ 	kvmppc_rmap_reset(kvm);
++	/* see the comments in kvmhv_test_clear_young() */
++	smp_wmb();
+ 	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
+ 	spin_lock(&kvm->mmu_lock);
+ 	kvm->arch.radix = 1;
+@@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = {
+ 	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
+ 	.age_gfn = kvm_age_gfn_hv,
+ 	.test_age_gfn = kvm_test_age_gfn_hv,
++	.test_clear_young = kvmhv_test_clear_young,
+ 	.set_spte_gfn = kvm_set_spte_gfn_hv,
+ 	.free_memslot = kvmppc_core_free_memslot_hv,
+ 	.init_vm =  kvmppc_core_init_vm_hv,
+@@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void)
+ 	return 0;
+ }
+ 
+-static int kvmppc_radix_possible(void)
+-{
+-	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+-}
+-
+ static int kvmppc_book3s_init_hv(void)
+ {
+ 	int r;
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 6aaae18f1854..d2995c9e8f07 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1367,6 +1367,12 @@ struct kvm_arch {
+ 	 *	the MMU lock in read mode + the tdp_mmu_pages_lock or
+ 	 *	the MMU lock in write mode
+ 	 *
++	 * kvm_arch_test_clear_young() is a special case. It relies on two
++	 * techniques, RCU and cmpxchg, to safely test and clear the accessed
++	 * bit without taking the MMU lock. The former protects KVM page tables
++	 * from being freed while the latter clears the accessed bit atomically
++	 * against both the hardware and other software page table walkers.
++	 *
+ 	 * Roots will remain in the list until their tdp_mmu_root_count
+ 	 * drops to zero, at which point the thread that decremented the
+ 	 * count to zero should removed the root from the list and clean
+@@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
+ 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
+ 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ 
++extern u64 __read_mostly shadow_accessed_mask;
++
++/*
++ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
++ * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
++ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
++ * scenario where KVM is using A/D bits for L1, but not L2.
++ */
++static inline bool kvm_ad_enabled(void)
++{
++	return shadow_accessed_mask;
++}
++
++/* see the comments on the generic kvm_arch_has_test_clear_young() */
++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
++static inline bool kvm_arch_has_test_clear_young(void)
++{
++	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
++	       (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled));
++}
++
+ #endif /* _ASM_X86_KVM_HOST_H */
+diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
+index 6f54dc9409c9..0dc7fed1f3fd 100644
+--- a/arch/x86/kvm/mmu/spte.h
++++ b/arch/x86/kvm/mmu/spte.h
+@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
+ extern u64 __read_mostly shadow_nx_mask;
+ extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+ extern u64 __read_mostly shadow_user_mask;
+-extern u64 __read_mostly shadow_accessed_mask;
+ extern u64 __read_mostly shadow_dirty_mask;
+ extern u64 __read_mostly shadow_mmio_value;
+ extern u64 __read_mostly shadow_mmio_mask;
+@@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte)
+ 	return !!(pte & SPTE_MMU_PRESENT_MASK);
+ }
+ 
+-/*
+- * Returns true if A/D bits are supported in hardware and are enabled by KVM.
+- * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
+- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
+- * scenario where KVM is using A/D bits for L1, but not L2.
+- */
+-static inline bool kvm_ad_enabled(void)
+-{
+-	return !!shadow_accessed_mask;
+-}
+-
+ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+ {
+ 	return sp->role.ad_disabled;
+diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
+index d6df38d371a0..9028e09f1aab 100644
+--- a/arch/x86/kvm/mmu/tdp_mmu.c
++++ b/arch/x86/kvm/mmu/tdp_mmu.c
+@@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+ 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
+ }
+ 
++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++			       gfn_t lsb_gfn, unsigned long *bitmap)
++{
++	struct kvm_mmu_page *root;
++
++	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
++		return false;
++
++	if (kvm_memslots_have_rmaps(kvm))
++		return false;
++
++	/* see the comments on kvm_arch->tdp_mmu_roots */
++	rcu_read_lock();
++
++	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
++		struct tdp_iter iter;
++
++		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
++			continue;
++
++		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
++			u64 *sptep = rcu_dereference(iter.sptep);
++			u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
++
++			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
++			VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);
++
++			if (new_spte == iter.old_spte)
++				continue;
++
++			/* see the comments on the generic kvm_arch_has_test_clear_young() */
++			if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
++				cmpxchg64(sptep, iter.old_spte, new_spte);
++		}
++	}
++
++	rcu_read_unlock();
++
++	return true;
++}
++
+ static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ 			 struct kvm_gfn_range *range)
+ {
+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
+index 4f26b244f6d0..df46fc815c8b 100644
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -2281,4 +2281,33 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
+ /* Max number of entries allowed for each kvm dirty ring */
+ #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
+ 
++/*
++ * Architectures that implement kvm_arch_test_clear_young() should override
++ * kvm_arch_has_test_clear_young().
++ *
++ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can
++ * return true if kvm_arch_test_clear_young() is supported but disabled due to
++ * some runtime constraint. In this case, kvm_arch_test_clear_young() should
++ * return false.
++ *
++ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the
++ * following specifications:
++ * 1. The offset of each bit is relative to the second to the last parameter
++ *    lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to
++ *    better suit batching while forward looping.
++ * 2. For each KVM PTE with the accessed bit set, the implementation should flip
++ *    the corresponding bit in the bitmap. It should only clear the accessed bit
++ *    if the old value is 1. This allows the caller to test or test and clear
++ *    the accessed bit.
++ */
++#ifndef kvm_arch_has_test_clear_young
++static inline bool kvm_arch_has_test_clear_young(void)
++{
++	return false;
++}
++#endif
++
++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
++			       gfn_t lsb_gfn, unsigned long *bitmap);
++
+ #endif
+diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
+index d6c06e140277..521f71ad0467 100644
+--- a/include/linux/mmu_notifier.h
++++ b/include/linux/mmu_notifier.h
+@@ -122,6 +122,11 @@ struct mmu_notifier_ops {
+ 			  struct mm_struct *mm,
+ 			  unsigned long address);
+ 
++	/* see the comments on mmu_notifier_test_clear_young() */
++	bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
++				 unsigned long start, unsigned long end,
++				 unsigned long *bitmap);
++
+ 	/*
+ 	 * change_pte is called in cases that pte mapping to page is changed:
+ 	 * for example, when ksm remaps pte to point to a new shared page.
+@@ -391,6 +396,9 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+ 				      unsigned long start,
+ 				      unsigned long end);
++extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
++					   unsigned long start, unsigned long end,
++					   bool fallback, unsigned long *bitmap);
+ extern int __mmu_notifier_test_young(struct mm_struct *mm,
+ 				     unsigned long address);
+ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
+@@ -433,6 +441,31 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+ 	return 0;
+ }
+ 
++/*
++ * This function always returns 0 if fallback is not allowed. If fallback
++ * happens, its return value is similar to that of mmu_notifier_clear_young().
++ *
++ * The bitmap has the following specifications:
++ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
++ * 2. The offset of each bit is relative to the end. E.g., the offset
++ *    corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit
++ *    batching while forward looping.
++ * 3. For each KVM PTE with the accessed bit set (young), this function flips
++ *    the corresponding bit in the bitmap. It only clears the accessed bit if
++ *    the old value is 1. A caller can test or test and clear the accessed bit
++ *    by setting the corresponding bit in the bitmap to 0 or 1, and the new
++ *    value will be 1 or 0 for a young KVM PTE.
++ */
++static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
++						unsigned long start, unsigned long end,
++						bool fallback, unsigned long *bitmap)
++{
++	if (mm_has_notifiers(mm))
++		return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap);
++
++	return 0;
++}
++
+ static inline int mmu_notifier_test_young(struct mm_struct *mm,
+ 					  unsigned long address)
+ {
+@@ -687,6 +720,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ 	return 0;
+ }
+ 
++static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
++						unsigned long start, unsigned long end,
++						bool fallback, unsigned long *bitmap)
++{
++	return 0;
++}
++
+ static inline int mmu_notifier_test_young(struct mm_struct *mm,
+ 					  unsigned long address)
+ {
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 977be526c939..beece92ce62e 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -379,6 +379,7 @@ enum {
+ 	LRU_GEN_CORE,
+ 	LRU_GEN_MM_WALK,
+ 	LRU_GEN_NONLEAF_YOUNG,
++	LRU_GEN_SPTE_WALK,
+ 	NR_LRU_GEN_CAPS
+ };
+ 
+@@ -485,7 +486,7 @@ struct lru_gen_mm_walk {
+ };
+ 
+ void lru_gen_init_lruvec(struct lruvec *lruvec);
+-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
++bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+ 
+ #ifdef CONFIG_MEMCG
+ 
+@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
+ }
+ 
+-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ {
++	return false;
+ }
+ 
+ #ifdef CONFIG_MEMCG
+diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
+index f45ff1b7626a..324799848fed 100644
+--- a/mm/mmu_notifier.c
++++ b/mm/mmu_notifier.c
+@@ -402,6 +402,32 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
+ 	return young;
+ }
+ 
++/* see the comments on mmu_notifier_test_clear_young() */
++int __mmu_notifier_test_clear_young(struct mm_struct *mm,
++				    unsigned long start, unsigned long end,
++				    bool fallback, unsigned long *bitmap)
++{
++	int key;
++	struct mmu_notifier *mn;
++	int young = 0;
++
++	key = srcu_read_lock(&srcu);
++
++	hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list,
++				  hlist, srcu_read_lock_held(&srcu)) {
++		if (mn->ops->test_clear_young &&
++		    mn->ops->test_clear_young(mn, mm, start, end, bitmap))
++			continue;
++
++		if (fallback && mn->ops->clear_young)
++			young |= mn->ops->clear_young(mn, mm, start, end);
++	}
++
++	srcu_read_unlock(&srcu, key);
++
++	return young;
++}
++
+ int __mmu_notifier_test_young(struct mm_struct *mm,
+ 			      unsigned long address)
+ {
+diff --git a/mm/rmap.c b/mm/rmap.c
+index 7b9205cb7d87..82e3a0be1ada 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio,
+ 			return false; /* To break the loop */
+ 		}
+ 
+-		if (pvmw.pte) {
+-			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+-				lru_gen_look_around(&pvmw);
++		if (lru_gen_enabled() && pvmw.pte) {
++			if (lru_gen_look_around(&pvmw))
+ 				referenced++;
+-			}
+-
++		} else if (pvmw.pte) {
+ 			if (ptep_clear_flush_young_notify(vma, address,
+ 						pvmw.pte))
+ 				referenced++;
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 04a54656b6b7..2fc436638dfe 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -57,6 +57,8 @@
+ #include <linux/khugepaged.h>
+ #include <linux/rculist_nulls.h>
+ #include <linux/random.h>
++#include <linux/mmu_notifier.h>
++#include <linux/kvm_host.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -3927,6 +3929,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ 	return folio;
+ }
+ 
++static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end,
++			    unsigned long *bitmap, unsigned long *last)
++{
++	if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK))
++		return false;
++
++	if (*last > addr)
++		goto done;
++
++	*last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
++		addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
++	bitmap_zero(bitmap, MIN_LRU_BATCH);
++
++	mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
++done:
++	return test_bit((*last - addr) / PAGE_SIZE, bitmap);
++}
++
++static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
++			     unsigned long *bitmap, unsigned long *last)
++{
++	int i;
++	unsigned long start, end = *last + 1;
++
++	if (addr + PAGE_SIZE != end)
++		return;
++
++	i = find_last_bit(bitmap, MIN_LRU_BATCH);
++	if (i == MIN_LRU_BATCH)
++		return;
++
++	start = end - (i + 1) * PAGE_SIZE;
++
++	i = find_first_bit(bitmap, MIN_LRU_BATCH);
++
++	end -= i * PAGE_SIZE;
++
++	mmu_notifier_test_clear_young(mm, start, end, false, bitmap);
++}
++
++static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
++			    unsigned long *bitmap, unsigned long *last)
++{
++	if (*last > addr)
++		__clear_bit((*last - addr) / PAGE_SIZE, bitmap);
++
++	clear_spte_young(mm, addr, bitmap, last);
++}
++
+ static bool suitable_to_scan(int total, int young)
+ {
+ 	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+@@ -3942,6 +3993,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ 	pte_t *pte;
+ 	spinlock_t *ptl;
+ 	unsigned long addr;
++	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
++	unsigned long last = 0;
+ 	int total = 0;
+ 	int young = 0;
+ 	struct lru_gen_mm_walk *walk = args->private;
+@@ -3960,6 +4013,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ 	pte = pte_offset_map(pmd, start & PMD_MASK);
+ restart:
+ 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
++		bool success;
+ 		unsigned long pfn;
+ 		struct folio *folio;
+ 
+@@ -3967,20 +4021,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ 		walk->mm_stats[MM_LEAF_TOTAL]++;
+ 
+ 		pfn = get_pte_pfn(pte[i], args->vma, addr);
+-		if (pfn == -1)
++		if (pfn == -1) {
++			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+ 			continue;
++		}
+ 
+-		if (!pte_young(pte[i])) {
++		success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last);
++		if (!success && !pte_young(pte[i])) {
++			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+ 			walk->mm_stats[MM_LEAF_OLD]++;
+ 			continue;
+ 		}
+ 
+ 		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+-		if (!folio)
++		if (!folio) {
++			skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+ 			continue;
++		}
+ 
+-		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
+-			VM_WARN_ON_ONCE(true);
++		clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
++		if (pte_young(pte[i]))
++			ptep_test_and_clear_young(args->vma, addr, pte + i);
+ 
+ 		young++;
+ 		walk->mm_stats[MM_LEAF_YOUNG]++;
+@@ -4589,6 +4650,24 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+  *                          rmap/PT walk feedback
+  ******************************************************************************/
+ 
++static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
++			       pte_t *pte, int *young)
++{
++	unsigned long old = true;
++
++	*young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old);
++	if (!old)
++		*young = true;
++
++	if (pte_young(*pte)) {
++		ptep_test_and_clear_young(vma, addr, pte);
++		*young = true;
++		return true;
++	}
++
++	return !old && get_cap(LRU_GEN_SPTE_WALK);
++}
++
+ /*
+  * This function exploits spatial locality when shrink_folio_list() walks the
+  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+@@ -4596,12 +4675,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+  * the PTE table to the Bloom filter. This forms a feedback loop between the
+  * eviction and the aging.
+  */
+-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ {
+ 	int i;
+ 	unsigned long start;
+ 	unsigned long end;
+ 	struct lru_gen_mm_walk *walk;
++	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
++	unsigned long last = 0;
+ 	int young = 0;
+ 	pte_t *pte = pvmw->pte;
+ 	unsigned long addr = pvmw->address;
+@@ -4615,8 +4696,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 	lockdep_assert_held(pvmw->ptl);
+ 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+ 
++	if (!should_look_around(pvmw->vma, addr, pte, &young))
++		return young;
++
+ 	if (spin_is_contended(pvmw->ptl))
+-		return;
++		return young;
+ 
+ 	/* avoid taking the LRU lock under the PTL when possible */
+ 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+@@ -4624,6 +4708,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 	start = max(addr & PMD_MASK, pvmw->vma->vm_start);
+ 	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ 
++	if (end - start == PAGE_SIZE)
++		return young;
++
+ 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
+@@ -4637,28 +4724,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 
+ 	/* folio_update_gen() requires stable folio_memcg() */
+ 	if (!mem_cgroup_trylock_pages(memcg))
+-		return;
++		return young;
+ 
+ 	arch_enter_lazy_mmu_mode();
+ 
+ 	pte -= (addr - start) / PAGE_SIZE;
+ 
+ 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
++		bool success;
+ 		unsigned long pfn;
+ 
+ 		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+-		if (pfn == -1)
++		if (pfn == -1) {
++			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+ 			continue;
++		}
+ 
+-		if (!pte_young(pte[i]))
++		success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last);
++		if (!success && !pte_young(pte[i])) {
++			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+ 			continue;
++		}
+ 
+ 		folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+-		if (!folio)
++		if (!folio) {
++			skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+ 			continue;
++		}
+ 
+-		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+-			VM_WARN_ON_ONCE(true);
++		clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
++		if (pte_young(pte[i]))
++			ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
+ 
+ 		young++;
+ 
+@@ -4688,6 +4784,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ 	/* feedback from rmap walkers to page table walkers */
+ 	if (suitable_to_scan(i, young))
+ 		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
++
++	return young;
+ }
+ 
+ /******************************************************************************
+@@ -5707,6 +5805,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
+ 	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+ 
++	if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
++		caps |= BIT(LRU_GEN_SPTE_WALK);
++
+ 	return sysfs_emit(buf, "0x%04x\n", caps);
+ }
+ 
+diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
+index 9c60384b5ae0..1b465df4a93d 100644
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+ 	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
+ }
+ 
++static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
++				 unsigned long end, unsigned long *bitmap)
++{
++	int i;
++	int key;
++	bool success = true;
++
++	trace_kvm_age_hva(start, end);
++
++	key = srcu_read_lock(&kvm->srcu);
++
++	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
++		struct interval_tree_node *node;
++		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
++
++		kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
++			gfn_t lsb_gfn;
++			unsigned long hva_start, hva_end;
++			struct kvm_gfn_range range = {
++				.slot = container_of(node, struct kvm_memory_slot,
++						     hva_node[slots->node_idx]),
++			};
++
++			hva_start = max(start, range.slot->userspace_addr);
++			hva_end = min(end - 1, range.slot->userspace_addr +
++					       range.slot->npages * PAGE_SIZE - 1);
++
++			range.start = hva_to_gfn_memslot(hva_start, range.slot);
++			range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
++
++			if (WARN_ON_ONCE(range.end <= range.start))
++				continue;
++
++			/* see the comments on the generic kvm_arch_has_test_clear_young() */
++			lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
++
++			success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
++			if (!success)
++				break;
++		}
++	}
++
++	srcu_read_unlock(&kvm->srcu, key);
++
++	return success;
++}
++
++static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
++					      unsigned long start, unsigned long end,
++					      unsigned long *bitmap)
++{
++	if (kvm_arch_has_test_clear_young())
++		return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap);
++
++	return false;
++}
++
+ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+ 				       struct mm_struct *mm,
+ 				       unsigned long address)
+@@ -903,6 +960,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+ 	.clear_young		= kvm_mmu_notifier_clear_young,
+ 	.test_young		= kvm_mmu_notifier_test_young,
++	.test_clear_young	= kvm_mmu_notifier_test_clear_young,
+ 	.change_pte		= kvm_mmu_notifier_change_pte,
+ 	.release		= kvm_mmu_notifier_release,
+ };
+-- 
+2.39.2
+
+From 1c4ee6ec54d7431a95f829f518cb6b1f7154c6b7 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 13 Feb 2023 09:26:09 +0100
+Subject: [PATCH 12/15] objtool
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ tools/objtool/.gitignore                |  1 +
+ tools/objtool/Build                     |  2 -
+ tools/objtool/Documentation/objtool.txt |  8 +++
+ tools/objtool/Makefile                  | 66 +++++++++++++++++--------
+ tools/objtool/builtin-check.c           |  2 +-
+ tools/objtool/check.c                   |  9 ++++
+ tools/objtool/elf.c                     | 42 ++++++++--------
+ tools/objtool/include/objtool/builtin.h |  2 -
+ tools/objtool/include/objtool/elf.h     |  9 ++--
+ tools/objtool/include/objtool/special.h |  2 +-
+ tools/objtool/special.c                 |  6 +--
+ 11 files changed, 95 insertions(+), 54 deletions(-)
+
+diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
+index 14236db3677f..4faa4dd72f35 100644
+--- a/tools/objtool/.gitignore
++++ b/tools/objtool/.gitignore
+@@ -2,3 +2,4 @@
+ arch/x86/lib/inat-tables.c
+ /objtool
+ fixdep
++libsubcmd/
+diff --git a/tools/objtool/Build b/tools/objtool/Build
+index 33f2ee5a46d3..a3cdf8af6635 100644
+--- a/tools/objtool/Build
++++ b/tools/objtool/Build
+@@ -16,8 +16,6 @@ objtool-y += libctype.o
+ objtool-y += str_error_r.o
+ objtool-y += librbtree.o
+ 
+-CFLAGS += -I$(srctree)/tools/lib
+-
+ $(OUTPUT)libstring.o: ../lib/string.c FORCE
+ 	$(call rule_mkdir)
+ 	$(call if_changed_dep,cc_o_c)
+diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt
+index 8a671902a187..8e53fc6735ef 100644
+--- a/tools/objtool/Documentation/objtool.txt
++++ b/tools/objtool/Documentation/objtool.txt
+@@ -410,6 +410,14 @@ the objtool maintainers.
+    can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL
+    directive right before the call.
+ 
++12. file.o: warning: func(): not an indirect call target
++
++   This means that objtool is running with --ibt and a function expected
++   to be an indirect call target is not. In particular, this happens for
++   init_module() or cleanup_module() if a module relies on these special
++   names and does not use module_init() / module_exit() macros to create
++   them.
++
+ 
+ If the error doesn't seem to make sense, it could be a bug in objtool.
+ Feel free to ask the objtool maintainer for help.
+diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
+index a3a9cc24e0e3..83b100c1e7f6 100644
+--- a/tools/objtool/Makefile
++++ b/tools/objtool/Makefile
+@@ -2,19 +2,18 @@
+ include ../scripts/Makefile.include
+ include ../scripts/Makefile.arch
+ 
+-# always use the host compiler
+-AR	 = $(HOSTAR)
+-CC	 = $(HOSTCC)
+-LD	 = $(HOSTLD)
+-
+ ifeq ($(srctree),)
+ srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+ srctree := $(patsubst %/,%,$(dir $(srctree)))
+ endif
+ 
+-SUBCMD_SRCDIR		= $(srctree)/tools/lib/subcmd/
+-LIBSUBCMD_OUTPUT	= $(or $(OUTPUT),$(CURDIR)/)
+-LIBSUBCMD		= $(LIBSUBCMD_OUTPUT)libsubcmd.a
++LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
++ifneq ($(OUTPUT),)
++  LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
++else
++  LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
++endif
++LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
+ 
+ OBJTOOL    := $(OUTPUT)objtool
+ OBJTOOL_IN := $(OBJTOOL)-in.o
+@@ -28,16 +27,29 @@ INCLUDES := -I$(srctree)/tools/include \
+ 	    -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
+ 	    -I$(srctree)/tools/arch/$(SRCARCH)/include	\
+ 	    -I$(srctree)/tools/objtool/include \
+-	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include
++	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \
++	    -I$(LIBSUBCMD_OUTPUT)/include
++# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it
++# is passed here to match a legacy behavior.
+ WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
+-CFLAGS   := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
+-LDFLAGS  += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
++OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
++OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
+ 
+ # Allow old libelf to be used:
+-elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
+-CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
++elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr)
++OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
++
++# Always want host compilation.
++HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
+ 
+ AWK = awk
++MKDIR = mkdir
++
++ifeq ($(V),1)
++  Q =
++else
++  Q = @
++endif
+ 
+ BUILD_ORC := n
+ 
+@@ -49,21 +61,33 @@ export BUILD_ORC
+ export srctree OUTPUT CFLAGS SRCARCH AWK
+ include $(srctree)/tools/build/Makefile.include
+ 
+-$(OBJTOOL_IN): fixdep FORCE
+-	@$(CONFIG_SHELL) ./sync-check.sh
+-	@$(MAKE) $(build)=objtool
++$(OBJTOOL_IN): fixdep $(LIBSUBCMD) FORCE
++	$(Q)$(CONFIG_SHELL) ./sync-check.sh
++	$(Q)$(MAKE) $(build)=objtool $(HOST_OVERRIDES) CFLAGS="$(OBJTOOL_CFLAGS)" \
++		LDFLAGS="$(OBJTOOL_LDFLAGS)"
++
+ 
+ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
+-	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
++	$(QUIET_LINK)$(HOSTCC) $(OBJTOOL_IN) $(OBJTOOL_LDFLAGS) -o $@
++
++
++$(LIBSUBCMD_OUTPUT):
++	$(Q)$(MKDIR) -p $@
+ 
++$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE
++	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
++		DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \
++		$(HOST_OVERRIDES) EXTRA_CFLAGS="$(OBJTOOL_CFLAGS)" \
++		$@ install_headers
+ 
+-$(LIBSUBCMD): fixdep FORCE
+-	$(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT)
++$(LIBSUBCMD)-clean:
++	$(call QUIET_CLEAN, libsubcmd)
++	$(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
+ 
+-clean:
++clean: $(LIBSUBCMD)-clean
+ 	$(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL)
+ 	$(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+-	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep $(LIBSUBCMD)
++	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep
+ 
+ FORCE:
+ 
+diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
+index a4f39407bf59..7c175198d09f 100644
+--- a/tools/objtool/builtin-check.c
++++ b/tools/objtool/builtin-check.c
+@@ -65,7 +65,7 @@ static int parse_hacks(const struct option *opt, const char *str, int unset)
+ 	return found ? 0 : -1;
+ }
+ 
+-const struct option check_options[] = {
++static const struct option check_options[] = {
+ 	OPT_GROUP("Actions:"),
+ 	OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks),
+ 	OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
+diff --git a/tools/objtool/check.c b/tools/objtool/check.c
+index 4b7c8b33069e..0678ba04fe22 100644
+--- a/tools/objtool/check.c
++++ b/tools/objtool/check.c
+@@ -688,6 +688,7 @@ static int create_static_call_sections(struct objtool_file *file)
+ 		if (strncmp(key_name, STATIC_CALL_TRAMP_PREFIX_STR,
+ 			    STATIC_CALL_TRAMP_PREFIX_LEN)) {
+ 			WARN("static_call: trampoline name malformed: %s", key_name);
++			free(key_name);
+ 			return -1;
+ 		}
+ 		tmp = key_name + STATIC_CALL_TRAMP_PREFIX_LEN - STATIC_CALL_KEY_PREFIX_LEN;
+@@ -697,6 +698,7 @@ static int create_static_call_sections(struct objtool_file *file)
+ 		if (!key_sym) {
+ 			if (!opts.module) {
+ 				WARN("static_call: can't find static_call_key symbol: %s", tmp);
++				free(key_name);
+ 				return -1;
+ 			}
+ 
+@@ -854,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
+ 	list_for_each_entry(insn, &file->endbr_list, call_node) {
+ 
+ 		int *site = (int *)sec->data->d_buf + idx;
++		struct symbol *sym = insn->sym;
+ 		*site = 0;
+ 
++		if (opts.module && sym && sym->type == STT_FUNC &&
++		    insn->offset == sym->offset &&
++		    (!strcmp(sym->name, "init_module") ||
++		     !strcmp(sym->name, "cleanup_module")))
++			WARN("%s(): not an indirect call target", sym->name);
++
+ 		if (elf_add_reloc_to_insn(file->elf, sec,
+ 					  idx * sizeof(int),
+ 					  R_X86_64_PC32,
+diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
+index 64443a7f4bbf..6806ce01d933 100644
+--- a/tools/objtool/elf.c
++++ b/tools/objtool/elf.c
+@@ -284,13 +284,13 @@ static int read_sections(struct elf *elf)
+ 	    !elf_alloc_hash(section_name, sections_nr))
+ 		return -1;
+ 
++	elf->section_data = calloc(sections_nr, sizeof(*sec));
++	if (!elf->section_data) {
++		perror("calloc");
++		return -1;
++	}
+ 	for (i = 0; i < sections_nr; i++) {
+-		sec = malloc(sizeof(*sec));
+-		if (!sec) {
+-			perror("malloc");
+-			return -1;
+-		}
+-		memset(sec, 0, sizeof(*sec));
++		sec = &elf->section_data[i];
+ 
+ 		INIT_LIST_HEAD(&sec->symbol_list);
+ 		INIT_LIST_HEAD(&sec->reloc_list);
+@@ -422,13 +422,13 @@ static int read_symbols(struct elf *elf)
+ 	    !elf_alloc_hash(symbol_name, symbols_nr))
+ 		return -1;
+ 
++	elf->symbol_data = calloc(symbols_nr, sizeof(*sym));
++	if (!elf->symbol_data) {
++		perror("calloc");
++		return -1;
++	}
+ 	for (i = 0; i < symbols_nr; i++) {
+-		sym = malloc(sizeof(*sym));
+-		if (!sym) {
+-			perror("malloc");
+-			return -1;
+-		}
+-		memset(sym, 0, sizeof(*sym));
++		sym = &elf->symbol_data[i];
+ 
+ 		sym->idx = i;
+ 
+@@ -918,13 +918,13 @@ static int read_relocs(struct elf *elf)
+ 		sec->base->reloc = sec;
+ 
+ 		nr_reloc = 0;
++		sec->reloc_data = calloc(sec->sh.sh_size / sec->sh.sh_entsize, sizeof(*reloc));
++		if (!sec->reloc_data) {
++			perror("calloc");
++			return -1;
++		}
+ 		for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) {
+-			reloc = malloc(sizeof(*reloc));
+-			if (!reloc) {
+-				perror("malloc");
+-				return -1;
+-			}
+-			memset(reloc, 0, sizeof(*reloc));
++			reloc = &sec->reloc_data[i];
+ 			switch (sec->sh.sh_type) {
+ 			case SHT_REL:
+ 				if (read_rel_reloc(sec, i, reloc, &symndx))
+@@ -1453,16 +1453,16 @@ void elf_close(struct elf *elf)
+ 		list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) {
+ 			list_del(&sym->list);
+ 			hash_del(&sym->hash);
+-			free(sym);
+ 		}
+ 		list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) {
+ 			list_del(&reloc->list);
+ 			hash_del(&reloc->hash);
+-			free(reloc);
+ 		}
+ 		list_del(&sec->list);
+-		free(sec);
++		free(sec->reloc_data);
+ 	}
+ 
++	free(elf->symbol_data);
++	free(elf->section_data);
+ 	free(elf);
+ }
+diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
+index fa45044e3863..2a108e648b7a 100644
+--- a/tools/objtool/include/objtool/builtin.h
++++ b/tools/objtool/include/objtool/builtin.h
+@@ -7,8 +7,6 @@
+ 
+ #include <subcmd/parse-options.h>
+ 
+-extern const struct option check_options[];
+-
+ struct opts {
+ 	/* actions: */
+ 	bool dump_orc;
+diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
+index bb60fd42b46f..ad0024da262b 100644
+--- a/tools/objtool/include/objtool/elf.h
++++ b/tools/objtool/include/objtool/elf.h
+@@ -39,6 +39,7 @@ struct section {
+ 	char *name;
+ 	int idx;
+ 	bool changed, text, rodata, noinstr, init, truncate;
++	struct reloc *reloc_data;
+ };
+ 
+ struct symbol {
+@@ -49,12 +50,11 @@ struct symbol {
+ 	GElf_Sym sym;
+ 	struct section *sec;
+ 	char *name;
+-	unsigned int idx;
+-	unsigned char bind, type;
++	unsigned int idx, len;
+ 	unsigned long offset;
+-	unsigned int len;
+ 	unsigned long __subtree_last;
+ 	struct symbol *pfunc, *cfunc, *alias;
++	unsigned char bind, type;
+ 	u8 uaccess_safe      : 1;
+ 	u8 static_call_tramp : 1;
+ 	u8 retpoline_thunk   : 1;
+@@ -104,6 +104,9 @@ struct elf {
+ 	struct hlist_head *section_hash;
+ 	struct hlist_head *section_name_hash;
+ 	struct hlist_head *reloc_hash;
++
++	struct section *section_data;
++	struct symbol *symbol_data;
+ };
+ 
+ #define OFFSET_STRIDE_BITS	4
+diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
+index dc4721e19002..86d4af9c5aa9 100644
+--- a/tools/objtool/include/objtool/special.h
++++ b/tools/objtool/include/objtool/special.h
+@@ -19,6 +19,7 @@ struct special_alt {
+ 	bool skip_orig;
+ 	bool skip_alt;
+ 	bool jump_or_nop;
++	u8 key_addend;
+ 
+ 	struct section *orig_sec;
+ 	unsigned long orig_off;
+@@ -27,7 +28,6 @@ struct special_alt {
+ 	unsigned long new_off;
+ 
+ 	unsigned int orig_len, new_len; /* group only */
+-	u8 key_addend;
+ };
+ 
+ int special_get_alts(struct elf *elf, struct list_head *alts);
+diff --git a/tools/objtool/special.c b/tools/objtool/special.c
+index 9c8d827f69af..baa85c31526b 100644
+--- a/tools/objtool/special.c
++++ b/tools/objtool/special.c
+@@ -26,7 +26,7 @@ struct special_entry {
+ 	unsigned char key; /* jump_label key */
+ };
+ 
+-struct special_entry entries[] = {
++static const struct special_entry entries[] = {
+ 	{
+ 		.sec = ".altinstructions",
+ 		.group = true,
+@@ -65,7 +65,7 @@ static void reloc_to_sec_off(struct reloc *reloc, struct section **sec,
+ 	*off = reloc->sym->offset + reloc->addend;
+ }
+ 
+-static int get_alt_entry(struct elf *elf, struct special_entry *entry,
++static int get_alt_entry(struct elf *elf, const struct special_entry *entry,
+ 			 struct section *sec, int idx,
+ 			 struct special_alt *alt)
+ {
+@@ -139,7 +139,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
+  */
+ int special_get_alts(struct elf *elf, struct list_head *alts)
+ {
+-	struct special_entry *entry;
++	const struct special_entry *entry;
+ 	struct section *sec;
+ 	unsigned int nr_entries;
+ 	struct special_alt *alt;
+-- 
+2.39.2
+
+From 66e965775ec9bee68e40b497c765bc03fc264ec8 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 17 Feb 2023 15:36:12 +0100
+Subject: [PATCH 13/15] sched
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/kernel/itmt.c           |  23 +-
+ arch/x86/kernel/smpboot.c        |   2 +-
+ include/linux/sched.h            |   3 +
+ include/linux/sched/sd_flags.h   |   5 +-
+ kernel/sched/core.c              |  77 ++--
+ kernel/sched/cpufreq_schedutil.c |  43 +--
+ kernel/sched/deadline.c          |  42 ++-
+ kernel/sched/debug.c             |   1 +
+ kernel/sched/fair.c              | 581 ++++++++++++++++++++-----------
+ kernel/sched/features.h          |   1 +
+ kernel/sched/pelt.c              |  60 ++++
+ kernel/sched/pelt.h              |  42 ++-
+ kernel/sched/sched.h             |  28 +-
+ 13 files changed, 591 insertions(+), 317 deletions(-)
+
+diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
+index 9ff480e94511..6510883c5e81 100644
+--- a/arch/x86/kernel/itmt.c
++++ b/arch/x86/kernel/itmt.c
+@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu)
+ 
+ /**
+  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+- * @prio:	Priority of cpu core
+- * @core_cpu:	The cpu number associated with the core
++ * @prio:	Priority of @cpu
++ * @cpu:	The CPU number
+  *
+  * The pstate driver will find out the max boost frequency
+  * and call this function to set a priority proportional
+- * to the max boost frequency. CPU with higher boost
++ * to the max boost frequency. CPUs with higher boost
+  * frequency will receive higher priority.
+  *
+  * No need to rebuild sched domain after updating
+  * the CPU priorities. The sched domains have no
+  * dependency on CPU priorities.
+  */
+-void sched_set_itmt_core_prio(int prio, int core_cpu)
++void sched_set_itmt_core_prio(int prio, int cpu)
+ {
+-	int cpu, i = 1;
+-
+-	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+-		int smt_prio;
+-
+-		/*
+-		 * Ensure that the siblings are moved to the end
+-		 * of the priority chain and only used when
+-		 * all other high priority cpus are out of capacity.
+-		 */
+-		smt_prio = prio * smp_num_siblings / (i * i);
+-		per_cpu(sched_core_priority, cpu) = smt_prio;
+-		i++;
+-	}
++	per_cpu(sched_core_priority, cpu) = prio;
+ }
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 55cad72715d9..0213d066a9a9 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -547,7 +547,7 @@ static int x86_core_flags(void)
+ #ifdef CONFIG_SCHED_SMT
+ static int x86_smt_flags(void)
+ {
+-	return cpu_smt_flags() | x86_sched_itmt_flags();
++	return cpu_smt_flags();
+ }
+ #endif
+ #ifdef CONFIG_SCHED_CLUSTER
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 853d08f7562b..28ce1be0ba47 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -557,6 +557,9 @@ struct sched_entity {
+ 	u64				prev_sum_exec_runtime;
+ 
+ 	u64				nr_migrations;
++	u64				prev_sleep_sum_runtime;
++	/* average duration of a task */
++	u64				dur_avg;
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	int				depth;
+diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
+index 57bde66d95f7..800238854ba5 100644
+--- a/include/linux/sched/sd_flags.h
++++ b/include/linux/sched/sd_flags.h
+@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+ /*
+  * Place busy tasks earlier in the domain
+  *
+- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
+- *               up, but currently assumed to be set from the base domain
+- *               upwards (see update_top_cache_domain()).
+  * NEEDS_GROUPS: Load balancing flag.
+  */
+-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
++SD_FLAG(SD_ASYM_PACKING,  SDF_NEEDS_GROUPS)
+ 
+ /*
+  * Prefer to place tasks in a sibling domain
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 2a4918a1faa9..5237639786b7 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -722,7 +722,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
+ 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ 		update_irq_load_avg(rq, irq_delta + steal);
+ #endif
+-	update_rq_clock_pelt(rq, delta);
++	update_rq_clock_task_mult(rq, delta);
+ }
+ 
+ void update_rq_clock(struct rq *rq)
+@@ -3675,14 +3675,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+ }
+ 
+ /*
+- * Mark the task runnable and perform wakeup-preemption.
++ * Mark the task runnable.
+  */
+-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+-			   struct rq_flags *rf)
++static inline void ttwu_do_wakeup(struct task_struct *p)
+ {
+-	check_preempt_curr(rq, p, wake_flags);
+ 	WRITE_ONCE(p->__state, TASK_RUNNING);
+ 	trace_sched_wakeup(p);
++}
++
++static void
++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
++		 struct rq_flags *rf)
++{
++	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
++
++	lockdep_assert_rq_held(rq);
++
++	if (p->sched_contributes_to_load)
++		rq->nr_uninterruptible--;
++
++#ifdef CONFIG_SMP
++	if (wake_flags & WF_MIGRATED)
++		en_flags |= ENQUEUE_MIGRATED;
++	else
++#endif
++	if (p->in_iowait) {
++		delayacct_blkio_end(p);
++		atomic_dec(&task_rq(p)->nr_iowait);
++	}
++
++	activate_task(rq, p, en_flags);
++	check_preempt_curr(rq, p, wake_flags);
++
++	ttwu_do_wakeup(p);
+ 
+ #ifdef CONFIG_SMP
+ 	if (p->sched_class->task_woken) {
+@@ -3712,31 +3737,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+ #endif
+ }
+ 
+-static void
+-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+-		 struct rq_flags *rf)
+-{
+-	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+-
+-	lockdep_assert_rq_held(rq);
+-
+-	if (p->sched_contributes_to_load)
+-		rq->nr_uninterruptible--;
+-
+-#ifdef CONFIG_SMP
+-	if (wake_flags & WF_MIGRATED)
+-		en_flags |= ENQUEUE_MIGRATED;
+-	else
+-#endif
+-	if (p->in_iowait) {
+-		delayacct_blkio_end(p);
+-		atomic_dec(&task_rq(p)->nr_iowait);
+-	}
+-
+-	activate_task(rq, p, en_flags);
+-	ttwu_do_wakeup(rq, p, wake_flags, rf);
+-}
+-
+ /*
+  * Consider @p being inside a wait loop:
+  *
+@@ -3770,9 +3770,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
+ 
+ 	rq = __task_rq_lock(p, &rf);
+ 	if (task_on_rq_queued(p)) {
+-		/* check_preempt_curr() may use rq clock */
+-		update_rq_clock(rq);
+-		ttwu_do_wakeup(rq, p, wake_flags, &rf);
++		if (!task_on_cpu(rq, p)) {
++			/*
++			 * When on_rq && !on_cpu the task is preempted, see if
++			 * it should preempt the task that is current now.
++			 */
++			update_rq_clock(rq);
++			check_preempt_curr(rq, p, wake_flags);
++		}
++		ttwu_do_wakeup(p);
+ 		ret = 1;
+ 	}
+ 	__task_rq_unlock(rq, &rf);
+@@ -4138,8 +4144,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ 			goto out;
+ 
+ 		trace_sched_waking(p);
+-		WRITE_ONCE(p->__state, TASK_RUNNING);
+-		trace_sched_wakeup(p);
++		ttwu_do_wakeup(p);
+ 		goto out;
+ 	}
+ 
+@@ -4424,6 +4429,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
++	p->se.dur_avg			= 0;
++	p->se.prev_sleep_sum_runtime	= 0;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
+index 1207c78f85c1..5c840151f3bb 100644
+--- a/kernel/sched/cpufreq_schedutil.c
++++ b/kernel/sched/cpufreq_schedutil.c
+@@ -48,7 +48,6 @@ struct sugov_cpu {
+ 
+ 	unsigned long		util;
+ 	unsigned long		bw_dl;
+-	unsigned long		max;
+ 
+ 	/* The field below is for single-CPU policies only: */
+ #ifdef CONFIG_NO_HZ_COMMON
+@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
+ {
+ 	struct rq *rq = cpu_rq(sg_cpu->cpu);
+ 
+-	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
+ 	sg_cpu->bw_dl = cpu_bw_dl(rq);
+ 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+ 					  FREQUENCY_UTIL, NULL);
+@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+  * sugov_iowait_apply() - Apply the IO boost to a CPU.
+  * @sg_cpu: the sugov data for the cpu to boost
+  * @time: the update time from the caller
++ * @max_cap: the max CPU capacity
+  *
+  * A CPU running a task which woken up after an IO operation can have its
+  * utilization boosted to speed up the completion of those IO operations.
+@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+  * This mechanism is designed to boost high frequently IO waiting tasks, while
+  * being more conservative on tasks which does sporadic IO operations.
+  */
+-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
++static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
++			       unsigned long max_cap)
+ {
+ 	unsigned long boost;
+ 
+@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+ 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
+ 	 * into the same scale so we can compare.
+ 	 */
+-	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
++	boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
+ 	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
+ 	if (sg_cpu->util < boost)
+ 		sg_cpu->util = boost;
+@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
+ }
+ 
+ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+-					      u64 time, unsigned int flags)
++					      u64 time, unsigned long max_cap,
++					      unsigned int flags)
+ {
+ 	sugov_iowait_boost(sg_cpu, time, flags);
+ 	sg_cpu->last_update = time;
+@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+ 		return false;
+ 
+ 	sugov_get_util(sg_cpu);
+-	sugov_iowait_apply(sg_cpu, time);
++	sugov_iowait_apply(sg_cpu, time, max_cap);
+ 
+ 	return true;
+ }
+@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+ 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ 	unsigned int cached_freq = sg_policy->cached_raw_freq;
++	unsigned long max_cap;
+ 	unsigned int next_f;
+ 
+-	if (!sugov_update_single_common(sg_cpu, time, flags))
++	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
++
++	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ 		return;
+ 
+-	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
++	next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
+ 	/*
+ 	 * Do not reduce the frequency if the CPU has not been idle
+ 	 * recently, as the reduction is likely to be premature then.
+@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ {
+ 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ 	unsigned long prev_util = sg_cpu->util;
++	unsigned long max_cap;
+ 
+ 	/*
+ 	 * Fall back to the "frequency" path if frequency invariance is not
+@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ 		return;
+ 	}
+ 
+-	if (!sugov_update_single_common(sg_cpu, time, flags))
++	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
++
++	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ 		return;
+ 
+ 	/*
+@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ 		sg_cpu->util = prev_util;
+ 
+ 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
+-				   map_util_perf(sg_cpu->util), sg_cpu->max);
++				   map_util_perf(sg_cpu->util), max_cap);
+ 
+ 	sg_cpu->sg_policy->last_freq_update_time = time;
+ }
+@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+ {
+ 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ 	struct cpufreq_policy *policy = sg_policy->policy;
+-	unsigned long util = 0, max = 1;
++	unsigned long util = 0, max_cap;
+ 	unsigned int j;
+ 
++	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
++
+ 	for_each_cpu(j, policy->cpus) {
+ 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+-		unsigned long j_util, j_max;
+ 
+ 		sugov_get_util(j_sg_cpu);
+-		sugov_iowait_apply(j_sg_cpu, time);
+-		j_util = j_sg_cpu->util;
+-		j_max = j_sg_cpu->max;
++		sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ 
+-		if (j_util * max > j_max * util) {
+-			util = j_util;
+-			max = j_max;
+-		}
++		util = max(j_sg_cpu->util, util);
+ 	}
+ 
+-	return get_next_freq(sg_policy, util, max);
++	return get_next_freq(sg_policy, util, max_cap);
+ }
+ 
+ static void
+diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
+index 0d97d54276cc..71b24371a6f7 100644
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
+ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ 			    int oldprio)
+ {
+-	if (task_on_rq_queued(p) || task_current(rq, p)) {
++	if (!task_on_rq_queued(p))
++		return;
++
+ #ifdef CONFIG_SMP
+-		/*
+-		 * This might be too much, but unfortunately
+-		 * we don't have the old deadline value, and
+-		 * we can't argue if the task is increasing
+-		 * or lowering its prio, so...
+-		 */
+-		if (!rq->dl.overloaded)
+-			deadline_queue_pull_task(rq);
++	/*
++	 * This might be too much, but unfortunately
++	 * we don't have the old deadline value, and
++	 * we can't argue if the task is increasing
++	 * or lowering its prio, so...
++	 */
++	if (!rq->dl.overloaded)
++		deadline_queue_pull_task(rq);
+ 
++	if (task_current(rq, p)) {
+ 		/*
+ 		 * If we now have a earlier deadline task than p,
+ 		 * then reschedule, provided p is still on this
+@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ 		 */
+ 		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
+ 			resched_curr(rq);
+-#else
++	} else {
+ 		/*
+-		 * Again, we don't know if p has a earlier
+-		 * or later deadline, so let's blindly set a
+-		 * (maybe not needed) rescheduling point.
++		 * Current may not be deadline in case p was throttled but we
++		 * have just replenished it (e.g. rt_mutex_setprio()).
++		 *
++		 * Otherwise, if p was given an earlier deadline, reschedule.
+ 		 */
+-		resched_curr(rq);
+-#endif /* CONFIG_SMP */
++		if (!dl_task(rq->curr) ||
++		    dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
++			resched_curr(rq);
+ 	}
++#else
++	/*
++	 * We don't know if p has a earlier or later deadline, so let's blindly
++	 * set a (maybe not needed) rescheduling point.
++	 */
++	resched_curr(rq);
++#endif
+ }
+ 
+ DEFINE_SCHED_CLASS(dl) = {
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 1637b65ba07a..8d64fba16cfe 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -1024,6 +1024,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ 	__PS("nr_involuntary_switches", p->nivcsw);
+ 
+ 	P(se.load.weight);
++	P(se.dur_avg);
+ #ifdef CONFIG_SMP
+ 	P(se.avg.load_sum);
+ 	P(se.avg.runnable_sum);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 86a988c830ef..b38a1ce1be49 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+  * Scheduling class queueing methods:
+  */
+ 
++static inline bool is_core_idle(int cpu)
++{
++#ifdef CONFIG_SCHED_SMT
++	int sibling;
++
++	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
++		if (cpu == sibling)
++			continue;
++
++		if (!idle_cpu(sibling))
++			return false;
++	}
++#endif
++
++	return true;
++}
++
+ #ifdef CONFIG_NUMA
+ #define NUMA_IMBALANCE_MIN 2
+ 
+@@ -1718,23 +1735,6 @@ struct numa_stats {
+ 	int idle_cpu;
+ };
+ 
+-static inline bool is_core_idle(int cpu)
+-{
+-#ifdef CONFIG_SCHED_SMT
+-	int sibling;
+-
+-	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+-		if (cpu == sibling)
+-			continue;
+-
+-		if (!idle_cpu(sibling))
+-			return false;
+-	}
+-#endif
+-
+-	return true;
+-}
+-
+ struct task_numa_env {
+ 	struct task_struct *p;
+ 
+@@ -4494,17 +4494,9 @@ static inline int util_fits_cpu(unsigned long util,
+ 	 *
+ 	 * For uclamp_max, we can tolerate a drop in performance level as the
+ 	 * goal is to cap the task. So it's okay if it's getting less.
+-	 *
+-	 * In case of capacity inversion we should honour the inverted capacity
+-	 * for both uclamp_min and uclamp_max all the time.
+ 	 */
+-	capacity_orig = cpu_in_capacity_inversion(cpu);
+-	if (capacity_orig) {
+-		capacity_orig_thermal = capacity_orig;
+-	} else {
+-		capacity_orig = capacity_orig_of(cpu);
+-		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+-	}
++	capacity_orig = capacity_orig_of(cpu);
++	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+ 
+ 	/*
+ 	 * We want to force a task to fit a cpu as implied by uclamp_max.
+@@ -4579,8 +4571,8 @@ static inline int util_fits_cpu(unsigned long util,
+ 	 * handle the case uclamp_min > uclamp_max.
+ 	 */
+ 	uclamp_min = min(uclamp_min, uclamp_max);
+-	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+-		fits = fits && (uclamp_min <= capacity_orig_thermal);
++	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
++		return -1;
+ 
+ 	return fits;
+ }
+@@ -4590,7 +4582,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
+ 	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ 	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ 	unsigned long util = task_util_est(p);
+-	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
++	/*
++	 * Return true only if the cpu fully fits the task requirements, which
++	 * include the utilization but also the performance hints.
++	 */
++	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+ }
+ 
+ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+@@ -4674,6 +4670,7 @@ static void
+ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+ {
+ 	u64 vruntime = cfs_rq->min_vruntime;
++	u64 sleep_time;
+ 
+ 	/*
+ 	 * The 'current' period is already promised to the current tasks,
+@@ -4703,8 +4700,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+ 		vruntime -= thresh;
+ 	}
+ 
+-	/* ensure we never gain time by being placed backwards. */
+-	se->vruntime = max_vruntime(se->vruntime, vruntime);
++	/*
++	 * Pull vruntime of the entity being placed to the base level of
++	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
++	 * slept for a long time, don't even try to compare its vruntime with
++	 * the base as it may be too far off and the comparison may get
++	 * inversed due to s64 overflow.
++	 */
++	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
++	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
++		se->vruntime = vruntime;
++	else
++		se->vruntime = max_vruntime(se->vruntime, vruntime);
+ }
+ 
+ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+@@ -4914,7 +4921,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 	struct sched_entity *se;
+ 	s64 delta;
+ 
+-	ideal_runtime = sched_slice(cfs_rq, curr);
++	/*
++	 * When many tasks blow up the sched_period; it is possible that
++	 * sched_slice() reports unusually large results (when many tasks are
++	 * very light for example). Therefore impose a maximum.
++	 */
++	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
++
+ 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ 	if (delta_exec > ideal_runtime) {
+ 		resched_curr(rq_of(cfs_rq));
+@@ -5479,22 +5492,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 		resched_curr(rq);
+ }
+ 
+-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
++#ifdef CONFIG_SMP
++static void __cfsb_csd_unthrottle(void *arg)
+ {
+-	struct cfs_rq *cfs_rq;
++	struct cfs_rq *cursor, *tmp;
++	struct rq *rq = arg;
++	struct rq_flags rf;
++
++	rq_lock(rq, &rf);
++
++	/*
++	 * Since we hold rq lock we're safe from concurrent manipulation of
++	 * the CSD list. However, this RCU critical section annotates the
++	 * fact that we pair with sched_free_group_rcu(), so that we cannot
++	 * race with group being freed in the window between removing it
++	 * from the list and advancing to the next entry in the list.
++	 */
++	rcu_read_lock();
++
++	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
++				 throttled_csd_list) {
++		list_del_init(&cursor->throttled_csd_list);
++
++		if (cfs_rq_throttled(cursor))
++			unthrottle_cfs_rq(cursor);
++	}
++
++	rcu_read_unlock();
++
++	rq_unlock(rq, &rf);
++}
++
++static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
++{
++	struct rq *rq = rq_of(cfs_rq);
++	bool first;
++
++	if (rq == this_rq()) {
++		unthrottle_cfs_rq(cfs_rq);
++		return;
++	}
++
++	/* Already enqueued */
++	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
++		return;
++
++	first = list_empty(&rq->cfsb_csd_list);
++	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
++	if (first)
++		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
++}
++#else
++static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
++{
++	unthrottle_cfs_rq(cfs_rq);
++}
++#endif
++
++static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
++{
++	lockdep_assert_rq_held(rq_of(cfs_rq));
++
++	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
++	    cfs_rq->runtime_remaining <= 0))
++		return;
++
++	__unthrottle_cfs_rq_async(cfs_rq);
++}
++
++static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
++{
++	struct cfs_rq *local_unthrottle = NULL;
++	int this_cpu = smp_processor_id();
+ 	u64 runtime, remaining = 1;
++	bool throttled = false;
++	struct cfs_rq *cfs_rq;
++	struct rq_flags rf;
++	struct rq *rq;
+ 
+ 	rcu_read_lock();
+ 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+ 				throttled_list) {
+-		struct rq *rq = rq_of(cfs_rq);
+-		struct rq_flags rf;
++		rq = rq_of(cfs_rq);
++
++		if (!remaining) {
++			throttled = true;
++			break;
++		}
+ 
+ 		rq_lock_irqsave(rq, &rf);
+ 		if (!cfs_rq_throttled(cfs_rq))
+ 			goto next;
+ 
+-		/* By the above check, this should never be true */
++#ifdef CONFIG_SMP
++		/* Already queued for async unthrottle */
++		if (!list_empty(&cfs_rq->throttled_csd_list))
++			goto next;
++#endif
++
++		/* By the above checks, this should never be true */
+ 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ 
+ 		raw_spin_lock(&cfs_b->lock);
+@@ -5508,16 +5604,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+ 		cfs_rq->runtime_remaining += runtime;
+ 
+ 		/* we check whether we're throttled above */
+-		if (cfs_rq->runtime_remaining > 0)
+-			unthrottle_cfs_rq(cfs_rq);
++		if (cfs_rq->runtime_remaining > 0) {
++			if (cpu_of(rq) != this_cpu ||
++			    SCHED_WARN_ON(local_unthrottle))
++				unthrottle_cfs_rq_async(cfs_rq);
++			else
++				local_unthrottle = cfs_rq;
++		} else {
++			throttled = true;
++		}
+ 
+ next:
+ 		rq_unlock_irqrestore(rq, &rf);
+-
+-		if (!remaining)
+-			break;
+ 	}
+ 	rcu_read_unlock();
++
++	if (local_unthrottle) {
++		rq = cpu_rq(this_cpu);
++		rq_lock_irqsave(rq, &rf);
++		if (cfs_rq_throttled(local_unthrottle))
++			unthrottle_cfs_rq(local_unthrottle);
++		rq_unlock_irqrestore(rq, &rf);
++	}
++
++	return throttled;
+ }
+ 
+ /*
+@@ -5562,10 +5672,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
+ 	while (throttled && cfs_b->runtime > 0) {
+ 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
+ 		/* we can't nest cfs_b->lock while distributing bandwidth */
+-		distribute_cfs_runtime(cfs_b);
++		throttled = distribute_cfs_runtime(cfs_b);
+ 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
+-
+-		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ 	}
+ 
+ 	/*
+@@ -5842,6 +5950,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ {
+ 	cfs_rq->runtime_enabled = 0;
+ 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
++#ifdef CONFIG_SMP
++	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
++#endif
+ }
+ 
+ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+@@ -5858,12 +5969,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ 
+ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ {
++	int __maybe_unused i;
++
+ 	/* init_cfs_bandwidth() was not called */
+ 	if (!cfs_b->throttled_cfs_rq.next)
+ 		return;
+ 
+ 	hrtimer_cancel(&cfs_b->period_timer);
+ 	hrtimer_cancel(&cfs_b->slack_timer);
++
++	/*
++	 * It is possible that we still have some cfs_rq's pending on a CSD
++	 * list, though this race is very rare. In order for this to occur, we
++	 * must have raced with the last task leaving the group while there
++	 * exist throttled cfs_rq(s), and the period_timer must have queued the
++	 * CSD item but the remote cpu has not yet processed it. To handle this,
++	 * we can simply flush all pending CSD work inline here. We're
++	 * guaranteed at this point that no additional cfs_rq of this group can
++	 * join a CSD list.
++	 */
++#ifdef CONFIG_SMP
++	for_each_possible_cpu(i) {
++		struct rq *rq = cpu_rq(i);
++		unsigned long flags;
++
++		if (list_empty(&rq->cfsb_csd_list))
++			continue;
++
++		local_irq_save(flags);
++		__cfsb_csd_unthrottle(rq);
++		local_irq_restore(flags);
++	}
++#endif
+ }
+ 
+ /*
+@@ -6026,6 +6163,7 @@ static inline bool cpu_overutilized(int cpu)
+ 	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ 	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ 
++	/* Return true only if the utilization doesn't fit CPU's capacity */
+ 	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+ }
+ 
+@@ -6159,6 +6297,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 
+ static void set_next_buddy(struct sched_entity *se);
+ 
++static inline void dur_avg_update(struct task_struct *p, bool task_sleep)
++{
++	u64 dur;
++
++	if (!task_sleep)
++		return;
++
++	dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime;
++	p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime;
++	update_avg(&p->se.dur_avg, dur);
++}
++
+ /*
+  * The dequeue_task method is called before nr_running is
+  * decreased. We remove the task from the rbtree and
+@@ -6231,6 +6381,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 
+ dequeue_throttle:
+ 	util_est_update(&rq->cfs, p, task_sleep);
++	dur_avg_update(p, task_sleep);
+ 	hrtick_update(rq);
+ }
+ 
+@@ -6364,6 +6515,20 @@ static int wake_wide(struct task_struct *p)
+ 	return 1;
+ }
+ 
++/*
++ * If a task switches in and then voluntarily relinquishes the
++ * CPU quickly, it is regarded as a short duration task.
++ *
++ * SIS_SHORT tries to wake up the short wakee on current CPU. This
++ * aims to avoid race condition among CPUs due to frequent context
++ * switch.
++ */
++static inline int is_short_task(struct task_struct *p)
++{
++	return sched_feat(SIS_SHORT) && p->se.dur_avg &&
++	       ((p->se.dur_avg * 8) < sysctl_sched_min_granularity);
++}
++
+ /*
+  * The purpose of wake_affine() is to quickly determine on which CPU we can run
+  * soonest. For the purpose of speed we only consider the waking and previous
+@@ -6400,6 +6565,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
+ 	if (available_idle_cpu(prev_cpu))
+ 		return prev_cpu;
+ 
++	/* The only running task is a short duration one. */
++	if (cpu_rq(this_cpu)->nr_running == 1 &&
++	    is_short_task(rcu_dereference(cpu_curr(this_cpu))))
++		return this_cpu;
++
+ 	return nr_cpumask_bits;
+ }
+ 
+@@ -6774,6 +6944,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
+ 			/* overloaded LLC is unlikely to have idle cpu/core */
+ 			if (nr == 1)
+ 				return -1;
++
++			if (!has_idle_core && this == target &&
++			    (5 * nr < 3 * sd->span_weight) &&
++			    cpu_rq(target)->nr_running <= 1 &&
++			    is_short_task(p) &&
++			    is_short_task(rcu_dereference(cpu_curr(target))))
++				return target;
+ 		}
+ 	}
+ 
+@@ -6819,6 +6996,7 @@ static int
+ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+ 	unsigned long task_util, util_min, util_max, best_cap = 0;
++	int fits, best_fits = 0;
+ 	int cpu, best_cpu = -1;
+ 	struct cpumask *cpus;
+ 
+@@ -6834,12 +7012,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+ 
+ 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ 			continue;
+-		if (util_fits_cpu(task_util, util_min, util_max, cpu))
++
++		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
++
++		/* This CPU fits with all requirements */
++		if (fits > 0)
+ 			return cpu;
++		/*
++		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
++		 * Look for the CPU with best capacity.
++		 */
++		else if (fits < 0)
++			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+ 
+-		if (cpu_cap > best_cap) {
++		/*
++		 * First, select CPU which fits better (-1 being better than 0).
++		 * Then, select the one with best capacity at same level.
++		 */
++		if ((fits < best_fits) ||
++		    ((fits == best_fits) && (cpu_cap > best_cap))) {
+ 			best_cap = cpu_cap;
+ 			best_cpu = cpu;
++			best_fits = fits;
+ 		}
+ 	}
+ 
+@@ -6852,7 +7046,11 @@ static inline bool asym_fits_cpu(unsigned long util,
+ 				 int cpu)
+ {
+ 	if (sched_asym_cpucap_active())
+-		return util_fits_cpu(util, util_min, util_max, cpu);
++		/*
++		 * Return true only if the cpu fully fits the task requirements
++		 * which include the utilization and the performance hints.
++		 */
++		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+ 
+ 	return true;
+ }
+@@ -7219,6 +7417,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ 	struct root_domain *rd = this_rq()->rd;
+ 	int cpu, best_energy_cpu, target = -1;
++	int prev_fits = -1, best_fits = -1;
++	unsigned long best_thermal_cap = 0;
++	unsigned long prev_thermal_cap = 0;
+ 	struct sched_domain *sd;
+ 	struct perf_domain *pd;
+ 	struct energy_env eenv;
+@@ -7254,6 +7455,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 		unsigned long prev_spare_cap = 0;
+ 		int max_spare_cap_cpu = -1;
+ 		unsigned long base_energy;
++		int fits, max_fits = -1;
+ 
+ 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+ 
+@@ -7303,7 +7505,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 				util_min = max(rq_util_min, p_util_min);
+ 				util_max = max(rq_util_max, p_util_max);
+ 			}
+-			if (!util_fits_cpu(util, util_min, util_max, cpu))
++
++			fits = util_fits_cpu(util, util_min, util_max, cpu);
++			if (!fits)
+ 				continue;
+ 
+ 			lsub_positive(&cpu_cap, util);
+@@ -7311,7 +7515,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 			if (cpu == prev_cpu) {
+ 				/* Always use prev_cpu as a candidate. */
+ 				prev_spare_cap = cpu_cap;
+-			} else if (cpu_cap > max_spare_cap) {
++				prev_fits = fits;
++			} else if ((fits > max_fits) ||
++				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+ 				/*
+ 				 * Find the CPU with the maximum spare capacity
+ 				 * among the remaining CPUs in the performance
+@@ -7319,6 +7525,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 				 */
+ 				max_spare_cap = cpu_cap;
+ 				max_spare_cap_cpu = cpu;
++				max_fits = fits;
+ 			}
+ 		}
+ 
+@@ -7337,26 +7544,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ 			if (prev_delta < base_energy)
+ 				goto unlock;
+ 			prev_delta -= base_energy;
++			prev_thermal_cap = cpu_thermal_cap;
+ 			best_delta = min(best_delta, prev_delta);
+ 		}
+ 
+ 		/* Evaluate the energy impact of using max_spare_cap_cpu. */
+ 		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
++			/* Current best energy cpu fits better */
++			if (max_fits < best_fits)
++				continue;
++
++			/*
++			 * Both don't fit performance hint (i.e. uclamp_min)
++			 * but best energy cpu has better capacity.
++			 */
++			if ((max_fits < 0) &&
++			    (cpu_thermal_cap <= best_thermal_cap))
++				continue;
++
+ 			cur_delta = compute_energy(&eenv, pd, cpus, p,
+ 						   max_spare_cap_cpu);
+ 			/* CPU utilization has changed */
+ 			if (cur_delta < base_energy)
+ 				goto unlock;
+ 			cur_delta -= base_energy;
+-			if (cur_delta < best_delta) {
+-				best_delta = cur_delta;
+-				best_energy_cpu = max_spare_cap_cpu;
+-			}
++
++			/*
++			 * Both fit for the task but best energy cpu has lower
++			 * energy impact.
++			 */
++			if ((max_fits > 0) && (best_fits > 0) &&
++			    (cur_delta >= best_delta))
++				continue;
++
++			best_delta = cur_delta;
++			best_energy_cpu = max_spare_cap_cpu;
++			best_fits = max_fits;
++			best_thermal_cap = cpu_thermal_cap;
+ 		}
+ 	}
+ 	rcu_read_unlock();
+ 
+-	if (best_delta < prev_delta)
++	if ((best_fits > prev_fits) ||
++	    ((best_fits > 0) && (best_delta < prev_delta)) ||
++	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ 		target = best_energy_cpu;
+ 
+ 	return target;
+@@ -8856,82 +9087,16 @@ static unsigned long scale_rt_capacity(int cpu)
+ 
+ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
+ {
+-	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
+ 	unsigned long capacity = scale_rt_capacity(cpu);
+ 	struct sched_group *sdg = sd->groups;
+-	struct rq *rq = cpu_rq(cpu);
+ 
+-	rq->cpu_capacity_orig = capacity_orig;
++	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+ 
+ 	if (!capacity)
+ 		capacity = 1;
+ 
+-	rq->cpu_capacity = capacity;
+-
+-	/*
+-	 * Detect if the performance domain is in capacity inversion state.
+-	 *
+-	 * Capacity inversion happens when another perf domain with equal or
+-	 * lower capacity_orig_of() ends up having higher capacity than this
+-	 * domain after subtracting thermal pressure.
+-	 *
+-	 * We only take into account thermal pressure in this detection as it's
+-	 * the only metric that actually results in *real* reduction of
+-	 * capacity due to performance points (OPPs) being dropped/become
+-	 * unreachable due to thermal throttling.
+-	 *
+-	 * We assume:
+-	 *   * That all cpus in a perf domain have the same capacity_orig
+-	 *     (same uArch).
+-	 *   * Thermal pressure will impact all cpus in this perf domain
+-	 *     equally.
+-	 */
+-	if (sched_energy_enabled()) {
+-		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+-		struct perf_domain *pd;
+-
+-		rcu_read_lock();
+-
+-		pd = rcu_dereference(rq->rd->pd);
+-		rq->cpu_capacity_inverted = 0;
+-
+-		for (; pd; pd = pd->next) {
+-			struct cpumask *pd_span = perf_domain_span(pd);
+-			unsigned long pd_cap_orig, pd_cap;
+-
+-			/* We can't be inverted against our own pd */
+-			if (cpumask_test_cpu(cpu_of(rq), pd_span))
+-				continue;
+-
+-			cpu = cpumask_any(pd_span);
+-			pd_cap_orig = arch_scale_cpu_capacity(cpu);
+-
+-			if (capacity_orig < pd_cap_orig)
+-				continue;
+-
+-			/*
+-			 * handle the case of multiple perf domains have the
+-			 * same capacity_orig but one of them is under higher
+-			 * thermal pressure. We record it as capacity
+-			 * inversion.
+-			 */
+-			if (capacity_orig == pd_cap_orig) {
+-				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+-
+-				if (pd_cap > inv_cap) {
+-					rq->cpu_capacity_inverted = inv_cap;
+-					break;
+-				}
+-			} else if (pd_cap_orig > inv_cap) {
+-				rq->cpu_capacity_inverted = inv_cap;
+-				break;
+-			}
+-		}
+-
+-		rcu_read_unlock();
+-	}
+-
+-	trace_sched_cpu_capacity_tp(rq);
++	cpu_rq(cpu)->cpu_capacity = capacity;
++	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+ 
+ 	sdg->sgc->capacity = capacity;
+ 	sdg->sgc->min_capacity = capacity;
+@@ -9135,20 +9300,15 @@ group_type group_classify(unsigned int imbalance_pct,
+  * @sgs:	Load-balancing statistics of the candidate busiest group
+  * @sg:		The candidate busiest group
+  *
+- * Check the state of the SMT siblings of both @sds::local and @sg and decide
+- * if @dst_cpu can pull tasks.
++ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull
++ * tasks.
+  *
+- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
+- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
+- * only if @dst_cpu has higher priority.
++ * This function must be called only if all the SMT siblings of @dst_cpu are
++ * idle, if any.
+  *
+- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
+- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
+- * Bigger imbalances in the number of busy CPUs will be dealt with in
+- * update_sd_pick_busiest().
+- *
+- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
+- * of @dst_cpu are idle and @sg has lower priority.
++ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than
++ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances
++ * in the number of busy CPUs will be dealt with in find_busiest_group().
+  *
+  * Return: true if @dst_cpu can pull tasks, false otherwise.
+  */
+@@ -9157,51 +9317,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ 				    struct sched_group *sg)
+ {
+ #ifdef CONFIG_SCHED_SMT
+-	bool local_is_smt, sg_is_smt;
+ 	int sg_busy_cpus;
+ 
+-	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
+-	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
+-
+ 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+ 
+-	if (!local_is_smt) {
+-		/*
+-		 * If we are here, @dst_cpu is idle and does not have SMT
+-		 * siblings. Pull tasks if candidate group has two or more
+-		 * busy CPUs.
+-		 */
+-		if (sg_busy_cpus >= 2) /* implies sg_is_smt */
+-			return true;
+-
+-		/*
+-		 * @dst_cpu does not have SMT siblings. @sg may have SMT
+-		 * siblings and only one is busy. In such case, @dst_cpu
+-		 * can help if it has higher priority and is idle (i.e.,
+-		 * it has no running tasks).
+-		 */
+-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+-	}
+-
+-	/* @dst_cpu has SMT siblings. */
+-
+-	if (sg_is_smt) {
+-		int local_busy_cpus = sds->local->group_weight -
+-				      sds->local_stat.idle_cpus;
+-		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
+-
+-		if (busy_cpus_delta == 1)
+-			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+-
+-		return false;
+-	}
+-
+ 	/*
+-	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
+-	 * up with more than one busy SMT sibling and only pull tasks if there
+-	 * are not busy CPUs (i.e., no CPU has running tasks).
++	 * If the difference in the number of busy CPUs is two or more, let
++	 * find_busiest_group() take care of it. We only care if @sg has
++	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
+ 	 */
+-	if (!sds->local_stat.sum_nr_running)
++	if (sg_busy_cpus == 1)
+ 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ 
+ 	return false;
+@@ -9215,7 +9340,14 @@ static inline bool
+ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
+ 	   struct sched_group *group)
+ {
+-	/* Only do SMT checks if either local or candidate have SMT siblings */
++	/*
++	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
++	 * is not sufficient. We need to make sure the whole core is idle.
++	 */
++	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
++		return false;
++
++	/* Only do SMT checks if either local or candidate have SMT siblings. */
+ 	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
+ 	    (group->flags & SD_SHARE_CPUCAPACITY))
+ 		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+@@ -9408,10 +9540,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 		 * contention when accessing shared HW resources.
+ 		 *
+ 		 * XXX for now avg_load is not computed and always 0 so we
+-		 * select the 1st one.
++		 * select the 1st one, except if @sg is composed of SMT
++		 * siblings.
+ 		 */
+-		if (sgs->avg_load <= busiest->avg_load)
++
++		if (sgs->avg_load < busiest->avg_load)
+ 			return false;
++
++		if (sgs->avg_load == busiest->avg_load) {
++			/*
++			 * SMT sched groups need more help than non-SMT groups.
++			 * If @sg happens to also be SMT, either choice is good.
++			 */
++			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
++				return false;
++		}
++
+ 		break;
+ 
+ 	case group_has_spare:
+@@ -9886,7 +10030,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
+ 
+ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+ {
+-	struct sched_domain *child = env->sd->child;
+ 	struct sched_group *sg = env->sd->groups;
+ 	struct sg_lb_stats *local = &sds->local_stat;
+ 	struct sg_lb_stats tmp_sgs;
+@@ -9927,9 +10070,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
+ 		sg = sg->next;
+ 	} while (sg != env->sd->groups);
+ 
+-	/* Tag domain that child domain prefers tasks go to siblings first */
+-	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+-
++	/*
++	 * Tag domain that @env::sd prefers to spread excess tasks among
++	 * sibling sched groups.
++	 */
++	sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING;
+ 
+ 	if (env->sd->flags & SD_NUMA)
+ 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+@@ -10159,24 +10304,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
+ 	 */
+ 	update_sd_lb_stats(env, &sds);
+ 
+-	if (sched_energy_enabled()) {
+-		struct root_domain *rd = env->dst_rq->rd;
+-
+-		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+-			goto out_balanced;
+-	}
+-
+-	local = &sds.local_stat;
+-	busiest = &sds.busiest_stat;
+-
+ 	/* There is no busy sibling group to pull tasks from */
+ 	if (!sds.busiest)
+ 		goto out_balanced;
+ 
++	busiest = &sds.busiest_stat;
++
+ 	/* Misfit tasks should be dealt with regardless of the avg load */
+ 	if (busiest->group_type == group_misfit_task)
+ 		goto force_balance;
+ 
++	if (sched_energy_enabled()) {
++		struct root_domain *rd = env->dst_rq->rd;
++
++		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
++			goto out_balanced;
++	}
++
+ 	/* ASYM feature bypasses nice load balance check */
+ 	if (busiest->group_type == group_asym_packing)
+ 		goto force_balance;
+@@ -10189,6 +10333,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
+ 	if (busiest->group_type == group_imbalanced)
+ 		goto force_balance;
+ 
++	local = &sds.local_stat;
+ 	/*
+ 	 * If the local group is busier than the selected busiest group
+ 	 * don't try and pull any tasks.
+@@ -10228,7 +10373,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
+ 			goto out_balanced;
+ 	}
+ 
+-	/* Try to move all excess tasks to child's sibling domain */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ 	    busiest->sum_nr_running > local->sum_nr_running + 1)
+ 		goto force_balance;
+@@ -10330,11 +10474,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+ 		    nr_running == 1)
+ 			continue;
+ 
+-		/* Make sure we only pull tasks from a CPU of lower priority */
++		/*
++		 * Make sure we only pull tasks from a CPU of lower priority
++		 * when balancing between SMT siblings.
++		 *
++		 * If balancing between cores, let lower priority CPUs help
++		 * SMT cores with more than one busy sibling.
++		 */
+ 		if ((env->sd->flags & SD_ASYM_PACKING) &&
+ 		    sched_asym_prefer(i, env->dst_cpu) &&
+-		    nr_running == 1)
+-			continue;
++		    nr_running == 1) {
++			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
++			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
++				continue;
++		}
+ 
+ 		switch (env->migration_type) {
+ 		case migrate_load:
+@@ -10424,8 +10577,20 @@ asym_active_balance(struct lb_env *env)
+ 	 * lower priority CPUs in order to pack all tasks in the
+ 	 * highest priority CPUs.
+ 	 */
+-	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+-	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
++	if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) {
++		/* Always obey priorities between SMT siblings. */
++		if (env->sd->flags & SD_SHARE_CPUCAPACITY)
++			return sched_asym_prefer(env->dst_cpu, env->src_cpu);
++
++		/*
++		 * A lower priority CPU can help an SMT core with more than one
++		 * busy sibling.
++		 */
++		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
++		       !is_core_idle(env->src_cpu);
++	}
++
++	return false;
+ }
+ 
+ static inline bool
+@@ -11162,8 +11327,17 @@ static void nohz_balancer_kick(struct rq *rq)
+ 		 */
+ 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ 			if (sched_asym_prefer(i, cpu)) {
+-				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+-				goto unlock;
++				/*
++				 * Always do ASYM_PACKING balance in the SMT
++				 * domain. In upper domains, the core must be
++				 * fully idle.
++				 */
++				if (sd->flags & SD_SHARE_CPUCAPACITY ||
++				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
++				     is_core_idle(i))) {
++					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
++					goto unlock;
++				}
+ 			}
+ 		}
+ 	}
+@@ -12498,6 +12672,11 @@ __init void init_sched_fair_class(void)
+ 	for_each_possible_cpu(i) {
+ 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
++
++#ifdef CONFIG_CFS_BANDWIDTH
++		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
++		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
++#endif
+ 	}
+ 
+ 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index ee7f23c76bd3..efdc29c42161 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  */
+ SCHED_FEAT(SIS_PROP, false)
+ SCHED_FEAT(SIS_UTIL, true)
++SCHED_FEAT(SIS_SHORT, true)
+ 
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
+diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
+index 0f310768260c..036b0e2cd2b4 100644
+--- a/kernel/sched/pelt.c
++++ b/kernel/sched/pelt.c
+@@ -467,3 +467,63 @@ int update_irq_load_avg(struct rq *rq, u64 running)
+ 	return ret;
+ }
+ #endif
++
++__read_mostly unsigned int sched_pelt_lshift;
++
++#ifdef CONFIG_SYSCTL
++static unsigned int sysctl_sched_pelt_multiplier = 1;
++
++int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
++			  size_t *lenp, loff_t *ppos)
++{
++	static DEFINE_MUTEX(mutex);
++	unsigned int old;
++	int ret;
++
++	mutex_lock(&mutex);
++	old = sysctl_sched_pelt_multiplier;
++	ret = proc_dointvec(table, write, buffer, lenp, ppos);
++	if (ret)
++		goto undo;
++	if (!write)
++		goto done;
++
++	switch (sysctl_sched_pelt_multiplier)  {
++	case 1:
++		fallthrough;
++	case 2:
++		fallthrough;
++	case 4:
++		WRITE_ONCE(sched_pelt_lshift,
++			   sysctl_sched_pelt_multiplier >> 1);
++		goto done;
++	default:
++		ret = -EINVAL;
++	}
++
++undo:
++	sysctl_sched_pelt_multiplier = old;
++done:
++	mutex_unlock(&mutex);
++
++	return ret;
++}
++
++static struct ctl_table sched_pelt_sysctls[] = {
++	{
++		.procname       = "sched_pelt_multiplier",
++		.data           = &sysctl_sched_pelt_multiplier,
++		.maxlen         = sizeof(unsigned int),
++		.mode           = 0644,
++		.proc_handler   = sched_pelt_multiplier,
++	},
++	{}
++};
++
++static int __init sched_pelt_sysctl_init(void)
++{
++	register_sysctl_init("kernel", sched_pelt_sysctls);
++	return 0;
++}
++late_initcall(sched_pelt_sysctl_init);
++#endif
+diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
+index 3a0e0dc28721..9b35b5072bae 100644
+--- a/kernel/sched/pelt.h
++++ b/kernel/sched/pelt.h
+@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
+ 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
+ }
+ 
++static inline u64 rq_clock_task_mult(struct rq *rq)
++{
++	lockdep_assert_rq_held(rq);
++	assert_clock_updated(rq);
++
++	return rq->clock_task_mult;
++}
++
+ static inline u64 rq_clock_pelt(struct rq *rq)
+ {
+ 	lockdep_assert_rq_held(rq);
+@@ -72,7 +80,7 @@ static inline u64 rq_clock_pelt(struct rq *rq)
+ /* The rq is idle, we can sync to clock_task */
+ static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+ {
+-	rq->clock_pelt  = rq_clock_task(rq);
++	rq->clock_pelt = rq_clock_task_mult(rq);
+ 
+ 	u64_u32_store(rq->clock_idle, rq_clock(rq));
+ 	/* Paired with smp_rmb in migrate_se_pelt_lag() */
+@@ -121,6 +129,27 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+ 	rq->clock_pelt += delta;
+ }
+ 
++extern unsigned int sched_pelt_lshift;
++
++/*
++ * absolute time   |1      |2      |3      |4      |5      |6      |
++ * @ mult = 1      --------****************--------****************-
++ * @ mult = 2      --------********----------------********---------
++ * @ mult = 4      --------****--------------------****-------------
++ * clock task mult
++ * @ mult = 2      |   |   |2  |3  |   |   |   |   |5  |6  |   |   |
++ * @ mult = 4      | | | | |2|3| | | | | | | | | | |5|6| | | | | | |
++ *
++ */
++static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta)
++{
++	delta <<= READ_ONCE(sched_pelt_lshift);
++
++	rq->clock_task_mult += delta;
++
++	update_rq_clock_pelt(rq, delta);
++}
++
+ /*
+  * When rq becomes idle, we have to check if it has lost idle time
+  * because it was fully busy. A rq is fully used when the /Sum util_sum
+@@ -147,7 +176,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
+ 	 * rq's clock_task.
+ 	 */
+ 	if (util_sum >= divider)
+-		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
++		rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt;
+ 
+ 	_update_idle_rq_clock_pelt(rq);
+ }
+@@ -218,13 +247,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
+ 	return 0;
+ }
+ 
+-static inline u64 rq_clock_pelt(struct rq *rq)
++static inline u64 rq_clock_task_mult(struct rq *rq)
+ {
+ 	return rq_clock_task(rq);
+ }
+ 
++static inline u64 rq_clock_pelt(struct rq *rq)
++{
++	return rq_clock_task_mult(rq);
++}
++
+ static inline void
+-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
++update_rq_clock_task_mult(struct rq *rq, s64 delta) { }
+ 
+ static inline void
+ update_idle_rq_clock_pelt(struct rq *rq) { }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 771f8ddb7053..9e8bb6278604 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -645,6 +645,9 @@ struct cfs_rq {
+ 	int			throttled;
+ 	int			throttle_count;
+ 	struct list_head	throttled_list;
++#ifdef CONFIG_SMP
++	struct list_head	throttled_csd_list;
++#endif
+ #endif /* CONFIG_CFS_BANDWIDTH */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ };
+@@ -1015,6 +1018,7 @@ struct rq {
+ 	u64			clock;
+ 	/* Ensure that all clocks are in the same cache line */
+ 	u64			clock_task ____cacheline_aligned;
++	u64			clock_task_mult;
+ 	u64			clock_pelt;
+ 	unsigned long		lost_idle_time;
+ 	u64			clock_pelt_idle;
+@@ -1041,7 +1045,6 @@ struct rq {
+ 
+ 	unsigned long		cpu_capacity;
+ 	unsigned long		cpu_capacity_orig;
+-	unsigned long		cpu_capacity_inverted;
+ 
+ 	struct balance_callback *balance_callback;
+ 
+@@ -1154,6 +1157,11 @@ struct rq {
+ 
+ 	/* Scratch cpumask to be temporarily used under rq_lock */
+ 	cpumask_var_t		scratch_mask;
++
++#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
++	call_single_data_t	cfsb_csd;
++	struct list_head	cfsb_csd_list;
++#endif
+ };
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -2893,24 +2901,6 @@ static inline unsigned long capacity_orig_of(int cpu)
+ 	return cpu_rq(cpu)->cpu_capacity_orig;
+ }
+ 
+-/*
+- * Returns inverted capacity if the CPU is in capacity inversion state.
+- * 0 otherwise.
+- *
+- * Capacity inversion detection only considers thermal impact where actual
+- * performance points (OPPs) gets dropped.
+- *
+- * Capacity inversion state happens when another performance domain that has
+- * equal or lower capacity_orig_of() becomes effectively larger than the perf
+- * domain this CPU belongs to due to thermal pressure throttling it hard.
+- *
+- * See comment in update_cpu_capacity().
+- */
+-static inline unsigned long cpu_in_capacity_inversion(int cpu)
+-{
+-	return cpu_rq(cpu)->cpu_capacity_inverted;
+-}
+-
+ /**
+  * enum cpu_util_type - CPU utilization type
+  * @FREQUENCY_UTIL:	Utilization used to select frequency
+-- 
+2.39.2
+
+From a98da743d79741ac811bca0a2704902a27604768 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 6 Feb 2023 09:53:13 +0100
+Subject: [PATCH 14/15] zram
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/blockdev/zram.rst |   2 +
+ drivers/block/zram/zram_drv.c               | 319 +++++++++++++++++++-
+ drivers/block/zram/zram_drv.h               |   7 +
+ 3 files changed, 322 insertions(+), 6 deletions(-)
+
+diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
+index e4551579cb12..a1dd202efca1 100644
+--- a/Documentation/admin-guide/blockdev/zram.rst
++++ b/Documentation/admin-guide/blockdev/zram.rst
+@@ -209,6 +209,7 @@ compact           	WO	trigger memory compaction
+ debug_stat        	RO	this file is used for zram debugging purposes
+ backing_dev	  	RW	set up backend storage for zram to write out
+ idle		  	WO	mark allocated slot as idle
++merge           	WO	trigger merge identical pages
+ ======================  ======  ===============================================
+ 
+ 
+@@ -267,6 +268,7 @@ line of text and contains the following stats separated by whitespace:
+  pages_compacted  the number of pages freed during compaction
+  huge_pages	  the number of incompressible pages
+  huge_pages_since the number of incompressible pages since zram set up
++ pages_merged	  the number of identical pages merged into single one
+  ================ =============================================================
+ 
+ File /sys/block/zram<id>/bd_stat
+diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
+index e290d6d97047..084f8f830bde 100644
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -33,12 +33,15 @@
+ #include <linux/debugfs.h>
+ #include <linux/cpuhotplug.h>
+ #include <linux/part_stat.h>
++#include <linux/hashtable.h>
++#include <linux/xxhash.h>
+ 
+ #include "zram_drv.h"
+ 
+ static DEFINE_IDR(zram_index_idr);
+ /* idr index must be protected */
+ static DEFINE_MUTEX(zram_index_mutex);
++static DEFINE_MUTEX(zram_rbtree_mutex);
+ 
+ static int zram_major;
+ static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
+@@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
+ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+ 				u32 index, int offset, struct bio *bio);
+ 
++struct zram_rbtree_node {
++	struct rb_node node;
++	unsigned long key;
++	unsigned long cnt;
++};
++
++struct zram_hash_node {
++	unsigned long index;
++	struct hlist_node next;
++};
+ 
+ static int zram_slot_trylock(struct zram *zram, u32 index)
+ {
+@@ -1140,7 +1153,7 @@ static ssize_t recomp_algorithm_store(struct device *dev,
+ 	while (*args) {
+ 		args = next_arg(args, &param, &val);
+ 
+-		if (!*val)
++		if (!val || !*val)
+ 			return -EINVAL;
+ 
+ 		if (!strcmp(param, "algo")) {
+@@ -1184,6 +1197,30 @@ static ssize_t compact_store(struct device *dev,
+ 	return len;
+ }
+ 
++static int zram_do_scan(struct zram *zram);
++
++static ssize_t merge_store(struct device *dev,
++		struct device_attribute *attr, const char *buf, size_t len)
++{
++	struct zram *zram = dev_to_zram(dev);
++	int ret;
++
++	down_read(&zram->init_lock);
++	if (!init_done(zram)) {
++		up_read(&zram->init_lock);
++		return -EINVAL;
++	}
++
++	ret = zram_do_scan(zram);
++	if (ret != 0) {
++		up_read(&zram->init_lock);
++		return -ENOMEM;
++	}
++
++	up_read(&zram->init_lock);
++	return len;
++}
++
+ static ssize_t io_stat_show(struct device *dev,
+ 		struct device_attribute *attr, char *buf)
+ {
+@@ -1223,7 +1260,7 @@ static ssize_t mm_stat_show(struct device *dev,
+ 	max_used = atomic_long_read(&zram->stats.max_used_pages);
+ 
+ 	ret = scnprintf(buf, PAGE_SIZE,
+-			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
++			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu %8llu\n",
+ 			orig_size << PAGE_SHIFT,
+ 			(u64)atomic64_read(&zram->stats.compr_data_size),
+ 			mem_used << PAGE_SHIFT,
+@@ -1232,7 +1269,8 @@ static ssize_t mm_stat_show(struct device *dev,
+ 			(u64)atomic64_read(&zram->stats.same_pages),
+ 			atomic_long_read(&pool_stats.pages_compacted),
+ 			(u64)atomic64_read(&zram->stats.huge_pages),
+-			(u64)atomic64_read(&zram->stats.huge_pages_since));
++			(u64)atomic64_read(&zram->stats.huge_pages_since),
++			(u64)atomic64_read(&zram->stats.pages_merged));
+ 	up_read(&zram->init_lock);
+ 
+ 	return ret;
+@@ -1283,6 +1321,248 @@ static DEVICE_ATTR_RO(bd_stat);
+ #endif
+ static DEVICE_ATTR_RO(debug_stat);
+ 
++static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
++{
++	struct rb_node **new = &(root->rb_node), *parent = NULL;
++	struct zram_rbtree_node *this;
++
++	while (*new) {
++		this = rb_entry(*new, struct zram_rbtree_node, node);
++		parent = *new;
++		if (data->key < this->key)
++			new = &((*new)->rb_left);
++		else if (data->key > this->key)
++			new = &((*new)->rb_right);
++		else
++			return false;
++	}
++
++	rb_link_node(&data->node, parent, new);
++	rb_insert_color(&data->node, root);
++	return true;
++}
++
++static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
++		unsigned long key)
++{
++	struct rb_node *node = root->rb_node;
++	struct zram_rbtree_node *data;
++
++	while (node) {
++		data = rb_entry(node, struct zram_rbtree_node, node);
++		if (key < data->key)
++			node = node->rb_left;
++		else if (key > data->key)
++			node = node->rb_right;
++		else
++			return data;
++	}
++
++	return NULL;
++}
++
++static unsigned long zram_calc_hash(void *src, size_t len)
++{
++	return xxhash(src, len, 0);
++}
++
++static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
++		size_t htable_size, size_t index)
++{
++	struct zram_rbtree_node *rb_node;
++	struct zram_hash_node *node;
++	unsigned long handle, cur_handle;
++	size_t obj_size;
++	char *src, *buf;
++	unsigned long hash;
++	int ret = 0;
++
++	handle = zram_get_handle(zram, index);
++	if (!handle)
++		return ret;
++
++	obj_size = zram_get_obj_size(zram, index);
++	buf = kmalloc(obj_size, GFP_KERNEL);
++	if (!buf) {
++		pr_err("Failed to allocate zs_map_object buffer\n");
++		return -ENOMEM;
++	}
++
++	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
++	memcpy(buf, src, obj_size);
++	zs_unmap_object(zram->mem_pool, handle);
++	hash = zram_calc_hash(buf, obj_size);
++
++	mutex_lock(&zram_rbtree_mutex);
++	hlist_for_each_entry(node, &htable[hash % htable_size], next) {
++		int cmp;
++
++		zram_slot_lock(zram, node->index);
++
++		/*
++		 * Page may change as the hash table is being formed,
++		 * so the checks below are necessary.
++		 */
++		cur_handle = zram_get_handle(zram, node->index);
++		if (handle == cur_handle ||
++			obj_size != zram_get_obj_size(zram, node->index)) {
++			zram_slot_unlock(zram, node->index);
++			continue;
++		}
++
++		src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
++		cmp = memcmp(buf, src, obj_size);
++		zs_unmap_object(zram->mem_pool, cur_handle);
++
++		if (!cmp) {
++			rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
++
++			/*
++			 * This check is necessary in order not to zs_free an object
++			 * that someone already refers to. This situation is possible
++			 * when with repeated calls to zram_do_scan(). For example:
++			 *
++			 * [slot0] [slot1] [slot2] [slot3] [slot4]
++			 * [obj0]  [obj1]  [obj2]  [obj3]  [obj4]
++			 *
++			 * Let's imagine that obj2 and obj3 are equal, and we called
++			 * zram_do_scan() function:
++			 *
++			 * [slot0] [slot1] [slot2] [slot3] [slot4]
++			 * [obj0]  [obj1]  [obj2]  [obj2]  [obj4]
++			 *
++			 * Now, slot2 and slot3 refers to obj2 zsmalloc object.
++			 * Time passed, now slot0 refres to obj0_n, which is equal
++			 * to obj2:
++			 *
++			 * [slot0]  [slot1] [slot2] [slot3] [slot4]
++			 * [obj0_n] [obj1]  [obj2]  [obj2]  [obj4]
++			 *
++			 * Now we call zram_do_scan() function again. We get to slot2,
++			 * and we understand that obj2 and obj0_n hashes are the same. We
++			 * try to zs_free(obj2), but slot3 also already refers to it.
++			 *
++			 * This is not correct!
++			 */
++			if (unlikely(rb_node))
++				if (rb_node->cnt > 1) {
++					zram_slot_unlock(zram, node->index);
++					continue;
++				}
++
++			zram_set_handle(zram, index, cur_handle);
++			zs_free(zram->mem_pool, handle);
++
++			rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
++
++			if (!rb_node) {
++				rb_node = kzalloc(sizeof(struct zram_rbtree_node),
++								GFP_KERNEL);
++				if (!rb_node) {
++					pr_err("Failed to allocate rb_node\n");
++					ret = -ENOMEM;
++					zram_slot_unlock(zram, node->index);
++					mutex_unlock(&zram_rbtree_mutex);
++					goto merged_or_err;
++				}
++
++				rb_node->key = cur_handle;
++				/* Two slots refers to an zsmalloc object with cur_handle key */
++				rb_node->cnt = 2;
++				zram_rbtree_insert(&zram->sph_rbtree, rb_node);
++			} else {
++				rb_node->cnt++;
++			}
++
++			atomic64_inc(&zram->stats.pages_merged);
++			atomic64_sub(obj_size, &zram->stats.compr_data_size);
++			zram_set_flag(zram, index, ZRAM_MERGED);
++			zram_set_flag(zram, node->index, ZRAM_MERGED);
++
++			zram_slot_unlock(zram, node->index);
++			mutex_unlock(&zram_rbtree_mutex);
++			goto merged_or_err;
++		}
++
++		zram_slot_unlock(zram, node->index);
++	}
++
++	mutex_unlock(&zram_rbtree_mutex);
++
++	node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
++	if (!node) {
++		ret = -ENOMEM;
++		goto merged_or_err;
++	}
++
++	node->index = index;
++	hlist_add_head(&node->next, &htable[hash % htable_size]);
++
++merged_or_err:
++	kfree(buf);
++	return ret;
++}
++
++static void zram_free_htable_entries(struct hlist_head *htable,
++		size_t htable_size)
++{
++	struct hlist_node *n;
++	struct zram_hash_node *node;
++
++	hlist_for_each_entry_safe(node, n, htable, next) {
++		hlist_del(&node->next);
++		kfree(node);
++	}
++}
++
++static int zram_do_scan(struct zram *zram)
++{
++	size_t num_pages = zram->disksize >> PAGE_SHIFT;
++	size_t htable_size = num_pages;
++	size_t index;
++	struct hlist_head *htable;
++	int i, ret = 0;
++
++	htable = vzalloc(htable_size * sizeof(struct hlist_head));
++	if (!htable) {
++		pr_err("Failed to allocate hash table\n");
++		return -ENOMEM;
++	}
++
++	for (i = 0; i < htable_size; i++)
++		INIT_HLIST_HEAD(&htable[i]);
++
++	for (index = 0; index < num_pages; index++) {
++		zram_slot_lock(zram, index);
++
++		if (!zram_allocated(zram, index)) {
++			zram_slot_unlock(zram, index);
++			continue;
++		}
++
++		if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
++			zram_test_flag(zram, index, ZRAM_WB) ||
++			zram_test_flag(zram, index, ZRAM_SAME)) {
++			zram_slot_unlock(zram, index);
++			continue;
++		}
++
++		/* Ignore pages that have been recompressed */
++		if (zram_get_priority(zram, index) != 0)
++			continue;
++
++		ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
++		zram_slot_unlock(zram, index);
++		if (ret != 0)
++			goto out;
++	}
++
++out:
++	zram_free_htable_entries(htable, htable_size);
++	vfree(htable);
++	return ret;
++}
++
+ static void zram_meta_free(struct zram *zram, u64 disksize)
+ {
+ 	size_t num_pages = disksize >> PAGE_SHIFT;
+@@ -1324,6 +1604,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
+ static void zram_free_page(struct zram *zram, size_t index)
+ {
+ 	unsigned long handle;
++	struct zram_rbtree_node *node;
+ 
+ #ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ 	zram->table[index].ac_time = 0;
+@@ -1361,7 +1642,28 @@ static void zram_free_page(struct zram *zram, size_t index)
+ 	if (!handle)
+ 		return;
+ 
+-	zs_free(zram->mem_pool, handle);
++	if (zram_test_flag(zram, index, ZRAM_MERGED)) {
++		zram_clear_flag(zram, index, ZRAM_MERGED);
++		mutex_lock(&zram_rbtree_mutex);
++
++		node = zram_rbtree_search(&zram->sph_rbtree, handle);
++		BUG_ON(!node);
++
++		node->cnt--;
++		if (node->cnt == 0) {
++			rb_erase(&node->node, &zram->sph_rbtree);
++			mutex_unlock(&zram_rbtree_mutex);
++
++			zs_free(zram->mem_pool, handle);
++			kfree(node);
++		} else {
++			mutex_unlock(&zram_rbtree_mutex);
++		}
++
++		atomic64_dec(&zram->stats.pages_merged);
++	} else {
++		zs_free(zram->mem_pool, handle);
++	}
+ 
+ 	atomic64_sub(zram_get_obj_size(zram, index),
+ 			&zram->stats.compr_data_size);
+@@ -1824,7 +2126,7 @@ static ssize_t recompress_store(struct device *dev,
+ 	while (*args) {
+ 		args = next_arg(args, &param, &val);
+ 
+-		if (!*val)
++		if (!val || !*val)
+ 			return -EINVAL;
+ 
+ 		if (!strcmp(param, "type")) {
+@@ -1909,7 +2211,8 @@ static ssize_t recompress_store(struct device *dev,
+ 		if (zram_test_flag(zram, index, ZRAM_WB) ||
+ 		    zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
+ 		    zram_test_flag(zram, index, ZRAM_SAME) ||
+-		    zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
++		    zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE) ||
++		    zram_test_flag(zram, index, ZRAM_MERGED))
+ 			goto next;
+ 
+ 		err = zram_recompress(zram, index, page, threshold,
+@@ -2295,6 +2598,7 @@ static const struct block_device_operations zram_devops = {
+ };
+ 
+ static DEVICE_ATTR_WO(compact);
++static DEVICE_ATTR_WO(merge);
+ static DEVICE_ATTR_RW(disksize);
+ static DEVICE_ATTR_RO(initstate);
+ static DEVICE_ATTR_WO(reset);
+@@ -2335,6 +2639,7 @@ static struct attribute *zram_disk_attrs[] = {
+ #ifdef CONFIG_ZRAM_WRITEBACK
+ 	&dev_attr_bd_stat.attr,
+ #endif
++	&dev_attr_merge.attr,
+ 	&dev_attr_debug_stat.attr,
+ #ifdef CONFIG_ZRAM_MULTI_COMP
+ 	&dev_attr_recomp_algorithm.attr,
+@@ -2421,6 +2726,8 @@ static int zram_add(void)
+ 
+ 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
+ 
++	zram->sph_rbtree = RB_ROOT;
++
+ 	zram_debugfs_register(zram);
+ 	pr_info("Added device: %s\n", zram->disk->disk_name);
+ 	return device_id;
+diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
+index c5254626f051..2afdbf76a1aa 100644
+--- a/drivers/block/zram/zram_drv.h
++++ b/drivers/block/zram/zram_drv.h
+@@ -56,6 +56,7 @@ enum zram_pageflags {
+ 
+ 	ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
+ 	ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
++	ZRAM_MERGED,	/* page was merged */
+ 
+ 	__NR_ZRAM_PAGEFLAGS,
+ };
+@@ -87,6 +88,7 @@ struct zram_stats {
+ 	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
+ 	atomic64_t writestall;		/* no. of write slow paths */
+ 	atomic64_t miss_free;		/* no. of missed free */
++	atomic64_t pages_merged;	/* no. of pages, which merged into single one */
+ #ifdef	CONFIG_ZRAM_WRITEBACK
+ 	atomic64_t bd_count;		/* no. of pages in backing device */
+ 	atomic64_t bd_reads;		/* no. of reads from backing device */
+@@ -140,5 +142,10 @@ struct zram {
+ #ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ 	struct dentry *debugfs_dir;
+ #endif
++	/*
++	 * This is same pages handle's rb tree, where the key is a handle
++	 * to same pages and the value is a link counter
++	 */
++	struct rb_root sph_rbtree;
+ };
+ #endif
+-- 
+2.39.2
+
+From d28acbb9cafa5f1fa935147e0dc23e1a211848e7 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Tue, 14 Feb 2023 22:02:09 +0100
+Subject: [PATCH 15/15] zstd import v1.5.4
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/zstd.h                          |    2 +-
+ include/linux/zstd_errors.h                   |   23 +-
+ include/linux/zstd_lib.h                      |  569 +++++--
+ lib/zstd/Makefile                             |    2 +-
+ lib/zstd/common/bits.h                        |  124 ++
+ lib/zstd/common/bitstream.h                   |   51 +-
+ lib/zstd/common/compiler.h                    |   14 +-
+ lib/zstd/common/cpu.h                         |    3 +-
+ lib/zstd/common/debug.c                       |    3 +-
+ lib/zstd/common/debug.h                       |    3 +-
+ lib/zstd/common/entropy_common.c              |   42 +-
+ lib/zstd/common/error_private.c               |   12 +-
+ lib/zstd/common/error_private.h               |    3 +-
+ lib/zstd/common/fse.h                         |   89 +-
+ lib/zstd/common/fse_decompress.c              |   94 +-
+ lib/zstd/common/huf.h                         |  222 +--
+ lib/zstd/common/mem.h                         |    2 +-
+ lib/zstd/common/portability_macros.h          |   26 +-
+ lib/zstd/common/zstd_common.c                 |    3 +-
+ lib/zstd/common/zstd_deps.h                   |    2 +-
+ lib/zstd/common/zstd_internal.h               |   94 +-
+ lib/zstd/compress/clevels.h                   |    3 +-
+ lib/zstd/compress/fse_compress.c              |   59 +-
+ lib/zstd/compress/hist.c                      |    3 +-
+ lib/zstd/compress/hist.h                      |    3 +-
+ lib/zstd/compress/huf_compress.c              |  372 ++--
+ lib/zstd/compress/zstd_compress.c             | 1491 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  267 +--
+ lib/zstd/compress/zstd_compress_literals.c    |  155 +-
+ lib/zstd/compress/zstd_compress_literals.h    |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
+ lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                |    5 +-
+ lib/zstd/compress/zstd_double_fast.c          |  129 +-
+ lib/zstd/compress/zstd_double_fast.h          |    6 +-
+ lib/zstd/compress/zstd_fast.c                 |  582 +++++--
+ lib/zstd/compress/zstd_fast.h                 |    6 +-
+ lib/zstd/compress/zstd_lazy.c                 |  364 ++--
+ lib/zstd/compress/zstd_lazy.h                 |    7 +-
+ lib/zstd/compress/zstd_ldm.c                  |   11 +-
+ lib/zstd/compress/zstd_ldm.h                  |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
+ lib/zstd/compress/zstd_opt.c                  |  185 +-
+ lib/zstd/compress/zstd_opt.h                  |    3 +-
+ lib/zstd/decompress/huf_decompress.c          |  731 ++++----
+ lib/zstd/decompress/zstd_ddict.c              |    8 +-
+ lib/zstd/decompress/zstd_ddict.h              |    3 +-
+ lib/zstd/decompress/zstd_decompress.c         |  215 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  252 ++-
+ lib/zstd/decompress/zstd_decompress_block.h   |    3 +-
+ .../decompress/zstd_decompress_internal.h     |    7 +-
+ lib/zstd/decompress_sources.h                 |    2 +-
+ lib/zstd/zstd_common_module.c                 |    2 +-
+ lib/zstd/zstd_compress_module.c               |    2 +-
+ lib/zstd/zstd_decompress_module.c             |    4 +-
+ 57 files changed, 4086 insertions(+), 2268 deletions(-)
+ create mode 100644 lib/zstd/common/bits.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index 113408eef6ec..f109d49f43f8 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a969..6d5cf55f0bf3 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+ 
+ 
+ /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+-#define ZSTDERRORLIB_VISIBILITY 
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE 
++
++#ifndef ZSTDERRORLIB_HIDDEN
++#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
++#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++#  else
++#    define ZSTDERRORLIB_HIDDEN
++#  endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+ 
+ /*-*********************************************
+  *  Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+   ZSTD_error_frameParameter_windowTooLarge = 16,
+   ZSTD_error_corruption_detected = 20,
+   ZSTD_error_checksum_wrong      = 22,
++  ZSTD_error_literals_headerWrong = 24,
+   ZSTD_error_dictionary_corrupted      = 30,
+   ZSTD_error_dictionary_wrong          = 32,
+   ZSTD_error_dictionaryCreation_failed = 34,
+   ZSTD_error_parameter_unsupported   = 40,
++  ZSTD_error_parameter_combination_unsupported = 41,
+   ZSTD_error_parameter_outOfBound    = 42,
+   ZSTD_error_tableLog_tooLarge       = 44,
+   ZSTD_error_maxSymbolValue_tooLarge = 46,
+   ZSTD_error_maxSymbolValue_tooSmall = 48,
++  ZSTD_error_stabilityCondition_notRespected = 50,
+   ZSTD_error_stage_wrong       = 60,
+   ZSTD_error_init_missing      = 62,
+   ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+   ZSTD_error_dstSize_tooSmall = 70,
+   ZSTD_error_srcSize_wrong    = 72,
+   ZSTD_error_dstBuffer_null   = 74,
++  ZSTD_error_noForwardProgress_destFull = 80,
++  ZSTD_error_noForwardProgress_inputEmpty = 82,
+   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+   ZSTD_error_frameIndex_tooLarge = 100,
+   ZSTD_error_seekableIO          = 102,
+   ZSTD_error_dstBuffer_wrong     = 104,
+   ZSTD_error_srcBuffer_wrong     = 105,
++  ZSTD_error_sequenceProducer_failed = 106,
++  ZSTD_error_externalSequences_invalid = 107,
+   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+ 
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d5c1..dc7e9605a624 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+ 
+-/* ======   Dependency   ======*/
++/* ======   Dependencies   ======*/
+ #include <linux/limits.h>   /* INT_MAX */
+ #include <linux/types.h>   /* size_t */
+ 
+ 
+ /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE 
++
++#ifndef ZSTDLIB_HIDDEN
+ #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ #  else
+-#    define ZSTDLIB_VISIBLE
+ #    define ZSTDLIB_HIDDEN
+ #  endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+ 
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++#  elif (__GNUC__ >= 3)
++#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++#  else
++#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++#    define ZSTD_DEPRECATED(message)
++#  endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+ 
+ /* *****************************************************************************
+   Introduction
+@@ -65,7 +85,7 @@
+ /*------   Version   ------*/
+ #define ZSTD_VERSION_MAJOR    1
+ #define ZSTD_VERSION_MINOR    5
+-#define ZSTD_VERSION_RELEASE  2
++#define ZSTD_VERSION_RELEASE  4
+ #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ 
+ /*! ZSTD_versionNumber() :
+@@ -156,7 +176,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+  *  "empty", "unknown" and "error" results to the same return value (0),
+  *  while ZSTD_getFrameContentSize() gives them separate return values.
+  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+  * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +190,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+ 
+ 
+ /*======  Helper functions  ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ *        In which case, ZSTD_compressBound() will return an error code
++ *        which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -412,6 +456,9 @@ typedef enum {
+      * ZSTD_c_validateSequences
+      * ZSTD_c_useBlockSplitter
+      * ZSTD_c_useRowMatchFinder
++     * ZSTD_c_prefetchCDictTables
++     * ZSTD_c_enableSeqProducerFallback
++     * ZSTD_c_maxBlockSize
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly;
+      *        also, the enums values themselves are unstable and can still change.
+@@ -430,7 +477,11 @@ typedef enum {
+      ZSTD_c_experimentalParam12=1009,
+      ZSTD_c_experimentalParam13=1010,
+      ZSTD_c_experimentalParam14=1011,
+-     ZSTD_c_experimentalParam15=1012
++     ZSTD_c_experimentalParam15=1012,
++     ZSTD_c_experimentalParam16=1013,
++     ZSTD_c_experimentalParam17=1014,
++     ZSTD_c_experimentalParam18=1015,
++     ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+ 
+ typedef struct {
+@@ -493,7 +544,7 @@ typedef enum {
+  *                  They will be used to compress next frame.
+  *                  Resetting session never fails.
+  *  - The parameters : changes all parameters back to "default".
+- *                  This removes any reference to any dictionary too.
++ *                  This also removes any reference to any dictionary or external sequence producer.
+  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+  *  - Both : similar to resetting the session, followed by resetting parameters.
+@@ -543,13 +594,15 @@ typedef enum {
+      * ZSTD_d_stableOutBuffer
+      * ZSTD_d_forceIgnoreChecksum
+      * ZSTD_d_refMultipleDDicts
++     * ZSTD_d_disableHuffmanAssembly
+      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+      * note : never ever use experimentalParam? names directly
+      */
+      ZSTD_d_experimentalParam1=1000,
+      ZSTD_d_experimentalParam2=1001,
+      ZSTD_d_experimentalParam3=1002,
+-     ZSTD_d_experimentalParam4=1003
++     ZSTD_d_experimentalParam4=1003,
++     ZSTD_d_experimentalParam5=1004
+ 
+ } ZSTD_dParameter;
+ 
+@@ -728,8 +781,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  * This following is a legacy streaming API, available since v1.0+ .
+  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+  * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+  ******************************************************************************/
+ 
+ /*!
+@@ -738,6 +789,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+  */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -788,13 +842,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+ 
+ /*===== Streaming decompression functions =====*/
+ 
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+  *
++ * Note : This function is redundant with the advanced API and equivalent to:
+  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+  *     ZSTD_DCtx_refDDict(zds, NULL);
+  */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ 
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ *   on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ *   call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ *           or an error code, which can be tested using ZSTD_isError(),
++ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ 
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+@@ -913,7 +985,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+  *  If @return == 0, the dictID could not be decoded.
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary to be decoded (most common case).
+- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+  *  - This is not a Zstandard frame.
+@@ -937,8 +1009,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+  *           meaning "return to no-dictionary mode".
+- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ *           until parameters are reset, a new dictionary is loaded, or the dictionary
++ *           is explicitly invalidated by loading a NULL dictionary.
+  *  Note 2 : Loading a dictionary involves building tables.
+  *           It's also a CPU consuming operation, with non-negligible impact on latency.
+  *           Tables are dependent on compression parameters, and for this reason,
+@@ -951,7 +1024,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+ 
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- *  Reference a prepared dictionary, to be used for all next compressed frames.
++ *  Reference a prepared dictionary, to be used for all future compressed frames.
+  *  Note that compression parameters are enforced from within CDict,
+  *  and supersede any compression parameter previously set within CCtx.
+  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -986,9 +1059,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                  const void* prefix, size_t prefixSize);
+ 
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- *  Create an internal DDict from dict buffer,
+- *  to be used to decompress next frames.
+- *  The dictionary remains valid for all future frames, until explicitly invalidated.
++ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
++ *  a new dictionary is loaded.
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+  *            meaning "return to no-dictionary mode".
+@@ -1012,9 +1085,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+  *  The memory for the table is allocated on the first call to refDDict, and can be
+  *  freed with ZSTD_freeDCtx().
+  *
++ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- *  Note 1 : Currently, only one dictionary can be managed.
+- *           Referencing a new dictionary effectively "discards" any previous one.
+  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+  */
+@@ -1071,24 +1145,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+ 
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
+-#else
+-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-#  elif (__GNUC__ >= 3)
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-#  else
+-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-#  endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+  *   experimental API (static linking only)
+  ****************************************************************************************
+@@ -1123,6 +1179,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN        ZSTD_fast
+ #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+ 
+ 
+ #define ZSTD_OVERLAPLOG_MIN       0
+@@ -1350,29 +1407,85 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+  *           or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ 
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ *  _______________________ Output Buffer ________________________
++ * |                                                              |
++ * |                                        ____ Input Buffer ____|
++ * |                                       |                      |
++ * v                                       v                      v
++ * |---------------------------------------|-----------|----------|
++ * ^                                                   ^          ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ *                     Unless you explicitly set the windowLog smaller than
++ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
++        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
++        4                                                                                         /* checksum */ + \
++        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++        (blockSize)                                                                    /* One block of margin */   \
++    ))
++
+ typedef enum {
+   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+ 
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ *  @return : upper-bound for the number of sequences that can be generated
++ *            from a buffer of srcSize bytes
++ *
++ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
+  *
+  * Each block will end with a dummy sequence
+  * with offset == 0, matchLength == 0, and litLength == length of last literals.
+  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+  * simply acts as a block delimiter.
+  *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
++ * @zc can be used to insert custom compression params.
++ * This function invokes ZSTD_compress2().
+  *
+  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+  * @return : number of sequences generated
+  */
+ 
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+-                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences( ZSTD_CCtx* zc,
++                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ /*! ZSTD_mergeBlockDelimiters() :
+  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1501,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+ 
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+  * The entire source is compressed into a single frame.
+  *
+@@ -1413,11 +1528,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+  *         and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+  */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+-                                  const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++                        const void* src, size_t srcSize);
+ 
+ 
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1481,8 +1597,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+  *
+- *  Note 2 : only single-threaded compression is supported.
++ *  Note : only single-threaded compression is supported.
+  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *
++ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
+  */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+@@ -1501,7 +1620,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+  *         an internal ?Dict will be created, which additional size is not estimated here.
+- *         In this case, get total size by adding ZSTD_estimate?DictSize */
++ *         In this case, get total size by adding ZSTD_estimate?DictSize
++ *  Note 2 : only single-threaded compression is supported.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+@@ -1649,22 +1773,31 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+  *  This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+ 
++/*! ZSTD_CCtx_setCParams() :
++ *  Set all parameters provided within @cparams into the working @cctx.
++ *  Note : if modifying parameters during compression (MT mode only),
++ *         note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
+ /*! ZSTD_compress_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+-                                          void* dst, size_t dstCapacity,
+-                                    const void* src, size_t srcSize,
+-                                    const void* dict,size_t dictSize,
+-                                          ZSTD_parameters params);
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const void* dict,size_t dictSize,
++                              ZSTD_parameters params);
+ 
+ /*! ZSTD_compress_usingCDict_advanced() :
+  *  Note : this function is now DEPRECATED.
+  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+  *  This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                               void* dst, size_t dstCapacity,
+                                         const void* src, size_t srcSize,
+@@ -1808,13 +1941,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Experimental parameter.
+  * Default is 0 == disabled. Set to 1 to enable.
+  *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+  *
+  * When this flag is enabled zstd won't allocate an input window buffer,
+  * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +1958,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+  * avoid the memcpy() from the input buffer to the input window buffer.
+  *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+  * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+  *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+  * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+  */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+ 
+@@ -1878,7 +2011,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  * Without validation, providing a sequence that does not conform to the zstd spec will cause
+  * undefined behavior, and may produce a corrupted block.
+  *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+  * specifics regarding offset/matchlength requirements) then the function will bail out and
+  * return an error.
+  *
+@@ -1928,6 +2061,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+ 
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() innacurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+  *  and store it into int* value.
+@@ -2084,7 +2290,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  * in the range [dst, dst + pos) MUST not be modified during decompression
+  * or you will get data corruption.
+  *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+  * it can write directly to the ZSTD_outBuffer, but it will still allocate
+  * an input buffer large enough to fit any compressed block. This will also
+  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2343,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+ 
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
+ 
+ /*! ZSTD_DCtx_setFormat() :
+  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2362,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+  *  such ZSTD_f_zstd1_magicless for example.
+  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+ 
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2399,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                          int compressionLevel,
+                          unsigned long long pledgedSrcSize);
+@@ -2198,6 +2417,7 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                      const void* dict, size_t dictSize,
+                            int compressionLevel);
+@@ -2218,6 +2438,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           ZSTD_parameters params,
+@@ -2232,6 +2453,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ 
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+@@ -2250,6 +2472,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+  * This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                const ZSTD_CDict* cdict,
+                                      ZSTD_frameParameters fParams,
+@@ -2274,6 +2497,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *  This prototype will generate compilation warnings.
+  */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+ 
+ 
+@@ -2319,8 +2543,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+  *
+  * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+ 
+ /*!
+@@ -2330,20 +2554,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+  *     ZSTD_DCtx_refDDict(zds, ddict);
+  *
+  * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+  */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+ 
+-/*!
+- * This function is deprecated, and is equivalent to:
+- *
+- *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+- *
+- * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+- */
+-ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+-
+ 
+ /* *******************************************************************
+ *  Buffer-less and synchronous inner streaming functions
+@@ -2362,7 +2576,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   Start by initializing a context.
+   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+ 
+   Then, consume your input using ZSTD_compressContinue().
+   There are some important considerations to keep in mind when using this advanced function :
+@@ -2387,15 +2600,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+ 
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+   Buffer-less streaming decompression (synchronous mode)
+@@ -2408,8 +2626,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+   Data fragment must be large enough to ensure successful decoding.
+  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+            errorCode, which can be tested using ZSTD_isError().
+ 
+   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2646,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   The most memory efficient way is to use a round buffer of sufficient size.
+   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+-  which can @return an error code if required value is too large for current system (in 32-bits mode).
++  which can return an error code if required value is too large for current system (in 32-bits mode).
+   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2666,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+ 
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+   It can also be an error code, which can be tested with ZSTD_isError().
+ 
+@@ -2480,6 +2698,8 @@ typedef struct {
+     unsigned headerSize;
+     unsigned dictID;
+     unsigned checksumFlag;
++    unsigned _reserved1;
++    unsigned _reserved2;
+ } ZSTD_frameHeader;
+ 
+ /*! ZSTD_getFrameHeader() :
+@@ -2502,6 +2722,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2524,7 +2745,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+     - It is necessary to init context before starting
+       + compression : any ZSTD_compressBegin*() variant, including with dictionary
+       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+-      + copyCCtx() and copyDCtx() can be used too
+     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+       + If input is larger than a block size, it's necessary to split input data into multiple blocks
+       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2547,5 +2767,166 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+ 
+ 
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
++ *     producer.
++ *
++ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ *     backing outSeqs is managed by the CCtx.
++ *
++ *   - src, srcSize: an input buffer for the sequence producer to parse.
++ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
++ *     producer may reference as it parses the src buffer. Currently, zstd will
++ *     always pass dictSize == 0 into external sequence producers, but this will
++ *     change in the future.
++ *
++ *   - compressionLevel: a signed integer representing the zstd compression level
++ *     set by the user for the current operation. The sequence producer may choose
++ *     to use this information to change its compression strategy and speed/ratio
++ *     tradeoff. Note: the compression level does not reflect zstd parameters set
++ *     through the advanced API.
++ *
++ *   - windowSize: a size_t representing the maximum allowed offset for external
++ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
++ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ *     for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ *   - The sum of matchLengths and literalLengths must equal srcSize.
++ *   - All sequences in the parse, except for the final sequence, must have
++ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ *   - All offsets must respect the windowSize parameter as specified in
++ *     doc/zstd_compression_format.md.
++ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ *         ZSTD_registerSequenceProducer(cctx,
++ *                                       sequenceProducerState,
++ *                                       sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ *     cases (see its documentation for details). Users must explicitly set
++ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ *     sequence producer is registered.
++ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
++ *     references a dictionary, but the dictionary won't have any effect.
++ *   - Stream history is not currently supported. All advanced compression APIs, including
++ *     streaming APIs, work with external sequence producers, but each block is treated as
++ *     an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t ZSTD_sequenceProducer_F (
++  void* sequenceProducerState,
++  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++  const void* src, size_t srcSize,
++  const void* dict, size_t dictSize,
++  int compressionLevel,
++  size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++  ZSTD_CCtx* cctx,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F* sequenceProducer
++);
++
+ #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+ 
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644b71..464c410b2768 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000000..bb7967def569
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,124 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++    assert(val != 0);
++    {
++        static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++                                                30, 22, 20, 15, 25, 17, 4, 8,
++                                                31, 27, 13, 23, 21, 19, 16, 7,
++                                                26, 12, 18, 6, 11, 5, 10, 9};
++        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_ctz(val);
++#   else
++        return ZSTD_countTrailingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++    assert(val != 0);
++    {
++        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++                                            11, 14, 16, 18, 22, 25, 3, 30,
++                                            8, 12, 20, 28, 15, 17, 24, 7,
++                                            19, 27, 23, 6, 26, 5, 4, 31};
++        val |= val >> 1;
++        val |= val >> 2;
++        val |= val >> 4;
++        val |= val >> 8;
++        val |= val >> 16;
++        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++    }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)__builtin_clz(val);
++#   else
++        return ZSTD_countLeadingZeros32_fallback(val);
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4) && defined(__LP64__)
++        return (unsigned)__builtin_ctzll(val);
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (leastSignificantWord == 0) {
++                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++            } else {
++                return ZSTD_countTrailingZeros32(leastSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++    assert(val != 0);
++#   if (__GNUC__ >= 4)
++        return (unsigned)(__builtin_clzll(val));
++#   else
++        {
++            U32 mostSignificantWord = (U32)(val >> 32);
++            U32 leastSignificantWord = (U32)val;
++            if (mostSignificantWord == 0) {
++                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++            } else {
++                return ZSTD_countLeadingZeros32(mostSignificantWord);
++            }
++        }
++#   endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++    if (MEM_isLittleEndian()) {
++        if (MEM_64bits()) {
++            return ZSTD_countTrailingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countTrailingZeros32((U32)val) >> 3;
++        }
++    } else {  /* Big Endian CPU */
++        if (MEM_64bits()) {
++            return ZSTD_countLeadingZeros64((U64)val) >> 3;
++        } else {
++            return ZSTD_countLeadingZeros32((U32)val) >> 3;
++        }
++    }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
++{
++    assert(val != 0);
++    return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1d60..83a180c65faf 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * bitstream
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h"       /* UNLIKELY() */
+ #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h"  /* error codes and messages */
++#include "bits.h"           /* ZSTD_highbit32 */
+ 
+ 
+ /*=========================================
+@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+ 
+-
+-
+-/*-**************************************************************
+-*  Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+-                                                 11, 14, 16, 18, 22, 25,  3, 30,
+-                                                  8, 12, 20, 28, 15, 17, 24,  7,
+-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+ /*=====    Local Constants   =====*/
+ static const unsigned BIT_mask[] = {
+     0,          1,         3,         7,         0xF,       0x1F,
+@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+     return 0;
+ }
+ 
++MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++    assert(nbBits < BIT_MASK_SIZE);
++    return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+  *  can add up to 31 bits into `bitC`.
+  *  Note : does not check for register overflow ! */
+@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+     assert(nbBits < BIT_MASK_SIZE);
+     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+     bitC->bitPos += nbBits;
+ }
+ 
+@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+         bitD->bitContainer = MEM_readLEST(bitD->ptr);
+         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
++          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+     } else {
+         bitD->ptr   = bitD->start;
+@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         default: break;
+         }
+         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+         }
+         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+ #endif
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+-    assert(nbBits < BIT_MASK_SIZE);
+-    return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+  *  Provides next n bits from local register.
+  *  local register is not modified.
+@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+ }
+ 
+ /*! BIT_readBitsFast() :
+- *  unsafe version; only works only if nbBits >= 1 */
++ *  unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf9bd..c437e0975575 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -179,6 +180,17 @@
+ *  Sanitizer
+ *****************************************************************/
+ 
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+ 
+ 
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b42407ee..d8319a2bef4c 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea616..e56ff6464e91 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fbd02..da0dbfc614b8 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * debug
+  * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f052..6cdd82233fb5 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h"       /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+ 
+ 
+ /*===   Version   ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ *  FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_ctz(val);
+-#   else   /* Software version */
+-        U32 count = 0;
+-        while ((val & 1) == 0) {
+-            val >>= 1;
+-            ++count;
+-        }
+-        return count;
+-#   endif
+-    }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                            const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+              * repeat.
+              * Avoid UB by setting the high bit to 1.
+              */
+-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             while (repeats >= 12) {
+                 charnum += 3 * 12;
+                 if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                     ip = iend - 4;
+                 }
+                 bitStream = MEM_readLE32(ip) >> bitCount;
+-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+             }
+             charnum += 3 * repeats;
+             bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+                  * know that threshold > 1.
+                  */
+                 if (remaining <= 1) break;
+-                nbBits = BIT_highbit32(remaining) + 1;
++                nbBits = ZSTD_highbit32(remaining) + 1;
+                 threshold = 1 << (nbBits - 1);
+             }
+             if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      const void* src, size_t srcSize)
+ {
+     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+     if (weightTotal == 0) return ERROR(corruption_detected);
+ 
+     /* get last non-null symbol weight (implied, total must be 2^n) */
+-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+         *tableLogPtr = tableLog;
+         /* determine last weight */
+         {   U32 const total = 1 << tableLog;
+             U32 const rest = total - weightTotal;
+-            U32 const verif = 1 << BIT_highbit32(rest);
+-            U32 const lastWeight = BIT_highbit32(rest) + 1;
++            U32 const verif = 1 << ZSTD_highbit32(rest);
++            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+             huffWeight[oSize] = (BYTE)lastWeight;
+             rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                      U32* nbSymbolsPtr, U32* tableLogPtr,
+                      const void* src, size_t srcSize,
+                      void* workSpace, size_t wkspSize,
+-                     int bmi2)
++                     int flags)
+ {
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+     }
+ #endif
+-    (void)bmi2;
++    (void)flags;
+     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c373..a4062d30d170 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(version_unsupported): return "Version not supported";
+     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+-    case PREFIX(corruption_detected): return "Corrupted block detected";
++    case PREFIX(corruption_detected): return "Data corruption detected";
+     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+     case PREFIX(parameter_unsupported): return "Unsupported parameter";
++    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+     case PREFIX(init_missing): return "Context should be init first";
+     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
+     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+     case PREFIX(srcSize_wrong): return "Src size is incorrect";
+     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+         /* following error codes are not stable and may be removed or changed in a future version */
+     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+     case PREFIX(maxCode):
+     default: return notErrorCode;
+     }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e542fa..9a4699a38a88 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2287..c4e25a219142 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * FSE : Finite State Entropy codec
+  * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
+ 
+ 
+-/*-****************************************
+-*  FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+-    @return : size of compressed data (<= dstCapacity).
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+-    @return : size of regenerated data (<= maxDstSize),
+-              or an error code, which can be tested using FSE_isError() .
+-
+-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+-    Why ? : making this distinction requires a header.
+-    Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+-                               const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ *  Tool functions
+ ******************************************/
+@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+ 
+ 
+-/*-*****************************************
+-*  FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+-    Both parameters can be defined as '0' to mean : use default value
+-    @return : size of compressed data
+-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+-                     if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ *  FSE detailed API
+ ******************************************/
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+ 
+ /*! FSE_buildCTable():
+     Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                            const void* rBuffer, size_t rBuffSize, int bmi2);
+ 
+-/*! Constructor and Destructor of FSE_DTable.
+-    Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+-    Builds 'dt', which must be already allocated, using FSE_createDTable().
+-    return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+-    into `dst` which must be already allocated.
+-    @return : size of regenerated data (necessarily <= `dstCapacity`),
+-              or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+ 
+ /*!
+ Tutorial :
+@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+ 
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+ 
+@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+ 
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+ 
+ typedef enum {
+    FSE_repeat_none,  /*< Cannot use the previous table */
+@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
+ 
+ /* FSE_getMaxNbBits() :
+  * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index a0d06095be83..45cf457f31ef 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -24,6 +25,7 @@
+ #include "error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #include "zstd_deps.h"
++#include "bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -55,19 +57,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+ 
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+-    ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+         for (u=0; u<tableSize; u++) {
+             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+     }   }
+ 
+@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ *  Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+-    DTableH->tableLog = 0;
+-    DTableH->fastMode = 0;
+-
+-    cell->newState = 0;
+-    cell->symbol = symbolValue;
+-    cell->nbBits = 0;
+-
+-    return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+-    void* ptr = dt;
+-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+-    void* dPtr = dt + 1;
+-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSV1 = tableMask+1;
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+-
+-    /* Build Decoding Table */
+-    DTableH->tableLog = (U16)nbBits;
+-    DTableH->fastMode = 1;
+-    for (s=0; s<maxSV1; s++) {
+-        dinfo[s].newState = 0;
+-        dinfo[s].symbol = (BYTE)s;
+-        dinfo[s].nbBits = (BYTE)nbBits;
+-    }
+-
+-    return 0;
+-}
+ 
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+           void* dst, size_t maxDstSize,
+@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+     return op-ostart;
+ }
+ 
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+-                            const void* cSrc, size_t cSrcSize,
+-                            const FSE_DTable* dt)
+-{
+-    const void* ptr = dt;
+-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+-    const U32 fastMode = DTableH->fastMode;
+-
+-    /* select fast mode (static) */
+-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+ typedef struct {
+     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+     FSE_DTable dtable[1]; /* Dynamically sized */
+@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     }
+ 
+     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ 
+     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+ 
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870308..8e7943092ed1 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * huff0 huffman codec,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  * You can contact the author at :
+  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+ 
+ /* *** Dependencies *** */
+ #include "zstd_deps.h"    /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- *        HUF symbols remain "private" (internal symbols for library only).
+- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+-#  define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-#  define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* ***  simple functions  *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+-                             const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- *  Note : in contrast with FSE, HUF_decompress can regenerate
+- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- *         because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- *           or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+-                               const void* cSrc, size_t cSrcSize);
++#include "mem.h"          /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+ 
+ 
+ /* ***   Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+ 
+ /* Error Management */
+-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
++unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+ 
+ 
+-/* ***   Advanced function   *** */
+-
+-/* HUF_compress2() :
+- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+-                               const void* src, size_t srcSize,
+-                               unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+-                                     const void* src, size_t srcSize,
+-                                     unsigned maxSymbolValue, unsigned tableLog,
+-                                     void* workSpace, size_t wkspSize);
+-
+-#endif   /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- *  WARNING !!
+- *  The following section contains advanced and experimental definitions
+- *  which shall never be used in the context of a dynamic library,
+- *  because they are not guaranteed to remain stable in the future.
+- *  Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h"   /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+ 
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ *  Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-#endif
+ 
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++    /*
++     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++     * Otherwise: Ignored.
++     */
++    HUF_flags_bmi2 = (1 << 0),
++    /*
++     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++     * If unset: Use heuristic to find the table depth.
++     */
++    HUF_flags_optimalDepth = (1 << 1),
++    /*
++     * If set: If the previous table can encode the input, always reuse the previous table.
++     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++     */
++    HUF_flags_preferRepeat = (1 << 2),
++    /*
++     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++     * If unset: Always histogram the entire input.
++     */
++    HUF_flags_suspectUncompressible = (1 << 3),
++    /*
++     * If set: Don't use assembly implementations
++     * If unset: Allow using assembly implementations
++     */
++    HUF_flags_disableAsm = (1 << 4),
++    /*
++     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++     * If unset: Use the fast decoding loop when possible.
++     */
++    HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+ 
+ 
+ /* ****************************************
+  *  HUF detailed API
+  * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+ 
+ /*! HUF_compress() does the following:
+  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+  *  For example, it's possible to compress several blocks using the same 'CTable',
+  *  or to save and regenerate 'CTable' using external methods.
+  */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ 
+@@ -196,6 +144,7 @@ typedef enum {
+    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
+  } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+  */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                           const void* src, size_t srcSize,
+                           void* workspace, size_t wkspSize,
+-                          int bmi2);
++                          int flags);
+ 
+ /* HUF_readCTable() :
+  *  Loading a CTable saved with HUF_writeCTable() */
+@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+ 
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+ 
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+ 
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+  *  If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                        const void* src, size_t srcSize,
+                        unsigned maxSymbolValue, unsigned tableLog,
+                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+-
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
+-#endif
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
+ #endif
+ 
+ /* BMI2 variants.
+  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+  */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ 
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif   /* HUF_H_298734234 */
+ 
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index 1d9cc03924ca..a7231822b6e3 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a527d..7ede8cf1ffe5 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+ 
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+  * This header is shared between C and ASM code, so it MUST only
+  * contain macro definitions. It MUST not contain any C code.
+  *
+@@ -65,7 +66,7 @@
+ #endif
+ 
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+  * because other platforms may not support GAS assembly syntax.
+  *
+  * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +91,23 @@
+  */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+ 
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++    && defined(__has_include)
++# if __has_include(<cet.h>)
++#  include <cet.h>
++#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b309b5..5a9abca10944 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 7a5bf44839c9..925161416033 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b41bb..170cd1db4819 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h>                /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+ 
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+ 
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+ 
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define MINMATCH 3
+ 
+ #define Litbits  8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML   52
+ #define MaxLL   35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog    9
+ #define OffFSELog   8
+ #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+ 
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+          * one COPY16() in the first call. Then, do two calls per loop since
+          * at that point it is more likely to have a high trip count.
+          */
+-#ifdef __aarch64__
+-        do {
+-            COPY16(op, ip);
+-        }
+-        while (op < oend);
+-#else
+         ZSTD_copy16(op, ip);
+         if (16 >= length) return;
+         op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+             COPY16(op, ip);
+         }
+         while (op < oend);
+-#endif
+     }
+ }
+ 
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+     seqDef* sequencesStart;
+     seqDef* sequences;      /* ptr to end of sequences */
+-    BYTE* litStart;
+-    BYTE* lit;              /* ptr to end of literals */
+-    BYTE* llCode;
+-    BYTE* mlCode;
+-    BYTE* ofCode;
++    BYTE*  litStart;
++    BYTE*  lit;             /* ptr to end of literals */
++    BYTE*  llCode;
++    BYTE*  mlCode;
++    BYTE*  ofCode;
+     size_t maxNbSeq;
+     size_t maxNbLit;
+ 
+@@ -301,8 +297,8 @@ typedef struct {
+      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+      * the existing value of the litLength or matchLength by 0x10000.
+      */
+-    ZSTD_longLengthType_e   longLengthType;
+-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
++    ZSTD_longLengthType_e longLengthType;
++    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+ 
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+     seqLen.matchLength = seq->mlBase + MINMATCH;
+     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+-            seqLen.litLength += 0xFFFF;
++            seqLen.litLength += 0x10000;
+         }
+         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+-            seqLen.matchLength += 0xFFFF;
++            seqLen.matchLength += 0x10000;
+         }
+     }
+     return seqLen;
+@@ -337,12 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+  */
+ typedef struct {
++    size_t nbBlocks;
+     size_t compressedSize;
+     unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo;   /* decompress & legacy */
+ 
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+ 
+ /* custom memory allocation functions */
+ void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+@@ -350,61 +347,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+ void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+ 
+ 
+-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+-{
+-    assert(val != 0);
+-    {
+-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
+-        return __builtin_clz (val) ^ 31;
+-#   else   /* Software version */
+-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+-        U32 v = val;
+-        v |= v >> 1;
+-        v |= v >> 2;
+-        v |= v >> 4;
+-        v |= v >> 8;
+-        v |= v >> 16;
+-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-#   endif
+-    }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+-    if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return __builtin_ctzll((U64)val);
+-#       else
+-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
+-                                                      4, 25, 14, 28,  9, 34, 20, 56,
+-                                                      5, 17, 26, 54, 15, 41, 29, 43,
+-                                                      10, 31, 38, 35, 21, 45, 49, 57,
+-                                                      63,  6, 12, 18, 24, 27, 33, 55,
+-                                                      16, 53, 40, 42, 30, 37, 44, 48,
+-                                                      62, 11, 23, 32, 52, 39, 36, 47,
+-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-    } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return __builtin_ctz((U32)val);
+-#       else
+-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
+-                                                     30, 22, 20, 15, 25, 17,  4,  8,
+-                                                     31, 27, 13, 23, 21, 19, 16,  7,
+-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-    }
+-}
+-
+-
+ /* ZSTD_invalidateRepCodes() :
+  * ensures next compression will not use repcodes from previous block.
+  * Note : only works with regular variant;
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112ec3a..6ab8be6532ef 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d71a..e46ca6621b48 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,6 +27,7 @@
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+ #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+     assert(tableLog < 16);   /* required for threshold strategy to work */
+ 
+     /* For explanations on how to distribute symbol values over the table :
+-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+ 
+      #ifdef __clang_analyzer__
+      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                 break;
+             default :
+                 assert(normalizedCounter[s] > 1);
+-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ *  FSE Compression Code
+ ****************************************************************/
+ 
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+-    size_t size;
+-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+-    return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+     return minBits;
+@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ 
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+     U32 tableLog = maxTableLog;
+     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+     assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+     return tableLog;
+ }
+ 
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+-    const unsigned tableSize = 1 << nbBits;
+-    const unsigned tableMask = tableSize - 1;
+-    const unsigned maxSymbolValue = tableMask;
+-    void* const ptr = ct;
+-    U16* const tableU16 = ( (U16*) ptr) + 2;
+-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+-    unsigned s;
+-
+-    /* Sanity checks */
+-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+-
+-    /* header */
+-    tableU16[-2] = (U16) nbBits;
+-    tableU16[-1] = (U16) maxSymbolValue;
+-
+-    /* Build table */
+-    for (s=0; s<tableSize; s++)
+-        tableU16[s] = (U16)(tableSize + s);
+-
+-    /* Build Symbol Transformation Table */
+-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+-        for (s=0; s<=maxSymbolValue; s++) {
+-            symbolTT[s].deltaNbBits = deltaNbBits;
+-            symbolTT[s].deltaFindState = s-1;
+-    }   }
+-
+-    return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+ 
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+ 
+-
+ #endif   /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6894..0b12587cc14b 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc9c6..f7687b0fc20a 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+  * hist : Histogram functions
+  * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47621..83241abafe35 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h"        /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h"       /* ZSTD_highbit32 */
+ 
+ 
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+ 
+ 
+ /* **************************************************************
+-*  Utils
++*  Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++    U32 count;
++    U16 parent;
++    BYTE byte;
++    BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", arr[u]); (void)arr;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++    }
++    RAWLOG(6, " \n");
++    return size;
++
+ }
+ 
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++    size_t u;
++    for (u=0; u<size; u++) {
++        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++    }
++    RAWLOG(6, " \n");
++    return size;
++}
++
++#endif
++
+ 
+ /* *******************************************************
+ *  HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+     S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+ 
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++              const void* weightTable, size_t wtSize,
++                    void* workspace, size_t workspaceSize)
+ {
+     BYTE* const ostart = (BYTE*) dst;
+     BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+ 
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+-    return elt & ~0xFF;
++    return elt & ~(size_t)0xFF;
+ }
+ 
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     U32 n;
+     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
+     /* check conditions */
+     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+     return ((maxSymbolValue+1)/2) + 1;
+ }
+ 
+-/*! HUF_writeCTable() :
+-    `CTable` : Huffman tree to save, using huf representation.
+-    @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+-    HUF_WriteCTableWksp wksp;
+-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+ 
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+-    const HUF_CElt* ct = CTable + 1;
++    const HUF_CElt* const ct = CTable + 1;
+     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+     return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+ 
+ 
+-typedef struct nodeElt_s {
+-    U32 count;
+-    U16 parent;
+-    BYTE byte;
+-    BYTE nbBits;
+-} nodeElt;
+-
+ /*
+  * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+  *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+  *
+  * @pre               The sum of the ranks of each symbol == 2^largestBits,
+  *                    where largestBits == huffNode[lastNonNull].nbBits.
+  * @post              The sum of the ranks of each symbol == 2^largestBits,
+- *                    where largestBits is the return value <= maxNbBits.
++ *                    where largestBits is the return value (expected <= targetNbBits).
+  *
+- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
++ *                    It's presumed sorted, from most frequent to rarest symbol.
+  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits  The allowed number of bits, which the Huffman tree
+  *                    may not respect. After this function the Huffman tree will
+- *                    respect maxNbBits.
+- * @return            The maximum number of bits of the Huffman tree after adjustment,
+- *                    necessarily no more than maxNbBits.
++ *                    respect targetNbBits.
++ * @return            The maximum number of bits of the Huffman tree after adjustment.
+  */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+     const U32 largestBits = huffNode[lastNonNull].nbBits;
+-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
+-    if (largestBits <= maxNbBits) return largestBits;
++    /* early exit : no elt > targetNbBits, so the tree is already valid. */
++    if (largestBits <= targetNbBits) return largestBits;
++
++    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+ 
+     /* there are several too large elements (at least >= 2) */
+     {   int totalCost = 0;
+-        const U32 baseCost = 1 << (largestBits - maxNbBits);
++        const U32 baseCost = 1 << (largestBits - targetNbBits);
+         int n = (int)lastNonNull;
+ 
+-        /* Adjust any ranks > maxNbBits to maxNbBits.
++        /* Adjust any ranks > targetNbBits to targetNbBits.
+          * Compute totalCost, which is how far the sum of the ranks is
+          * we are over 2^largestBits after adjust the offending ranks.
+          */
+-        while (huffNode[n].nbBits > maxNbBits) {
++        while (huffNode[n].nbBits > targetNbBits) {
+             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+-            huffNode[n].nbBits = (BYTE)maxNbBits;
++            huffNode[n].nbBits = (BYTE)targetNbBits;
+             n--;
+         }
+-        /* n stops at huffNode[n].nbBits <= maxNbBits */
+-        assert(huffNode[n].nbBits <= maxNbBits);
+-        /* n end at index of smallest symbol using < maxNbBits */
+-        while (huffNode[n].nbBits == maxNbBits) --n;
++        /* n stops at huffNode[n].nbBits <= targetNbBits */
++        assert(huffNode[n].nbBits <= targetNbBits);
++        /* n end at index of smallest symbol using < targetNbBits */
++        while (huffNode[n].nbBits == targetNbBits) --n;
+ 
+-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
++        /* renorm totalCost from 2^largestBits to 2^targetNbBits
+          * note : totalCost is necessarily a multiple of baseCost */
+-        assert((totalCost & (baseCost - 1)) == 0);
+-        totalCost >>= (largestBits - maxNbBits);
++        assert(((U32)totalCost & (baseCost - 1)) == 0);
++        totalCost >>= (largestBits - targetNbBits);
+         assert(totalCost > 0);
+ 
+         /* repay normalized cost */
+@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ 
+             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+-            {   U32 currentNbBits = maxNbBits;
++            {   U32 currentNbBits = targetNbBits;
+                 int pos;
+                 for (pos=n ; pos >= 0; pos--) {
+                     if (huffNode[pos].nbBits >= currentNbBits) continue;
+-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
++                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
++                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
+             }   }
+ 
+             while (totalCost > 0) {
+                 /* Try to reduce the next power of 2 above totalCost because we
+                  * gain back half the rank.
+                  */
+-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                     U32 const highPos = rankLast[nBitsToDecrease];
+                     U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+                     rankLast[nBitsToDecrease] = noSymbol;
+                 else {
+                     rankLast[nBitsToDecrease]--;
+-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+                 }
+             }   /* while (totalCost > 0) */
+@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+              * TODO.
+              */
+             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+-                /* special case : no rank 1 symbol (using maxNbBits-1);
+-                 * let's create one from largest rank 0 (using maxNbBits).
++                /* special case : no rank 1 symbol (using targetNbBits-1);
++                 * let's create one from largest rank 0 (using targetNbBits).
+                  */
+                 if (rankLast[1] == noSymbol) {
+-                    while (huffNode[n].nbBits == maxNbBits) n--;
++                    while (huffNode[n].nbBits == targetNbBits) n--;
+                     huffNode[n+1].nbBits--;
+                     assert(n >= 0);
+                     rankLast[1] = (U32)(n+1);
+@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+         }   /* repay normalized cost */
+     }   /* there are several too large elements (at least >= 2) */
+ 
+-    return maxNbBits;
++    return targetNbBits;
+ }
+ 
+ typedef struct {
+@@ -429,7 +475,7 @@ typedef struct {
+     U16 curr;
+ } rankPos;
+ 
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+ 
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +494,8 @@ typedef struct {
+  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+  */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+ 
+ /* Return the appropriate bucket index for a given count. See definition of
+  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +503,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+         ? count
+-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+ 
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ 
+     /* Sort each bucket. */
+     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+         U32 const bucketStartIdx = rankPosition[n].base;
+         if (bucketSize > 1) {
+             assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+     assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+ 
++
+ /* HUF_buildCTable_wksp() :
+  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     int lowS, lowN;
+     int nodeNb = STARTNODE;
+     int n, nodeRoot;
++    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+     /* init for parents */
+     nonNullRank = (int)maxSymbolValue;
+     while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+     for (n=0; n<=nonNullRank; n++)
+         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+ 
++    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+     return nonNullRank;
+ }
+ 
+@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+     CTable[0] = maxNbBits;
+ }
+ 
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++                     void* workSpace, size_t wkspSize)
+ {
+-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++    HUF_buildCTable_wksp_tables* const wksp_tables =
++        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+     nodeElt* const huffNode = huffNode0+1;
+     int nonNullRank;
+ 
++    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+     /* safety checks */
+     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+-      return ERROR(workSpace_tooSmall);
++        return ERROR(workSpace_tooSmall);
+     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+-      return ERROR(maxSymbolValue_tooLarge);
++        return ERROR(maxSymbolValue_tooLarge);
+     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+ 
+     /* sort, decreasing order */
+     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+ 
+     /* build tree */
+     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+ 
+-    /* enforce maxTableLog */
++    /* determine and enforce maxTableLog */
+     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+ 
+@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+     {
+         size_t const nbBits = HUF_getNbBits(elt);
+-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+         (void)dirtyBits;
+         /* Middle bits are 0. */
+         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+     {
+         size_t const nbBits = bitC->bitPos[0] & 0xFF;
+         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+     }
+ }
+ 
+@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
+         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+     }
+     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, const int bmi2)
++                              const HUF_CElt* CTable, const int flags)
+ {
+-    (void)bmi2;
++    (void)flags;
+     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+ 
+ #endif
+ 
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                               const void* src, size_t srcSize,
+-                              const HUF_CElt* CTable, int bmi2)
++                              const HUF_CElt* CTable, int flags)
+ {
+     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+     const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     op += 6;   /* jumpTable */
+ 
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart, (U16)cSize);
+         op += cSize;
+@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+2, (U16)cSize);
+         op += cSize;
+@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ 
+     ip += segmentSize;
+     assert(op <= oend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         MEM_writeLE16(ostart+4, (U16)cSize);
+         op += cSize;
+@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     ip += segmentSize;
+     assert(op <= oend);
+     assert(ip <= iend);
+-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+         if (cSize == 0 || cSize > 65535) return 0;
+         op += cSize;
+     }
+@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+     return (size_t)(op-ostart);
+ }
+ 
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+ 
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+                 BYTE* const ostart, BYTE* op, BYTE* const oend,
+                 const void* src, size_t srcSize,
+-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+     size_t const cSize = (nbStreams==HUF_singleStream) ?
+-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+     if (HUF_isError(cSize)) { return cSize; }
+     if (cSize==0) { return 0; }   /* uncompressible */
+     op += cSize;
+@@ -1168,6 +1216,79 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
+ 
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++    unsigned cardinality = 0;
++    unsigned i;
++
++    for (i = 0; i < maxSymbolValue + 1; i++) {
++        if (count[i] != 0) cardinality += 1;
++    }
++
++    return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++    return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++            unsigned maxTableLog,
++            size_t srcSize,
++            unsigned maxSymbolValue,
++            void* workSpace, size_t wkspSize,
++            HUF_CElt* table,
++      const unsigned* count,
++            int flags)
++{
++    assert(srcSize > 1); /* Not supported, RLE should be used instead */
++    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++    if (!(flags & HUF_flags_optimalDepth)) {
++        /* cheap evaluation, based on FSE */
++        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++    }
++
++    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++        size_t maxBits, hSize, newSize;
++        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++        size_t optSize = ((size_t) ~0) - 1;
++        unsigned optLog = maxTableLog, optLogGuess;
++
++        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++        /* Search until size increases */
++        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++            if (ERR_isError(maxBits)) continue;
++
++            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++
++            if (ERR_isError(hSize)) continue;
++
++            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++            if (newSize > optSize + 1) {
++                break;
++            }
++
++            if (newSize < optSize) {
++                optSize = newSize;
++                optLog = optLogGuess;
++            }
++        }
++        assert(optLog <= HUF_TABLELOG_MAX);
++        return optLog;
++    }
++}
++
+ /* HUF_compress_internal() :
+  * `workSpace_align4` must be aligned on 4-bytes boundaries,
+  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+                        unsigned maxSymbolValue, unsigned huffLog,
+                        HUF_nbStreams_e nbStreams,
+                        void* workSpace, size_t wkspSize,
+-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+-                 const int bmi2, unsigned suspectUncompressible)
++                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+     BYTE* op = ostart;
+ 
++    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+ 
+     /* checks & inits */
+@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+ 
+     /* Heuristic : If old table is valid, use it for small inputs */
+-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* If uncompressible data is suspected, do a smaller sampling first */
+     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+         size_t largestTotal = 0;
++        DEBUGLOG(5, "input suspected incompressible : sampling to check");
+         {   unsigned maxSymbolValueBegin = maxSymbolValue;
+             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+             largestTotal += largestBegin;
+@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+     }
++    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+ 
+     /* Check validity of previous table */
+     if ( repeat
+@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+         *repeat = HUF_repeat_none;
+     }
+     /* Heuristic : use existing table for small inputs */
+-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+         return HUF_compressCTable_internal(ostart, op, oend,
+                                            src, srcSize,
+-                                           nbStreams, oldHufTable, bmi2);
++                                           nbStreams, oldHufTable, flags);
+     }
+ 
+     /* Build Huffman Tree */
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                             maxSymbolValue, huffLog,
+                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+         CHECK_F(maxBits);
+         huffLog = (U32)maxBits;
++        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+     }
+     /* Zero unused symbols in CTable, so we can check it for validity */
+     {
+@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                 return HUF_compressCTable_internal(ostart, op, oend,
+                                                    src, srcSize,
+-                                                   nbStreams, oldHufTable, bmi2);
++                                                   nbStreams, oldHufTable, flags);
+         }   }
+ 
+         /* Use the new huffman table */
+@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+     }
+     return HUF_compressCTable_internal(ostart, op, oend,
+                                        src, srcSize,
+-                                       nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_singleStream,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                       nbStreams, table->CTable, flags);
+ }
+ 
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+-                      int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_singleStream,
+                                  workSpace, wkspSize, hufTable,
+-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+-                      const void* src, size_t srcSize,
+-                      unsigned maxSymbolValue, unsigned huffLog,
+-                      void* workSpace, size_t wkspSize)
+-{
+-    return HUF_compress_internal(dst, dstSize, src, srcSize,
+-                                 maxSymbolValue, huffLog, HUF_fourStreams,
+-                                 workSpace, wkspSize,
+-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
++                                 repeat, flags);
+ }
+ 
+ /* HUF_compress4X_repeat():
+@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       void* workSpace, size_t wkspSize,
+-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+     return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                  maxSymbolValue, huffLog, HUF_fourStreams,
+                                  workSpace, wkspSize,
+-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++                                 hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index f620cafca633..81b8cd119cd8 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,6 @@
+ #include "hist.h"           /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +27,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include  "../common/bits.h"      /* ZSTD_highbit32 */
+ 
+ /* ***************************************************************
+ *  Tuning parameters
+@@ -55,14 +56,17 @@
+ *  Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+  */
+ size_t ZSTD_compressBound(size_t srcSize) {
+-    return ZSTD_COMPRESSBOUND(srcSize);
++    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++    if (r==0) return ERROR(srcSize_wrong);
++    return r;
+ }
+ 
+ 
+@@ -171,12 +175,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+     if (cctx==NULL) return 0;   /* support free on NULL */
+     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                     "not compatible with static CCtx");
+-    {
+-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+         ZSTD_freeCCtxContent(cctx);
+-        if (!cctxInWorkspace) {
+-            ZSTD_customFree(cctx, cctx->customMem);
+-        }
++        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+     }
+     return 0;
+ }
+@@ -257,9 +258,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+ 
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+  * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+  */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                  const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +268,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+ 
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++    return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++    if (maxBlockSize == 0) {
++        return ZSTD_BLOCKSIZE_MAX;
++    } else {
++        return maxBlockSize;
++    }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++    if (value != ZSTD_ps_auto) return value;
++    if (cLevel < 10) {
++        return ZSTD_ps_disable;
++    } else {
++        return ZSTD_ps_enable;
++    }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+         ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +313,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+     }
+     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++                                                                             cctxParams.compressionLevel);
+     assert(!ZSTD_checkCParams(cParams));
+     return cctxParams;
+ }
+@@ -329,10 +362,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+ 
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+  */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++                        const ZSTD_parameters* params,
++                              int compressionLevel)
+ {
+     assert(!ZSTD_checkCParams(params->cParams));
+     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +381,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +398,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+ 
+ /*
+  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+  */
+ static void ZSTD_CCtxParams_setZstdParams(
+         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +494,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         return bounds;
+ 
+     case ZSTD_c_enableLongDistanceMatching:
+-        bounds.lowerBound = 0;
+-        bounds.upperBound = 1;
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
+         return bounds;
+ 
+     case ZSTD_c_ldmHashLog:
+@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+         bounds.upperBound = 1;
+         return bounds;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        bounds.lowerBound = 0;
++        bounds.upperBound = 1;
++        return bounds;
++
++    case ZSTD_c_maxBlockSize:
++        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++        return bounds;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        bounds.lowerBound = (int)ZSTD_ps_auto;
++        bounds.upperBound = (int)ZSTD_ps_disable;
++        return bounds;
++
+     default:
+         bounds.error = ERROR(parameter_unsupported);
+         return bounds;
+@@ -613,6 +672,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+     default:
+         return 0;
+     }
+@@ -625,7 +688,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+         if (ZSTD_isUpdateAuthorized(param)) {
+             cctx->cParamsChanged = 1;
+         } else {
+-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+     }   }
+ 
+     switch(param)
+@@ -668,6 +731,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+     case ZSTD_c_useBlockSplitter:
+     case ZSTD_c_useRowMatchFinder:
+     case ZSTD_c_deterministicRefPrefix:
++    case ZSTD_c_prefetchCDictTables:
++    case ZSTD_c_enableSeqProducerFallback:
++    case ZSTD_c_maxBlockSize:
++    case ZSTD_c_searchForExternalRepcodes:
+         break;
+ 
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +790,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_minMatch :
+         if (value!=0)   /* 0 => use default */
+             BOUNDCHECK(ZSTD_c_minMatch, value);
+-        CCtxParams->cParams.minMatch = value;
++        CCtxParams->cParams.minMatch = (U32)value;
+         return CCtxParams->cParams.minMatch;
+ 
+     case ZSTD_c_targetLength :
+         BOUNDCHECK(ZSTD_c_targetLength, value);
+-        CCtxParams->cParams.targetLength = value;
++        CCtxParams->cParams.targetLength = (U32)value;
+         return CCtxParams->cParams.targetLength;
+ 
+     case ZSTD_c_strategy :
+@@ -741,12 +808,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         /* Content size written in frame header _when known_ (default:1) */
+         DEBUGLOG(4, "set content size flag = %u", (value!=0));
+         CCtxParams->fParams.contentSizeFlag = value != 0;
+-        return CCtxParams->fParams.contentSizeFlag;
++        return (size_t)CCtxParams->fParams.contentSizeFlag;
+ 
+     case ZSTD_c_checksumFlag :
+         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+         CCtxParams->fParams.checksumFlag = value != 0;
+-        return CCtxParams->fParams.checksumFlag;
++        return (size_t)CCtxParams->fParams.checksumFlag;
+ 
+     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +822,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_forceMaxWindow :
+         CCtxParams->forceWindow = (value != 0);
+-        return CCtxParams->forceWindow;
++        return (size_t)CCtxParams->forceWindow;
+ 
+     case ZSTD_c_forceAttachDict : {
+         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+         CCtxParams->attachDictPref = pref;
+         return CCtxParams->attachDictPref;
+     }
+ 
+     case ZSTD_c_literalCompressionMode : {
+         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+         CCtxParams->literalCompressionMode = lcm;
+         return CCtxParams->literalCompressionMode;
+     }
+@@ -789,47 +856,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ 
+     case ZSTD_c_enableDedicatedDictSearch :
+         CCtxParams->enableDedicatedDictSearch = (value!=0);
+-        return CCtxParams->enableDedicatedDictSearch;
++        return (size_t)CCtxParams->enableDedicatedDictSearch;
+ 
+     case ZSTD_c_enableLongDistanceMatching :
++        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+         return CCtxParams->ldmParams.enableLdm;
+ 
+     case ZSTD_c_ldmHashLog :
+         if (value!=0)   /* 0 ==> auto */
+             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+-        CCtxParams->ldmParams.hashLog = value;
++        CCtxParams->ldmParams.hashLog = (U32)value;
+         return CCtxParams->ldmParams.hashLog;
+ 
+     case ZSTD_c_ldmMinMatch :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+-        CCtxParams->ldmParams.minMatchLength = value;
++        CCtxParams->ldmParams.minMatchLength = (U32)value;
+         return CCtxParams->ldmParams.minMatchLength;
+ 
+     case ZSTD_c_ldmBucketSizeLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+-        CCtxParams->ldmParams.bucketSizeLog = value;
++        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+         return CCtxParams->ldmParams.bucketSizeLog;
+ 
+     case ZSTD_c_ldmHashRateLog :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+-        CCtxParams->ldmParams.hashRateLog = value;
++        CCtxParams->ldmParams.hashRateLog = (U32)value;
+         return CCtxParams->ldmParams.hashRateLog;
+ 
+     case ZSTD_c_targetCBlockSize :
+         if (value!=0)   /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+-        CCtxParams->targetCBlockSize = value;
++        CCtxParams->targetCBlockSize = (U32)value;
+         return CCtxParams->targetCBlockSize;
+ 
+     case ZSTD_c_srcSizeHint :
+         if (value!=0)    /* 0 ==> default */
+             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+         CCtxParams->srcSizeHint = value;
+-        return CCtxParams->srcSizeHint;
++        return (size_t)CCtxParams->srcSizeHint;
+ 
+     case ZSTD_c_stableInBuffer:
+         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -866,6 +934,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+         CCtxParams->deterministicRefPrefix = !!value;
+         return CCtxParams->deterministicRefPrefix;
+ 
++    case ZSTD_c_prefetchCDictTables:
++        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->prefetchCDictTables;
++
++    case ZSTD_c_enableSeqProducerFallback:
++        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++        CCtxParams->enableMatchFinderFallback = value;
++        return CCtxParams->enableMatchFinderFallback;
++
++    case ZSTD_c_maxBlockSize:
++        if (value!=0)    /* 0 ==> default */
++            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++        CCtxParams->maxBlockSize = value;
++        return CCtxParams->maxBlockSize;
++
++    case ZSTD_c_searchForExternalRepcodes:
++        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++        return CCtxParams->searchForExternalRepcodes;
++
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+ }
+@@ -980,6 +1069,18 @@ size_t ZSTD_CCtxParams_getParameter(
+     case ZSTD_c_deterministicRefPrefix:
+         *value = (int)CCtxParams->deterministicRefPrefix;
+         break;
++    case ZSTD_c_prefetchCDictTables:
++        *value = (int)CCtxParams->prefetchCDictTables;
++        break;
++    case ZSTD_c_enableSeqProducerFallback:
++        *value = CCtxParams->enableMatchFinderFallback;
++        break;
++    case ZSTD_c_maxBlockSize:
++        *value = (int)CCtxParams->maxBlockSize;
++        break;
++    case ZSTD_c_searchForExternalRepcodes:
++        *value = (int)CCtxParams->searchForExternalRepcodes;
++        break;
+     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+     }
+     return 0;
+@@ -1006,9 +1107,24 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+     return 0;
+ }
+ 
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++    assert(cctx != NULL);
++    if (cctx->streamStage != zcss_init) {
++        /* All parameters in @cparams are allowed to be updated during MT compression.
++         * This must be signaled, so that MT compression picks up the changes */
++        cctx->cParamsChanged = 1;
++    }
++    /* only update if parameters are valid */
++    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++    cctx->requestedParams.cParams = cparams;
++    return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                     "Can't set pledgedSrcSize when not in init stage.");
+     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1151,6 +1267,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                         "Can't reset parameters only when not in init stage.");
+         ZSTD_clearAllDicts(cctx);
++        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
+         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+     }
+     return 0;
+@@ -1247,7 +1364,8 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                             unsigned long long srcSize,
+                             size_t dictSize,
+-                            ZSTD_cParamMode_e mode)
++                            ZSTD_cParamMode_e mode,
++                            ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+     const U64 minSrcSize = 513; /* (1<<9) + 1 */
+     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+@@ -1281,8 +1399,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     }
+ 
+     /* resize windowLog if input is small enough, to use less memory */
+-    if ( (srcSize < maxWindowResize)
+-      && (dictSize < maxWindowResize) )  {
++    if ( (srcSize <= maxWindowResize)
++      && (dictSize <= maxWindowResize) )  {
+         U32 const tSize = (U32)(srcSize + dictSize);
+         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1418,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+ 
++    /* We can't use more than 32 bits of hash in total, so that means that we require:
++     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++     */
++    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++        if (cPar.hashLog > maxShortCacheHashLog) {
++            cPar.hashLog = maxShortCacheHashLog;
++        }
++        if (cPar.chainLog > maxShortCacheHashLog) {
++            cPar.chainLog = maxShortCacheHashLog;
++        }
++    }
++
++
++    /* At this point, we aren't 100% sure if we are using the row match finder.
++     * Unless it is explicitly disabled, conservatively assume that it is enabled.
++     * In this case it will only be disabled for small sources, so shrinking the
++     * hash log a little bit shouldn't result in any ratio loss.
++     */
++    if (useRowMatchFinder == ZSTD_ps_auto)
++        useRowMatchFinder = ZSTD_ps_enable;
++
++    /* We can't hash more than 32-bits in total. So that means that we require:
++     * (hashLog - rowLog + 8) <= 32
++     */
++    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++        /* Switch to 32-entry rows if searchLog is 5 (or more) */
++        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++        U32 const maxHashLog = maxRowHashLog + rowLog;
++        assert(cPar.hashLog >= rowLog);
++        if (cPar.hashLog > maxHashLog) {
++            cPar.hashLog = maxHashLog;
++        }
++    }
++
+     return cPar;
+ }
+ 
+@@ -1310,7 +1464,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+ 
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1495,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+     assert(!ZSTD_checkCParams(cParams));
+     /* srcSizeHint == 0 means 0 */
+-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+ 
+ static size_t
+@@ -1386,6 +1540,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+ 
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++    return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_compressionParameters* cParams,
+         const ldmParams_t* ldmParams,
+@@ -1393,12 +1554,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         const ZSTD_paramSwitch_e useRowMatchFinder,
+         const size_t buffInSize,
+         const size_t buffOutSize,
+-        const U64 pledgedSrcSize)
++        const U64 pledgedSrcSize,
++        int useSequenceProducer,
++        size_t maxBlockSize)
+ {
+     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
+-    size_t const maxNbSeq = blockSize / divider;
++    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1579,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ 
+     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+ 
++    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++    size_t const externalSeqSpace = useSequenceProducer
++        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++        : 0;
++
+     size_t const neededSpace =
+         cctxSpace +
+         entropySpace +
+@@ -1425,7 +1592,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+         ldmSeqSpace +
+         matchStateSize +
+         tokenSpace +
+-        bufferSpace;
++        bufferSpace +
++        externalSeqSpace;
+ 
+     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+     return neededSpace;
+@@ -1443,7 +1611,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+      * be needed. However, we still allocate two 0-sized buffers, which can
+      * take space under ASAN. */
+     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+ }
+ 
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1661,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+     {   ZSTD_compressionParameters const cParams =
+                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+                 ? ((size_t)1 << cParams.windowLog) + blockSize
+                 : 0;
+@@ -1504,7 +1672,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ 
+         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+-            ZSTD_CONTENTSIZE_UNKNOWN);
++            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+     }
+ }
+ 
+@@ -1768,6 +1936,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     assert(params->useRowMatchFinder != ZSTD_ps_auto);
+     assert(params->useBlockSplitter != ZSTD_ps_auto);
+     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++    assert(params->maxBlockSize != 0);
+     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+         /* Adjust long distance matching parameters */
+         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +1945,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+     }
+ 
+     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
+-        size_t const maxNbSeq = blockSize / divider;
++        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
+         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+                 ? ZSTD_compressBound(blockSize) + 1
+                 : 0;
+@@ -1795,7 +1963,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         size_t const neededSpace =
+             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+-                buffInSize, buffOutSize, pledgedSrcSize);
++                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
+         int resizeWorkspace;
+ 
+         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+@@ -1838,6 +2006,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+         /* init params */
+         zc->blockState.matchState.cParams = params->cParams;
++        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+         zc->consumedSrcSize = 0;
+         zc->producedCSize = 0;
+@@ -1907,6 +2076,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+             zc->ldmState.loadedDictEnd = 0;
+         }
+ 
++        /* reserve space for block-level external sequences */
++        if (params->useSequenceProducer) {
++            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
++            zc->externalMatchCtx.seqBuffer =
++                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++        }
++
+         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+         assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+ 
+@@ -1980,7 +2157,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+         }
+ 
+         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
++                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
++                                                     params.useRowMatchFinder);
+         params.cParams.windowLog = windowLog;
+         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
+         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2197,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++                                        ZSTD_compressionParameters const* cParams) {
++    if (ZSTD_CDictIndicesAreTagged(cParams)){
++        /* Remove tags from the CDict table if they are present.
++         * See docs on "short cache" in zstd_compress_internal.h for context. */
++        size_t i;
++        for (i = 0; i < tableSize; i++) {
++            U32 const taggedIndex = src[i];
++            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++            dst[i] = index;
++        }
++    } else {
++        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++    }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                             const ZSTD_CDict* cdict,
+                             ZSTD_CCtx_params params,
+@@ -2054,14 +2248,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                                                             : 0;
+         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+ 
+-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+-               cdict->matchState.hashTable,
+-               hSize * sizeof(U32));
++        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++                                cdict->matchState.hashTable,
++                                hSize, cdict_cParams);
++
+         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+-               cdict->matchState.chainTable,
+-               chainSize * sizeof(U32));
++            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++                                    cdict->matchState.chainTable,
++                                    chainSize, cdict_cParams);
+         }
+         /* copy tag table */
+         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+@@ -2147,6 +2342,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+         params.ldmParams = srcCCtx->appliedParams.ldmParams;
+         params.fParams = fParams;
++        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                 /* loadedDictSize */ 0,
+                                 ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2490,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+ 
+ /* See doc/zstd_compression_format.md for detailed format description */
+ 
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+     BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2498,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+     BYTE* const mlCodeTable = seqStorePtr->mlCode;
+     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     U32 u;
++    int longOffsets = 0;
+     assert(nbSeq <= seqStorePtr->maxNbSeq);
+     for (u=0; u<nbSeq; u++) {
+         U32 const llv = sequences[u].litLength;
++        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+         U32 const mlv = sequences[u].mlBase;
+         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++        ofCodeTable[u] = (BYTE)ofCode;
+         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++            longOffsets = 1;
+     }
+     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++    return longOffsets;
+ }
+ 
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2549,7 @@ typedef struct {
+     U32 MLtype;
+     size_t size;
+     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++    int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+ 
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2560,13 @@ typedef struct {
+  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+-                              BYTE* dst, const BYTE* const dstEnd,
+-                              ZSTD_strategy strategy, unsigned* countWorkspace,
+-                              void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++                const seqStore_t* seqStorePtr, size_t nbSeq,
++                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++                      BYTE* dst, const BYTE* const dstEnd,
++                      ZSTD_strategy strategy, unsigned* countWorkspace,
++                      void* entropyWorkspace, size_t entropyWkspSize)
++{
+     BYTE* const ostart = dst;
+     const BYTE* const oend = dstEnd;
+     BYTE* op = ostart;
+@@ -2375,7 +2580,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ 
+     stats.lastCountSize = 0;
+     /* convert length/distances into codes */
+-    ZSTD_seqToCodes(seqStorePtr);
++    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+     assert(op <= oend);
+     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+     /* build CTable for Literal Lengths */
+@@ -2480,22 +2685,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+  */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+-                          const ZSTD_entropyCTables_t* prevEntropy,
+-                                ZSTD_entropyCTables_t* nextEntropy,
+-                          const ZSTD_CCtx_params* cctxParams,
+-                                void* dst, size_t dstCapacity,
+-                                void* entropyWorkspace, size_t entropyWkspSize,
+-                          const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++                        const seqStore_t* seqStorePtr,
++                        const ZSTD_entropyCTables_t* prevEntropy,
++                              ZSTD_entropyCTables_t* nextEntropy,
++                        const ZSTD_CCtx_params* cctxParams,
++                              void* dst, size_t dstCapacity,
++                              void* entropyWorkspace, size_t entropyWkspSize,
++                        const int bmi2)
+ {
+-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+     unsigned* count = (unsigned*)entropyWorkspace;
+     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+     const seqDef* const sequences = seqStorePtr->sequencesStart;
+-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+     const BYTE* const llCodeTable = seqStorePtr->llCode;
+     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2708,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+     BYTE* const oend = ostart + dstCapacity;
+     BYTE* op = ostart;
+     size_t lastCountSize;
++    int longOffsets = 0;
+ 
+     entropyWorkspace = count + (MaxSeq + 1);
+     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+ 
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+ 
+     /* Compress literals */
+     {   const BYTE* const literals = seqStorePtr->litStart;
+-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+         /* Base suspicion of uncompressibility on ratio of literals to sequences */
+         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+         size_t const cSize = ZSTD_compressLiterals(
+-                                    &prevEntropy->huf, &nextEntropy->huf,
+-                                    cctxParams->cParams.strategy,
+-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                     op, dstCapacity,
+                                     literals, litSize,
+                                     entropyWorkspace, entropyWkspSize,
+-                                    bmi2, suspectUncompressible);
++                                    &prevEntropy->huf, &nextEntropy->huf,
++                                    cctxParams->cParams.strategy,
++                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
++                                    suspectUncompressible, bmi2);
+         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+         assert(cSize <= dstCapacity);
+         op += cSize;
+@@ -2551,11 +2758,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+         return (size_t)(op - ostart);
+     }
+-    {
+-        ZSTD_symbolEncodingTypeStats_t stats;
+-        BYTE* seqHead = op++;
++    {   BYTE* const seqHead = op++;
+         /* build stats for sequences */
+-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++        const ZSTD_symbolEncodingTypeStats_t stats =
++                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                               op, oend,
+                                               strategy, count,
+@@ -2564,6 +2770,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+         lastCountSize = stats.lastCountSize;
+         op += stats.size;
++        longOffsets = stats.longOffsets;
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2805,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ }
+ 
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+-                       const ZSTD_entropyCTables_t* prevEntropy,
+-                             ZSTD_entropyCTables_t* nextEntropy,
+-                       const ZSTD_CCtx_params* cctxParams,
+-                             void* dst, size_t dstCapacity,
+-                             size_t srcSize,
+-                             void* entropyWorkspace, size_t entropyWkspSize,
+-                             int bmi2)
++ZSTD_entropyCompressSeqStore(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          void* dst, size_t dstCapacity,
++                          size_t srcSize,
++                          void* entropyWorkspace, size_t entropyWkspSize,
++                          int bmi2)
+ {
+     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2823,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+      */
+-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+         return 0;  /* block not compressed */
++    }
+     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+ 
+     /* Check compressibility */
+     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+         if (cSize >= maxCSize) return 0;  /* block not compressed */
+     }
+-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++     */
++    assert(cSize < ZSTD_BLOCKSIZE_MAX);
+     return cSize;
+ }
+ 
+@@ -2718,6 +2932,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+     ssPtr->longLengthType = ZSTD_llt_none;
+ }
+ 
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ *   - Checks whether nbExternalSeqs represents an error condition.
++ *   - Appends a block delimiter to outSeqs if one is not already present.
++ *     See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++    RETURN_ERROR_IF(
++        nbExternalSeqs > outSeqsCapacity,
++        sequenceProducer_failed,
++        "External sequence producer returned error code %lu",
++        (unsigned long)nbExternalSeqs
++    );
++
++    RETURN_ERROR_IF(
++        nbExternalSeqs == 0 && srcSize > 0,
++        sequenceProducer_failed,
++        "Got zero sequences from external sequence producer for a non-empty src buffer!"
++    );
++
++    if (srcSize == 0) {
++        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++        return 1;
++    }
++
++    {
++        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++        /* We can return early if lastSeq is already a block delimiter. */
++        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++            return nbExternalSeqs;
++        }
++
++        /* This error condition is only possible if the external matchfinder
++         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++        RETURN_ERROR_IF(
++            nbExternalSeqs == outSeqsCapacity,
++            sequenceProducer_failed,
++            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++        );
++
++        /* lastSeq is not a block delimiter, so we need to append one. */
++        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++        return nbExternalSeqs + 1;
++    }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++    size_t matchLenSum, litLenSum, i;
++    matchLenSum = 0;
++    litLenSum = 0;
++    for (i = 0; i < seqBufSize; i++) {
++        litLenSum += seqBuf[i].litLength;
++        matchLenSum += seqBuf[i].matchLength;
++    }
++    return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+ 
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3007,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     /* Assert that we have correctly flushed the ctx params into the ms's copy */
+     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++     * additional 1. We need to revisit and change this logic to be more consistent */
++    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+         } else {
+@@ -2763,6 +3045,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         }
+         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                zc->appliedParams.useSequenceProducer,
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             /* Updates ldmSeqStore.pos */
+             lastLLSize =
+                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3065,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+ 
++            /* External matchfinder + LDM is technically possible, just not implemented yet.
++             * We need to revisit soon and implement it. */
++            RETURN_ERROR_IF(
++                zc->appliedParams.useSequenceProducer,
++                parameter_combination_unsupported,
++                "Long-distance matching with external sequence producer enabled is not currently supported."
++            );
++
+             ldmSeqStore.seq = zc->ldmSequences;
+             ldmSeqStore.capacity = zc->maxNbLdmSequences;
+             /* Updates ldmSeqStore.size */
+@@ -2788,7 +3087,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+                                        zc->appliedParams.useRowMatchFinder,
+                                        src, srcSize);
+             assert(ldmSeqStore.pos == ldmSeqStore.size);
+-        } else {   /* not long range mode */
++        } else if (zc->appliedParams.useSequenceProducer) {
++            assert(
++                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
++            );
++            assert(zc->externalMatchCtx.mFinder != NULL);
++
++            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
++                    zc->externalMatchCtx.mState,
++                    zc->externalMatchCtx.seqBuffer,
++                    zc->externalMatchCtx.seqBufferCapacity,
++                    src, srcSize,
++                    NULL, 0,  /* dict and dictSize, currently not supported */
++                    zc->appliedParams.compressionLevel,
++                    windowSize
++                );
++
++                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++                    zc->externalMatchCtx.seqBuffer,
++                    nbExternalSeqs,
++                    zc->externalMatchCtx.seqBufferCapacity,
++                    srcSize
++                );
++
++                /* Return early if there is no error, since we don't need to worry about last literals */
++                if (!ZSTD_isError(nbPostProcessedSeqs)) {
++                    ZSTD_sequencePosition seqPos = {0,0,0};
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
++                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++                    FORWARD_IF_ERROR(
++                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++                            zc, &seqPos,
++                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
++                            src, srcSize,
++                            zc->appliedParams.searchForExternalRepcodes
++                        ),
++                        "Failed to copy external sequences to seqStore!"
++                    );
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++                    return ZSTDbss_compress;
++                }
++
++                /* Propagate the error if fallback is disabled */
++                if (!zc->appliedParams.enableMatchFinderFallback) {
++                    return nbPostProcessedSeqs;
++                }
++
++                /* Fallback to software matchfinder */
++                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
++                                                                                            zc->appliedParams.useRowMatchFinder,
++                                                                                            dictMode);
++                    ms->ldmSeqStore = NULL;
++                    DEBUGLOG(
++                        5,
++                        "External sequence producer returned error code %lu. Falling back to internal parser.",
++                        (unsigned long)nbExternalSeqs
++                    );
++                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++            }   }
++        } else {   /* not long range mode and no external matchfinder */
+             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                     zc->appliedParams.useRowMatchFinder,
+                                                                                     dictMode);
+@@ -2849,7 +3209,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+            so we provide seqStoreSeqs[i].offset - 1 */
+         ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
++                       seqStoreSeqs[i].offBase,
+                        seqStoreSeqs[i].litLength == 0);
+         literalsRead += outSeqs[i].litLength;
+     }
+@@ -2865,6 +3225,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+     zc->seqCollector.seqIndex += seqStoreSeqSize;
+ }
+ 
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
++}
++
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+                               size_t outSeqsSize, const void* src, size_t srcSize)
+ {
+@@ -2910,19 +3274,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+     const size_t unrollMask = unrollSize - 1;
+     const size_t prefixLength = length & unrollMask;
+     size_t i;
+-    size_t u;
+     if (length == 1) return 1;
+     /* Check if prefix is RLE first before using unrolled loop */
+     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+         return 0;
+     }
+     for (i = prefixLength; i != length; i += unrollSize) {
++        size_t u;
+         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+             if (MEM_readST(ip + i + u) != valueST) {
+                 return 0;
+-            }
+-        }
+-    }
++    }   }   }
+     return 1;
+ }
+ 
+@@ -2938,7 +3300,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+     return nbSeqs < 4 && nbLits < 10;
+ }
+ 
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+     bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3309,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+ 
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+     U32 const cBlockHeader = cSize == 1 ?
+                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3324,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+  *  Stores literals block type (raw, rle, compressed, repeat) and
+  *  huffman description table to hufMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE workspace
+- *  @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+-                                            const ZSTD_hufCTables_t* prevHuf,
+-                                                  ZSTD_hufCTables_t* nextHuf,
+-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                  const int literalsCompressionIsDisabled,
+-                                                  void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++                               const ZSTD_hufCTables_t* prevHuf,
++                                     ZSTD_hufCTables_t* nextHuf,
++                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               const int literalsCompressionIsDisabled,
++                                     void* workspace, size_t wkspSize,
++                                     int hufFlags)
+ {
+     BYTE* const wkspStart = (BYTE*)workspace;
+     BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3341,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+     unsigned* const countWksp = (unsigned*)workspace;
+     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+     BYTE* const nodeWksp = countWkspStart + countWkspSize;
+-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
++    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
++    unsigned huffLog = LitHufLog;
+     HUF_repeat repeat = prevHuf->repeatMode;
+     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+ 
+@@ -2990,73 +3358,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ 
+     /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
+ #endif
+     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+         if (srcSize <= minLitSize) {
+             DEBUGLOG(5, "set_basic - too small");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Scan input and build symbol stats */
+-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++    {   size_t const largest =
++            HIST_count_wksp (countWksp, &maxSymbolValue,
++                            (const BYTE*)src, srcSize,
++                            workspace, wkspSize);
+         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+         if (largest == srcSize) {
++            /* only one literal symbol */
+             DEBUGLOG(5, "set_rle");
+             hufMetadata->hType = set_rle;
+             return 0;
+         }
+         if (largest <= (srcSize >> 7)+4) {
++            /* heuristic: likely not compressible */
+             DEBUGLOG(5, "set_basic - no gain");
+             hufMetadata->hType = set_basic;
+             return 0;
+-        }
+-    }
++    }   }
+ 
+     /* Validate the previous Huffman table */
+-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++    if (repeat == HUF_repeat_check
++      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+         repeat = HUF_repeat_none;
+     }
+ 
+     /* Build Huffman Tree */
+     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++    assert(huffLog <= LitHufLog);
+     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                     maxSymbolValue, huffLog,
+                                                     nodeWksp, nodeWkspSize);
+         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+         huffLog = (U32)maxBits;
+-        {   /* Build and write the CTable */
+-            size_t const newCSize = HUF_estimateCompressedSize(
+-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+-            size_t const hSize = HUF_writeCTable_wksp(
+-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+-                    nodeWksp, nodeWkspSize);
+-            /* Check against repeating the previous CTable */
+-            if (repeat != HUF_repeat_none) {
+-                size_t const oldCSize = HUF_estimateCompressedSize(
+-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+-                    DEBUGLOG(5, "set_repeat - smaller");
+-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                    hufMetadata->hType = set_repeat;
+-                    return 0;
+-                }
+-            }
+-            if (newCSize + hSize >= srcSize) {
+-                DEBUGLOG(5, "set_basic - no gains");
++    }
++    {   /* Build and write the CTable */
++        size_t const newCSize = HUF_estimateCompressedSize(
++                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++        size_t const hSize = HUF_writeCTable_wksp(
++                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++                nodeWksp, nodeWkspSize);
++        /* Check against repeating the previous CTable */
++        if (repeat != HUF_repeat_none) {
++            size_t const oldCSize = HUF_estimateCompressedSize(
++                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++                DEBUGLOG(5, "set_repeat - smaller");
+                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-                hufMetadata->hType = set_basic;
++                hufMetadata->hType = set_repeat;
+                 return 0;
+-            }
+-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+-            hufMetadata->hType = set_compressed;
+-            nextHuf->repeatMode = HUF_repeat_check;
+-            return hSize;
++        }   }
++        if (newCSize + hSize >= srcSize) {
++            DEBUGLOG(5, "set_basic - no gains");
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            hufMetadata->hType = set_basic;
++            return 0;
+         }
++        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++        hufMetadata->hType = set_compressed;
++        nextHuf->repeatMode = HUF_repeat_check;
++        return hSize;
+     }
+ }
+ 
+@@ -3066,8 +3438,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+  * and updates nextEntropy to the appropriate repeatMode.
+  */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+     nextEntropy->litlength_repeatMode = FSE_repeat_none;
+     nextEntropy->offcode_repeatMode = FSE_repeat_none;
+     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3451,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+  *  Builds entropy for the sequences.
+  *  Stores symbol compression modes and fse table to fseMetadata.
+  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+- *  @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+-                                              const ZSTD_fseCTables_t* prevEntropy,
+-                                                    ZSTD_fseCTables_t* nextEntropy,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                    void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++                const seqStore_t* seqStorePtr,
++                const ZSTD_fseCTables_t* prevEntropy,
++                      ZSTD_fseCTables_t* nextEntropy,
++                const ZSTD_CCtx_params* cctxParams,
++                      ZSTD_fseCTablesMetadata_t* fseMetadata,
++                      void* workspace, size_t wkspSize)
+ {
+     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+     BYTE* const ostart = fseMetadata->fseTablesBuffer;
+     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+     BYTE* op = ostart;
+@@ -3114,23 +3489,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- *  @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ *  Note : also employed in superblock
+  */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize)
+-{
+-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++            const seqStore_t* seqStorePtr,
++            const ZSTD_entropyCTables_t* prevEntropy,
++                  ZSTD_entropyCTables_t* nextEntropy,
++            const ZSTD_CCtx_params* cctxParams,
++                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                  void* workspace, size_t wkspSize)
++{
++    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+     entropyMetadata->hufMetadata.hufDesSize =
+         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+                                             &prevEntropy->huf, &nextEntropy->huf,
+                                             &entropyMetadata->hufMetadata,
+                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
+-                                            workspace, wkspSize);
++                                            workspace, wkspSize, hufFlags);
++
+     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+     entropyMetadata->fseMetadata.fseTablesSize =
+         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3523,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+ 
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+-                                                const ZSTD_hufCTables_t* huf,
+-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                                void* workspace, size_t wkspSize,
+-                                                int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++                               const ZSTD_hufCTables_t* huf,
++                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                               void* workspace, size_t wkspSize,
++                               int writeEntropy)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3550,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+ 
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+-                        const FSE_CTable* fseCTable,
+-                        const U8* additionalBits,
+-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+-                        void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++                    const FSE_CTable* fseCTable,
++                    const U8* additionalBits,
++                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++                    void* workspace, size_t wkspSize)
+ {
+     unsigned* const countWksp = (unsigned*)workspace;
+     const BYTE* ctp = codeTable;
+@@ -3206,99 +3588,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+ 
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+-                                                  const BYTE* llCodeTable,
+-                                                  const BYTE* mlCodeTable,
+-                                                  size_t nbSeq,
+-                                                  const ZSTD_fseCTables_t* fseTables,
+-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                                  void* workspace, size_t wkspSize,
+-                                                  int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++                                 const BYTE* llCodeTable,
++                                 const BYTE* mlCodeTable,
++                                 size_t nbSeq,
++                                 const ZSTD_fseCTables_t* fseTables,
++                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                 void* workspace, size_t wkspSize,
++                                 int writeEntropy)
+ {
+     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+     size_t cSeqSizeEstimate = 0;
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+-                                         fseTables->offcodeCTable, NULL,
+-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+-                                         workspace, wkspSize);
++                                    fseTables->offcodeCTable, NULL,
++                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+-                                         fseTables->litlengthCTable, LL_bits,
+-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+-                                         workspace, wkspSize);
++                                    fseTables->litlengthCTable, LL_bits,
++                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
++                                    workspace, wkspSize);
+     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+-                                         fseTables->matchlengthCTable, ML_bits,
+-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+-                                         workspace, wkspSize);
++                                    fseTables->matchlengthCTable, ML_bits,
++                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
++                                    workspace, wkspSize);
+     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+-                                     const BYTE* ofCodeTable,
+-                                     const BYTE* llCodeTable,
+-                                     const BYTE* mlCodeTable,
+-                                     size_t nbSeq,
+-                                     const ZSTD_entropyCTables_t* entropy,
+-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                     void* workspace, size_t wkspSize,
+-                                     int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++                       const BYTE* ofCodeTable,
++                       const BYTE* llCodeTable,
++                       const BYTE* mlCodeTable,
++                       size_t nbSeq,
++                       const ZSTD_entropyCTables_t* entropy,
++                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                       void* workspace, size_t wkspSize,
++                       int writeLitEntropy, int writeSeqEntropy)
++{
+     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
++                                    &entropy->huf, &entropyMetadata->hufMetadata,
++                                    workspace, wkspSize, writeLitEntropy);
+     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+-                                                         workspace, wkspSize, writeSeqEntropy);
++                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++                                    workspace, wkspSize, writeSeqEntropy);
+     return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+ 
+ /* Builds entropy statistics and uses them for blocksize estimation.
+  *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+  */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                     &zc->blockState.prevCBlock->entropy,
+                     &zc->blockState.nextCBlock->entropy,
+                     &zc->appliedParams,
+                     entropyMetadata,
+-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++    return ZSTD_estimateBlockSize(
++                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
+-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++                    &zc->blockState.nextCBlock->entropy,
++                    entropyMetadata,
++                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+ 
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+     size_t literalsBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+-        seqDef seq = seqStore->sequencesStart[i];
++        seqDef const seq = seqStore->sequencesStart[i];
+         literalsBytes += seq.litLength;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+             literalsBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return literalsBytes;
+ }
+ 
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+     size_t matchBytes = 0;
+-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+     size_t i;
+     for (i = 0; i < nbSeqs; ++i) {
+         seqDef seq = seqStore->sequencesStart[i];
+         matchBytes += seq.mlBase + MINMATCH;
+         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+             matchBytes += 0x10000;
+-        }
+-    }
++    }   }
+     return matchBytes;
+ }
+ 
+@@ -3307,15 +3697,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+  */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                                const seqStore_t* originalSeqStore,
+-                                     size_t startIdx, size_t endIdx) {
+-    BYTE* const litEnd = originalSeqStore->lit;
+-    size_t literalsBytes;
+-    size_t literalsBytesPreceding = 0;
+-
++                                     size_t startIdx, size_t endIdx)
++{
+     *resultSeqStore = *originalSeqStore;
+     if (startIdx > 0) {
+         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+     }
+ 
+     /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3715,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+     }
+     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+-    resultSeqStore->litStart += literalsBytesPreceding;
+     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+-        resultSeqStore->lit = litEnd;
++        assert(resultSeqStore->lit == originalSeqStore->lit);
+     } else {
+-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+     }
+     resultSeqStore->llCode += startIdx;
+     resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3728,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ 
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+  */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
+-    assert(STORED_IS_REPCODE(offCode));
+-    if (adjustedOffCode == ZSTD_REP_NUM) {
+-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+-        assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
++    assert(OFFBASE_IS_REPCODE(offBase));
++    if (adjustedRepCode == ZSTD_REP_NUM) {
++        assert(ll0);
++        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++         * This is only valid if it results in a valid offset value, aka > 0.
++         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++         * In which case this function will return 0, which is an invalid offset.
++         * It's not an issue though, since this value will be
++         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++         */
+         return rep[0] - 1;
+     }
+-    return rep[adjustedOffCode];
++    return rep[adjustedRepCode];
+ }
+ 
+ /*
+@@ -3371,30 +3763,32 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+  *        1-3 : repcode 1-3
+  *        4+ : real_offset+3
+  */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+-                                          seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++                        const seqStore_t* const seqStore, U32 const nbSeq)
++{
+     U32 idx = 0;
+     for (; idx < nbSeq; ++idx) {
+         seqDef* const seq = seqStore->sequencesStart + idx;
+         U32 const ll0 = (seq->litLength == 0);
+-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+-        assert(seq->offBase > 0);
+-        if (STORED_IS_REPCODE(offCode)) {
+-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++        U32 const offBase = seq->offBase;
++        assert(offBase > 0);
++        if (OFFBASE_IS_REPCODE(offBase)) {
++            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+              * the repcode with the offset it actually references, determined by the compression
+              * repcode history.
+              */
+             if (dRawOffset != cRawOffset) {
+-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
++                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+             }
+         }
+         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+          */
+-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+     }
+ }
+ 
+@@ -3404,10 +3798,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+  * Returns the total size of that block (including header) or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++                            const seqStore_t* const seqStore,
+                                   repcodes_t* const dRep, repcodes_t* const cRep,
+                                   void* dst, size_t dstCapacity,
+-                                  const void* src, size_t srcSize,
++                            const void* src, size_t srcSize,
+                                   U32 lastBlock, U32 isPartition)
+ {
+     const U32 rleMaxLength = 25;
+@@ -3481,45 +3876,49 @@ typedef struct {
+ 
+ /* Helper function to perform the recursive search for block splits.
+  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+  *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+  * In practice, recursion depth usually doesn't go beyond 4.
+  *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+  * maximum of 128 KB, this value is actually impossible to reach.
+  */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+     size_t estimatedOriginalSize;
+     size_t estimatedFirstHalfSize;
+     size_t estimatedSecondHalfSize;
+     size_t midIdx = (startIdx + endIdx)/2;
+ 
++    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++    assert(endIdx >= startIdx);
+     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+         return;
+     }
+-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+         return;
+     }
+     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+         splits->splitLocations[splits->idx] = (U32)midIdx;
+         splits->idx++;
+@@ -3527,14 +3926,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+     }
+ }
+ 
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+  *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+  */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+-    seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++    seqStoreSplits splits;
++    splits.splitLocations = partitions;
++    splits.idx = 0;
+     if (nbSeq <= 4) {
+-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+         /* Refuse to try and split anything with less than 4 sequences */
+         return 0;
+     }
+@@ -3550,18 +3953,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+  */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++                                    void* dst, size_t dstCapacity,
++                              const void* src, size_t blockSize,
++                                    U32 lastBlock, U32 nbSeq)
+ {
+     size_t cSize = 0;
+     const BYTE* ip = (const BYTE*)src;
+     BYTE* op = (BYTE*)dst;
+     size_t i = 0;
+     size_t srcBytesTotal = 0;
+-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+ 
+     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +3988,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+ 
+-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                 (unsigned)zc->blockState.matchState.nextToUpdate);
+ 
+     if (numSplits == 0) {
+-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+-                                                                   &dRep, &cRep,
+-                                                                    op, dstCapacity,
+-                                                                    ip, blockSize,
+-                                                                    lastBlock, 0 /* isPartition */);
++        size_t cSizeSingleBlock =
++            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++                                            &dRep, &cRep,
++                                            op, dstCapacity,
++                                            ip, blockSize,
++                                            lastBlock, 0 /* isPartition */);
+         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+         return cSizeSingleBlock;
+     }
+ 
+     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+     for (i = 0; i <= numSplits; ++i) {
+-        size_t srcBytes;
+         size_t cSizeChunk;
+         U32 const lastPartition = (i == numSplits);
+         U32 lastBlockEntireSrc = 0;
+ 
+-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+         srcBytesTotal += srcBytes;
+         if (lastPartition) {
+             /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4027,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+                                                        op, dstCapacity,
+                                                        ip, srcBytes,
+                                                        lastBlockEntireSrc, 1 /* isPartition */);
+-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+ 
+         ip += srcBytes;
+@@ -3629,10 +4036,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+         dstCapacity -= cSizeChunk;
+         cSize += cSizeChunk;
+         *currSeqStore = *nextSeqStore;
+-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+     }
+-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+-     * for the next block.
++    /* cRep and dRep may have diverged during the compression.
++     * If so, we use the dRep repcodes for the next block.
+      */
+     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+     return cSize;
+@@ -3643,8 +4050,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize, U32 lastBlock)
+ {
+-    const BYTE* ip = (const BYTE*)src;
+-    BYTE* op = (BYTE*)dst;
+     U32 nbSeq;
+     size_t cSize;
+     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4060,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+         if (bss == ZSTDbss_noCompress) {
+             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+             return cSize;
+@@ -3673,9 +4078,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                             void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize, U32 frame)
+ {
+-    /* This the upper bound for the length of an rle block.
+-     * This isn't the actual upper bound. Finding the real threshold
+-     * needs further investigation.
++    /* This is an estimated upper bound for the length of an rle block.
++     * This isn't the actual upper bound.
++     * Finding the real threshold needs further investigation.
+      */
+     const U32 rleMaxLength = 25;
+     size_t cSize;
+@@ -3767,10 +4172,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+          *     emit an uncompressed block.
+          */
+-        {
+-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++        {   size_t const cSize =
++                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+             if (cSize != ERROR(dstSize_tooSmall)) {
+-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++                size_t const maxCSize =
++                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4184,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                 }
+             }
+         }
+-    }
++    } /* if (bss == ZSTDbss_compress)*/
+ 
+     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+     /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4242,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ *   All blocks will be terminated, all input will be consumed.
+ *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ *   Frame is supposed already started (header already produced)
+-*   @return : compressed size, or an error code
++*  @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+@@ -3860,7 +4266,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ 
+-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+                         dstSize_tooSmall,
+                         "not enough space to store compressed block");
+         if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4307,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+                     MEM_writeLE24(op, cBlockHeader);
+                     cSize += ZSTD_blockHeaderSize;
+                 }
+-            }
++            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+ 
+ 
+             ip += blockSize;
+@@ -4091,7 +4499,7 @@ size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+ {
+     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+     assert(!ZSTD_checkCParams(cParams));
+-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+ 
+ size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+@@ -4111,31 +4519,47 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                          ZSTD_cwksp* ws,
+                                          ZSTD_CCtx_params const* params,
+                                          const void* src, size_t srcSize,
+-                                         ZSTD_dictTableLoadMethod_e dtlm)
++                                         ZSTD_dictTableLoadMethod_e dtlm,
++                                         ZSTD_tableFillPurpose_e tfp)
+ {
+     const BYTE* ip = (const BYTE*) src;
+     const BYTE* const iend = ip + srcSize;
+     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+ 
+-    /* Assert that we the ms params match the params we're being given */
++    /* Assert that the ms params match the params we're being given */
+     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+ 
+-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++    {   /* Ensure large dictionaries can't cause index overflow */
++
+         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+          * Dictionaries right at the edge will immediately trigger overflow
+          * correction, but I don't want to insert extra constraints here.
+          */
+-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+-        /* We must have cleared our windows when our source is this large. */
+-        assert(ZSTD_window_isEmpty(ms->window));
+-        if (loadLdmDict)
+-            assert(ZSTD_window_isEmpty(ls->window));
++        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++            /* Some dictionary matchfinders in zstd use "short cache",
++             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++             * CDict hashtable entry as a tag rather than as part of an index.
++             * When short cache is used, we need to truncate the dictionary
++             * so that its indices don't overlap with the tag. */
++            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++            assert(!loadLdmDict);
++        }
++
+         /* If the dictionary is too large, only load the suffix of the dictionary. */
+         if (srcSize > maxDictSize) {
+             ip = iend - maxDictSize;
+             src = ip;
+             srcSize = maxDictSize;
+-        }
++    }   }
++
++    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++        /* We must have cleared our windows when our source is this large. */
++        assert(ZSTD_window_isEmpty(ms->window));
++        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+     }
+ 
+     DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+@@ -4158,10 +4582,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+     switch(params->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, dtlm);
++        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+         break;
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+         break;
+ 
+     case ZSTD_greedy:
+@@ -4327,6 +4751,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                       ZSTD_CCtx_params const* params,
+                                       const void* dict, size_t dictSize,
+                                       ZSTD_dictTableLoadMethod_e dtlm,
++                                      ZSTD_tableFillPurpose_e tfp,
+                                       void* workspace)
+ {
+     const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4770,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+     {
+         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+     }
+     return dictID;
+ }
+@@ -4361,6 +4786,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                          const void* dict, size_t dictSize,
+                                ZSTD_dictContentType_e dictContentType,
+                                ZSTD_dictTableLoadMethod_e dtlm,
++                               ZSTD_tableFillPurpose_e tfp,
+                                void* workspace)
+ {
+     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4799,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict restricted modes */
+     if (dictContentType == ZSTD_dct_rawContent)
+-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ 
+     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+         if (dictContentType == ZSTD_dct_auto) {
+             DEBUGLOG(4, "raw content dictionary detected");
+             return ZSTD_loadDictionaryContent(
+-                ms, ls, ws, params, dict, dictSize, dtlm);
++                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+         }
+         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+         assert(0);   /* impossible */
+@@ -4387,13 +4813,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ 
+     /* dict as full zstd dictionary */
+     return ZSTD_loadZstdDictionary(
+-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+ 
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+ 
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+  * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                     const void* dict, size_t dictSize,
+@@ -4426,11 +4853,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                         cdict->dictContentSize, cdict->dictContentType, dtlm,
+-                        cctx->entropyWorkspace)
++                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+               : ZSTD_compress_insertDictionary(
+                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+-                        dictContentType, dtlm, cctx->entropyWorkspace);
++                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+         assert(dictID <= UINT_MAX);
+         cctx->dictID = (U32)dictID;
+@@ -4471,11 +4898,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                                             &cctxParams, pledgedSrcSize);
+ }
+ 
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+     ZSTD_CCtx_params cctxParams;
+-    {
+-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+     }
+     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4709,7 +5136,7 @@ static size_t ZSTD_initCDict_internal(
+         {   size_t const dictID = ZSTD_compress_insertDictionary(
+                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                     &params, cdict->dictContent, cdict->dictContentSize,
+-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+             assert(dictID <= (size_t)(U32)-1);
+             cdict->dictID = (U32)dictID;
+@@ -5197,30 +5624,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+ 
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+-    if (hintInSize==0) hintInSize = cctx->blockSize;
+-    return hintInSize;
++    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        return cctx->blockSize - cctx->stableIn_notConsumed;
++    }
++    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++        if (hintInSize==0) hintInSize = cctx->blockSize;
++        return hintInSize;
++    }
+ }
+ 
+ /* ZSTD_compressStream_generic():
+  *  internal function for all *compressStream*() variants
+- *  non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                           ZSTD_outBuffer* output,
+                                           ZSTD_inBuffer* input,
+                                           ZSTD_EndDirective const flushMode)
+ {
+-    const char* const istart = (const char*)input->src;
+-    const char* const iend = input->size != 0 ? istart + input->size : istart;
+-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+-    char* const ostart = (char*)output->dst;
+-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
++    const char* const istart = (assert(input != NULL), (const char*)input->src);
++    const char* const iend = (istart != NULL) ? istart + input->size : istart;
++    const char* ip = (istart != NULL) ? istart + input->pos : istart;
++    char* const ostart = (assert(output != NULL), (char*)output->dst);
++    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+     U32 someMoreWork = 1;
+ 
+     /* check expectations */
+-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++    assert(zcs != NULL);
++    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++        assert(input->pos >= zcs->stableIn_notConsumed);
++        input->pos -= zcs->stableIn_notConsumed;
++        ip -= zcs->stableIn_notConsumed;
++        zcs->stableIn_notConsumed = 0;
++    }
+     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+         assert(zcs->inBuff != NULL);
+         assert(zcs->inBuffSize > 0);
+@@ -5229,8 +5667,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+         assert(zcs->outBuff !=  NULL);
+         assert(zcs->outBuffSize > 0);
+     }
+-    assert(output->pos <= output->size);
++    if (input->src == NULL) assert(input->size == 0);
+     assert(input->pos <= input->size);
++    if (output->dst == NULL) assert(output->size == 0);
++    assert(output->pos <= output->size);
+     assert((U32)flushMode <= (U32)ZSTD_e_end);
+ 
+     while (someMoreWork) {
+@@ -5262,8 +5702,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                         zcs->inBuff + zcs->inBuffPos, toLoad,
+                                         ip, iend-ip);
+                 zcs->inBuffPos += loaded;
+-                if (loaded != 0)
+-                    ip += loaded;
++                if (ip) ip += loaded;
+                 if ( (flushMode == ZSTD_e_continue)
+                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                     /* not enough input to fill full block : stop here */
+@@ -5274,6 +5713,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     /* empty */
+                     someMoreWork = 0; break;
+                 }
++            } else {
++                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++                if ( (flushMode == ZSTD_e_continue)
++                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++                    /* can't compress a full block : stop here */
++                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
++                    ip = iend;  /* pretend to have consumed input */
++                    someMoreWork = 0; break;
++                }
++                if ( (flushMode == ZSTD_e_flush)
++                  && (ip == iend) ) {
++                    /* empty */
++                    someMoreWork = 0; break;
++                }
+             }
+             /* compress current block (note : this stage cannot be stopped in the middle) */
+             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5281,9 +5734,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                 void* cDst;
+                 size_t cSize;
+                 size_t oSize = oend-op;
+-                size_t const iSize = inputBuffered
+-                    ? zcs->inBuffPos - zcs->inToCompress
+-                    : MIN((size_t)(iend - ip), zcs->blockSize);
++                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
+                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+                     cDst = op;   /* compress into output buffer, to skip flush stage */
+                 else
+@@ -5306,19 +5758,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                     if (!lastBlock)
+                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                     zcs->inToCompress = zcs->inBuffPos;
+-                } else {
+-                    unsigned const lastBlock = (ip + iSize == iend);
+-                    assert(flushMode == ZSTD_e_end /* Already validated */);
++                } else { /* !inputBuffered, hence ZSTD_bm_stable */
++                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+                     cSize = lastBlock ?
+                             ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+                             ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                     /* Consume the input prior to error checking to mirror buffered mode. */
+-                    if (iSize > 0)
+-                        ip += iSize;
++                    if (ip) ip += iSize;
+                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                     zcs->frameEnded = lastBlock;
+-                    if (lastBlock)
+-                        assert(ip == iend);
++                    if (lastBlock) assert(ip == iend);
+                 }
+                 if (cDst == op) {  /* no need to flush */
+                     op += cSize;
+@@ -5388,8 +5837,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+  * This is validated at the start of the next compression call.
+  */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         cctx->expectedInBuffer = *input;
+     }
+@@ -5408,22 +5859,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+-        if (endOp != ZSTD_e_end)
+-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++        if (expect.src != input->src || expect.pos != input->pos)
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+     }
++    (void)endOp;
+     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+         size_t const outBufferSize = output->size - output->pos;
+         if (cctx->expectedOutBufferSize != outBufferSize)
+-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+     }
+     return 0;
+ }
+ 
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+                                              ZSTD_EndDirective endOp,
+-                                             size_t inSize) {
++                                             size_t inSize)
++{
+     ZSTD_CCtx_params params = cctx->requestedParams;
+     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5437,9 +5888,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+         params.compressionLevel = cctx->cdict->compressionLevel;
+     }
+     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
+-    {
+-        size_t const dictSize = prefixDict.dict
++    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
++
++    {   size_t const dictSize = prefixDict.dict
+                 ? prefixDict.dictSize
+                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5451,6 +5902,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+ 
+     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5477,6 +5931,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+     return 0;
+ }
+ 
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                              ZSTD_outBuffer* output,
+                              ZSTD_inBuffer* input,
+@@ -5491,8 +5947,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ 
+     /* transparent initialization stage */
+     if (cctx->streamStage == zcss_init) {
+-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
++        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
++        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
++          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
++            if (cctx->stableIn_notConsumed) {  /* not the first time */
++                /* check stable source guarantees */
++                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++            }
++            /* pretend input was consumed, to give a sense forward progress */
++            input->pos = input->size;
++            /* save stable inBuffer, for later control, and flush/end */
++            cctx->expectedInBuffer = *input;
++            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++            cctx->stableIn_notConsumed += inputSize;
++            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
++        }
++        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
+     }
+     /* end of transparent initialization stage */
+ 
+@@ -5510,13 +5985,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+                       const void* src, size_t srcSize, size_t* srcPos,
+                             ZSTD_EndDirective endOp)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
+     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+ 
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6021,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+         /* Reset to the original values. */
+         cctx->requestedParams.inBufferMode = originalInBufferMode;
+         cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+         if (result != 0) {  /* compression not completed, due to lack of output space */
+             assert(oPos == dstCapacity);
+@@ -5549,64 +6032,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+     }
+ }
+ 
+-typedef struct {
+-    U32 idx;             /* Index in array of ZSTD_Sequence */
+-    U32 posInSequence;   /* Position within sequence at idx */
+-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+  * @returns a ZSTD error code if sequence is not valid
+  */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+-                      size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+-    U32 const windowSize = 1 << windowLog;
++    U32 const windowSize = 1u << windowLog;
+     /* posInSrc represents the amount of data the decoder would decode up to this point.
+      * As long as the amount of data decoded is less than or equal to window size, offsets may be
+      * larger than the total length of output decoded in order to reference the dict, even larger than
+      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+      */
+     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+     return 0;
+ }
+ 
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+-    U32 offCode = STORE_OFFSET(rawOffset);
++    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+ 
+     if (!ll0 && rawOffset == rep[0]) {
+-        offCode = STORE_REPCODE_1;
++        offBase = REPCODE1_TO_OFFBASE;
+     } else if (rawOffset == rep[1]) {
+-        offCode = STORE_REPCODE(2 - ll0);
++        offBase = REPCODE_TO_OFFBASE(2 - ll0);
+     } else if (rawOffset == rep[2]) {
+-        offCode = STORE_REPCODE(3 - ll0);
++        offBase = REPCODE_TO_OFFBASE(3 - ll0);
+     } else if (ll0 && rawOffset == rep[0] - 1) {
+-        offCode = STORE_REPCODE_3;
++        offBase = REPCODE3_TO_OFFBASE;
+     }
+-    return offCode;
++    return offBase;
+ }
+ 
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                               ZSTD_sequencePosition* seqPos,
+                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                        const void* src, size_t blockSize)
++                                        const void* src, size_t blockSize,
++                                        ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
++    U32 const startIdx = idx;
+     BYTE const* ip = (BYTE const*)(src);
+     const BYTE* const iend = ip + blockSize;
+     repcodes_t updatedRepcodes;
+     U32 dictSize;
+ 
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+     if (cctx->cdict) {
+         dictSize = (U32)cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5615,25 +6095,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         dictSize = 0;
+     }
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+         U32 const litLength = inSeqs[idx].litLength;
+-        U32 const ll0 = (litLength == 0);
+         U32 const matchLength = inSeqs[idx].matchLength;
+-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++        U32 offBase;
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++        if (externalRepSearch == ZSTD_ps_disable) {
++            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++        } else {
++            U32 const ll0 = (litLength == 0);
++            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++        }
++
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+                                                 "Sequence validation failed");
+         }
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
+     }
++
++    /* If we skipped repcode search while parsing, we need to update repcodes now */
++    assert(externalRepSearch != ZSTD_ps_auto);
++    assert(idx >= startIdx);
++    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++        U32* const rep = updatedRepcodes.rep;
++        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++        if (lastSeqIdx >= startIdx + 2) {
++            rep[2] = inSeqs[lastSeqIdx - 2].offset;
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else if (lastSeqIdx == startIdx + 1) {
++            rep[2] = rep[0];
++            rep[1] = inSeqs[lastSeqIdx - 1].offset;
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        } else {
++            assert(lastSeqIdx == startIdx);
++            rep[2] = rep[1];
++            rep[1] = rep[0];
++            rep[0] = inSeqs[lastSeqIdx].offset;
++        }
++    }
++
+     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+ 
+     if (inSeqs[idx].litLength) {
+@@ -5642,26 +6152,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+         ip += inSeqs[idx].litLength;
+         seqPos->posInSrc += inSeqs[idx].litLength;
+     }
+-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+     seqPos->idx = idx+1;
+     return 0;
+ }
+ 
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                   const void* src, size_t blockSize)
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+     U32 idx = seqPos->idx;
+     U32 startPosInSequence = seqPos->posInSequence;
+@@ -5673,6 +6172,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     U32 bytesAdjustment = 0;
+     U32 finalMatchSplit = 0;
+ 
++    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++    (void)externalRepSearch;
++
+     if (cctx->cdict) {
+         dictSize = cctx->cdict->dictContentSize;
+     } else if (cctx->prefixDict.dict) {
+@@ -5680,7 +6182,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+     } else {
+         dictSize = 0;
+     }
+-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5688,7 +6190,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         U32 litLength = currSeq.litLength;
+         U32 matchLength = currSeq.matchLength;
+         U32 const rawOffset = currSeq.offset;
+-        U32 offCode;
++        U32 offBase;
+ 
+         /* Modify the sequence depending on where endPosInSequence lies */
+         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5702,7 +6204,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+             /* Move to the next sequence */
+             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+             startPosInSequence = 0;
+-            idx++;
+         } else {
+             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+                does not reach the end of the match. So, we have to split the sequence */
+@@ -5742,21 +6243,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+         }
+         /* Check if this offset can be represented with a repcode */
+         {   U32 const ll0 = (litLength == 0);
+-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+         }
+ 
+         if (cctx->appliedParams.validateSequences) {
+             seqPos->posInSrc += litLength + matchLength;
+-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
++            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+                                                    "Sequence validation failed");
+         }
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+         ip += matchLength + litLength;
++        if (!finalMatchSplit)
++            idx++; /* Next Sequence */
+     }
+     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5779,7 +6282,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ 
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+-                                       const void* src, size_t blockSize);
++                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+     ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5793,6 +6296,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+     return sequenceCopier;
+ }
+ 
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    int end = 0;
++    size_t blockSize = 0;
++    size_t spos = seqPos.idx;
++    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++    assert(spos <= inSeqsSize);
++    while (spos < inSeqsSize) {
++        end = (inSeqs[spos].offset == 0);
++        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++        if (end) {
++            if (inSeqs[spos].matchLength != 0)
++                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++            break;
++        }
++        spos++;
++    }
++    if (!end)
++        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++    return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++    int const lastBlock = (remaining <= blockSize);
++    return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++                           size_t blockSize, size_t remaining,
++                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++    if (mode == ZSTD_sf_noBlockDelimiters)
++        return blockSize_noDelimiter(blockSize, remaining);
++    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++        if (explicitBlockSize > blockSize)
++            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++        if (explicitBlockSize > remaining)
++            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++        return explicitBlockSize;
++    }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+  *
+  * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5805,9 +6359,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                           const void* src, size_t srcSize)
+ {
+     size_t cSize = 0;
+-    U32 lastBlock;
+-    size_t blockSize;
+-    size_t compressedSeqsSize;
+     size_t remaining = srcSize;
+     ZSTD_sequencePosition seqPos = {0, 0, 0};
+ 
+@@ -5827,22 +6378,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+     }
+ 
+     while (remaining) {
++        size_t compressedSeqsSize;
+         size_t cBlockSize;
+         size_t additionalByteAdjustment;
+-        lastBlock = remaining <= cctx->blockSize;
+-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++                                        cctx->blockSize, remaining,
++                                        inSeqs, inSeqsSize, seqPos);
++        U32 const lastBlock = (blockSize == remaining);
++        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++        assert(blockSize <= remaining);
+         ZSTD_resetSeqStore(&cctx->seqStore);
+-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+ 
+-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+         blockSize -= additionalByteAdjustment;
+ 
+         /* If blocks are too small, emit as a nocompress block */
+-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++         * additional 1. We need to revisit and change this logic to be more consistent */
++        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+             cSize += cBlockSize;
+             ip += blockSize;
+             op += cBlockSize;
+@@ -5851,6 +6409,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             continue;
+         }
+ 
++        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+                                 &cctx->appliedParams,
+@@ -5859,11 +6418,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                                 cctx->bmi2);
+         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+ 
+         if (!cctx->isFirstBlock &&
+             ZSTD_maybeRLE(&cctx->seqStore) &&
+-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
++            ZSTD_isRLE(ip, blockSize)) {
+             /* We don't want to emit our first block as a RLE even if it qualifies because
+             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+             * This is only an issue for zstd <= v1.4.3
+@@ -5874,12 +6433,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+         if (compressedSeqsSize == 0) {
+             /* ZSTD_noCompressBlock writes the block header as well */
+             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+         } else if (compressedSeqsSize == 1) {
+             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+         } else {
+             U32 cBlockHeader;
+             /* Error checking and repcodes update */
+@@ -5891,11 +6450,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+             MEM_writeLE24(op, cBlockHeader);
+             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+         }
+ 
+         cSize += cBlockSize;
+-        DEBUGLOG(4, "cSize running total: %zu", cSize);
+ 
+         if (lastBlock) {
+             break;
+@@ -5906,12 +6464,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+             dstCapacity -= cBlockSize;
+             cctx->isFirstBlock = 0;
+         }
++        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+     }
+ 
++    DEBUGLOG(4, "cSize final total: %zu", cSize);
+     return cSize;
+ }
+ 
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++                              void* dst, size_t dstCapacity,
+                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                               const void* src, size_t srcSize)
+ {
+@@ -5921,7 +6482,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+     size_t frameHeaderSize = 0;
+ 
+     /* Transparent initialization stage, same as compressStream2() */
+-    DEBUGLOG(3, "ZSTD_compressSequences()");
++    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+     assert(cctx != NULL);
+     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+     /* Begin writing output, starting with frame header */
+@@ -5949,26 +6510,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+         cSize += 4;
+     }
+ 
+-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
++    DEBUGLOG(4, "Final compressed size: %zu", cSize);
+     return cSize;
+ }
+ 
+ /*======   Finalize   ======*/
+ 
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++    return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+  * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++    input.size = input.pos; /* do not ingest more input during flush */
+     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+ 
+ 
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+-    ZSTD_inBuffer input = { NULL, 0, 0 };
++    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+     /* single thread mode : attempt to calculate remaining to flush more precisely */
+     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6090,7 +6659,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+             cp.targetLength = (unsigned)(-clampedCompressionLevel);
+         }
+         /* refine parameters based on srcSize & dictSize */
+-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+     }
+ }
+ 
+@@ -6125,3 +6694,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++    ZSTD_CCtx* zc, void* mState,
++    ZSTD_sequenceProducer_F* mFinder
++) {
++    if (mFinder != NULL) {
++        ZSTD_externalMatchCtx emctx;
++        emctx.mState = mState;
++        emctx.mFinder = mFinder;
++        emctx.seqBuffer = NULL;
++        emctx.seqBufferCapacity = 0;
++        zc->externalMatchCtx = emctx;
++        zc->requestedParams.useSequenceProducer = 1;
++    } else {
++        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
++        zc->requestedParams.useSequenceProducer = 0;
++    }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11ae30..0198c8f5cac0 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+ 
+ 
+ /*-*************************************
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+  *  Builds entropy for the block.
+  *  @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+-                             const ZSTD_entropyCTables_t* prevEntropy,
+-                                   ZSTD_entropyCTables_t* nextEntropy,
+-                             const ZSTD_CCtx_params* cctxParams,
+-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+-                                   void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++                    const seqStore_t* seqStorePtr,
++                    const ZSTD_entropyCTables_t* prevEntropy,
++                          ZSTD_entropyCTables_t* nextEntropy,
++                    const ZSTD_CCtx_params* cctxParams,
++                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++                          void* workspace, size_t wkspSize);
+ 
+ /* *******************************
+ *  Compression internals structs *
+@@ -142,6 +145,12 @@ typedef struct {
+   size_t capacity;      /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+ 
++typedef struct {
++    U32 idx;            /* Index in array of ZSTD_Sequence */
++    U32 posInSequence;  /* Position within sequence at idx */
++    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+ 
+ typedef struct {
+@@ -228,6 +237,11 @@ struct ZSTD_matchState_t {
+     const ZSTD_matchState_t* dictMatchState;
+     ZSTD_compressionParameters cParams;
+     const rawSeqStore_t* ldmSeqStore;
++
++    /* Controls prefetching in some dictMatchState matchfinders.
++     * This behavior is controlled from the cctx ms.
++     * This parameter has no effect in the cdict ms. */
++    int prefetchCDictTables;
+ };
+ 
+ typedef struct {
+@@ -324,6 +338,24 @@ struct ZSTD_CCtx_params_s {
+ 
+     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+     ZSTD_customMem customMem;
++
++    /* Controls prefetching in some dictMatchState matchfinders */
++    ZSTD_paramSwitch_e prefetchCDictTables;
++
++    /* Controls whether zstd will fall back to an internal matchfinder
++     * if the external matchfinder returns an error code. */
++    int enableMatchFinderFallback;
++
++    /* Indicates whether an external matchfinder has been referenced.
++     * Users can't set this externally.
++     * It is set internally in ZSTD_registerSequenceProducer(). */
++    int useSequenceProducer;
++
++    /* Adjust the max block size*/
++    size_t maxBlockSize;
++
++    /* Controls repcode search in external sequence parsing */
++    ZSTD_paramSwitch_e searchForExternalRepcodes;
+ };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+ 
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -355,6 +387,14 @@ typedef struct {
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ } ZSTD_blockSplitCtx;
+ 
++/* Context for block-level external matchfinder API */
++typedef struct {
++  void* mState;
++  ZSTD_sequenceProducer_F* mFinder;
++  ZSTD_Sequence* seqBuffer;
++  size_t seqBufferCapacity;
++} ZSTD_externalMatchCtx;
++
+ struct ZSTD_CCtx_s {
+     ZSTD_compressionStage_e stage;
+     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+@@ -404,6 +444,7 @@ struct ZSTD_CCtx_s {
+ 
+     /* Stable in/out buffer verification */
+     ZSTD_inBuffer expectedInBuffer;
++    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+     size_t expectedOutBufferSize;
+ 
+     /* Dictionary */
+@@ -417,9 +458,13 @@ struct ZSTD_CCtx_s {
+ 
+     /* Workspace for block splitter */
+     ZSTD_blockSplitCtx blockSplitCtx;
++
++    /* Workspace for external matchfinder */
++    ZSTD_externalMatchCtx externalMatchCtx;
+ };
+ 
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+ 
+ typedef enum {
+     ZSTD_noDict = 0,
+@@ -441,7 +486,7 @@ typedef enum {
+                                  * In this mode we take both the source size and the dictionary size
+                                  * into account when selecting and adjusting the parameters.
+                                  */
+-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+                                  * We don't know what these parameters are for. We default to the legacy
+                                  * behavior of taking both the source size and the dict size into account
+                                  * when selecting and adjusting parameters.
+@@ -500,9 +545,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+ /* ZSTD_noCompressBlock() :
+  * Writes uncompressed block to dst buffer from given src.
+  * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                     dstSize_tooSmall, "dst buf too small for uncompressed block");
+     MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +557,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+     return ZSTD_blockHeaderSize + srcSize;
+ }
+ 
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+     BYTE* const op = (BYTE*)dst;
+     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +577,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+     return (srcSize >> minlog) + 2;
+ }
+ 
+@@ -565,29 +613,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+     while (ip < iend) *op++ = *ip++;
+ }
+ 
+-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
+ 
+ /*! ZSTD_storeSeq() :
+- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+  *  @matchLength : must be >= MINMATCH
+- *  Allowed to overread literals up to litLimit.
++ *  Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+               size_t litLength, const BYTE* literals, const BYTE* litLimit,
+-              U32 offBase_minus1,
++              U32 offBase,
+               size_t matchLength)
+ {
+     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +642,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     static const BYTE* g_start = NULL;
+     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+     }
+ #endif
+     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +653,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     assert(literals + litLength <= litLimit);
+     if (litEnd <= litLimit_w) {
+         /* Common case we can use wildcopy.
+-	 * First copy 16 bytes, because literals are likely short.
+-	 */
+-        assert(WILDCOPY_OVERLENGTH >= 16);
++         * First copy 16 bytes, because literals are likely short.
++         */
++        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+         ZSTD_copy16(seqStorePtr->lit, literals);
+         if (litLength > 16) {
+             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +674,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+     seqStorePtr->sequences[0].litLength = (U16)litLength;
+ 
+     /* match offset */
+-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++    seqStorePtr->sequences[0].offBase = offBase;
+ 
+     /* match Length */
+     assert(matchLength >= MINMATCH);
+@@ -646,17 +692,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ 
+ /* ZSTD_updateRep() :
+  * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+  */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
++    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
+         rep[2] = rep[1];
+         rep[1] = rep[0];
+-        rep[0] = STORED_OFFSET(offBase_minus1);
++        rep[0] = OFFBASE_TO_OFFSET(offBase);
+     } else {   /* repcode */
+-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+         if (repCode > 0) {  /* note : if repCode==0, no change */
+             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +719,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+ 
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+     repcodes_t newReps;
+     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++    ZSTD_updateRep(newReps.rep, offBase, ll0);
+     return newReps;
+ }
+ 
+@@ -685,59 +731,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ *  Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+-    if (MEM_isLittleEndian()) {
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_ctzll((U64)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+-                                                     0, 3, 1, 3, 1, 4, 2, 7,
+-                                                     0, 2, 3, 6, 1, 5, 3, 5,
+-                                                     1, 3, 4, 4, 2, 5, 6, 7,
+-                                                     7, 0, 1, 2, 3, 3, 4, 6,
+-                                                     2, 6, 5, 5, 3, 4, 5, 6,
+-                                                     7, 1, 2, 4, 6, 4, 4, 5,
+-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_ctz((U32)val) >> 3);
+-#       else
+-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+-                                                     3, 2, 2, 1, 3, 2, 0, 1,
+-                                                     3, 3, 1, 2, 2, 2, 2, 0,
+-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-#       endif
+-        }
+-    } else {  /* Big Endian CPU */
+-        if (MEM_64bits()) {
+-#       if (__GNUC__ >= 4)
+-            return (__builtin_clzll(val) >> 3);
+-#       else
+-            unsigned r;
+-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-        } else { /* 32 bits */
+-#       if (__GNUC__ >= 3)
+-            return (__builtin_clz((U32)val) >> 3);
+-#       else
+-            unsigned r;
+-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+-            r += (!val);
+-            return r;
+-#       endif
+-    }   }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+     const BYTE* const pStart = pIn;
+@@ -783,32 +776,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+  *  Hashes
+  ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
++static U32    ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+ MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+ 
+ static const U32 prime4bytes = 2654435761U;
+-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32    ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); }
+ 
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
++static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+ static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+ 
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
++static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+ static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+ 
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
++static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+ static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+ 
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
++static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+ static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+ 
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++    /* Although some of these hashes do support hBits up to 64, some do not.
++     * To be on the safe side, always avoid hBits > 32. */
++    assert(hBits <= 32);
++
+     switch(mls)
+     {
+     default:
+@@ -1167,10 +1164,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+         assert(blockEndIdx >= loadedDictEnd);
+ 
+-        if (blockEndIdx > loadedDictEnd + maxDist) {
++        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+             /* On reaching window size, dictionaries are invalidated.
+              * For simplification, if window size is reached anywhere within next block,
+              * the dictionary is invalidated for the full block.
++             *
++             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++             * dictMatchState, so setting it to NULL is not a problem.
+              */
+             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+             *loadedDictEndPtr = 0;
+@@ -1302,6 +1304,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+ 
+ #endif
+ 
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ *     1. Compute hash at ip
++ *     2. Load index from hashTable[hash]
++ *     3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++    return tag1 == tag2;
++}
+ 
+ 
+ /* ===============================================================
+@@ -1396,4 +1434,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+  */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+ 
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++                                              ZSTD_sequencePosition* seqPos,
++                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059aba..3e9ea46a670a 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+  ***************************************/
+ #include "zstd_compress_literals.h"
+ 
++
++/* **************************************************************
++*  Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++    const BYTE* const ip = (const BYTE*)src;
++    size_t u;
++    for (u=0; u<srcSize; u++) {
++        RAWLOG(5, " %02X", ip[u]); (void)ip;
++    }
++    RAWLOG(5, " \n");
++    return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++*  Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
++    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+ 
+     switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+     }
+ 
+     ZSTD_memcpy(ostart + flSize, src, srcSize);
+-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+     return srcSize + flSize;
+ }
+ 
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++    assert(srcSize >= 1);
++    assert(src != NULL);
++    {   const BYTE b = ((const BYTE*)src)[0];
++        size_t p;
++        for (p=1; p<srcSize; p++) {
++            if (((const BYTE*)src)[p] != b) return 0;
++        }
++        return 1;
++    }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+ 
+-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
++    assert(dstCapacity >= 4); (void)dstCapacity;
++    assert(allBytesIdentical(src, srcSize));
+ 
+     switch(flSize)
+     {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+     }
+ 
+     ostart[flSize] = *(const BYTE*)src;
+-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+     return flSize+1;
+ }
+ 
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize,
+-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++    assert((int)strategy >= 0);
++    assert((int)strategy <= 9);
++    /* btultra2 : min 8 bytes;
++     * then 2x larger for each successive compression strategy
++     * max threshold 64 bytes */
++    {   int const shift = MIN(9-(int)strategy, 3);
++        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++        return mintc;
++    }
++}
++
++size_t ZSTD_compressLiterals (
++                  void* dst, size_t dstCapacity,
++            const void* src, size_t srcSize,
++                  void* entropyWorkspace, size_t entropyWorkspaceSize,
++            const ZSTD_hufCTables_t* prevHuf,
++                  ZSTD_hufCTables_t* nextHuf,
++                  ZSTD_strategy strategy,
++                  int disableLiteralCompression,
++                  int suspectUncompressible,
++                  int bmi2)
+ {
+-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+     BYTE*  const ostart = (BYTE*)dst;
+     U32 singleStream = srcSize < 256;
+     symbolEncodingType_e hType = set_compressed;
+     size_t cLitSize;
+ 
+-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+-                disableLiteralCompression, (U32)srcSize);
++    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++                disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+ 
+     /* Prepare nextEntropy assuming reusing the existing table */
+     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     if (disableLiteralCompression)
+         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+-    /* small ? don't even attempt compression (speed opt) */
+-#   define COMPRESS_LITERALS_SIZE_MIN 63
+-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    /* if too small, don't even attempt compression (speed opt) */
++    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ 
+     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+     {   HUF_repeat repeat = prevHuf->repeatMode;
+-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++        int const flags = 0
++            | (bmi2 ? HUF_flags_bmi2 : 0)
++            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++        huf_compress_f huf_compress;
+         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+-        cLitSize = singleStream ?
+-            HUF_compress1X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+-            HUF_compress4X_repeat(
+-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++                                src, srcSize,
++                                HUF_SYMBOLVALUE_MAX, LitHufLog,
++                                entropyWorkspace, entropyWorkspaceSize,
++                                (HUF_CElt*)nextHuf->CTable,
++                                &repeat, flags);
++        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+         if (repeat != HUF_repeat_none) {
+             /* reused the existing table */
+-            DEBUGLOG(5, "Reusing previous huffman table");
++            DEBUGLOG(5, "reusing statistics from previous huffman block");
+             hType = set_repeat;
+         }
+     }
+ 
+-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+-    }
++    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
++        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++    }   }
+     if (cLitSize==1) {
+-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+-    }
++        /* A return value of 1 signals that the alphabet consists of a single symbol.
++         * However, in some rare circumstances, it could be the compressed size (a single byte).
++         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++         * (it's also necessary to not generate statistics).
++         * Therefore, in such a case, actively check that all bytes are identical. */
++        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++    }   }
+ 
+     if (hType == set_compressed) {
+         /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+     case 4: /* 2 - 2 - 14 - 14 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+             MEM_writeLE32(ostart, lhc);
+             break;
+         }
+     case 5: /* 2 - 2 - 18 - 18 */
++        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+             MEM_writeLE32(ostart, lhc);
+             ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97cb70..a2a85d6b69e5 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+ 
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ 
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+-                              ZSTD_hufCTables_t* nextHuf,
+-                              ZSTD_strategy strategy, int disableLiteralCompression,
+-                              void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               void* entropyWorkspace, size_t entropyWorkspaceSize,
+-                        const int bmi2,
+-                        unsigned suspectUncompressible);
++                        const ZSTD_hufCTables_t* prevHuf,
++                              ZSTD_hufCTables_t* nextHuf,
++                              ZSTD_strategy strategy, int disableLiteralCompression,
++                              int suspectUncompressible,
++                              int bmi2);
+ 
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37acf..5c028c78d889 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+     /* Heuristic: This should cover most blocks <= 16K and
+      * start to fade out after 16K to about 32K depending on
+-     * comprssibility.
++     * compressibility.
+      */
+     return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+     if (mostFrequent == nbSeq) {
+         *repeatMode = FSE_repeat_none;
+         if (isDefaultAllowed && nbSeq <= 2) {
+-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
++            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+              * If basic encoding isn't possible, always choose RLE.
+              */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2f71..7fe6f4ff5cf2 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc84e8..dbacbaf72733 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+  *  @return : compressed size of literals section of a sub-block
+- *            Or 0 if it unable to compress.
++ *            Or 0 if unable to compress.
+  *            Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+-                                    const BYTE* literals, size_t litSize,
+-                                    void* dst, size_t dstSize,
+-                                    const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
++                              const BYTE* literals, size_t litSize,
++                              void* dst, size_t dstSize,
++                              const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     size_t const header = writeEntropy ? 200 : 0;
+     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+     size_t cLitSize = 0;
+ 
+-    (void)bmi2; /* TODO bmi2... */
+-
+     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+ 
+     *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+     }
+ 
+-    /* TODO bmi2 */
+-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
+         op += cSize;
+         cLitSize += cSize;
+         if (cSize == 0 || ERR_isError(cSize)) {
+@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     return op-ostart;
+ }
+ 
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++                   const seqDef* sequences, size_t nbSeq,
++                         size_t litSize, int lastSequence)
++{
+     const seqDef* const sstart = sequences;
+     const seqDef* const send = sequences + nbSeq;
+     const seqDef* sp = sstart;
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+  *  @return : compressed size of sequences section of a sub-block
+  *            Or 0 if it is unable to compress
+  *            Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+-                                              const seqDef* sequences, size_t nbSeq,
+-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+-                                              const ZSTD_CCtx_params* cctxParams,
+-                                              void* dst, size_t dstCapacity,
+-                                              const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
++                                const seqDef* sequences, size_t nbSeq,
++                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++                                const ZSTD_CCtx_params* cctxParams,
++                                void* dst, size_t dstCapacity,
++                                const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+             repcodes_t rep;
+             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+             for (seq = sstart; seq < sp; ++seq) {
+-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+             }
+             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+         }
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece79546e..826bbc9e029b 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c355..ef5e65cfcf9a 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -451,7 +452,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+     assert(ws->tableValidEnd >= ws->objectEnd);
+     assert(ws->tableValidEnd <= ws->allocStart);
+     if (ws->tableValidEnd < ws->tableEnd) {
+-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+     }
+     ZSTD_cwksp_mark_tables_clean(ws);
+ }
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2624..ab9440a99603 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,43 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+ 
++static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashLarge = ms->hashTable;
++    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    U32* const hashSmall = ms->chainTable;
++    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++    /* Always insert every fastHashFillStep position into the hash tables.
++     * Insert the other positions into the large hash table if their entry
++     * is empty.
++     */
++    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        U32 i;
++        for (i = 0; i < fastHashFillStep; ++i) {
++            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++            if (i == 0) {
++                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++            }
++            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++            }
++            /* Only load extra positions for ZSTD_dtlm_full */
++            if (dtlm == ZSTD_dtlm_fast)
++                break;
++    }   }
++}
++
++static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+             /* Only load extra positions for ZSTD_dtlm_full */
+             if (dtlm == ZSTD_dtlm_fast)
+                 break;
+-    }   }
++        }   }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++    }
+ }
+ 
+ 
+@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t mLength;
+     U32 offset;
+@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         U32 const current = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+         U32 const maxRep = current - windowLow;
+-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+ 
+     /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                 ip++;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                 goto _match_stored;
+             }
+ 
+@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         } while (ip1 <= ilimit);
+ 
+ _cleanup:
++        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+         /* save reps for next block */
+-        rep[0] = offset_1 ? offset_1 : offsetSaved;
+-        rep[1] = offset_2 ? offset_2 : offsetSaved;
++        rep[0] = offset_1 ? offset_1 : offsetSaved1;
++        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+         /* Return the last literals size */
+         return (size_t)(iend - anchor);
+@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+             hashLong[hl1] = (U32)(ip1 - base);
+         }
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 ip += rLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictHBitsL           = dictCParams->hashLog;
+-    const U32 dictHBitsS           = dictCParams->chainLog;
++    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+ 
+     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     /* if a dictionary is attached, it must be within window range */
+     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashLong, hashTableBytes)
++        PREFETCH_AREA(dictHashSmall, chainTableBytes)
++    }
++
+     /* init */
+     ip += (dictAndPrefixLength == 0);
+ 
+@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         U32 offset;
+         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+         U32 const curr = (U32)(ip-base);
+         U32 const matchIndexL = hashLong[h2];
+         U32 matchIndexS = hashSmall[h];
+@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+             goto _match_stored;
+         }
+ 
+@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                 goto _match_found;
+             }
+-        } else {
++        } else if (dictTagsMatchL) {
+             /* check dictMatchState long match */
+-            U32 const dictMatchIndexL = dictHashLong[dictHL];
++            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+             assert(dictMatchL < dictEnd);
+ 
+@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+             if (MEM_read32(match) == MEM_read32(ip)) {
+                 goto _search_next_long;
+             }
+-        } else {
++        } else if (dictTagsMatchS) {
+             /* check dictMatchState short match */
+-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
++            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+             match = dictBase + dictMatchIndexS;
+             matchIndexS = dictMatchIndexS + dictIndexDelta;
+ 
+@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         continue;
+ 
+ _search_next_long:
+-
+         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+             U32 const matchIndexL3 = hashLong[hl3];
++            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+             const BYTE* matchL3 = base + matchIndexL3;
+             hashLong[hl3] = curr + 1;
+ 
+@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                     goto _match_found;
+                 }
+-            } else {
++            } else if (dictTagsMatchL3) {
+                 /* check dict long +1 match */
+-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                 assert(dictMatchL3 < dictEnd);
+                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         offset_2 = offset_1;
+         offset_1 = offset;
+ 
+-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+ _match_stored:
+         /* match found */
+@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+     }   /* while (ip < ilimit) */
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+             ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+         } else {
+             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                 }
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+ 
+             } else {
+                 ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                     ip += repLength2;
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65a1d..0204f12e4cf7 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+ 
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                              ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_doubleFast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6beab52..3399b39c5dbc 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,42 @@
+ #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+ 
++static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm)
++{
++    const ZSTD_compressionParameters* const cParams = &ms->cParams;
++    U32* const hashTable = ms->hashTable;
++    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++    U32  const mls = cParams->minMatch;
++    const BYTE* const base = ms->window.base;
++    const BYTE* ip = base + ms->nextToUpdate;
++    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++    const U32 fastHashFillStep = 3;
+ 
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_full);
++
++    /* Always insert every fastHashFillStep position into the hash table.
++     * Insert the other positions if their hash entry is empty.
++     */
++    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++        U32 const curr = (U32)(ip - base);
++        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
++
++        if (dtlm == ZSTD_dtlm_fast) continue;
++        /* Only load extra positions for ZSTD_dtlm_full */
++        {   U32 p;
++            for (p = 1; p < fastHashFillStep; ++p) {
++                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
++                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++                }   }   }   }
++}
++
++static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                         const void* const end,
+                         ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+     const U32 fastHashFillStep = 3;
+ 
++    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++     * Feel free to remove this assert if there's a good reason! */
++    assert(dtlm == ZSTD_dtlm_fast);
++
+     /* Always insert every fastHashFillStep position into the hash table.
+      * Insert the other positions if their hash entry is empty.
+      */
+@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+     }   }   }   }
+ }
+ 
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++                        const void* const end,
++                        ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp)
++{
++    if (tfp == ZSTD_tfp_forCDict) {
++        ZSTD_fillHashTableForCDict(ms, end, dtlm);
++    } else {
++        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++    }
++}
++
+ 
+ /*
+  * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ 
+     U32 rep_offset1 = rep[0];
+     U32 rep_offset2 = rep[1];
+-    U32 offsetSaved = 0;
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     size_t hash0; /* hash for ip0 */
+     size_t hash1; /* hash for ip1 */
+@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+     {   U32 const curr = (U32)(ip0 - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+     }
+ 
+     /* start each op */
+@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+             mLength = ip0[-1] == match0[-1];
+             ip0 -= mLength;
+             match0 -= mLength;
+-            offcode = STORE_REPCODE_1;
++            offcode = REPCODE1_TO_OFFBASE;
+             mLength += 4;
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 is before the
++             * repcode (ip2). */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _match;
+         }
+ 
+@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* First write next hash table entry; we've already calculated it.
++             * This write is known to be safe because the ip1 == ip0 + 1, so
++             * we know we will resume searching after ip1 */
++            hashTable[hash1] = (U32)(ip1 - base);
++
+             goto _offset;
+         }
+ 
+@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+         /* check match at ip[0] */
+         if (MEM_read32(ip0) == mval) {
+             /* found a match! */
++
++            /* first write next hash table entry; we've already calculated it */
++            if (step <= 4) {
++                /* We need to avoid writing an index into the hash table >= the
++                 * position at which we will pick up our searching after we've
++                 * taken this match.
++                 *
++                 * The minimum possible match has length 4, so the earliest ip0
++                 * can be after we take this match will be the current ip0 + 4.
++                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++                 * write this position.
++                 */
++                hashTable[hash1] = (U32)(ip1 - base);
++            }
++
+             goto _offset;
+         }
+ 
+@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+      * However, it seems to be a meaningful performance hit to try to search
+      * them. So let's not. */
+ 
++    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++     * When the offsets are still zero, we need to restore them after the block to have a correct
++     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++     * offsets were invalid. We need to figure out which offset to refill with.
++     *     - If both offsets are zero they are in the same order.
++     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++     *     - If only one is zero, we need to decide which offset to restore.
++     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++     *         - It is impossible for rep_offset2 to be non-zero.
++     *
++     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++     */
++    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+     /* save reps for next block */
+-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+     match0 = base + idx;
+     rep_offset2 = rep_offset1;
+     rep_offset1 = (U32)(ip0-match0);
+-    offcode = STORE_OFFSET(rep_offset1);
++    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+     mLength = 4;
+ 
+     /* Count the backwards match length. */
+@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+     ip0 += mLength;
+     anchor = ip0;
+ 
+-    /* write next hash table entry */
+-    if (ip1 < ip0) {
+-        hashTable[hash1] = (U32)(ip1 - base);
+-    }
+-
+     /* Fill table and check for immediate repcode. */
+     if (ip0 <= ilimit) {
+         /* Fill Table */
+@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                 ip0 += rLength;
+-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                 anchor = ip0;
+                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+     }   }   }
+@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+     const BYTE* const base = ms->window.base;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
++    const BYTE* ip0 = istart;
++    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+     const BYTE* anchor = istart;
+     const U32   prefixStartIndex = ms->window.dictLimit;
+     const BYTE* const prefixStart = base + prefixStartIndex;
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+     U32 offset_1=rep[0], offset_2=rep[1];
+-    U32 offsetSaved = 0;
+ 
+     const ZSTD_matchState_t* const dms = ms->dictMatchState;
+     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+     const BYTE* const dictStart    = dictBase + dictStartIndex;
+     const BYTE* const dictEnd      = dms->window.nextSrc;
+     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+-    const U32 dictHLog             = dictCParams->hashLog;
++    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
++    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ 
+     /* if a dictionary is still attached, it necessarily means that
+      * it is within window size. So we just check it. */
+     const U32 maxDistance = 1U << cParams->windowLog;
+-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+     assert(endIndex - prefixStartIndex <= maxDistance);
+     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+ 
+@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+      * when translating a dict index into a local index */
+     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+ 
++    if (ms->prefetchCDictTables) {
++        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++        PREFETCH_AREA(dictHashTable, hashTableBytes)
++    }
++
+     /* init */
+     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+-    ip += (dictAndPrefixLength == 0);
++    ip0 += (dictAndPrefixLength == 0);
+     /* dictMatchState repCode checks don't currently handle repCode == 0
+      * disabling. */
+     assert(offset_1 <= dictAndPrefixLength);
+     assert(offset_2 <= dictAndPrefixLength);
+ 
+-    /* Main Search Loop */
+-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
++    /* Outer search loop */
++    assert(stepSize >= 1);
++    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+         size_t mLength;
+-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+-        U32 const curr = (U32)(ip-base);
+-        U32 const matchIndex = hashTable[h];
+-        const BYTE* match = base + matchIndex;
+-        const U32 repIndex = curr + 1 - offset_1;
+-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+-                               dictBase + (repIndex - dictIndexDelta) :
+-                               base + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-
+-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+-        } else if ( (matchIndex <= prefixStartIndex) ) {
+-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+-            U32 const dictMatchIndex = dictHashTable[dictHash];
+-            const BYTE* dictMatch = dictBase + dictMatchIndex;
+-            if (dictMatchIndex <= dictStartIndex ||
+-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
+-            } else {
+-                /* found a dict match */
+-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (dictMatch>dictStart))
+-                     && (ip[-1] == dictMatch[-1])) {
+-                    ip--; dictMatch--; mLength++;
++        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++        U32 matchIndex = hashTable[hash0];
++        U32 curr = (U32)(ip0 - base);
++        size_t step = stepSize;
++        const size_t kStepIncr = 1 << kSearchStrength;
++        const BYTE* nextStep = ip0 + kStepIncr;
++
++        /* Inner search loop */
++        while (1) {
++            const BYTE* match = base + matchIndex;
++            const U32 repIndex = curr + 1 - offset_1;
++            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++                                   dictBase + (repIndex - dictIndexDelta) :
++                                   base + repIndex;
++            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++            hashTable[hash0] = curr;   /* update hash table */
++
++            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++                ip0++;
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++                break;
++            }
++
++            if (dictTagsMatch) {
++                /* Found a possible dict match */
++                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++                const BYTE* dictMatch = dictBase + dictMatchIndex;
++                if (dictMatchIndex > dictStartIndex &&
++                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
++                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++                    if (matchIndex <= prefixStartIndex) {
++                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++                        while (((ip0 > anchor) & (dictMatch > dictStart))
++                            && (ip0[-1] == dictMatch[-1])) {
++                            ip0--;
++                            dictMatch--;
++                            mLength++;
++                        } /* catch up */
++                        offset_2 = offset_1;
++                        offset_1 = offset;
++                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                        break;
++                    }
++                }
++            }
++
++            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++                /* found a regular match */
++                U32 const offset = (U32) (ip0 - match);
++                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++                while (((ip0 > anchor) & (match > prefixStart))
++                       && (ip0[-1] == match[-1])) {
++                    ip0--;
++                    match--;
++                    mLength++;
+                 } /* catch up */
+                 offset_2 = offset_1;
+                 offset_1 = offset;
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++                break;
+             }
+-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+-            /* it's not a match, and we're not going to check the dictionary */
+-            assert(stepSize >= 1);
+-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-            continue;
+-        } else {
+-            /* found a regular match */
+-            U32 const offset = (U32)(ip-match);
+-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+-            while (((ip>anchor) & (match>prefixStart))
+-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+-            offset_2 = offset_1;
+-            offset_1 = offset;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-        }
++
++            /* Prepare for next iteration */
++            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++            matchIndex = hashTable[hash1];
++
++            if (ip1 >= nextStep) {
++                step++;
++                nextStep += kStepIncr;
++            }
++            ip0 = ip1;
++            ip1 = ip1 + step;
++            if (ip1 > ilimit) goto _cleanup;
++
++            curr = (U32)(ip0 - base);
++            hash0 = hash1;
++        }   /* end inner search loop */
+ 
+         /* match found */
+-        ip += mLength;
+-        anchor = ip;
++        assert(mLength);
++        ip0 += mLength;
++        anchor = ip0;
+ 
+-        if (ip <= ilimit) {
++        if (ip0 <= ilimit) {
+             /* Fill Table */
+             assert(base+curr+2 > istart);  /* check base overflow */
+             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+ 
+             /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
++            while (ip0 <= ilimit) {
++                U32 const current2 = (U32)(ip0-base);
+                 U32 const repIndex2 = current2 - offset_2;
+                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                         dictBase - dictIndexDelta + repIndex2 :
+                         base + repIndex2;
+                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++                    ip0 += repLength2;
++                    anchor = ip0;
+                     continue;
+                 }
+                 break;
+             }
+         }
++
++        /* Prepare for next iteration */
++        assert(ip0 == anchor);
++        ip1 = ip0 + stepSize;
+     }
+ 
++_cleanup:
+     /* save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : offsetSaved;
+-    rep[1] = offset_2 ? offset_2 : offsetSaved;
++    rep[0] = offset_1;
++    rep[1] = offset_2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     U32* const hashTable = ms->hashTable;
+     U32 const hlog = cParams->hashLog;
+     /* support stepSize of 0 */
+-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+     const BYTE* const base = ms->window.base;
+     const BYTE* const dictBase = ms->window.dictBase;
+     const BYTE* const istart = (const BYTE*)src;
+-    const BYTE* ip = istart;
+     const BYTE* anchor = istart;
+     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     const BYTE* const iend = istart + srcSize;
+     const BYTE* const ilimit = iend - 8;
+     U32 offset_1=rep[0], offset_2=rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++    const BYTE* ip0 = istart;
++    const BYTE* ip1;
++    const BYTE* ip2;
++    const BYTE* ip3;
++    U32 current0;
++
++
++    size_t hash0; /* hash for ip0 */
++    size_t hash1; /* hash for ip1 */
++    U32 idx; /* match idx for ip0 */
++    const BYTE* idxBase; /* base pointer for idx */
++
++    U32 offcode;
++    const BYTE* match0;
++    size_t mLength;
++    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++    size_t step;
++    const BYTE* nextStep;
++    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ 
+     (void)hasStep; /* not currently specialized on whether it's accelerated */
+ 
+@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+     if (prefixStartIndex == dictStartIndex)
+         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+ 
+-    /* Search Loop */
+-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+-        const U32    matchIndex = hashTable[h];
+-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE*  match = matchBase + matchIndex;
+-        const U32    curr = (U32)(ip-base);
+-        const U32    repIndex = curr + 1 - offset_1;
+-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+-        const BYTE* const repMatch = repBase + repIndex;
+-        hashTable[h] = curr;   /* update hash table */
+-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+-            ip++;
+-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+-            ip += rLength;
+-            anchor = ip;
+-        } else {
+-            if ( (matchIndex < dictStartIndex) ||
+-                 (MEM_read32(match) != MEM_read32(ip)) ) {
+-                assert(stepSize >= 1);
+-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+-                continue;
++    {   U32 const curr = (U32)(ip0 - base);
++        U32 const maxRep = curr - dictStartIndex;
++        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++    }
++
++    /* start each op */
++_start: /* Requires: ip0 */
++
++    step = stepSize;
++    nextStep = ip0 + kStepIncr;
++
++    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++    ip1 = ip0 + 1;
++    ip2 = ip0 + step;
++    ip3 = ip2 + 1;
++
++    if (ip3 >= ilimit) {
++        goto _cleanup;
++    }
++
++    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++    idx = hashTable[hash0];
++    idxBase = idx < prefixStartIndex ? dictBase : base;
++
++    do {
++        {   /* load repcode match for ip[2] */
++            U32 const current2 = (U32)(ip2 - base);
++            U32 const repIndex = current2 - offset_1;
++            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++            U32 rval;
++            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++                 & (offset_1 > 0) ) {
++                rval = MEM_read32(repBase + repIndex);
++            } else {
++                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+             }
+-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+-                U32 const offset = curr - matchIndex;
+-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+-                ip += mLength;
+-                anchor = ip;
++
++            /* write back hash table entry */
++            current0 = (U32)(ip0 - base);
++            hashTable[hash0] = current0;
++
++            /* check repcode at ip[2] */
++            if (MEM_read32(ip2) == rval) {
++                ip0 = ip2;
++                match0 = repBase + repIndex;
++                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++                assert((match0 != prefixStart) & (match0 != dictStart));
++                mLength = ip0[-1] == match0[-1];
++                ip0 -= mLength;
++                match0 -= mLength;
++                offcode = REPCODE1_TO_OFFBASE;
++                mLength += 4;
++                goto _match;
+         }   }
+ 
+-        if (ip <= ilimit) {
+-            /* Fill Table */
+-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+-            /* check immediate repcode */
+-            while (ip <= ilimit) {
+-                U32 const current2 = (U32)(ip-base);
+-                U32 const repIndex2 = current2 - offset_2;
+-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
+-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+-                    ip += repLength2;
+-                    anchor = ip;
+-                    continue;
+-                }
+-                break;
+-    }   }   }
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip3;
++
++        /* write back hash table entry */
++        current0 = (U32)(ip0 - base);
++        hashTable[hash0] = current0;
++
++        {   /* load match for ip[0] */
++            U32 const mval = idx >= dictStartIndex ?
++                    MEM_read32(idxBase + idx) :
++                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++            /* check match at ip[0] */
++            if (MEM_read32(ip0) == mval) {
++                /* found a match! */
++                goto _offset;
++        }   }
++
++        /* lookup ip[1] */
++        idx = hashTable[hash1];
++        idxBase = idx < prefixStartIndex ? dictBase : base;
++
++        /* hash ip[2] */
++        hash0 = hash1;
++        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++        /* advance to next positions */
++        ip0 = ip1;
++        ip1 = ip2;
++        ip2 = ip0 + step;
++        ip3 = ip1 + step;
++
++        /* calculate step */
++        if (ip2 >= nextStep) {
++            step++;
++            PREFETCH_L1(ip1 + 64);
++            PREFETCH_L1(ip1 + 128);
++            nextStep += kStepIncr;
++        }
++    } while (ip3 < ilimit);
++
++_cleanup:
++    /* Note that there are probably still a couple positions we could search.
++     * However, it seems to be a meaningful performance hit to try to search
++     * them. So let's not. */
++
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+ 
+     /* save reps for next block */
+-    rep[0] = offset_1;
+-    rep[1] = offset_2;
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++    /* Compute the offset code. */
++    {   U32 const offset = current0 - idx;
++        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++        match0 = idxBase + idx;
++        offset_2 = offset_1;
++        offset_1 = offset;
++        offcode = OFFSET_TO_OFFBASE(offset);
++        mLength = 4;
++
++        /* Count the backwards match length. */
++        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++            ip0--;
++            match0--;
++            mLength++;
++    }   }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++    /* Count the forward length. */
++    assert(matchEnd != 0);
++    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++    ip0 += mLength;
++    anchor = ip0;
++
++    /* write next hash table entry */
++    if (ip1 < ip0) {
++        hashTable[hash1] = (U32)(ip1 - base);
++    }
++
++    /* Fill table and check for immediate repcode. */
++    if (ip0 <= ilimit) {
++        /* Fill Table */
++        assert(base+current0+2 > istart);  /* check base overflow */
++        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
++        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++        while (ip0 <= ilimit) {
++            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
++                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
++                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++                ip0 += repLength2;
++                anchor = ip0;
++                continue;
++            }
++            break;
++    }   }
++
++    goto _start;
+ }
+ 
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+         void const* src, size_t srcSize)
+ {
+     U32 const mls = ms->cParams.minMatch;
++    assert(ms->dictMatchState == NULL);
+     switch(mls)
+     {
+     default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532d21..e64d9e1b2d39 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+ 
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++                        ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7504..83727cd46f91 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -10,6 +11,7 @@
+ 
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+ 
+ 
+ /*-*************************************
+@@ -197,8 +199,8 @@ ZSTD_DUBT_findBetterDictMatch (
+             U32 matchIndex = dictMatchIndex + dictIndexDelta;
+             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             }
+             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +220,7 @@ ZSTD_DUBT_findBetterDictMatch (
+     }
+ 
+     if (bestLength >= MINMATCH) {
+-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+     }
+@@ -230,7 +232,7 @@ ZSTD_DUBT_findBetterDictMatch (
+ static size_t
+ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iend,
+-                        size_t* offsetPtr,
++                        size_t* offBasePtr,
+                         U32 const mls,
+                         const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +329,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+             if (matchLength > bestLength) {
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                     if (dictMode == ZSTD_dictMatchState) {
+                         nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +363,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+         if (dictMode == ZSTD_dictMatchState && nbCompares) {
+             bestLength = ZSTD_DUBT_findBetterDictMatch(
+                     ms, ip, iend,
+-                    offsetPtr, bestLength, nbCompares,
++                    offBasePtr, bestLength, nbCompares,
+                     mls, dictMode);
+         }
+ 
+         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+         if (bestLength >= MINMATCH) {
+-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+         }
+         return bestLength;
+     }
+@@ -381,14 +383,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ FORCE_INLINE_TEMPLATE size_t
+ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iLimit,
+-                      size_t* offsetPtr,
++                      size_t* offBasePtr,
+                 const U32 mls /* template */,
+                 const ZSTD_dictMode_e dictMode)
+ {
+     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+     ZSTD_updateDUBT(ms, ip, iLimit, mls);
+-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+ 
+ /* *********************************
+@@ -561,7 +563,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+             if (ip+currentMl == iLimit) {
+                 /* best possible, avoids read overflow on next attempt */
+                 return ml;
+@@ -598,7 +600,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+             /* save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -691,7 +693,8 @@ size_t ZSTD_HcFindBestMatch(
+         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+             const BYTE* const match = base + matchIndex;
+             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-            if (match[ml] == ip[ml])   /* potentially better */
++            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                 currentMl = ZSTD_count(ip, match, iLimit);
+         } else {
+             const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +706,7 @@ size_t ZSTD_HcFindBestMatch(
+         /* save best solution */
+         if (currentMl > ml) {
+             ml = currentMl;
+-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
++            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+         }
+ 
+@@ -739,7 +742,7 @@ size_t ZSTD_HcFindBestMatch(
+             if (currentMl > ml) {
+                 ml = currentMl;
+                 assert(curr > matchIndex + dmsIndexDelta);
+-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+ 
+@@ -757,7 +760,6 @@ size_t ZSTD_HcFindBestMatch(
+ ***********************************/
+ /* Constants for row-based hash */
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
+ 
+@@ -769,29 +771,8 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+  * Starting from the LSB, returns the idx of the next non-zero bit.
+  * Basically counting the nb of trailing zeroes.
+  */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+-    assert(val != 0);
+-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+-    if (sizeof(size_t) == 4) {
+-        U32 mostSignificantWord = (U32)(val >> 32);
+-        U32 leastSignificantWord = (U32)val;
+-        if (leastSignificantWord == 0) {
+-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+-        } else {
+-            return (U32)__builtin_ctz(leastSignificantWord);
+-        }
+-    } else {
+-        return (U32)__builtin_ctzll(val);
+-    }
+-#   else
+-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+-     */
+-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
+-    val = val - ((val >> 1) & 0x5555555555555555);
+-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-#   endif
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++    return ZSTD_countTrailingZeros64(val);
+ }
+ 
+ /* ZSTD_rotateRight_*():
+@@ -971,7 +952,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+ 
+     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++    /* NEON path only works for little endian */
++    if (!MEM_isLittleEndian()) {
++        return 1;
++    }
++    if (rowEntries == 16) {
++        return 4;
++    }
++    if (rowEntries == 32) {
++        return 2;
++    }
++    if (rowEntries == 64) {
++        return 1;
++    }
++#endif
++    return 1;
+ }
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +1003,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+ 
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++    if (rowEntries == 16) {
++        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++         * After that groups of 4 bits represent the equalMask. We lower
++         * all bits except the highest in these groups by doing AND with
++         * 0x88 = 0b10001000.
++         */
++        const uint8x16_t chunk = vld1q_u8(src);
++        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++    } else if (rowEntries == 32) {
++        /* Same idea as with rowEntries == 16 but doing AND with
++         * 0x55 = 0b01010101.
++         */
++        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++    } else { /* rowEntries == 64 */
++        const uint8x16x4_t chunk = vld4q_u8(src);
++        const uint8x16_t dup = vdupq_n_u8(tag);
++        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++        return ZSTD_rotateRight_U64(matches, headGrouped);
++    }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+     const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+ 
+ #if defined(ZSTD_ARCH_X86_SSE2)
+ 
+-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+ 
+ #else /* SW or NEON-LE */
+ 
+ # if defined(ZSTD_ARCH_ARM_NEON)
+   /* This NEON path only works for little endian - otherwise use SWAR below */
+     if (MEM_isLittleEndian()) {
+-        if (rowEntries == 16) {
+-            const uint8x16_t chunk = vld1q_u8(src);
+-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+-        } else if (rowEntries == 32) {
+-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+-            return ZSTD_rotateRight_U32(matches, head);
+-        } else { /* rowEntries == 64 */
+-            const uint8x16x4_t chunk = vld4q_u8(src);
+-            const uint8x16_t dup = vdupq_n_u8(tag);
+-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+-            return ZSTD_rotateRight_U64(matches, head);
+-        }
++        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+     }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+     /* SWAR */
+-    {   const size_t chunkSize = sizeof(size_t);
++    {   const int chunkSize = sizeof(size_t);
+         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+         const size_t xFF = ~((size_t)0);
+         const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1111,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+         }
+         matches = ~matches;
+         if (rowEntries == 16) {
+-            return ZSTD_rotateRight_U16((U16)matches, head);
++            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+         } else if (rowEntries == 32) {
+-            return ZSTD_rotateRight_U32((U32)matches, head);
++            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+         } else {
+-            return ZSTD_rotateRight_U64((U64)matches, head);
++            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+         }
+     }
+ #endif
+@@ -1143,6 +1163,7 @@ size_t ZSTD_RowFindBestMatch(
+     const U32 rowEntries = (1U << rowLog);
+     const U32 rowMask = rowEntries - 1;
+     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+     U32 nbAttempts = 1U << cappedSearchLog;
+     size_t ml=4-1;
+ 
+@@ -1185,15 +1206,15 @@ size_t ZSTD_RowFindBestMatch(
+         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+         U32* const row = hashTable + relRow;
+         BYTE* tagRow = (BYTE*)(tagTable + relRow);
+-        U32 const head = *tagRow & rowMask;
++        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+         size_t numMatches = 0;
+         size_t currMatch = 0;
+-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+ 
+         /* Cycle through the matches and prefetch */
+         for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+             U32 const matchIndex = row[matchPos];
+             assert(numMatches < rowEntries);
+             if (matchIndex < lowLimit)
+@@ -1224,7 +1245,8 @@ size_t ZSTD_RowFindBestMatch(
+             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                 const BYTE* const match = base + matchIndex;
+                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+-                if (match[ml] == ip[ml])   /* potentially better */
++                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
+                     currentMl = ZSTD_count(ip, match, iLimit);
+             } else {
+                 const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1258,7 @@ size_t ZSTD_RowFindBestMatch(
+             /* Save best solution */
+             if (currentMl > ml) {
+                 ml = currentMl;
+-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
++                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+             }
+         }
+@@ -1254,14 +1276,14 @@ size_t ZSTD_RowFindBestMatch(
+         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+         const U32 dmsIndexDelta        = dictLimit - dmsSize;
+ 
+-        {   U32 const head = *dmsTagRow & rowMask;
++        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+             size_t numMatches = 0;
+             size_t currMatch = 0;
+-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+ 
+             for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+                 U32 const matchIndex = dmsRow[matchPos];
+                 if (matchIndex < dmsLowestIndex)
+                     break;
+@@ -1285,7 +1307,7 @@ size_t ZSTD_RowFindBestMatch(
+                 if (currentMl > ml) {
+                     ml = currentMl;
+                     assert(curr > matchIndex + dmsIndexDelta);
+-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+                     if (ip+currentMl == iLimit) break;
+                 }
+             }
+@@ -1491,7 +1513,8 @@ ZSTD_compressBlock_lazy_generic(
+     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+ 
+-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++    U32 offset_1 = rep[0], offset_2 = rep[1];
++    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+ 
+     const int isDMS = dictMode == ZSTD_dictMatchState;
+     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1535,8 @@ ZSTD_compressBlock_lazy_generic(
+         U32 const curr = (U32)(ip - base);
+         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+         U32 const maxRep = curr - windowLow;
+-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+     }
+     if (isDxS) {
+         /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1537,7 +1560,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         DEBUGLOG(7, "search baseline (depth 0)");
+ 
+@@ -1562,10 +1585,10 @@ ZSTD_compressBlock_lazy_generic(
+         }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++        {   size_t offbaseFound = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = offbaseFound;
+         }
+ 
+         if (matchLength < 4) {
+@@ -1579,12 +1602,12 @@ ZSTD_compressBlock_lazy_generic(
+             DEBUGLOG(7, "search depth 1");
+             ip ++;
+             if ( (dictMode == ZSTD_noDict)
+-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                 int const gain2 = (int)(mlRep * 3);
+-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                 if ((mlRep >= 4) && (gain2 > gain1))
+-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }
+             if (isDxS) {
+                 const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1619,17 @@ ZSTD_compressBlock_lazy_generic(
+                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                     int const gain2 = (int)(mlRep * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+             }
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate=999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1615,12 +1638,12 @@ ZSTD_compressBlock_lazy_generic(
+                 DEBUGLOG(7, "search depth 2");
+                 ip ++;
+                 if ( (dictMode == ZSTD_noDict)
+-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                     int const gain2 = (int)(mlRep * 4);
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((mlRep >= 4) && (gain2 > gain1))
+-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }
+                 if (isDxS) {
+                     const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1655,17 @@ ZSTD_compressBlock_lazy_generic(
+                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                         int const gain2 = (int)(mlRep * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((mlRep >= 4) && (gain2 > gain1))
+-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                     }
+                 }
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate=999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+@@ -1653,24 +1676,24 @@ ZSTD_compressBlock_lazy_generic(
+          * notably if `value` is unsigned, resulting in a large positive `-value`.
+          */
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
++        if (OFFBASE_IS_OFFSET(offBase)) {
+             if (dictMode == ZSTD_noDict) {
+-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
++                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
+                     { start--; matchLength++; }
+             }
+             if (isDxS) {
+-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+             }
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
+ 
+@@ -1686,8 +1709,8 @@ ZSTD_compressBlock_lazy_generic(
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
+-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
++                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                     ip += matchLength;
+                     anchor = ip;
+                     continue;
+@@ -1701,16 +1724,20 @@ ZSTD_compressBlock_lazy_generic(
+                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                 /* store sequence */
+                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+     }   }   }
+ 
+-    /* Save reps for next block */
+-    rep[0] = offset_1 ? offset_1 : savedOffset;
+-    rep[1] = offset_2 ? offset_2 : savedOffset;
++    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++    /* save reps for next block */
++    rep[0] = offset_1 ? offset_1 : offsetSaved1;
++    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+ 
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+@@ -1903,7 +1930,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+     while (ip < ilimit) {
+         size_t matchLength=0;
+-        size_t offcode=STORE_REPCODE_1;
++        size_t offBase = REPCODE1_TO_OFFBASE;
+         const BYTE* start=ip+1;
+         U32 curr = (U32)(ip-base);
+ 
+@@ -1922,10 +1949,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+         }   }
+ 
+         /* first search (depth 0) */
+-        {   size_t offsetFound = 999999999;
+-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++        {   size_t ofbCandidate = 999999999;
++            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+             if (ml2 > matchLength)
+-                matchLength = ml2, start = ip, offcode=offsetFound;
++                matchLength = ml2, start = ip, offBase = ofbCandidate;
+         }
+ 
+         if (matchLength < 4) {
+@@ -1939,7 +1966,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+             ip ++;
+             curr++;
+             /* check repCode */
+-            if (offcode) {
++            if (offBase) {
+                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                 const U32 repIndex = (U32)(curr - offset_1);
+                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +1978,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                     int const gain2 = (int)(repLength * 3);
+-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+                     if ((repLength >= 4) && (gain2 > gain1))
+-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+             }   }
+ 
+             /* search match, depth 1 */
+-            {   size_t offset2=999999999;
+-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++            {   size_t ofbCandidate = 999999999;
++                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+                 if ((ml2 >= 4) && (gain2 > gain1)) {
+-                    matchLength = ml2, offcode = offset2, start = ip;
++                    matchLength = ml2, offBase = ofbCandidate, start = ip;
+                     continue;   /* search a better one */
+             }   }
+ 
+@@ -1971,7 +1998,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 ip ++;
+                 curr++;
+                 /* check repCode */
+-                if (offcode) {
++                if (offBase) {
+                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+                     const U32 repIndex = (U32)(curr - offset_1);
+                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,36 +2010,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                         int const gain2 = (int)(repLength * 4);
+-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+                         if ((repLength >= 4) && (gain2 > gain1))
+-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+                 }   }
+ 
+                 /* search match, depth 2 */
+-                {   size_t offset2=999999999;
+-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
+-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++                {   size_t ofbCandidate = 999999999;
++                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
++                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+                     if ((ml2 >= 4) && (gain2 > gain1)) {
+-                        matchLength = ml2, offcode = offset2, start = ip;
++                        matchLength = ml2, offBase = ofbCandidate, start = ip;
+                         continue;
+             }   }   }
+             break;  /* nothing found : store previous solution */
+         }
+ 
+         /* catch up */
+-        if (STORED_IS_OFFSET(offcode)) {
+-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++        if (OFFBASE_IS_OFFSET(offBase)) {
++            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+         }
+ 
+         /* store sequence */
+ _storeSequence:
+         {   size_t const litLength = (size_t)(start - anchor);
+-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+             anchor = ip = start + matchLength;
+         }
+ 
+@@ -2029,8 +2056,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                 /* repcode detected we should take it */
+                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
+-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
++                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+                 ip += matchLength;
+                 anchor = ip;
+                 continue;   /* faster when present ... (?) */
+@@ -2096,7 +2123,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8dde..9505bed93c03 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -22,6 +23,8 @@
+  */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+ 
++#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
++
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+ 
+@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-        
++
+ 
+ 
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e7dd..b7da76b0db7c 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+     switch(ms->cParams.strategy)
+     {
+     case ZSTD_fast:
+-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_dfast:
+-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+         break;
+ 
+     case ZSTD_greedy:
+@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
+          * the window through early invalidation.
+          * TODO: * Test the chunk size.
+          *       * Try invalidation after the sequence generation and test the
+-         *         the offset against maxDist directly.
++         *         offset against maxDist directly.
+          *
+          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+          * that any offset used is valid at the END of the sequence, since it may
+@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+             rep[0] = sequence.offset;
+             /* Store the sequence */
+             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+-                          STORE_OFFSET(sequence.offset),
++                          OFFSET_TO_OFFBASE(sequence.offset),
+                           sequence.matchLength);
+             ip += sequence.matchLength;
+         }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88fd7..c540731abde7 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be290..cfccfc46f6f7 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda62f..a6bf7f856437 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,7 @@
+ #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE     (1<<30)
+ 
+-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+ 
+ 
+ /*-*************************************
+@@ -26,27 +27,35 @@
+ #if 0    /* approximation at bit level (for tests) */
+ #  define BITCOST_ACCURACY 0
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0  /* fractional bit accuracy (for tests) */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else    /* opt==approx, ultra==accurate */
+ #  define BITCOST_ACCURACY 8
+ #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+ 
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+ 
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+     U32 const stat = rawStat + 1;
+     U32 const hb = ZSTD_highbit32(stat);
+     U32 const BWeight = hb * BITCOST_MULTIPLIER;
++    /* Fweight was meant for "Fractional weight"
++     * but it's effectively a value between 1 and 2
++     * using fixed point arithmetic */
+     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+     U32 const weight = BWeight + FWeight;
+     assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+  * @return price in bytes as fractional value
+  * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+     return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+     return total;
+ }
+ 
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+     U32 s, sum=0;
+-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++            (unsigned)lastEltIndex+1, (unsigned)shift );
+     assert(shift < 30);
+     for (s=0; s<lastEltIndex+1; s++) {
+-        table[s] = 1 + (table[s] >> shift);
+-        sum += table[s];
++        unsigned const base = base1 ? 1 : (table[s]>0);
++        unsigned const newStat = base + (table[s] >> shift);
++        sum += newStat;
++        table[s] = newStat;
+     }
+     return sum;
+ }
+ 
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+  * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+     assert(logTarget < 30);
+     if (factor <= 1) return prevsum;
+-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+ 
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+     optPtr->priceType = zop_dynamic;
+ 
+-    if (optPtr->litLengthSum == 0) {  /* first block : init */
+-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
++
++        /* heuristic: use pre-defined stats for too small inputs */
++        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+             optPtr->priceType = zop_predef;
+         }
+ 
+         assert(optPtr->symbolCosts != NULL);
+         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+-            /* huffman table presumed generated by dictionary */
++
++            /* huffman stats covering the full value set : table presumed generated by dictionary */
+             optPtr->priceType = zop_dynamic;
+ 
+             if (compressedLiterals) {
++                /* generate literals statistics from huffman table */
+                 unsigned lit;
+                 assert(optPtr->litFreq != NULL);
+                 optPtr->litSum = 0;
+@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
+             }   }
+ 
+-        } else {  /* not a dictionary */
++        } else {  /* first block, no dictionary */
+ 
+             assert(optPtr->litFreq != NULL);
+             if (compressedLiterals) {
++                /* base initial cost of literals on direct frequency within src */
+                 unsigned lit = MaxLit;
+                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+             }
+ 
+             {   unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+             }
+ 
+-
+         }
+ 
+-    } else {   /* new block : re-use previous statistics, scaled down */
++    } else {   /* new block : scale down accumulated statistics */
+ 
+         if (compressedLiterals)
+             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+ 
+     /* dynamic statistics */
+-    {   U32 price = litLength * optPtr->litSumBasePrice;
++    {   U32 price = optPtr->litSumBasePrice * litLength;
++        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+         U32 u;
++        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+         for (u=0; u < litLength; u++) {
+-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++            price -= litPrice;
+         }
+         return price;
+     }
+@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+     if (optPtr->priceType == zop_predef)
+         return WEIGHT(litLength, optLevel);
+-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+-     * because it isn't representable in the zstd format. So instead just
+-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+-     * would be all literals.
++
++    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++     * because it isn't representable in the zstd format.
++     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++     * In such a case, the block would be all literals.
+      */
+     if (litLength == ZSTD_BLOCKSIZE_MAX)
+         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+ 
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+  */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+                    U32 const matchLength,
+              const optState_t* const optPtr,
+                    int const optLevel)
+ {
+     U32 price;
+-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++    U32 const offCode = ZSTD_highbit32(offBase);
+     U32 const mlBase = matchLength - MINMATCH;
+     assert(matchLength >= MINMATCH);
+ 
+-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
++        return WEIGHT(mlBase, optLevel)
++             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+ 
+     /* dynamic statistics */
+     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+ 
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+                              U32 litLength, const BYTE* literals,
+-                             U32 offsetCode, U32 matchLength)
++                             U32 offBase, U32 matchLength)
+ {
+     /* literals */
+     if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+         optPtr->litLengthSum++;
+     }
+ 
+-    /* offset code : expected to follow storeSeq() numeric representation */
+-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++    /* offset code : follows storeSeq() numeric representation */
++    {   U32 const offCode = ZSTD_highbit32(offBase);
+         assert(offCode <= MaxOff);
+         optPtr->offCodeFreq[offCode]++;
+         optPtr->offCodeSum++;
+@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+ }
+ 
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+-                    ZSTD_matchState_t* ms,
+-                    U32* nextToUpdate3,
+-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+-                    const U32 rep[ZSTD_REP_NUM],
+-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+-                    const U32 lengthToBeat,
+-                    U32 const mls /* template */)
++FORCE_INLINE_TEMPLATE U32
++ZSTD_insertBtAndGetAllMatches (
++                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
++                ZSTD_matchState_t* ms,
++                U32* nextToUpdate3,
++                const BYTE* const ip, const BYTE* const iLimit,
++                const ZSTD_dictMode_e dictMode,
++                const U32 rep[ZSTD_REP_NUM],
++                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++                const U32 lengthToBeat,
++                const U32 mls /* template */)
+ {
+     const ZSTD_compressionParameters* const cParams = &ms->cParams;
+     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                             repCode, ll0, repOffset, repLen);
+                 bestLength = repLen;
+-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
++                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                 matches[mnum].len = (U32)repLen;
+                 mnum++;
+                 if ( (repLen > sufficient_len)
+@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+                 bestLength = mlen;
+                 assert(curr > matchIndex3);
+                 assert(mnum==0);  /* no prior solution */
+-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
++                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+                 matches[0].len = (U32)mlen;
+                 mnum = 1;
+                 if ( (mlen > sufficient_len) |
+@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+         }
+ 
+         if (matchLength > bestLength) {
+-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+             assert(matchEndIdx > matchIndex);
+             if (matchLength > matchEndIdx - matchIndex)
+                 matchEndIdx = matchIndex + (U32)matchLength;
+             bestLength = matchLength;
+-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+             matches[mnum].len = (U32)matchLength;
+             mnum++;
+             if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ 
+             if (matchLength > bestLength) {
+                 matchIndex = dictMatchIndex + dmsIndexDelta;
+-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+                 if (matchLength > matchEndIdx - matchIndex)
+                     matchEndIdx = matchIndex + (U32)matchLength;
+                 bestLength = matchLength;
+-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+                 matches[mnum].len = (U32)matchLength;
+                 mnum++;
+                 if ( (matchLength > ZSTD_OPT_NUM)
+@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+ 
+     /* Ensure that current block position is not outside of the match */
+@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+     }
+ 
+     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+-                 candidateOffCode, candidateMatchLength, currPosInBlock);
++        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++                 candidateOffBase, candidateMatchLength, currPosInBlock);
+         matches[*nbMatches].len = candidateMatchLength;
+-        matches[*nbMatches].off = candidateOffCode;
++        matches[*nbMatches].off = candidateOffBase;
+         (*nbMatches)++;
+     }
+ }
+@@ -1098,14 +1123,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             /* large match -> immediate encoding */
+             {   U32 const maxML = matches[nbMatches-1].len;
+-                U32 const maxOffcode = matches[nbMatches-1].off;
+-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++                U32 const maxOffBase = matches[nbMatches-1].off;
++                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+ 
+                 if (maxML > sufficient_len) {
+                     lastSequence.litlen = litlen;
+                     lastSequence.mlen = maxML;
+-                    lastSequence.off = maxOffcode;
++                    lastSequence.off = maxOffBase;
+                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                 maxML, sufficient_len);
+                     cur = 0;
+@@ -1122,15 +1147,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                 }
+                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+-                    U32 const offcode = matches[matchNb].off;
++                    U32 const offBase = matches[matchNb].off;
+                     U32 const end = matches[matchNb].len;
+                     for ( ; pos <= end ; pos++ ) {
+-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
++                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+                         U32 const sequencePrice = literalsPrice + matchPrice;
+                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+-                                    pos, ZSTD_fCost(sequencePrice));
++                                    pos, ZSTD_fCost((int)sequencePrice));
+                         opt[pos].mlen = pos;
+-                        opt[pos].off = offcode;
++                        opt[pos].off = offBase;
+                         opt[pos].litlen = litlen;
+                         opt[pos].price = (int)sequencePrice;
+                 }   }
+@@ -1230,7 +1255,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                     U32 mlen;
+ 
+-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
++                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+                                 matchNb, matches[matchNb].off, lastML, litlen);
+ 
+                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+@@ -1296,7 +1321,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                     U32 const llen = opt[storePos].litlen;
+                     U32 const mlen = opt[storePos].mlen;
+-                    U32 const offCode = opt[storePos].off;
++                    U32 const offBase = opt[storePos].off;
+                     U32 const advance = llen + mlen;
+                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,8 +1333,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     }
+ 
+                     assert(anchor + llen <= iend);
+-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+                     anchor += advance;
+                     ip = anchor;
+             }   }
+@@ -1349,7 +1374,7 @@ size_t ZSTD_compressBlock_btopt(
+ /* ZSTD_initStats_ultra():
+  * make a first compression pass, just to seed stats with more accurate starting values.
+  * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+  */
+ static void
+ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+@@ -1368,7 +1393,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+ 
+     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
+ 
+-    /* invalidate first scan from history */
++    /* invalidate first scan from history, only keep entropy stats */
+     ZSTD_resetSeqStore(seqStore);
+     ms->window.base -= srcSize;
+     ms->window.dictLimit += (U32)srcSize;
+@@ -1392,20 +1417,20 @@ size_t ZSTD_compressBlock_btultra2(
+     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+ 
+-    /* 2-pass strategy:
++    /* 2-passes strategy:
+      * this strategy makes a first pass over first block to collect statistics
+-     * and seed next round's statistics with it.
+-     * After 1st pass, function forgets everything, and starts a new block.
++     * in order to seed next round's statistics with it.
++     * After 1st pass, function forgets history, and starts a new block.
+      * Consequently, this can only work if no data has been previously loaded in tables,
+      * aka, no dictionary, no prefix, no ldm preprocessing.
+      * The compression ratio gain is generally small (~0.5% on first block),
+-     * the cost is 2x cpu time on first block. */
++    ** the cost is 2x cpu time on first block. */
+     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+     if ( (ms->opt.litLengthSum==0)   /* first block */
+       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
++      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
++      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+       ) {
+         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+     }
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858ba7..faa73ff4b03d 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afebc41..28a036f7543b 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+  * huff0 huffman decoder,
+  * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  *
+  *  You can contact the author at :
+  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h"  /* BIT_* */
+ #include "../common/fse.h"        /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+ 
+ /* **************************************************************
+ *  Constants
+@@ -43,27 +44,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+ 
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+ 
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+ 
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ *  Error Management
+ ****************************************************************/
+@@ -80,6 +79,11 @@
+ /* **************************************************************
+ *  BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++                                              const void *cSrc,
++                                              size_t cSrcSize,
++                                              const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+ 
+ #define HUF_DGEN(fn)                                                        \
+@@ -101,9 +105,9 @@
+     }                                                                       \
+                                                                             \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        if (bmi2) {                                                         \
++        if (flags & HUF_flags_bmi2) {                                       \
+             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+         }                                                                   \
+         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+@@ -113,9 +117,9 @@
+ 
+ #define HUF_DGEN(fn)                                                        \
+     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
++                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
+     {                                                                       \
+-        (void)bmi2;                                                         \
++        (void)flags;                                                        \
+         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+     }
+ 
+@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+     return dtd;
+ }
+ 
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+     BYTE const lastByte = ip[7];
+-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+     size_t const value = MEM_readLEST(ip) | 1;
+     assert(bitsConsumed <= 8);
++    assert(sizeof(size_t) == 8);
+     return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ *             as long as it is above ilimit, but that indicates corruption.
++ */
+ typedef struct {
+     BYTE const* ip[4];
+     BYTE* op[4];
+@@ -151,15 +168,17 @@ typedef struct {
+     BYTE const* ilimit;
+     BYTE* oend;
+     BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+ 
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- *          1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ *          0 if the fallback implementation should be used.
+  *          Or an error code on failure.
+  */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+     void const* dt = DTable + 1;
+     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     BYTE* const oend = (BYTE*)dst + dstSize;
+ 
+-    /* The following condition is false on x32 platform,
+-     * but HUF_asm is not compatible with this ABI */
+-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++    /* The fast decoding loop assumes 64-bit little-endian.
++     * This condition is false on x32.
++     */
++    if (!MEM_isLittleEndian() || MEM_32bits())
++        return 0;
+ 
+     /* strict minimum : jump table + 1 byte per stream */
+     if (srcSize < 10)
+@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+      */
+     if (dtLog != HUF_DECODER_FAST_TABLELOG)
+-        return 1;
++        return 0;
+ 
+     /* Read the jump table. */
+     {
+@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         args->iend[2] = args->iend[1] + length2;
+         args->iend[3] = args->iend[2] + length3;
+ 
+-        /* HUF_initDStream() requires this, and this small of an input
++        /* HUF_initFastDStream() requires this, and this small of an input
+          * won't benefit from the ASM loop anyways.
+          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+          * starts.
+          */
+         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-            return 1;
++            return 0;
+         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
+     }
+     /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ 
+     /* No point to call the ASM loop for tiny outputs. */
+     if (args->op[3] >= oend)
+-        return 1;
++        return 0;
+ 
+     /* bits[] is the bit container.
+         * It is read from the MSB down to the LSB.
+@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+         * set, so that CountTrailingZeros(bits[]) can be used
+         * to count how many bits we've consumed.
+         */
+-    args->bits[0] = HUF_initDStream(args->ip[0]);
+-    args->bits[1] = HUF_initDStream(args->ip[1]);
+-    args->bits[2] = HUF_initDStream(args->ip[2]);
+-    args->bits[3] = HUF_initDStream(args->ip[3]);
++    args->bits[0] = HUF_initFastDStream(args->ip[0]);
++    args->bits[1] = HUF_initFastDStream(args->ip[1]);
++    args->bits[2] = HUF_initFastDStream(args->ip[2]);
++    args->bits[3] = HUF_initFastDStream(args->ip[3]);
+ 
+     /* If ip[] >= ilimit, it is guaranteed to be safe to
+         * reload bits[]. It may be beyond its section, but is
+@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+     args->oend = oend;
+     args->dt = dt;
+ 
+-    return 0;
++    return 1;
+ }
+ 
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+     /* Validate that we haven't overwritten. */
+     if (args->op[stream] > segmentEnd)
+@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+         return ERROR(corruption_detected);
+ 
+     /* Construct the BIT_DStream_t. */
+-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
+-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
++    assert(sizeof(size_t) == 8);
++    bit->bitContainer = MEM_readLEST(args->ip[stream]);
++    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+     bit->start = (const char*)args->iend[0];
+     bit->limitPtr = bit->start + sizeof(size_t);
+     bit->ptr = (const char*)args->ip[stream];
+ 
+     return 0;
+ }
+-#endif
+ 
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+     U64 D4;
+     if (MEM_isLittleEndian()) {
+-        D4 = (symbol << 8) + nbBits;
++        D4 = (U64)((symbol << 8) + nbBits);
+     } else {
+-        D4 = symbol + (nbBits << 8);
++        D4 = (U64)(symbol + (nbBits << 8));
+     }
++    assert(D4 < (1U << 16));
+     D4 *= 0x0001000100010001ULL;
+     return D4;
+ }
+@@ -329,13 +351,7 @@ typedef struct {
+         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+ 
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog = 0;
+     U32 nbSymbols = 0;
+@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+ 
+@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * rankStart[0] is not filled because there are no entries in the table for
+      * weight 0.
+      */
+-    {
+-        int n;
+-        int nextRankStart = 0;
++    {   int n;
++        U32 nextRankStart = 0;
+         int const unroll = 4;
+         int const nLimit = (int)nbSymbols - unroll + 1;
+         for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+      * We can switch based on the length to a different inner loop which is
+      * optimized for that particular case.
+      */
+-    {
+-        U32 w;
+-        int symbol=wksp->rankVal[0];
+-        int rankStart=0;
++    {   U32 w;
++        int symbol = wksp->rankVal[0];
++        int rankStart = 0;
+         for (w=1; w<tableLog+1; ++w) {
+             int const symbolCount = wksp->rankVal[w];
+             int const length = (1 << w) >> 1;
+@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+     while (p < pEnd)
+         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+ 
+-    return pEnd-pStart;
++    return (size_t)(pEnd-pStart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+     return dstSize;
+ }
+ 
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ 
+         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    U16 const* const dtable = (U16 const*)args->dt;
++    BYTE* const oend = args->oend;
++    BYTE const* const ilimit = args->ilimit;
++
++    /* Copy the arguments to local variables */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++        int symbol;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++            assert(ip[stream] >= ilimit);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each iteration produces 5 output symbols per stream */
++            size_t const oiters = (size_t)(oend - op[3]) / 5;
++            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++             * per stream.
++             */
++            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
++            /* We can safely run iters iterations before running bounds checks */
++            size_t const iters = MIN(oiters, iiters);
++            size_t const symbols = iters * 5;
++
++            /* We can simply check that op[3] < olimit, instead of checking all
++             * of our bounds, since we can't hit the other bounds until we've run
++             * iters iterations, which only happens when op[3] == olimit.
++             */
++            olimit = op[3] + symbols;
++
++            /* Exit fast decoding loop once we get close to the end. */
++            if (op[3] + 20 > olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++        do {
++            /* Decode 5 symbols in each of the 4 streams */
++            for (symbol = 0; symbol < 5; ++symbol) {
++                for (stream = 0; stream < 4; ++stream) {
++                    int const index = (int)(bits[stream] >> 53);
++                    int const entry = (int)dtable[index];
++                    bits[stream] <<= (entry & 63);
++                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
++                }
++            }
++            /* Reload the bitstreams */
++            for (stream = 0; stream < 4; ++stream) {
++                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
++                int const nbBits = ctz & 7;
++                int const nbBytes = ctz >> 3;
++                op[stream] += 5;
++                ip[stream] -= nbBytes;
++                bits[stream] = MEM_read64(ip[stream]) | 1;
++                bits[stream] <<= nbBits;
++            }
++        } while (op[3] < olimit);
++    }
++
++_out:
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ *          0 if the fallback implementation should be used
++ *          An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn)
+ {
+     void const* dt = DTable + 1;
+     const BYTE* const iend = (const BYTE*)cSrc + 6;
+     BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
+-    {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    HUF_DecompressFastArgs args;
++    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++        if (ret == 0)
++            return 0;
+     }
+ 
+     assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    loopFn(&args);
+ 
+     /* Our loop guarantees that ip[] >= ilimit and that we haven't
+     * overwritten any op[].
+@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     (void)iend;
+ 
+     /* finish bit streams one by one. */
+-    {
+-        size_t const segmentSize = (dstSize+3) / 4;
++    {   size_t const segmentSize = (dstSize+3) / 4;
+         BYTE* segmentEnd = (BYTE*)dst;
+         int i;
+         for (i = 0; i < 4; ++i) {
+@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+     }
+ 
+     /* decoded size */
++    assert(dstSize != 0);
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+-                                               const void *cSrc,
+-                                               size_t cSrcSize,
+-                                               const HUF_DTable *DTable);
+ 
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+ 
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+ 
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    const BYTE* ip = (const BYTE*) cSrc;
+-
+-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+-    if (HUF_isError(hSize)) return hSize;
+-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+-    ip += hSize; cSrcSize -= hSize;
+-
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 0) return ERROR(GENERIC);
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    if (!(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+ 
+ 
+@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+ 
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                            const sortedSymbol_t* sortedList,
+-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+                            const U32 nbBitsBaseline)
+ {
+     U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1124,7 @@ typedef struct {
+ 
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                        const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize)
+-{
+-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+-                       const void* src, size_t srcSize,
+-                             void* workSpace, size_t wkspSize, int bmi2)
++                             void* workSpace, size_t wkspSize, int flags)
+ {
+     U32 tableLog, maxW, nbSymbols;
+     DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+ 
+-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+     if (HUF_isError(iSize)) return iSize;
+ 
+     /* check result */
+@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+     /* decoded size */
+     return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+           void* dst,  size_t dstSize,
+@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+         U32 const dtLog = dtd.tableLog;
+ 
+-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
++        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
++        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
++        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+ 
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+                     size_t cSrcSize, HUF_DTable const* DTable) {
+     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+ 
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++    U64 bits[4];
++    BYTE const* ip[4];
++    BYTE* op[4];
++    BYTE* oend[4];
++    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++    BYTE const* const ilimit = args->ilimit;
++
++    /* Copy the arguments to local registers. */
++    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++    ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
++    ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++    oend[0] = op[1];
++    oend[1] = op[2];
++    oend[2] = op[3];
++    oend[3] = args->oend;
++
++    assert(MEM_isLittleEndian());
++    assert(!MEM_32bits());
++
++    for (;;) {
++        BYTE* olimit;
++        int stream;
++        int symbol;
++
++        /* Assert loop preconditions */
++#ifndef NDEBUG
++        for (stream = 0; stream < 4; ++stream) {
++            assert(op[stream] <= oend[stream]);
++            assert(ip[stream] >= ilimit);
++        }
++#endif
++        /* Compute olimit */
++        {
++            /* Each loop does 5 table lookups for each of the 4 streams.
++             * Each table lookup consumes up to 11 bits of input, and produces
++             * up to 2 bytes of output.
++             */
++            /* We can consume up to 7 bytes of input per iteration per stream.
++             * We also know that each input pointer is >= ip[0]. So we can run
++             * iters loops before running out of input.
++             */
++            size_t iters = (size_t)(ip[0] - ilimit) / 7;
++            /* Each iteration can produce up to 10 bytes of output per stream.
++             * Each output stream my advance at different rates. So take the
++             * minimum number of safe iterations among all the output streams.
++             */
++            for (stream = 0; stream < 4; ++stream) {
++                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++                iters = MIN(iters, oiters);
++            }
++
++            /* Each iteration produces at least 5 output symbols. So until
++             * op[3] crosses olimit, we know we haven't executed iters
++             * iterations yet. This saves us maintaining an iters counter,
++             * at the expense of computing the remaining # of iterations
++             * more frequently.
++             */
++            olimit = op[3] + (iters * 5);
++
++            /* Exit the fast decoding loop if we are too close to the end. */
++            if (op[3] + 10 > olimit)
++                break;
++
++            /* Exit the decoding loop if any input pointer has crossed the
++             * previous one. This indicates corruption, and a precondition
++             * to our loop is that ip[i] >= ip[0].
++             */
++            for (stream = 1; stream < 4; ++stream) {
++                if (ip[stream] < ip[stream - 1])
++                    goto _out;
++            }
++        }
++
++#ifndef NDEBUG
++        for (stream = 1; stream < 4; ++stream) {
++            assert(ip[stream] >= ip[stream - 1]);
++        }
++#endif
++
++        do {
++            /* Do 5 table lookups for each of the first 3 streams */
++            for (symbol = 0; symbol < 5; ++symbol) {
++                for (stream = 0; stream < 3; ++stream) {
++                    int const index = (int)(bits[stream] >> 53);
++                    HUF_DEltX2 const entry = dtable[index];
++                    MEM_write16(op[stream], entry.sequence);
++                    bits[stream] <<= (entry.nbBits);
++                    op[stream] += (entry.length);
++                }
++            }
++            /* Do 1 table lookup from the final stream */
++            {
++                int const index = (int)(bits[3] >> 53);
++                HUF_DEltX2 const entry = dtable[index];
++                MEM_write16(op[3], entry.sequence);
++                bits[3] <<= (entry.nbBits);
++                op[3] += (entry.length);
++            }
++            /* Do 4 table lookups from the final stream & reload bitstreams */
++            for (stream = 0; stream < 4; ++stream) {
++                /* Do a table lookup from the final stream.
++                 * This is interleaved with the reloading to reduce register
++                 * pressure. This shouldn't be necessary, but compilers can
++                 * struggle with codegen with high register pressure.
++                 */
++                {
++                    int const index = (int)(bits[3] >> 53);
++                    HUF_DEltX2 const entry = dtable[index];
++                    MEM_write16(op[3], entry.sequence);
++                    bits[3] <<= (entry.nbBits);
++                    op[3] += (entry.length);
++                }
++                /* Reload the bistreams. The final bitstream must be reloaded
++                 * after the 5th symbol was decoded.
++                 */
++                {
++                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
++                    int const nbBits = ctz & 7;
++                    int const nbBytes = ctz >> 3;
++                    ip[stream] -= nbBytes;
++                    bits[stream] = MEM_read64(ip[stream]) | 1;
++                    bits[stream] <<= nbBits;
++                }
++            }
++        } while (op[3] < olimit);
++    }
++
++_out:
++
++    /* Save the final values of each of the state variables back to args. */
++    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++    ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
++    ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+           void* dst,  size_t dstSize,
+     const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable) {
++    const HUF_DTable* DTable,
++    HUF_DecompressFastLoopFn loopFn) {
+     void const* dt = DTable + 1;
+     const BYTE* const iend = (const BYTE*)cSrc + 6;
+     BYTE* const oend = (BYTE*)dst + dstSize;
+-    HUF_DecompressAsmArgs args;
++    HUF_DecompressFastArgs args;
+     {
+-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+         FORWARD_IF_ERROR(ret, "Failed to init asm args");
+-        if (ret != 0)
+-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (ret == 0)
++            return 0;
+     }
+ 
+     assert(args.ip[0] >= args.ilimit);
+-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    loopFn(&args);
+ 
+     /* note : op4 already verified within main loop */
+     assert(args.ip[0] >= iend);
+@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+     /* decoded size */
+     return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+ 
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+-    if (bmi2) {
++    if (flags & HUF_flags_bmi2) {
++        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++        if (!(flags & HUF_flags_disableAsm)) {
++            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++        }
+ # endif
++    } else {
++        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+     }
+-#else
+-    (void)bmi2;
+ #endif
+ 
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++    if (!(flags & HUF_flags_disableAsm)) {
++        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++    }
+ #endif
++
++    if (!(flags & HUF_flags_disableFast)) {
++        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++        if (ret != 0)
++            return ret;
++    }
++    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+ 
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+ 
+-size_t HUF_decompress1X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+-                                               workSpace, wkspSize);
++                                               workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X2_usingDTable(
+-          void* dst,  size_t dstSize,
+-    const void* cSrc, size_t cSrcSize,
+-    const HUF_DTable* DTable)
+-{
+-    DTableDesc dtd = HUF_getDTableDesc(DTable);
+-    if (dtd.tableType != 1) return ERROR(GENERIC);
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+ 
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                    const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize, int bmi2)
++                                   void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+-                                         workSpace, wkspSize);
++                                         workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ 
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+-                                   const void* cSrc, size_t cSrcSize,
+-                                   void* workSpace, size_t wkspSize)
+-{
+-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+ 
+ 
+@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+ 
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+-                                    const void* cSrc, size_t cSrcSize,
+-                                    const HUF_DTable* DTable)
+-{
+-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-    (void)dtd;
+-    assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-    (void)dtd;
+-    assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+ 
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+ 
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+-                                     size_t dstSize, const void* cSrc,
+-                                     size_t cSrcSize, void* workSpace,
+-                                     size_t wkspSize)
+-{
+-    /* validation checks */
+-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+-    if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+-        (void)algoNb;
+-        assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+-        (void)algoNb;
+-        assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                            cSrcSize, workSpace, wkspSize):
+-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+-    }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+-                                  void* workSpace, size_t wkspSize)
++                                  void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+         (void)algoNb;
+         assert(algoNb == 0);
+         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #else
+         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize):
++                                cSrcSize, workSpace, wkspSize, flags):
+                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+-                                cSrcSize, workSpace, wkspSize);
++                                cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+ 
+ 
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     const BYTE* ip = (const BYTE*) cSrc;
+ 
+-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+     if (HUF_isError(hSize)) return hSize;
+     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+     ip += hSize; cSrcSize -= hSize;
+ 
+-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+ 
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+     DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+     (void)dtd;
+     assert(dtd.tableType == 0);
+-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+     (void)dtd;
+     assert(dtd.tableType == 1);
+-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+ 
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+     /* validation checks */
+     if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+         (void)algoNb;
+         assert(algoNb == 0);
+-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+         (void)algoNb;
+         assert(algoNb == 1);
+-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+     }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919de53..4f801e0dd564 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -19,7 +20,6 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +131,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+         ZSTD_memcpy(internalBuffer, dict, dictSize);
+     }
+     ddict->dictSize = dictSize;
+-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+ 
+     /* parse dictionary content */
+     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +237,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+     if (ddict==NULL) return 0;
+-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++    return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d666f8..de459a0dacd1 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index b9b935a9f5c0..d7eebb17a2c5 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -56,13 +57,13 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+ #include "../common/zstd_internal.h"  /* blockProperties_t */
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ 
+ 
+@@ -72,11 +73,11 @@
+  *************************************/
+ 
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+-                                                     * Currently, that means a 0.75 load factor.
+-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+-                                                     * the load factor of the ddict hash set.
+-                                                     */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++                                                    * Currently, that means a 0.75 load factor.
++                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++                                                    * the load factor of the ddict hash set.
++                                                    */
+ 
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +238,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+     dctx->outBufferMode = ZSTD_bm_buffered;
+     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++    dctx->disableHufAsm = 0;
+ }
+ 
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -421,16 +423,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+  * @return : 0, `zfhPtr` is correctly filled,
+  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- *           or an error code, which can be tested using ZSTD_isError() */
++**           or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     const BYTE* ip = (const BYTE*)src;
+     size_t const minInputSize = ZSTD_startingInputLength(format);
+ 
+-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+-    if (srcSize < minInputSize) return minInputSize;
+-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++    if (srcSize > 0) {
++        /* note : technically could be considered an assert(), since it's an invalid entry */
++        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++    }
++    if (srcSize < minInputSize) {
++        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++            /* when receiving less than @minInputSize bytes,
++             * control these bytes at least correspond to a supported magic number
++             * in order to error out early if they don't.
++            **/
++            size_t const toCopy = MIN(4, srcSize);
++            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++            assert(src != NULL);
++            ZSTD_memcpy(hbuf, src, toCopy);
++            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++                /* not a zstd frame : let's check if it's a skippable frame */
++                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++                ZSTD_memcpy(hbuf, src, toCopy);
++                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++                    RETURN_ERROR(prefix_unknown,
++                                "first bytes don't correspond to any supported magic number");
++        }   }   }
++        return minInputSize;
++    }
+ 
++    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+     if ( (format != ZSTD_f_zstd1_magicless)
+       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -730,10 +756,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+             ip += 4;
+         }
+ 
++        frameSizeInfo.nbBlocks = nbBlocks;
+         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                         ? zfh.frameContentSize
+-                                        : nbBlocks * zfh.blockSizeMax;
++                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+         return frameSizeInfo;
+     }
+ }
+@@ -773,6 +800,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     return bound;
+ }
+ 
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++    size_t margin = 0;
++    unsigned maxBlockSize = 0;
++
++    /* Iterate over each frame */
++    while (srcSize > 0) {
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        size_t const compressedSize = frameSizeInfo.compressedSize;
++        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++        ZSTD_frameHeader zfh;
++
++        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++            return ERROR(corruption_detected);
++
++        if (zfh.frameType == ZSTD_frame) {
++            /* Add the frame header to our margin */
++            margin += zfh.headerSize;
++            /* Add the checksum to our margin */
++            margin += zfh.checksumFlag ? 4 : 0;
++            /* Add 3 bytes per block */
++            margin += 3 * frameSizeInfo.nbBlocks;
++
++            /* Compute the max block size */
++            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++        } else {
++            assert(zfh.frameType == ZSTD_skippableFrame);
++            /* Add the entire skippable frame size to our margin. */
++            margin += compressedSize;
++        }
++
++        assert(srcSize >= compressedSize);
++        src = (const BYTE*)src + compressedSize;
++        srcSize -= compressedSize;
++    }
++
++    /* Add the max block size back to the margin. */
++    margin += maxBlockSize;
++
++    return margin;
++}
+ 
+ /*-*************************************************************
+  *   Frame decoding
+@@ -798,7 +867,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+         if (srcSize == 0) return 0;
+         RETURN_ERROR(dstBuffer_null, "");
+     }
+-    ZSTD_memcpy(dst, src, srcSize);
++    ZSTD_memmove(dst, src, srcSize);
+     return srcSize;
+ }
+ 
+@@ -858,6 +927,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+ 
+     /* Loop on each block */
+     while (1) {
++        BYTE* oBlockEnd = oend;
+         size_t decodedSize;
+         blockProperties_t blockProperties;
+         size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
+@@ -867,16 +937,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         remainingSrcSize -= ZSTD_blockHeaderSize;
+         RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+ 
++        if (ip >= op && ip < oBlockEnd) {
++            /* We are decompressing in-place. Limit the output pointer so that we
++             * don't overwrite the block that we are currently reading. This will
++             * fail decompression if the input & output pointers aren't spaced
++             * far enough apart.
++             *
++             * This is important to set, even when the pointers are far enough
++             * apart, because ZSTD_decompressBlock_internal() can decide to store
++             * literals in the output buffer, after the block it is decompressing.
++             * Since we don't want anything to overwrite our input, we have to tell
++             * ZSTD_decompressBlock_internal to never write past ip.
++             *
++             * See ZSTD_allocateLiteralsBuffer() for reference.
++             */
++            oBlockEnd = op + (ip - op);
++        }
++
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
+             break;
+         case bt_raw :
++            /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+             decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
+             break;
+         case bt_rle :
+-            decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize);
++            decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize);
+             break;
+         case bt_reserved :
+         default:
+@@ -911,6 +999,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+     }
+     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+     /* Allow caller to get size read */
++    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+     *srcPtr = ip;
+     *srcSizePtr = remainingSrcSize;
+     return (size_t)(op-ostart);
+@@ -1042,8 +1131,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+ 
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+  * be streamed.
+  *
+  * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1243,7 +1332,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+ 
+     default:
+         assert(0);   /* impossible */
+-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }
+ }
+ 
+@@ -1284,11 +1373,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+         /* in minimal huffman, we always use X1 variants */
+         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                 dictPtr, dictEnd - dictPtr,
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #else
+         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                 dictPtr, (size_t)(dictEnd - dictPtr),
+-                                                workspace, workspaceSize);
++                                                workspace, workspaceSize, /* flags */ 0);
+ #endif
+         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+         dictPtr += hSize;
+@@ -1384,7 +1473,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+     dctx->prefixStart = NULL;
+     dctx->virtualStart = NULL;
+     dctx->dictEnd = NULL;
+-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
++    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+     dctx->litEntropy = dctx->fseEntropy = 0;
+     dctx->dictID = 0;
+     dctx->bType = bt_reserved;
+@@ -1446,7 +1535,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  This could for one of the following reasons :
+  *  - The frame does not require a dictionary (most common case).
+  *  - The frame was built with dictID intentionally removed.
+- *    Needed dictionary is a hidden information.
++ *    Needed dictionary is a hidden piece of information.
+  *    Note : this use case also happens when using a non-conformant dictionary.
+  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1455,7 +1544,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+     if (ZSTD_isError(hError)) return 0;
+     return zfp.dictID;
+@@ -1562,7 +1651,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+     DEBUGLOG(4, "ZSTD_initDStream");
+-    return ZSTD_initDStream_usingDDict(zds, NULL);
++    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++    return ZSTD_startingInputLength(zds->format);
+ }
+ 
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1570,20 +1661,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+  * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+     return ZSTD_startingInputLength(dctx->format);
+ }
+ 
+-/* ZSTD_resetDStream() :
+- * return : expected size, aka ZSTD_startingInputLength().
+- * this function cannot fail */
+-size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+-{
+-    FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+-    return ZSTD_startingInputLength(dctx->format);
+-}
+-
+ 
+ size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+ {
+@@ -1651,6 +1734,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+             return bounds;
++        case ZSTD_d_disableHuffmanAssembly:
++            bounds.lowerBound = 0;
++            bounds.upperBound = 1;
++            return bounds;
++
+         default:;
+     }
+     bounds.error = ERROR(parameter_unsupported);
+@@ -1691,6 +1779,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+         case ZSTD_d_refMultipleDDicts:
+             *value = (int)dctx->refMultipleDDicts;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            *value = (int)dctx->disableHufAsm;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1724,6 +1815,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+             }
+             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+             return 0;
++        case ZSTD_d_disableHuffmanAssembly:
++            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++            dctx->disableHufAsm = value != 0;
++            return 0;
+         default:;
+     }
+     RETURN_ERROR(parameter_unsupported, "");
+@@ -1899,7 +1994,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 if (zds->refMultipleDDicts && zds->ddictSet) {
+                     ZSTD_DCtx_selectFrameDDict(zds);
+                 }
+-                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                 if (ZSTD_isError(hSize)) {
+                     return hSize;   /* error */
+                 }
+@@ -1913,6 +2007,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                             zds->lhSize += remainingInput;
+                         }
+                         input->pos = input->size;
++                        /* check first few bytes */
++                        FORWARD_IF_ERROR(
++                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++                            "First few bytes detected incorrect" );
++                        /* return hint input size */
+                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                     }
+                     assert(ip != NULL);
+@@ -1930,8 +2029,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    assert(istart != NULL);
+                     ip = istart + cSize;
+-                    op += decompressedSize;
++                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+                     zds->expected = 0;
+                     zds->streamStage = zdss_init;
+                     someMoreWork = 0;
+@@ -2015,6 +2115,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 }
+                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++                    assert(ip != NULL);
+                     ip += neededInSize;
+                     /* Function modifies the stage so we must break */
+                     break;
+@@ -2029,7 +2130,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                 size_t loadedSize;
+                 /* At this point we shouldn't be decompressing a block that we can stream. */
+-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+                 if (isSkipFrame) {
+                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                 } else {
+@@ -2038,8 +2139,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                                     "should never happen");
+                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+                 }
+-                ip += loadedSize;
+-                zds->inPos += loadedSize;
++                if (loadedSize != 0) {
++                    /* ip may be NULL */
++                    ip += loadedSize;
++                    zds->inPos += loadedSize;
++                }
+                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+ 
+                 /* decode loaded input */
+@@ -2049,14 +2153,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+                 break;
+             }
+         case zdss_flush:
+-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
++            {
++                size_t const toFlushSize = zds->outEnd - zds->outStart;
+                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+-                op += flushedSize;
++
++                op = op ? op + flushedSize : op;
++
+                 zds->outStart += flushedSize;
+                 if (flushedSize == toFlushSize) {  /* flush completed */
+                     zds->streamStage = zdss_read;
+                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                 (int)(zds->outBuffSize - zds->outStart),
+                                 (U32)zds->fParams.blockSizeMax);
+@@ -2070,7 +2177,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ 
+         default:
+             assert(0);    /* impossible */
+-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
++            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
+     }   }
+ 
+     /* result */
+@@ -2083,8 +2190,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+         zds->noForwardProgress ++;
+         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+             assert(0);
+         }
+     } else {
+@@ -2121,11 +2228,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+                             void* dst, size_t dstCapacity, size_t* dstPos,
+                       const void* src, size_t srcSize, size_t* srcPos)
+ {
+-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+-    *dstPos = output.pos;
+-    *srcPos = input.pos;
+-    return cErr;
++    ZSTD_outBuffer output;
++    ZSTD_inBuffer  input;
++    output.dst = dst;
++    output.size = dstCapacity;
++    output.pos = *dstPos;
++    input.src = src;
++    input.size = srcSize;
++    input.pos = *srcPos;
++    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++        *dstPos = output.pos;
++        *srcPos = input.pos;
++        return cErr;
++    }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7c89..ffbe53ba0346 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h"         /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
+ #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h"  /* ZSTD_highbit32 */
+ 
+ /*_*******************************************************
+ *  Macros
+@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
+             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+         }
+         else {
+-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+         }
+@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+             ZSTD_FALLTHROUGH;
+ 
+         case set_compressed:
+-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+             {   size_t lhSize, litSize, litCSize;
+                 U32 singleStream=0;
+                 U32 const lhlCode = (istart[0] >> 2) & 3;
+                 U32 const lhc = MEM_readLE32(istart);
+                 size_t hufSuccess;
+                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                int const flags = 0
++                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+                 switch(lhlCode)
+                 {
+                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                if (!singleStream)
++                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
++                        litSize, MIN_LITERALS_FOR_4_STREAMS);
+                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 if (litEncType==set_repeat) {
+                     if (singleStream) {
+-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++                        hufSuccess = HUF_decompress1X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     } else {
+-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++                        hufSuccess = HUF_decompress4X_usingDTable(
+                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
+-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++                            dctx->HUFptr, flags);
+                     }
+                 } else {
+                     if (singleStream) {
+@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                         hufSuccess = HUF_decompress1X_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace));
++                            sizeof(dctx->workspace), flags);
+ #else
+-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+ #endif
+                     } else {
+-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
+                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                             istart+lhSize, litCSize, dctx->workspace,
+-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++                            sizeof(dctx->workspace), flags);
+                     }
+                 }
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+                     litSize = MEM_readLE24(istart) >> 4;
+                     break;
+                 }
+@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
+                 case 1:
+                     lhSize = 2;
++                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+                     litSize = MEM_readLE16(istart) >> 4;
+                     break;
+                 case 3:
+                     lhSize = 3;
++                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+                     litSize = MEM_readLE24(istart) >> 4;
+-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                     break;
+                 }
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
++                assert(n>=0);
++                pos += (size_t)n;
+             }
+         }
+         /* Now we spread those positions across the table.
+-         * The benefit of doing it in two stages is that we avoid the the
++         * The benefit of doing it in two stages is that we avoid the
+          * variable size inner loop, which caused lots of branch misses.
+          * Now we can run through all the positions without any branch misses.
+-         * We unroll the loop twice, since that is what emperically worked best.
++         * We unroll the loop twice, since that is what empirically worked best.
+          */
+         {
+             size_t position = 0;
+@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+             for (i=0; i<n; i++) {
+                 tableDecode[position].baseValue = s;
+                 position = (position + step) & tableMask;
+-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
++                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
+         }   }
+         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+     }
+@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+         for (u=0; u<tableSize; u++) {
+             U32 const symbol = tableDecode[u].baseValue;
+             U32 const nextState = symbolNext[symbol]++;
+-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+             assert(nbAdditionalBits[symbol] < 255);
+             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
+ 
+     assert(op != NULL /* Precondition */);
+     assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++    /* prefetch sequence starting from match that will be used for copy later */
++    PREFETCH_L1(match);
++#endif
+     /* Handle edge cases in a slow path:
+      *   - Read beyond end of literals
+      *   - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+ 
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+  * bits before reloading. This value is the maximum number of bytes we read
+  * after reloading when we are decoding long offsets.
+  */
+@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ {
+     seq_t seq;
++    /*
++     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
++     * loaded in one operation and extracted its fields by simply shifting or
++     * bit-extracting on aarch64.
++     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++     * operations that cause performance drop. This can be avoided by using this
++     * ZSTD_memcpy hack.
++     */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+     seq.matchLength = mlDInfo->baseValue;
+     seq.litLength = llDInfo->baseValue;
+     {   U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,9 +1221,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         U32 const llnbBits = llDInfo->nbBits;
+         U32 const mlnbBits = mlDInfo->nbBits;
+         U32 const ofnbBits = ofDInfo->nbBits;
++
++        assert(llBits <= MaxLLBits);
++        assert(mlBits <= MaxMLBits);
++        assert(ofBits <= MaxOff);
+         /*
+          * As gcc has better branch and block analyzers, sometimes it is only
+-         * valuable to mark likelyness for clang, it gives around 3-4% of
++         * valuable to mark likeliness for clang, it gives around 3-4% of
+          * performance.
+          */
+ 
+@@ -1201,13 +1240,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+     #endif
+                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+-                assert(ofBits <= MaxOff);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++                    /* Always read extra bits, this keeps the logic simple,
++                     * avoids branches, and avoids accidentally reading 0 bits.
++                     */
++                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                     BIT_reloadDStream(&seqState->DStream);
+-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
++                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                 } else {
+                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1552,7 +1594,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
++    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+     (void)frame;
+ 
+     /* Regen sequences */
+@@ -1945,34 +1987,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+ 
++/*
++ * @returns The total size of the history referencable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++    return (size_t)(op - virtualStart);
++}
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++typedef struct {
++    unsigned longOffsetShare;
++    unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
++
++/* ZSTD_getOffsetInfo() :
+  * condition : offTable must be valid
+  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- *           compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ *           compared to maximum possible of (1<<OffFSELog),
++ *           as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+-    const void* ptr = offTable;
+-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+-    const ZSTD_seqSymbol* table = offTable + 1;
+-    U32 const max = 1 << tableLog;
+-    U32 u, total = 0;
+-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+-    assert(max <= (1 << OffFSELog));  /* max not too large */
+-    for (u=0; u<max; u++) {
+-        if (table[u].nbAdditionalBits > 22) total += 1;
++    ZSTD_OffsetInfo info = {0, 0};
++    /* If nbSeq == 0, then the offTable is uninitialized, but we have
++     * no sequences, so both values should be 0.
++     */
++    if (nbSeq != 0) {
++        const void* ptr = offTable;
++        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++        const ZSTD_seqSymbol* table = offTable + 1;
++        U32 const max = 1 << tableLog;
++        U32 u;
++        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++        assert(max <= (1 << OffFSELog));  /* max not too large */
++        for (u=0; u<max; u++) {
++            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++        }
++
++        assert(tableLog <= OffFSELog);
++        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+     }
+ 
+-    assert(tableLog <= OffFSELog);
+-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
++    return info;
++}
+ 
+-    return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++    if (MEM_64bits()) {
++        /* We can decode any offset without reloading bits.
++         * This might change if the max window size grows.
++         */
++        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++        return (size_t)-1;
++    } else {
++        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++         */
++        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++        return maxOffset;
++    }
+ }
+-#endif
+ 
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -1980,20 +2067,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+ {   /* blockType == blockCompressed */
+     const BYTE* ip = (const BYTE*)src;
+-    /* isLongOffset must be true if there are long offsets.
+-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+-     * We don't expect that to be the case in 64-bit mode.
+-     * In block mode, window size is not known, so we have to be conservative.
+-     * (note: but it could be evaluated from current-lowLimit)
+-     */
+-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+ 
+-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    /* Note : the wording of the specification
++     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
++     * This generally does not happen, as it makes little sense,
++     * since an uncompressed block would feature same size and have no decompression cost.
++     * Also, note that decoder from reference libzstd before < v1.5.4
++     * would consider this edge case as an error.
++     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
++     * for broader compatibility with the deployed ecosystem of zstd decoders */
++    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+ 
+     /* Decode literals section */
+     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+         if (ZSTD_isError(litCSize)) return litCSize;
+         ip += litCSize;
+         srcSize -= litCSize;
+@@ -2001,6 +2089,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+     /* Build Decoding Tables */
+     {
++        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++         */
++        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
++        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
++        /* isLongOffset must be true if there are long offsets.
++         * Offsets are long if they are larger than ZSTD_maxShortOffset().
++         * We don't expect that to be the case in 64-bit mode.
++         *
++         * We check here to see if our history is large enough to allow long offsets.
++         * If it isn't, then we can't possible have (valid) long offsets. If the offset
++         * is invalid, then it is okay to read it incorrectly.
++         *
++         * If isLongOffsets is true, then we will later check our decoding table to see
++         * if it is even possible to generate long offsets.
++         */
++        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+         /* These macros control at build-time which decompressor implementation
+          * we use. If neither is defined, we do some inspection and dispatch at
+          * runtime.
+@@ -2008,6 +2113,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+         int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++        /* Set to 1 to avoid computing offset info if we don't need to.
++         * Otherwise this value is ignored.
++         */
++        int usePrefetchDecoder = 1;
+ #endif
+         int nbSeq;
+         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2017,26 +2127,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ 
+         RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+ 
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if ( !usePrefetchDecoder
+-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+-            usePrefetchDecoder = (shareLongOffsets >= minShare);
++        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++         * NOTE: could probably use a larger nbSeq limit
++         */
++        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++                 * enough, then we know it is impossible to have too long an offset in this block, so we can
++                 * use the regular offset decoder.
++                 */
++                isLongOffset = ZSTD_lo_isRegularOffset;
++            }
++            if (!usePrefetchDecoder) {
++                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++                usePrefetchDecoder = (info.longOffsetShare >= minShare);
++            }
+         }
+-#endif
+ 
+         dctx->ddictIsCold = 0;
+ 
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-        if (usePrefetchDecoder)
++        if (usePrefetchDecoder) {
++#else
++        (void)usePrefetchDecoder;
++        {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+ #endif
++        }
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+         /* else */
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d25a..e372f048d186 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6a83..32f79fb2873d 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+ 
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+ 
+ typedef struct {
+     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
++    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+     U32 rep[ZSTD_REP_NUM];
+     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
+     ZSTD_dictUses_e dictUses;
+     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++    int disableHufAsm;
+ 
+     /* streaming */
+     ZSTD_dStreamStage streamStage;
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187aab5..8a47eb2a4514 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367e6f..2fead39eb743 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index 04e1b5c01d9b..8ecf43226af2 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index f4ed952ed485..7d31518e9d5a 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+  * All rights reserved.
+  *
+  * This source code is licensed under both the BSD-style license (found in the
+@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+ 
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+-	return ZSTD_resetDStream(dstream);
++	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+ 
+-- 
+2.39.2
diff --git a/patches/0002-cfs-nice.patch b/patches/0002-cfs-nice.patch
new file mode 100644
index 0000000..fdb957a
--- /dev/null
+++ b/patches/0002-cfs-nice.patch
@@ -0,0 +1,1029 @@
+From 78440b24f24a021daf660c0bd212c936e50e5f0a Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 17 Feb 2023 15:38:09 +0100
+Subject: [PATCH] Add latency priority for CFS class
+
+This patchset restarts the work about adding a latency priority to describe
+the latency tolerance of cfs tasks.
+
+Patch [1] is a new one that has been added with v6. It fixes an
+unfairness for low prio tasks because of wakeup_gran() being bigger
+than the maximum vruntime credit that a waking task can keep after
+sleeping.
+
+The patches [2-4] have been done by Parth:
+https://lore.kernel.org/lkml/20200228090755.22829-1-parth@linux.ibm.com/
+
+I have just rebased and moved the set of latency priority outside the
+priority update. I have removed the reviewed tag because the patches
+are 2 years old.
+
+This aims to be a generic interface and the following patches is one use
+of it to improve the scheduling latency of cfs tasks.
+
+Patch [5] uses latency nice priority to define a latency offset
+and then decide if a cfs task can or should preempt the current
+running task. The patch gives some tests results with cyclictests and
+hackbench to highlight the benefit of latency priority for short
+interactive task or long intensive tasks.
+
+Patch [6] adds the support of latency nice priority to task group by
+adding a cpu.latency.nice field. The range is [-20:19] as for setting task
+latency priority.
+
+Patch [7] makes sched_core taking into account the latency offset.
+
+Patch [8] adds a rb tree to cover some corner cases where the latency
+sensitive task (priority < 0) is preempted by high priority task (RT/DL)
+or fails to preempt them. This patch ensures that tasks will have at least
+a slice of sched_min_granularity in priority at wakeup.
+
+Patch [9] removes useless check after adding a latency rb tree.
+
+I have also backported the patchset on a dragonboard RB3 with an android
+mainline kernel based on v5.18 for a quick test. I have used the
+TouchLatency app which is part of AOSP and described to be a very good
+test to highlight jitter and jank frame sources of a system [1].
+In addition to the app, I have added some short running tasks waking-up
+regularly (to use the 8 cpus for 4 ms every 37777us) to stress the system
+without overloading it (and disabling EAS). The 1st results shows that the
+patchset helps to reduce the missed deadline frames from 5% to less than
+0.1% when the cpu.latency.nice of task group are set. I haven't rerun the
+test with latest version.
+
+I have also tested the patchset with the modified version of the alsa
+latency test that has been shared by Tim. The test quickly xruns with
+default latency nice priority 0 but is able to run without underuns with
+a latency -20 and hackbench running simultaneously.
+
+While preparing the version 8, I have evaluated the benefit of using an
+augmented rbtree instead of adding a rbtree for latency sensitive entities,
+which was a relevant suggestion done by PeterZ. Although the augmented
+rbtree enables to sort additional information in the tree with a limited
+overhead, it has more impact on legacy use cases (latency_nice >= 0)
+because the augmented callbacks are always called to maintain this
+additional information even when there is no sensitive tasks. In such
+cases, the dedicated rbtree remains empty and the overhead is reduced to
+loading a cached null node pointer. Nevertheless, we might want to
+reconsider the augmented rbtree once the use of negative latency_nice will
+be more widlely deployed. At now, the different tests that I have done,
+have not shown improvements with augmented rbtree.
+
+Below are some hackbench results:
+        2 rbtrees               augmented rbtree        augmented rbtree
+                                sorted by vruntime      sorted by wakeup_vruntime
+sched   pipe
+avg     26311,000               25976,667               25839,556
+stdev   0,15 %                  0,28 %                  0,24 %
+vs tip  0,50 %                  -0,78 %                 -1,31 %
+hackbench       1 group
+avg     1,315                   1,344                   1,359
+stdev   0,88 %                  1,55 %                  1,82 %
+vs tip  -0,47 %                 -2,68 %                 -3,87 %
+hackbench       4 groups
+avg     1,339                   1,365                   1,367
+stdev   2,39 %                  2,26 %                  3,58 %
+vs tip  -0,08 %                 -2,01 %                 -2,22 %
+hackbench       8 groups
+avg     1,233                   1,286                   1,301
+stdev   0,74 %                  1,09 %                  1,52 %
+vs tip  0,29 %                  -4,05 %                 -5,27 %
+hackbench       16 groups
+avg     1,268                   1,313                   1,319
+stdev   0,85 %                  1,60 %                  0,68 %
+vs tip  -0,02 %                 -3,56 %                 -4,01 %
+
+[1] https://source.android.com/docs/core/debug/eval_perf#touchlatency
+
+Change since v9:
+- Rebase
+- add tags
+
+Change since v8:
+- Rename get_sched_latency by get_sleep_latency
+- move latency nice defines in sched/prio.h and fix latency_prio init value
+- Fix typo and comments
+
+Change since v7:
+- Replaced se->on_latency by using RB_CLEAR_NODE() and RB_EMPTY_NODE()
+- Clarify the limit behavior fo the cgroup cpu.latenyc_nice
+
+Change since v6:
+- Fix compilation error for !CONFIG_SCHED_DEBUG
+
+Change since v5:
+- Add patch 1 to fix unfairness for low prio task. This has been
+  discovered while studying Youssef's tests results with latency nice
+  which were hitting the same problem.
+- Fixed latency_offset computation to take into account
+  GENTLE_FAIR_SLEEPERS. This has diseappeared with v2and has been raised
+  by Youssef's tests.
+- Reworked and optimized how latency_offset in used to check for
+  preempting current task at wakeup and tick. This cover more cases too.
+- Add patch 9 to remove check_preempt_from_others() which is not needed
+  anymore with the rb tree.
+
+Change since v4:
+- Removed permission checks to set latency priority. This enables user
+  without elevated privilege like audio application to set their latency
+  priority as requested by Tim.
+- Removed cpu.latency and replaced it by cpu.latency.nice so we keep a
+  generic interface not tied to latency_offset which can be used to
+  implement other latency features.
+- Added an entry in Documentation/admin-guide/cgroup-v2.rst to describe
+  cpu.latency.nice.
+- Fix some typos.
+
+Change since v3:
+- Fix 2 compilation warnings raised by kernel test robot <lkp@intel.com>
+
+Change since v2:
+- Set a latency_offset field instead of saving a weight and computing it
+  on the fly.
+- Make latency_offset available for task group: cpu.latency
+- Fix some corner cases to make latency sensitive tasks schedule first and
+  add a rb tree for latency sensitive task.
+
+Change since v1:
+- fix typo
+- move some codes in the right patch to make bisect happy
+- simplify and fixed how the weight is computed
+- added support of sched core patch 7
+
+Parth Shah (3):
+  sched: Introduce latency-nice as a per-task attribute
+  sched/core: Propagate parent task's latency requirements to the child
+    task
+  sched: Allow sched_{get,set}attr to change latency_nice of the task
+
+Vincent Guittot (6):
+  sched/fair: fix unfairness at wakeup
+  sched/fair: Take into account latency priority at wakeup
+  sched/fair: Add sched group latency support
+  sched/core: Support latency priority with sched core
+  sched/fair: Add latency list
+  sched/fair: remove check_preempt_from_others
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/cgroup-v2.rst |  10 ++
+ include/linux/sched.h                   |   4 +
+ include/linux/sched/prio.h              |  27 +++
+ include/uapi/linux/sched.h              |   4 +-
+ include/uapi/linux/sched/types.h        |  19 +++
+ init/init_task.c                        |   1 +
+ kernel/sched/core.c                     | 106 ++++++++++++
+ kernel/sched/debug.c                    |   1 +
+ kernel/sched/fair.c                     | 209 ++++++++++++++++++++----
+ kernel/sched/sched.h                    |  45 ++++-
+ tools/include/uapi/linux/sched.h        |   4 +-
+ 11 files changed, 394 insertions(+), 36 deletions(-)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 74cec76be9f2..2e511d4a4c6a 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1118,6 +1118,16 @@ All time durations are in microseconds.
+         values similar to the sched_setattr(2). This maximum utilization
+         value is used to clamp the task specific maximum utilization clamp.
+ 
++  cpu.latency.nice
++	A read-write single value file which exists on non-root
++	cgroups.  The default is "0".
++
++	The nice value is in the range [-20, 19].
++
++	This interface file allows reading and setting latency using the
++	same values used by sched_setattr(2). The latency_nice of a group is
++	used to limit the impact of the latency_nice of a task outside the
++	group.
+ 
+ 
+ Memory
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 28ce1be0ba47..df219c7cd6aa 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -548,6 +548,7 @@ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+ 	struct rb_node			run_node;
++	struct rb_node			latency_node;
+ 	struct list_head		group_node;
+ 	unsigned int			on_rq;
+ 
+@@ -571,6 +572,8 @@ struct sched_entity {
+ 	/* cached value of my_q->h_nr_running */
+ 	unsigned long			runnable_weight;
+ #endif
++	/* preemption offset in ns */
++	long				latency_offset;
+ 
+ #ifdef CONFIG_SMP
+ 	/*
+@@ -787,6 +790,7 @@ struct task_struct {
+ 	int				static_prio;
+ 	int				normal_prio;
+ 	unsigned int			rt_priority;
++	int				latency_prio;
+ 
+ 	struct sched_entity		se;
+ 	struct sched_rt_entity		rt;
+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
+index ab83d85e1183..be79503d86af 100644
+--- a/include/linux/sched/prio.h
++++ b/include/linux/sched/prio.h
+@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio)
+ 	return (MAX_NICE - prio + 1);
+ }
+ 
++/*
++ * Latency nice is meant to provide scheduler hints about the relative
++ * latency requirements of a task with respect to other tasks.
++ * Thus a task with latency_nice == 19 can be hinted as the task with no
++ * latency requirements, in contrast to the task with latency_nice == -20
++ * which should be given priority in terms of lower latency.
++ */
++#define MAX_LATENCY_NICE	19
++#define MIN_LATENCY_NICE	-20
++
++#define LATENCY_NICE_WIDTH	\
++	(MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
++
++/*
++ * Default tasks should be treated as a task with latency_nice = 0.
++ */
++#define DEFAULT_LATENCY_NICE	0
++#define DEFAULT_LATENCY_PRIO	(DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
++
++/*
++ * Convert user-nice values [ -20 ... 0 ... 19 ]
++ * to static latency [ 0..39 ],
++ * and back.
++ */
++#define NICE_TO_LATENCY(nice)	((nice) + DEFAULT_LATENCY_PRIO)
++#define LATENCY_TO_NICE(prio)	((prio) - DEFAULT_LATENCY_PRIO)
++
+ #endif /* _LINUX_SCHED_PRIO_H */
+diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
+index 3bac0a8ceab2..b2e932c25be6 100644
+--- a/include/uapi/linux/sched.h
++++ b/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+ 
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+ 
+ #endif /* _UAPI_LINUX_SCHED_H */
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index f2c4589d4dbf..db1e8199e8c8 100644
+--- a/include/uapi/linux/sched/types.h
++++ b/include/uapi/linux/sched/types.h
+@@ -10,6 +10,7 @@ struct sched_param {
+ 
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
++#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
+ 
+ /*
+  * Extended scheduling parameters data structure.
+@@ -98,6 +99,22 @@ struct sched_param {
+  * scheduled on a CPU with no more capacity than the specified value.
+  *
+  * A task utilization boundary can be reset by setting the attribute to -1.
++ *
++ * Latency Tolerance Attributes
++ * ===========================
++ *
++ * A subset of sched_attr attributes allows to specify the relative latency
++ * requirements of a task with respect to the other tasks running/queued in the
++ * system.
++ *
++ * @ sched_latency_nice	task's latency_nice value
++ *
++ * The latency_nice of a task can have any value in a range of
++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
++ *
++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
++ * taken for a task requiring a lower latency as opposed to the task with
++ * higher latency_nice.
+  */
+ struct sched_attr {
+ 	__u32 size;
+@@ -120,6 +137,8 @@ struct sched_attr {
+ 	__u32 sched_util_min;
+ 	__u32 sched_util_max;
+ 
++	/* latency requirement hints */
++	__s32 sched_latency_nice;
+ };
+ 
+ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
+diff --git a/init/init_task.c b/init/init_task.c
+index ff6c4b9bfe6b..071deff8dbd1 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -78,6 +78,7 @@ struct task_struct init_task
+ 	.prio		= MAX_PRIO - 20,
+ 	.static_prio	= MAX_PRIO - 20,
+ 	.normal_prio	= MAX_PRIO - 20,
++	.latency_prio	= DEFAULT_LATENCY_PRIO,
+ 	.policy		= SCHED_NORMAL,
+ 	.cpus_ptr	= &init_task.cpus_mask,
+ 	.user_cpus_ptr	= NULL,
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 5237639786b7..5d6a283a4da9 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1283,6 +1283,16 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+ 	}
+ }
+ 
++static void set_latency_offset(struct task_struct *p)
++{
++	long weight = sched_latency_to_weight[p->latency_prio];
++	s64 offset;
++
++	offset = weight * get_sleep_latency(false);
++	offset = div_s64(offset, NICE_LATENCY_WEIGHT_MAX);
++	p->se.latency_offset = (long)offset;
++}
++
+ #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+@@ -4432,6 +4442,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.dur_avg			= 0;
+ 	p->se.prev_sleep_sum_runtime	= 0;
+ 	INIT_LIST_HEAD(&p->se.group_node);
++	RB_CLEAR_NODE(&p->se.latency_node);
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+@@ -4684,6 +4695,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+ 
++		p->latency_prio = NICE_TO_LATENCY(0);
++		set_latency_offset(p);
++
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+ 		 * fulfilled its duty:
+@@ -7444,6 +7458,16 @@ static void __setscheduler_params(struct task_struct *p,
+ 	p->rt_priority = attr->sched_priority;
+ 	p->normal_prio = normal_prio(p);
+ 	set_load_weight(p, true);
++
++}
++
++static void __setscheduler_latency(struct task_struct *p,
++		const struct sched_attr *attr)
++{
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
++		p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
++		set_latency_offset(p);
++	}
+ }
+ 
+ /*
+@@ -7586,6 +7610,13 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			return retval;
+ 	}
+ 
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
++		if (attr->sched_latency_nice > MAX_LATENCY_NICE)
++			return -EINVAL;
++		if (attr->sched_latency_nice < MIN_LATENCY_NICE)
++			return -EINVAL;
++	}
++
+ 	if (pi)
+ 		cpuset_read_lock();
+ 
+@@ -7620,6 +7651,9 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			goto change;
+ 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ 			goto change;
++		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
++		    attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
++			goto change;
+ 
+ 		p->sched_reset_on_fork = reset_on_fork;
+ 		retval = 0;
+@@ -7708,6 +7742,7 @@ static int __sched_setscheduler(struct task_struct *p,
+ 		__setscheduler_params(p, attr);
+ 		__setscheduler_prio(p, newprio);
+ 	}
++	__setscheduler_latency(p, attr);
+ 	__setscheduler_uclamp(p, attr);
+ 
+ 	if (queued) {
+@@ -7918,6 +7953,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 	    size < SCHED_ATTR_SIZE_VER1)
+ 		return -EINVAL;
+ 
++	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
++	    size < SCHED_ATTR_SIZE_VER2)
++		return -EINVAL;
+ 	/*
+ 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
+ 	 * to be strict and return an error on out-of-bounds values?
+@@ -8155,6 +8193,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ 	get_params(p, &kattr);
+ 	kattr.sched_flags &= SCHED_FLAG_ALL;
+ 
++	kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
++
+ #ifdef CONFIG_UCLAMP_TASK
+ 	/*
+ 	 * This could race with another potential updater, but this is fine
+@@ -11027,6 +11067,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ {
+ 	return sched_group_set_idle(css_tg(css), idle);
+ }
++
++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
++				    struct cftype *cft)
++{
++	int prio, delta, last_delta = INT_MAX;
++	s64 weight;
++
++	weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
++	weight = div_s64(weight, get_sleep_latency(false));
++
++	/* Find the closest nice value to the current weight */
++	for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
++		delta = abs(sched_latency_to_weight[prio] - weight);
++		if (delta >= last_delta)
++			break;
++		last_delta = delta;
++	}
++
++	return LATENCY_TO_NICE(prio-1);
++}
++
++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
++				     struct cftype *cft, s64 nice)
++{
++	s64 latency_offset;
++	long weight;
++	int idx;
++
++	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
++		return -ERANGE;
++
++	idx = NICE_TO_LATENCY(nice);
++	idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
++	weight = sched_latency_to_weight[idx];
++
++	latency_offset = weight * get_sleep_latency(false);
++	latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
++
++	return sched_group_set_latency(css_tg(css), latency_offset);
++}
++
+ #endif
+ 
+ static struct cftype cpu_legacy_files[] = {
+@@ -11041,6 +11122,11 @@ static struct cftype cpu_legacy_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11258,6 +11344,12 @@ static struct cftype cpu_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11368,6 +11460,20 @@ const u32 sched_prio_to_wmult[40] = {
+  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ };
+ 
++/*
++ * latency weight for wakeup preemption
++ */
++const int sched_latency_to_weight[40] = {
++ /* -20 */     -1024,     -973,     -922,      -870,      -819,
++ /* -15 */      -768,     -717,     -666,      -614,      -563,
++ /* -10 */      -512,     -461,     -410,      -358,      -307,
++ /*  -5 */      -256,     -205,     -154,      -102,       -51,
++ /*   0 */         0,       51,      102,       154,       205,
++ /*   5 */       256,      307,      358,       410,       461,
++ /*  10 */       512,      563,      614,       666,       717,
++ /*  15 */       768,      819,      870,       922,       973,
++};
++
+ void call_trace_sched_update_nr_running(struct rq *rq, int count)
+ {
+         trace_sched_update_nr_running_tp(rq, count);
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 8d64fba16cfe..177934290ec4 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -1044,6 +1044,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ #endif
+ 	P(policy);
+ 	P(prio);
++	P(latency_prio);
+ 	if (task_has_dl_policy(p)) {
+ 		P(dl.runtime);
+ 		P(dl.deadline);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index b38a1ce1be49..5ef893ce5734 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -698,7 +698,76 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+ 
+ 	return __node_2_se(last);
+ }
++#endif
++
++/**************************************************************
++ * Scheduling class tree data structure manipulation methods:
++ * for latency
++ */
++
++static inline bool latency_before(struct sched_entity *a,
++				struct sched_entity *b)
++{
++	return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0;
++}
++
++#define __latency_node_2_se(node) \
++	rb_entry((node), struct sched_entity, latency_node)
++
++static inline bool __latency_less(struct rb_node *a, const struct rb_node *b)
++{
++	return latency_before(__latency_node_2_se(a), __latency_node_2_se(b));
++}
++
++/*
++ * Enqueue an entity into the latency rb-tree:
++ */
++static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
++{
++
++	/* Only latency sensitive entity can be added to the list */
++	if (se->latency_offset >= 0)
++		return;
++
++	if (!RB_EMPTY_NODE(&se->latency_node))
++		return;
++
++	/*
++	 * An execution time less than sysctl_sched_min_granularity means that
++	 * the entity has been preempted by a higher sched class or an entity
++	 * with higher latency constraint.
++	 * Put it back in the list so it gets a chance to run 1st during the
++	 * next slice.
++	 */
++	if (!(flags & ENQUEUE_WAKEUP)) {
++		u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++
++		if (delta_exec >= sysctl_sched_min_granularity)
++			return;
++	}
++
++	rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less);
++}
++
++static void __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	if (!RB_EMPTY_NODE(&se->latency_node)) {
++		rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline);
++		RB_CLEAR_NODE(&se->latency_node);
++	}
++}
++
++static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq)
++{
++	struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline);
++
++	if (!left)
++		return NULL;
++
++	return __latency_node_2_se(left);
++}
+ 
++#ifdef CONFIG_SCHED_DEBUG
+ /**************************************************************
+  * Scheduling class statistics methods:
+  */
+@@ -4672,33 +4741,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+ 	u64 vruntime = cfs_rq->min_vruntime;
+ 	u64 sleep_time;
+ 
+-	/*
+-	 * The 'current' period is already promised to the current tasks,
+-	 * however the extra weight of the new task will slow them down a
+-	 * little, place the new task so that it fits in the slot that
+-	 * stays open at the end.
+-	 */
+-	if (initial && sched_feat(START_DEBIT))
+-		vruntime += sched_vslice(cfs_rq, se);
+-
+-	/* sleeps up to a single latency don't count. */
+-	if (!initial) {
+-		unsigned long thresh;
+-
+-		if (se_is_idle(se))
+-			thresh = sysctl_sched_min_granularity;
+-		else
+-			thresh = sysctl_sched_latency;
+-
++	if (!initial)
++		/* sleeps up to a single latency don't count. */
++		vruntime -= get_sleep_latency(se_is_idle(se));
++	else if (sched_feat(START_DEBIT))
+ 		/*
+-		 * Halve their sleep time's effect, to allow
+-		 * for a gentler effect of sleepers:
++		 * The 'current' period is already promised to the current tasks,
++		 * however the extra weight of the new task will slow them down a
++		 * little, place the new task so that it fits in the slot that
++		 * stays open at the end.
+ 		 */
+-		if (sched_feat(GENTLE_FAIR_SLEEPERS))
+-			thresh >>= 1;
+-
+-		vruntime -= thresh;
+-	}
++		vruntime += sched_vslice(cfs_rq, se);
+ 
+ 	/*
+ 	 * Pull vruntime of the entity being placed to the base level of
+@@ -4792,8 +4845,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	check_schedstat_required();
+ 	update_stats_enqueue_fair(cfs_rq, se, flags);
+ 	check_spread(cfs_rq, se);
+-	if (!curr)
++	if (!curr) {
+ 		__enqueue_entity(cfs_rq, se);
++		__enqueue_latency(cfs_rq, se, flags);
++	}
+ 	se->on_rq = 1;
+ 
+ 	if (cfs_rq->nr_running == 1) {
+@@ -4879,8 +4934,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 
+ 	clear_buddies(cfs_rq, se);
+ 
+-	if (se != cfs_rq->curr)
++	if (se != cfs_rq->curr) {
+ 		__dequeue_entity(cfs_rq, se);
++		__dequeue_latency(cfs_rq, se);
++	}
+ 	se->on_rq = 0;
+ 	account_entity_dequeue(cfs_rq, se);
+ 
+@@ -4911,6 +4968,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+ }
+ 
++static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se);
++
+ /*
+  * Preempt the current task with a newly woken task if needed:
+  */
+@@ -4919,7 +4978,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+ 	unsigned long ideal_runtime, delta_exec;
+ 	struct sched_entity *se;
+-	s64 delta;
++	s64 delta, offset;
+ 
+ 	/*
+ 	 * When many tasks blow up the sched_period; it is possible that
+@@ -4950,10 +5009,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 	se = __pick_first_entity(cfs_rq);
+ 	delta = curr->vruntime - se->vruntime;
+ 
+-	if (delta < 0)
++	offset = wakeup_latency_gran(curr, se);
++	if (delta < offset)
+ 		return;
+ 
+-	if (delta > ideal_runtime)
++	if ((delta > ideal_runtime) ||
++	    (delta > get_latency_max()))
+ 		resched_curr(rq_of(cfs_rq));
+ }
+ 
+@@ -4971,6 +5032,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		 */
+ 		update_stats_wait_end_fair(cfs_rq, se);
+ 		__dequeue_entity(cfs_rq, se);
++		__dequeue_latency(cfs_rq, se);
+ 		update_load_avg(cfs_rq, se, UPDATE_TG);
+ 	}
+ 
+@@ -5009,7 +5071,7 @@ static struct sched_entity *
+ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+ 	struct sched_entity *left = __pick_first_entity(cfs_rq);
+-	struct sched_entity *se;
++	struct sched_entity *latency, *se;
+ 
+ 	/*
+ 	 * If curr is set we have to see if its left of the leftmost entity
+@@ -5051,6 +5113,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 		se = cfs_rq->last;
+ 	}
+ 
++	/* Check for latency sensitive entity waiting for running */
++	latency = __pick_first_latency(cfs_rq);
++	if (latency && (latency != se) &&
++	    wakeup_preempt_entity(latency, se) < 1)
++		se = latency;
++
+ 	return se;
+ }
+ 
+@@ -5074,6 +5142,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ 		update_stats_wait_start_fair(cfs_rq, prev);
+ 		/* Put 'current' back into the tree. */
+ 		__enqueue_entity(cfs_rq, prev);
++		__enqueue_latency(cfs_rq, prev, 0);
+ 		/* in !on_rq case, update occurred at dequeue */
+ 		update_load_avg(cfs_rq, prev, 0);
+ 	}
+@@ -7735,6 +7804,23 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+ }
+ #endif /* CONFIG_SMP */
+ 
++static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se)
++{
++	long latency_offset = se->latency_offset;
++
++	/*
++	 * A negative latency offset means that the sched_entity has latency
++	 * requirement that needs to be evaluated versus other entity.
++	 * Otherwise, use the latency weight to evaluate how much scheduling
++	 * delay is acceptable by se.
++	 */
++	if ((latency_offset < 0) || (curr->latency_offset < 0))
++		latency_offset -= curr->latency_offset;
++	latency_offset = min_t(long, latency_offset, get_latency_max());
++
++	return latency_offset;
++}
++
+ static unsigned long wakeup_gran(struct sched_entity *se)
+ {
+ 	unsigned long gran = sysctl_sched_wakeup_granularity;
+@@ -7773,11 +7859,24 @@ static int
+ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+ {
+ 	s64 gran, vdiff = curr->vruntime - se->vruntime;
++	s64 offset = wakeup_latency_gran(curr, se);
+ 
+-	if (vdiff <= 0)
++	if (vdiff < offset)
+ 		return -1;
+ 
+-	gran = wakeup_gran(se);
++	gran = offset + wakeup_gran(se);
++
++	/*
++	 * At wake up, the vruntime of a task is capped to not be older than
++	 * a sched_latency period compared to min_vruntime. This prevents long
++	 * sleeping task to get unlimited credit at wakeup. Such waking up task
++	 * has to preempt current in order to not lose its share of CPU
++	 * bandwidth but wakeup_gran() can become higher than scheduling period
++	 * for low priority task. Make sure that long sleeping task will get a
++	 * chance to preempt current.
++	 */
++	gran = min_t(s64, gran, get_latency_max());
++
+ 	if (vdiff > gran)
+ 		return 1;
+ 
+@@ -11995,6 +12094,9 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+ 	delta = (s64)(sea->vruntime - seb->vruntime) +
+ 		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ 
++	/* Take into account latency prio */
++	delta -= wakeup_latency_gran(sea, seb);
++
+ 	return delta > 0;
+ }
+ #else
+@@ -12265,6 +12367,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ void init_cfs_rq(struct cfs_rq *cfs_rq)
+ {
+ 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
++	cfs_rq->latency_timeline = RB_ROOT_CACHED;
+ 	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+ #ifdef CONFIG_SMP
+ 	raw_spin_lock_init(&cfs_rq->removed.lock);
+@@ -12320,6 +12423,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 		goto err;
+ 
+ 	tg->shares = NICE_0_LOAD;
++	tg->latency_offset = 0;
+ 
+ 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ 
+@@ -12418,6 +12522,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ 	}
+ 
+ 	se->my_q = cfs_rq;
++
++	se->latency_offset = tg->latency_offset;
++
+ 	/* guarantee group entities always have weight */
+ 	update_load_set(&se->load, NICE_0_LOAD);
+ 	se->parent = parent;
+@@ -12548,6 +12655,42 @@ int sched_group_set_idle(struct task_group *tg, long idle)
+ 	return 0;
+ }
+ 
++int sched_group_set_latency(struct task_group *tg, s64 latency)
++{
++	int i;
++
++	if (tg == &root_task_group)
++		return -EINVAL;
++
++	if (abs(latency) > sysctl_sched_latency)
++		return -EINVAL;
++
++	mutex_lock(&shares_mutex);
++
++	if (tg->latency_offset == latency) {
++		mutex_unlock(&shares_mutex);
++		return 0;
++	}
++
++	tg->latency_offset = latency;
++
++	for_each_possible_cpu(i) {
++		struct sched_entity *se = tg->se[i];
++		struct rq *rq = cpu_rq(i);
++		struct rq_flags rf;
++
++		rq_lock_irqsave(rq, &rf);
++
++		__dequeue_latency(se->cfs_rq, se);
++		WRITE_ONCE(se->latency_offset, latency);
++
++		rq_unlock_irqrestore(rq, &rf);
++	}
++
++	mutex_unlock(&shares_mutex);
++	return 0;
++}
++
+ #else /* CONFIG_FAIR_GROUP_SCHED */
+ 
+ void free_fair_sched_group(struct task_group *tg) { }
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 9e8bb6278604..c47198dbf740 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -125,6 +125,11 @@ extern int sched_rr_timeslice;
+  */
+ #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+ 
++/* Maximum nice latency weight used to scale the latency_offset */
++
++#define NICE_LATENCY_SHIFT	(SCHED_FIXEDPOINT_SHIFT)
++#define NICE_LATENCY_WEIGHT_MAX	(1L << NICE_LATENCY_SHIFT)
++
+ /*
+  * Increase resolution of nice-level calculations for 64-bit architectures.
+  * The extra resolution improves shares distribution and load balancing of
+@@ -378,6 +383,8 @@ struct task_group {
+ 
+ 	/* A positive value indicates that this is a SCHED_IDLE group. */
+ 	int			idle;
++	/* latency constraint of the group. */
++	int			latency_offset;
+ 
+ #ifdef	CONFIG_SMP
+ 	/*
+@@ -488,6 +495,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+ 
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
+ 
++extern int sched_group_set_latency(struct task_group *tg, s64 latency);
++
+ #ifdef CONFIG_SMP
+ extern void set_task_rq_fair(struct sched_entity *se,
+ 			     struct cfs_rq *prev, struct cfs_rq *next);
+@@ -566,6 +575,7 @@ struct cfs_rq {
+ #endif
+ 
+ 	struct rb_root_cached	tasks_timeline;
++	struct rb_root_cached	latency_timeline;
+ 
+ 	/*
+ 	 * 'curr' points to currently running entity on this cfs_rq.
+@@ -2123,6 +2133,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE);
+ 
+ extern const int		sched_prio_to_weight[40];
+ extern const u32		sched_prio_to_wmult[40];
++extern const int		sched_latency_to_weight[40];
+ 
+ /*
+  * {de,en}queue flags:
+@@ -2461,9 +2472,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+ extern const_debug unsigned int sysctl_sched_nr_migrate;
+ extern const_debug unsigned int sysctl_sched_migration_cost;
+ 
+-#ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
++#ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_idle_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern int sysctl_resched_latency_warn_ms;
+@@ -2478,6 +2489,38 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #endif
+ 
++static inline unsigned long get_sleep_latency(bool idle)
++{
++	unsigned long thresh;
++
++	if (idle)
++		thresh = sysctl_sched_min_granularity;
++	else
++		thresh = sysctl_sched_latency;
++
++	/*
++	 * Halve their sleep time's effect, to allow
++	 * for a gentler effect of sleepers:
++	 */
++	if (sched_feat(GENTLE_FAIR_SLEEPERS))
++		thresh >>= 1;
++
++	return thresh;
++}
++
++static inline unsigned long get_latency_max(void)
++{
++	unsigned long thresh = get_sleep_latency(false);
++
++	 /*
++	  * If the waking task failed to preempt current it could to wait up to
++	  * sysctl_sched_min_granularity before preempting it during next tick.
++	  */
++	thresh -= sysctl_sched_min_granularity;
++
++	return thresh;
++}
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
+index 3bac0a8ceab2..b2e932c25be6 100644
+--- a/tools/include/uapi/linux/sched.h
++++ b/tools/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+ 
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+ 
+ #endif /* _UAPI_LINUX_SCHED_H */
+-- 
+2.39.2
diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch
new file mode 100644
index 0000000..9e5bc88
--- /dev/null
+++ b/patches/0003-bore.patch
@@ -0,0 +1,388 @@
+From f169eabeb1ba8f339ab9bebec8d503c70c5f5879 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 17 Feb 2023 15:39:23 +0100
+Subject: [PATCH] bore-cachy
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ include/linux/sched.h   |   5 ++
+ init/Kconfig            |  20 ++++++
+ kernel/sched/core.c     |  29 +++++++++
+ kernel/sched/debug.c    |   3 +
+ kernel/sched/fair.c     | 132 +++++++++++++++++++++++++++++++++++++++-
+ kernel/sched/features.h |   4 ++
+ 6 files changed, 190 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index df219c7cd6aa..a3538eacb095 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -556,6 +556,11 @@ struct sched_entity {
+ 	u64				sum_exec_runtime;
+ 	u64				vruntime;
+ 	u64				prev_sum_exec_runtime;
++#ifdef CONFIG_SCHED_BORE
++	u64				prev_burst_time;
++	u64				burst_time;
++	u8				burst_score;
++#endif // CONFIG_SCHED_BORE
+ 
+ 	u64				nr_migrations;
+ 	u64				prev_sleep_sum_runtime;
+diff --git a/init/Kconfig b/init/Kconfig
+index 85a602dba878..bc69f062ca76 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE
+ 
+ 	  If unsure, say N here.
+ 
++config SCHED_BORE
++	bool "Burst-Oriented Response Enhancer"
++	default y
++	help
++	  In Desktop and Mobile computing, one might prefer interactive
++	  tasks to keep responsive no matter what they run in the background.
++
++	  Enabling this kernel feature modifies the scheduler to discriminate
++	  tasks by their burst time (runtime since it last went sleeping or
++	  yielding state) and prioritize those that run less bursty.
++	  Such tasks usually include window compositor, widgets backend,
++	  terminal emulator, video playback, games and so on.
++	  With a little impact to scheduling fairness, it may improve
++	  responsiveness especially under heavy background workload.
++
++	  You can turn it off by setting the sysctl kernel.sched_bore = 0.
++	  Enabling this feature implies NO_GENTLE_FAIR_SLEEPERS by default.
++
++	  If unsure say Y here.
++
+ config SCHED_AUTOGROUP
+ 	bool "Automatic process group scheduling"
+ 	select CGROUPS
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 919edb034108..fd52870a002f 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4420,6 +4420,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+ 	return try_to_wake_up(p, state, 0);
+ }
+ 
++#ifdef CONFIG_SCHED_BORE
++static inline void sched_fork_update_prev_burst(struct task_struct *p)
++{
++	struct task_struct *sib;
++	u32 cnt = 0;
++	u64 sum = 0, avg = 0;
++	list_for_each_entry(sib, &p->sibling, sibling) {
++		cnt++;
++		sum += sib->se.prev_burst_time >> 8;
++	}
++	if (cnt) avg = div_u64(sum, cnt) << 8;
++	if (p->se.prev_burst_time < avg) p->se.prev_burst_time = avg;
++}
++#endif // CONFIG_SCHED_BORE
++
+ /*
+  * Perform scheduler related setup for a newly forked process p.
+  * p is forked by current.
+@@ -4438,6 +4453,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.vruntime			= 0;
+ 	p->se.dur_avg			= 0;
+ 	p->se.prev_sleep_sum_runtime	= 0;
++#ifdef CONFIG_SCHED_BORE
++	p->se.burst_time      = 0;
++#endif // CONFIG_SCHED_BORE
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 	RB_CLEAR_NODE(&p->se.latency_node);
+ 
+@@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init);
+ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ {
+ 	__sched_fork(clone_flags, p);
++#ifdef CONFIG_SCHED_BORE
++	sched_fork_update_prev_burst(p);
++	p->se.burst_time = 0;
++#endif // CONFIG_SCHED_BORE
+ 	/*
+ 	 * We mark the process as NEW here. This guarantees that
+ 	 * nobody will actually run it, and a signal or other external
+@@ -9154,6 +9176,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
+ 
+ 	idle->__state = TASK_RUNNING;
+ 	idle->se.exec_start = sched_clock();
++#ifdef CONFIG_SCHED_BORE
++	idle->se.prev_burst_time = 0;
++#endif //CONFIG_SCHED_BORE
+ 	/*
+ 	 * PF_KTHREAD should already be set at this point; regardless, make it
+ 	 * look like a proper per-CPU kthread.
+@@ -9821,6 +9846,10 @@ void __init sched_init(void)
+ 	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+ #endif
+ 
++#ifdef CONFIG_SCHED_BORE
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 1.7.10 by Masahito Suzuki");
++#endif // CONFIG_SCHED_BORE
++
+ 	wait_bit_init();
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 177934290ec4..2f40a238cdad 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+ 
++#ifdef CONFIG_SCHED_BORE
++	SEQ_printf(m, " %2d", p->se.burst_score);
++#endif
+ #ifdef CONFIG_NUMA_BALANCING
+ 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 5ef893ce5734..590adb9a3e37 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -19,6 +19,9 @@
+  *
+  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
++ *
++ *  Burst-Oriented Response Enhancer (BORE) CPU Scheduler
++ *  Copyright (C) 2021 Masahito Suzuki <firelzrd@gmail.com>
+  */
+ #include <linux/energy_model.h>
+ #include <linux/mmap_lock.h>
+@@ -140,6 +143,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+ 
+ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+ 
++#ifdef CONFIG_SCHED_BORE
++unsigned int __read_mostly sched_bore                = 1;
++unsigned int __read_mostly sched_burst_penalty_scale = 1280;
++unsigned int __read_mostly sched_burst_granularity   = 12;
++unsigned int __read_mostly sched_burst_smoothness    = 2;
++static int three          = 3;
++static int sixty_four     = 64;
++static int maxval_12_bits = 4095;
++#endif // CONFIG_SCHED_BORE
++
+ int sched_thermal_decay_shift;
+ static int __init setup_sched_thermal_decay_shift(char *str)
+ {
+@@ -203,6 +216,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+ 
+ #ifdef CONFIG_SYSCTL
+ static struct ctl_table sched_fair_sysctls[] = {
++#ifdef CONFIG_SCHED_BORE
++	{
++		.procname	= "sched_bore",
++		.data		= &sched_bore,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &three,
++	},
++	{
++		.procname	= "sched_burst_penalty_scale",
++		.data		= &sched_burst_penalty_scale,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &maxval_12_bits,
++	},
++	{
++		.procname	= "sched_burst_granularity",
++		.data		= &sched_burst_granularity,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &sixty_four,
++	},
++	{
++		.procname	= "sched_burst_smoothness",
++		.data		= &sched_burst_smoothness,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &three,
++	},
++#endif // CONFIG_SCHED_BORE
+ 	{
+ 		.procname       = "sched_child_runs_first",
+ 		.data           = &sysctl_sched_child_runs_first,
+@@ -978,6 +1029,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
+ }
+ #endif /* CONFIG_SMP */
+ 
++#ifdef CONFIG_SCHED_BORE
++static inline void update_burst_score(struct sched_entity *se) {
++	u64 burst_time;
++	s32 bits;
++	u32 intgr, fdigs, dec10;
++	
++	burst_time = max(se->burst_time, se->prev_burst_time);
++	bits = fls64(burst_time);
++	intgr = max((u32)bits, sched_burst_granularity) - sched_burst_granularity;
++	fdigs = max(bits - 1, (s32)sched_burst_granularity);
++	dec10 = (intgr << 10) | (burst_time << (64 - fdigs) >> 54);
++	se->burst_score = min((u32)39, dec10 * sched_burst_penalty_scale >> 20);
++}
++
++static u64 burst_scale(u64 delta, struct sched_entity *se) {
++	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22);
++}
++
++static u64 calc_delta_fair_bscale(u64 delta, struct sched_entity *se) {
++	return burst_scale(calc_delta_fair(delta, se), se);
++}
++
++static inline u64 binary_smooth(u64 old, u64 new, unsigned int smoothness) {
++	return (new + old * ((1 << smoothness) - 1)) >> smoothness;
++}
++
++static inline void reset_burst(struct sched_entity *se) {
++	se->prev_burst_time = binary_smooth(
++		se->prev_burst_time, se->burst_time, sched_burst_smoothness);
++	se->burst_time = 0;
++}
++#endif // CONFIG_SCHED_BORE
++
+ /*
+  * Update the current task's runtime statistics.
+  */
+@@ -1007,6 +1091,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	curr->sum_exec_runtime += delta_exec;
+ 	schedstat_add(cfs_rq->exec_clock, delta_exec);
+ 
++#ifdef CONFIG_SCHED_BORE
++	curr->burst_time += delta_exec;
++	update_burst_score(curr);
++	if (sched_bore & 1)
++		curr->vruntime += calc_delta_fair_bscale(delta_exec, curr);
++	else
++#endif // CONFIG_SCHED_BORE
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
+ 	update_min_vruntime(cfs_rq);
+ 
+@@ -5057,6 +5148,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
+ }
+ 
++#ifdef CONFIG_SCHED_BORE
++static int
++wakeup_preempt_entity_bscale(struct sched_entity *curr,
++                             struct sched_entity *se, bool do_scale);
++#endif // CONFIG_SCHED_BORE
+ static int
+ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+ 
+@@ -5101,7 +5197,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 			se = second;
+ 	}
+ 
+-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
++#ifdef CONFIG_SCHED_BORE
++	if (cfs_rq->next && wakeup_preempt_entity_bscale(
++		                  cfs_rq->next, left, sched_bore & 2) < 1)
++#else // CONFIG_SCHED_BORE
++	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
++#endif // CONFIG_SCHED_BORE
++	{
+ 		/*
+ 		 * Someone really wants this to run. If it's not unfair, run it.
+ 		 */
+@@ -6394,6 +6496,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	util_est_dequeue(&rq->cfs, p);
+ 
+ 	for_each_sched_entity(se) {
++#ifdef CONFIG_SCHED_BORE
++		if (task_sleep) reset_burst(se);
++#endif // CONFIG_SCHED_BORE
+ 		cfs_rq = cfs_rq_of(se);
+ 		dequeue_entity(cfs_rq, se, flags);
+ 
+@@ -7856,7 +7961,12 @@ static unsigned long wakeup_gran(struct sched_entity *se)
+  *
+  */
+ static int
++#ifdef CONFIG_SCHED_BORE
++wakeup_preempt_entity_bscale(struct sched_entity *curr,
++                             struct sched_entity *se, bool do_scale)
++#else // CONFIG_SCHED_BORE
+ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
++#endif // CONFIG_SCHED_BORE
+ {
+ 	s64 gran, vdiff = curr->vruntime - se->vruntime;
+ 	s64 offset = wakeup_latency_gran(curr, se);
+@@ -7876,12 +7986,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+ 	 * chance to preempt current.
+ 	 */
+ 	gran = min_t(s64, gran, get_latency_max());
+-
++#ifdef CONFIG_SCHED_BORE
++	if (do_scale) gran = burst_scale(gran, se);
++#endif // CONFIG_SCHED_BORE
+ 	if (vdiff > gran)
+ 		return 1;
+ 
+ 	return 0;
+ }
++#ifdef CONFIG_SCHED_BORE
++static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
++{
++	return wakeup_preempt_entity_bscale(curr, se, false);
++}
++#endif // CONFIG_SCHED_BORE
+ 
+ static void set_last_buddy(struct sched_entity *se)
+ {
+@@ -7981,7 +8099,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 		return;
+ 
+ 	update_curr(cfs_rq_of(se));
+-	if (wakeup_preempt_entity(se, pse) == 1) {
++#ifdef CONFIG_SCHED_BORE
++	if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1)
++#else // CONFIG_SCHED_BORE
++	if (wakeup_preempt_entity(se, pse) == 1)
++#endif // CONFIG_SCHED_BORE
++	{
+ 		/*
+ 		 * Bias pick_next to pick the sched entity that is
+ 		 * triggering this preemption.
+@@ -8217,6 +8340,9 @@ static void yield_task_fair(struct rq *rq)
+ 	struct task_struct *curr = rq->curr;
+ 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ 	struct sched_entity *se = &curr->se;
++#ifdef CONFIG_SCHED_BORE
++	reset_burst(se);
++#endif // CONFIG_SCHED_BORE
+ 
+ 	/*
+ 	 * Are we the only task in the tree?
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index efdc29c42161..0f28637ce1aa 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -4,7 +4,11 @@
+  * them to run sooner, but does not allow tons of sleepers to
+  * rip the spread apart.
+  */
++#ifdef CONFIG_SCHED_BORE
++SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false)
++#else // CONFIG_SCHED_BORE
+ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
++#endif // CONFIG_SCHED_BORE
+ 
+ /*
+  * Place new tasks ahead so that they do not starve already running
+-- 
+2.39.2
diff --git a/patches/0004-hdr.patch b/patches/0004-hdr.patch
new file mode 100644
index 0000000..7e467ac
--- /dev/null
+++ b/patches/0004-hdr.patch
@@ -0,0 +1,912 @@
+From 9cab14aa7f6828572f808d1bea60def5f883522c Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 22 Jan 2023 23:10:03 +0100
+Subject: [PATCH 08/16] hdr
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 110 ++++++++---
+ .../amd/display/amdgpu_dm/amdgpu_dm_debugfs.c |  57 ++++++
+ .../gpu/drm/amd/display/dc/core/dc_resource.c | 100 ++++------
+ drivers/gpu/drm/amd/display/dc/dc_stream.h    |   2 +-
+ drivers/gpu/drm/amd/display/dc/dc_types.h     |  14 --
+ drivers/gpu/drm/display/drm_hdmi_helper.c     |   8 +-
+ drivers/gpu/drm/drm_atomic.c                  |   2 +
+ drivers/gpu/drm/drm_connector.c               | 181 ++++++++++--------
+ .../gpu/drm/i915/display/intel_connector.c    |   4 +-
+ drivers/gpu/drm/vc4/vc4_hdmi.c                |   2 +-
+ include/drm/display/drm_dp.h                  |   2 +-
+ include/drm/drm_connector.h                   |  57 +++---
+ 12 files changed, 327 insertions(+), 212 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index 93dee3d1a483..b5eb33a97590 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -5172,21 +5172,46 @@ get_aspect_ratio(const struct drm_display_mode *mode_in)
+ }
+ 
+ static enum dc_color_space
+-get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing)
++get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing,
++		       const struct drm_connector_state *connector_state)
+ {
+ 	enum dc_color_space color_space = COLOR_SPACE_SRGB;
+ 
+-	switch (dc_crtc_timing->pixel_encoding)	{
+-	case PIXEL_ENCODING_YCBCR422:
+-	case PIXEL_ENCODING_YCBCR444:
+-	case PIXEL_ENCODING_YCBCR420:
+-	{
++	switch (connector_state->colorspace) {
++	case DRM_MODE_COLORIMETRY_BT601_YCC:
++		if (dc_crtc_timing->flags.Y_ONLY)
++			color_space = COLOR_SPACE_YCBCR601_LIMITED;
++		else
++			color_space = COLOR_SPACE_YCBCR601;
++		break;
++	case DRM_MODE_COLORIMETRY_BT709_YCC:
++		if (dc_crtc_timing->flags.Y_ONLY)
++			color_space = COLOR_SPACE_YCBCR709_LIMITED;
++		else
++			color_space = COLOR_SPACE_YCBCR709;
++		break;
++	case DRM_MODE_COLORIMETRY_OPRGB:
++		color_space = COLOR_SPACE_ADOBERGB;
++		break;
++	case DRM_MODE_COLORIMETRY_BT2020_RGB:
++		if (dc_crtc_timing->pixel_encoding == PIXEL_ENCODING_RGB)
++			color_space = COLOR_SPACE_2020_RGB_FULLRANGE;
++		else
++			color_space = COLOR_SPACE_2020_YCBCR;
++		break;
++	case DRM_MODE_COLORIMETRY_BT2020_YCC:
++		color_space = COLOR_SPACE_2020_YCBCR;
++		break;
++	case DRM_MODE_COLORIMETRY_DEFAULT: // ITU601
++	default:
++		if (dc_crtc_timing->pixel_encoding == PIXEL_ENCODING_RGB) {
++			color_space = COLOR_SPACE_SRGB;
+ 		/*
+ 		 * 27030khz is the separation point between HDTV and SDTV
+ 		 * according to HDMI spec, we use YCbCr709 and YCbCr601
+ 		 * respectively
+ 		 */
+-		if (dc_crtc_timing->pix_clk_100hz > 270300) {
++		} else if (dc_crtc_timing->pix_clk_100hz > 270300) {
+ 			if (dc_crtc_timing->flags.Y_ONLY)
+ 				color_space =
+ 					COLOR_SPACE_YCBCR709_LIMITED;
+@@ -5199,21 +5224,30 @@ get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing)
+ 			else
+ 				color_space = COLOR_SPACE_YCBCR601;
+ 		}
+-
+-	}
+-	break;
+-	case PIXEL_ENCODING_RGB:
+-		color_space = COLOR_SPACE_SRGB;
+-		break;
+-
+-	default:
+-		WARN_ON(1);
+ 		break;
+ 	}
+ 
+ 	return color_space;
+ }
+ 
++static enum display_content_type
++get_output_content_type(const struct drm_connector_state *connector_state)
++{
++	switch (connector_state->content_type) {
++	default:
++	case DRM_MODE_CONTENT_TYPE_NO_DATA:
++		return DISPLAY_CONTENT_TYPE_NO_DATA;
++	case DRM_MODE_CONTENT_TYPE_GRAPHICS:
++		return DISPLAY_CONTENT_TYPE_GRAPHICS;
++	case DRM_MODE_CONTENT_TYPE_PHOTO:
++		return DISPLAY_CONTENT_TYPE_PHOTO;
++	case DRM_MODE_CONTENT_TYPE_CINEMA:
++		return DISPLAY_CONTENT_TYPE_CINEMA;
++	case DRM_MODE_CONTENT_TYPE_GAME:
++		return DISPLAY_CONTENT_TYPE_GAME;
++	}
++}
++
+ static bool adjust_colour_depth_from_display_info(
+ 	struct dc_crtc_timing *timing_out,
+ 	const struct drm_display_info *info)
+@@ -5307,6 +5341,7 @@ static void fill_stream_properties_from_drm_display_mode(
+ 	if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) {
+ 		drm_hdmi_avi_infoframe_from_display_mode(&avi_frame, (struct drm_connector *)connector, mode_in);
+ 		timing_out->vic = avi_frame.video_code;
++		drm_hdmi_avi_infoframe_colorimetry(&avi_frame, connector_state);
+ 		drm_hdmi_vendor_infoframe_from_display_mode(&hv_frame, (struct drm_connector *)connector, mode_in);
+ 		timing_out->hdmi_vic = hv_frame.vic;
+ 	}
+@@ -5346,7 +5381,8 @@ static void fill_stream_properties_from_drm_display_mode(
+ 		}
+ 	}
+ 
+-	stream->output_color_space = get_output_color_space(timing_out);
++	stream->output_color_space = get_output_color_space(timing_out, connector_state);
++	stream->content_type = get_output_content_type(connector_state);
+ }
+ 
+ static void fill_audio_info(struct audio_info *audio_info,
+@@ -5786,15 +5822,14 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
+ {
+ 	struct drm_display_mode *preferred_mode = NULL;
+ 	struct drm_connector *drm_connector;
+-	const struct drm_connector_state *con_state =
+-		dm_state ? &dm_state->base : NULL;
++	const struct drm_connector_state *con_state = &dm_state->base;
+ 	struct dc_stream_state *stream = NULL;
+ 	struct drm_display_mode mode;
+ 	struct drm_display_mode saved_mode;
+ 	struct drm_display_mode *freesync_mode = NULL;
+ 	bool native_mode_found = false;
+ 	bool recalculate_timing = false;
+-	bool scale = dm_state ? (dm_state->scaling != RMX_OFF) : false;
++	bool scale = dm_state->scaling != RMX_OFF;
+ 	int mode_refresh;
+ 	int preferred_refresh = 0;
+ 	enum color_transfer_func tf = TRANSFER_FUNC_UNKNOWN;
+@@ -5875,7 +5910,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
+ 
+ 	if (recalculate_timing)
+ 		drm_mode_set_crtcinfo(&saved_mode, 0);
+-	else if (!dm_state)
++	else
+ 		drm_mode_set_crtcinfo(&mode, 0);
+ 
+ 	/*
+@@ -6404,7 +6439,9 @@ enum drm_mode_status amdgpu_dm_connector_mode_valid(struct drm_connector *connec
+ 		goto fail;
+ 	}
+ 
+-	stream = create_validate_stream_for_sink(aconnector, mode, NULL, NULL);
++	stream = create_validate_stream_for_sink(aconnector, mode,
++						 to_dm_connector_state(connector->state),
++						 NULL);
+ 	if (stream) {
+ 		dc_stream_release(stream);
+ 		result = MODE_OK;
+@@ -6498,6 +6535,14 @@ amdgpu_dm_connector_atomic_check(struct drm_connector *conn,
+ 	if (!crtc)
+ 		return 0;
+ 
++	if (new_con_state->colorspace != old_con_state->colorspace) {
++		new_crtc_state = drm_atomic_get_crtc_state(state, crtc);
++		if (IS_ERR(new_crtc_state))
++			return PTR_ERR(new_crtc_state);
++
++		new_crtc_state->mode_changed = true;
++	}
++
+ 	if (!drm_connector_atomic_hdr_metadata_equal(old_con_state, new_con_state)) {
+ 		struct dc_info_packet hdr_infopacket;
+ 
+@@ -6520,7 +6565,7 @@ amdgpu_dm_connector_atomic_check(struct drm_connector *conn,
+ 		 * set is permissible, however. So only force a
+ 		 * modeset if we're entering or exiting HDR.
+ 		 */
+-		new_crtc_state->mode_changed =
++		new_crtc_state->mode_changed = new_crtc_state->mode_changed ||
+ 			!old_con_state->hdr_output_metadata ||
+ 			!new_con_state->hdr_output_metadata;
+ 	}
+@@ -7041,6 +7086,12 @@ static int amdgpu_dm_connector_get_modes(struct drm_connector *connector)
+ 	return amdgpu_dm_connector->num_modes;
+ }
+ 
++static const u32 supported_colorspaces =
++	BIT(DRM_MODE_COLORIMETRY_BT709_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_OPRGB) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_YCC);
++
+ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
+ 				     struct amdgpu_dm_connector *aconnector,
+ 				     int connector_type,
+@@ -7109,7 +7160,7 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
+ 		drm_connector_attach_max_bpc_property(&aconnector->base, 8, 16);
+ 
+ 	/* This defaults to the max in the range, but we want 8bpc for non-edp. */
+-	aconnector->base.state->max_bpc = (connector_type == DRM_MODE_CONNECTOR_eDP) ? 16 : 8;
++	aconnector->base.state->max_bpc = 16;
+ 	aconnector->base.state->max_requested_bpc = aconnector->base.state->max_bpc;
+ 
+ 	if (connector_type == DRM_MODE_CONNECTOR_eDP &&
+@@ -7118,6 +7169,17 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
+ 				adev->mode_info.abm_level_property, 0);
+ 	}
+ 
++	drm_connector_attach_content_type_property(&aconnector->base);
++
++	if (connector_type == DRM_MODE_CONNECTOR_HDMIA) {
++		if (!drm_mode_create_hdmi_colorspace_property(&aconnector->base, supported_colorspaces))
++			drm_connector_attach_colorspace_property(&aconnector->base);
++	} else if (connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
++		   connector_type == DRM_MODE_CONNECTOR_eDP) {
++		if (!drm_mode_create_dp_colorspace_property(&aconnector->base, supported_colorspaces))
++			drm_connector_attach_colorspace_property(&aconnector->base);
++	}
++
+ 	if (connector_type == DRM_MODE_CONNECTOR_HDMIA ||
+ 	    connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
+ 	    connector_type == DRM_MODE_CONNECTOR_eDP) {
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
+index 461037a3dd75..d95d1c9f4805 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c
+@@ -935,6 +935,61 @@ static int amdgpu_current_bpc_show(struct seq_file *m, void *data)
+ }
+ DEFINE_SHOW_ATTRIBUTE(amdgpu_current_bpc);
+ 
++/*
++ * Returns the current bpc for the crtc.
++ * Example usage: cat /sys/kernel/debug/dri/0/crtc-0/amdgpu_current_colorspace
++ */
++static int amdgpu_current_colorspace_show(struct seq_file *m, void *data)
++{
++	struct drm_crtc *crtc = m->private;
++	struct drm_device *dev = crtc->dev;
++	struct dm_crtc_state *dm_crtc_state = NULL;
++	int res = -ENODEV;
++
++	mutex_lock(&dev->mode_config.mutex);
++	drm_modeset_lock(&crtc->mutex, NULL);
++	if (crtc->state == NULL)
++		goto unlock;
++
++	dm_crtc_state = to_dm_crtc_state(crtc->state);
++	if (dm_crtc_state->stream == NULL)
++		goto unlock;
++
++	switch (dm_crtc_state->stream->output_color_space) {
++	case COLOR_SPACE_SRGB:
++		seq_printf(m, "RGB");
++		break;
++	case COLOR_SPACE_YCBCR601:
++	case COLOR_SPACE_YCBCR601_LIMITED:
++		seq_printf(m, "BT601_YCC");
++		break;
++	case COLOR_SPACE_YCBCR709:
++	case COLOR_SPACE_YCBCR709_LIMITED:
++		seq_printf(m, "BT709_YCC");
++		break;
++	case COLOR_SPACE_ADOBERGB:
++		seq_printf(m, "opRGB");
++		break;
++	case COLOR_SPACE_2020_RGB_FULLRANGE:
++		seq_printf(m, "BT2020_RGB");
++		break;
++	case COLOR_SPACE_2020_YCBCR:
++		seq_printf(m, "BT2020_YCC");
++		break;
++	default:
++		goto unlock;
++	}
++	res = 0;
++
++unlock:
++	drm_modeset_unlock(&crtc->mutex);
++	mutex_unlock(&dev->mode_config.mutex);
++
++	return res;
++}
++DEFINE_SHOW_ATTRIBUTE(amdgpu_current_colorspace);
++
++
+ /*
+  * Example usage:
+  * Disable dsc passthrough, i.e.,: have dsc decoding at converver, not external RX
+@@ -3326,6 +3381,8 @@ void crtc_debugfs_init(struct drm_crtc *crtc)
+ #endif
+ 	debugfs_create_file("amdgpu_current_bpc", 0644, crtc->debugfs_entry,
+ 			    crtc, &amdgpu_current_bpc_fops);
++	debugfs_create_file("amdgpu_current_colorspace", 0644, crtc->debugfs_entry,
++			    crtc, &amdgpu_current_colorspace_fops);
+ }
+ 
+ /*
+diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+index da164685547d..e00fadf9d0ff 100644
+--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
++++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+@@ -2943,14 +2943,9 @@ static void set_avi_info_frame(
+ 	uint32_t pixel_encoding = 0;
+ 	enum scanning_type scan_type = SCANNING_TYPE_NODATA;
+ 	enum dc_aspect_ratio aspect = ASPECT_RATIO_NO_DATA;
+-	bool itc = false;
+-	uint8_t itc_value = 0;
+-	uint8_t cn0_cn1 = 0;
+-	unsigned int cn0_cn1_value = 0;
+ 	uint8_t *check_sum = NULL;
+ 	uint8_t byte_index = 0;
+ 	union hdmi_info_packet hdmi_info;
+-	union display_content_support support = {0};
+ 	unsigned int vic = pipe_ctx->stream->timing.vic;
+ 	unsigned int rid = pipe_ctx->stream->timing.rid;
+ 	unsigned int fr_ind = pipe_ctx->stream->timing.fr_index;
+@@ -3010,23 +3005,32 @@ static void set_avi_info_frame(
+ 	hdmi_info.bits.S0_S1 = scan_type;
+ 
+ 	/* C0, C1 : Colorimetry */
+-	if (color_space == COLOR_SPACE_YCBCR709 ||
+-			color_space == COLOR_SPACE_YCBCR709_LIMITED)
++	switch (color_space) {
++	case COLOR_SPACE_YCBCR709:
++	case COLOR_SPACE_YCBCR709_LIMITED:
+ 		hdmi_info.bits.C0_C1 = COLORIMETRY_ITU709;
+-	else if (color_space == COLOR_SPACE_YCBCR601 ||
+-			color_space == COLOR_SPACE_YCBCR601_LIMITED)
++		break;
++	case COLOR_SPACE_YCBCR601:
++	case COLOR_SPACE_YCBCR601_LIMITED:
+ 		hdmi_info.bits.C0_C1 = COLORIMETRY_ITU601;
+-	else {
+-		hdmi_info.bits.C0_C1 = COLORIMETRY_NO_DATA;
+-	}
+-	if (color_space == COLOR_SPACE_2020_RGB_FULLRANGE ||
+-			color_space == COLOR_SPACE_2020_RGB_LIMITEDRANGE ||
+-			color_space == COLOR_SPACE_2020_YCBCR) {
++		break;
++	case COLOR_SPACE_2020_RGB_FULLRANGE:
++	case COLOR_SPACE_2020_RGB_LIMITEDRANGE:
+ 		hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_BT2020RGBYCBCR;
+ 		hdmi_info.bits.C0_C1   = COLORIMETRY_EXTENDED;
+-	} else if (color_space == COLOR_SPACE_ADOBERGB) {
++		break;
++	case COLOR_SPACE_2020_YCBCR:
++		hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_BT2020YCC;
++		hdmi_info.bits.C0_C1   = COLORIMETRY_EXTENDED;
++		break;
++	case COLOR_SPACE_ADOBERGB:
+ 		hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_ADOBERGB;
+ 		hdmi_info.bits.C0_C1   = COLORIMETRY_EXTENDED;
++		break;
++	case COLOR_SPACE_SRGB:
++	default:
++		hdmi_info.bits.C0_C1 = COLORIMETRY_NO_DATA;
++		break;
+ 	}
+ 
+ 	if (pixel_encoding && color_space == COLOR_SPACE_2020_YCBCR &&
+@@ -3054,49 +3058,27 @@ static void set_avi_info_frame(
+ 	/* Active Format Aspect ratio - same as Picture Aspect Ratio. */
+ 	hdmi_info.bits.R0_R3 = ACTIVE_FORMAT_ASPECT_RATIO_SAME_AS_PICTURE;
+ 
+-	/* TODO: un-hardcode cn0_cn1 and itc */
+-
+-	cn0_cn1 = 0;
+-	cn0_cn1_value = 0;
+-
+-	itc = true;
+-	itc_value = 1;
+-
+-	support = stream->content_support;
+-
+-	if (itc) {
+-		if (!support.bits.valid_content_type) {
+-			cn0_cn1_value = 0;
+-		} else {
+-			if (cn0_cn1 == DISPLAY_CONTENT_TYPE_GRAPHICS) {
+-				if (support.bits.graphics_content == 1) {
+-					cn0_cn1_value = 0;
+-				}
+-			} else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_PHOTO) {
+-				if (support.bits.photo_content == 1) {
+-					cn0_cn1_value = 1;
+-				} else {
+-					cn0_cn1_value = 0;
+-					itc_value = 0;
+-				}
+-			} else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_CINEMA) {
+-				if (support.bits.cinema_content == 1) {
+-					cn0_cn1_value = 2;
+-				} else {
+-					cn0_cn1_value = 0;
+-					itc_value = 0;
+-				}
+-			} else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_GAME) {
+-				if (support.bits.game_content == 1) {
+-					cn0_cn1_value = 3;
+-				} else {
+-					cn0_cn1_value = 0;
+-					itc_value = 0;
+-				}
+-			}
+-		}
+-		hdmi_info.bits.CN0_CN1 = cn0_cn1_value;
+-		hdmi_info.bits.ITC = itc_value;
++	switch (stream->content_type) {
++	case DISPLAY_CONTENT_TYPE_NO_DATA:
++		hdmi_info.bits.CN0_CN1 = 0;
++		hdmi_info.bits.ITC = 0;
++		break;
++	case DISPLAY_CONTENT_TYPE_GRAPHICS:
++		hdmi_info.bits.CN0_CN1 = 0;
++		hdmi_info.bits.ITC = 1;
++		break;
++	case DISPLAY_CONTENT_TYPE_PHOTO:
++		hdmi_info.bits.CN0_CN1 = 1;
++		hdmi_info.bits.ITC = 1;
++		break;
++	case DISPLAY_CONTENT_TYPE_CINEMA:
++		hdmi_info.bits.CN0_CN1 = 2;
++		hdmi_info.bits.ITC = 1;
++		break;
++	case DISPLAY_CONTENT_TYPE_GAME:
++		hdmi_info.bits.CN0_CN1 = 3;
++		hdmi_info.bits.ITC = 1;
++		break;
+ 	}
+ 
+ 	if (stream->qs_bit == 1) {
+diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h
+index dfd3df1d2f7e..f78d49e33a6e 100644
+--- a/drivers/gpu/drm/amd/display/dc/dc_stream.h
++++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h
+@@ -182,7 +182,6 @@ struct dc_stream_state {
+ 	 */
+ 	struct link_encoder *link_enc;
+ 	struct dc_panel_patch sink_patches;
+-	union display_content_support content_support;
+ 	struct dc_crtc_timing timing;
+ 	struct dc_crtc_timing_adjust adjust;
+ 	struct dc_info_packet vrr_infopacket;
+@@ -205,6 +204,7 @@ struct dc_stream_state {
+ 	struct dc_csc_transform csc_color_matrix;
+ 
+ 	enum dc_color_space output_color_space;
++	enum display_content_type content_type;
+ 	enum dc_dither_option dither_option;
+ 
+ 	enum view_3d_format view_format;
+diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h
+index dc78e2404b48..fdf58a2e3a75 100644
+--- a/drivers/gpu/drm/amd/display/dc/dc_types.h
++++ b/drivers/gpu/drm/amd/display/dc/dc_types.h
+@@ -174,18 +174,6 @@ struct dc_edid {
+ 
+ #define AUDIO_INFO_DISPLAY_NAME_SIZE_IN_CHARS 20
+ 
+-union display_content_support {
+-	unsigned int raw;
+-	struct {
+-		unsigned int valid_content_type :1;
+-		unsigned int game_content :1;
+-		unsigned int cinema_content :1;
+-		unsigned int photo_content :1;
+-		unsigned int graphics_content :1;
+-		unsigned int reserved :27;
+-	} bits;
+-};
+-
+ struct dc_panel_patch {
+ 	unsigned int dppowerup_delay;
+ 	unsigned int extra_t12_ms;
+@@ -218,8 +206,6 @@ struct dc_edid_caps {
+ 	uint32_t audio_latency;
+ 	uint32_t video_latency;
+ 
+-	union display_content_support content_support;
+-
+ 	uint8_t qs_bit;
+ 	uint8_t qy_bit;
+ 
+diff --git a/drivers/gpu/drm/display/drm_hdmi_helper.c b/drivers/gpu/drm/display/drm_hdmi_helper.c
+index 0264abe55278..c1e6851b2606 100644
+--- a/drivers/gpu/drm/display/drm_hdmi_helper.c
++++ b/drivers/gpu/drm/display/drm_hdmi_helper.c
+@@ -44,10 +44,8 @@ int drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+ 
+ 	/* Sink EOTF is Bit map while infoframe is absolute values */
+ 	if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf,
+-	    connector->hdr_sink_metadata.hdmi_type1.eotf)) {
+-		DRM_DEBUG_KMS("EOTF Not Supported\n");
+-		return -EINVAL;
+-	}
++	    connector->hdr_sink_metadata.hdmi_type1.eotf))
++		DRM_DEBUG_KMS("Unknown EOTF %d\n", hdr_metadata->hdmi_metadata_type1.eotf);
+ 
+ 	err = hdmi_drm_infoframe_init(frame);
+ 	if (err < 0)
+@@ -105,7 +103,7 @@ EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata);
+ #define HDMI_COLORIMETRY_DCI_P3_RGB_THEATER	(C(3) | EC(7) | ACE(1))
+ 
+ static const u32 hdmi_colorimetry_val[] = {
+-	[DRM_MODE_COLORIMETRY_NO_DATA] = HDMI_COLORIMETRY_NO_DATA,
++	[DRM_MODE_COLORIMETRY_DEFAULT] = HDMI_COLORIMETRY_NO_DATA,
+ 	[DRM_MODE_COLORIMETRY_SMPTE_170M_YCC] = HDMI_COLORIMETRY_SMPTE_170M_YCC,
+ 	[DRM_MODE_COLORIMETRY_BT709_YCC] = HDMI_COLORIMETRY_BT709_YCC,
+ 	[DRM_MODE_COLORIMETRY_XVYCC_601] = HDMI_COLORIMETRY_XVYCC_601,
+diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
+index f197f59f6d99..d6d04c4ccfc0 100644
+--- a/drivers/gpu/drm/drm_atomic.c
++++ b/drivers/gpu/drm/drm_atomic.c
+@@ -1070,6 +1070,8 @@ static void drm_atomic_connector_print_state(struct drm_printer *p,
+ 	drm_printf(p, "connector[%u]: %s\n", connector->base.id, connector->name);
+ 	drm_printf(p, "\tcrtc=%s\n", state->crtc ? state->crtc->name : "(null)");
+ 	drm_printf(p, "\tself_refresh_aware=%d\n", state->self_refresh_aware);
++	drm_printf(p, "\tmax_requested_bpc=%d\n", state->max_requested_bpc);
++	drm_printf(p, "\tcolorspace=%s\n", drm_get_colorspace_name(state->colorspace));
+ 
+ 	if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)
+ 		if (state->writeback_job && state->writeback_job->fb)
+diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
+index 547356e00341..e99d397cd228 100644
+--- a/drivers/gpu/drm/drm_connector.c
++++ b/drivers/gpu/drm/drm_connector.c
+@@ -1016,64 +1016,72 @@ static const struct drm_prop_enum_list drm_dp_subconnector_enum_list[] = {
+ DRM_ENUM_NAME_FN(drm_get_dp_subconnector_name,
+ 		 drm_dp_subconnector_enum_list)
+ 
+-static const struct drm_prop_enum_list hdmi_colorspaces[] = {
+-	/* For Default case, driver will set the colorspace */
+-	{ DRM_MODE_COLORIMETRY_DEFAULT, "Default" },
+-	/* Standard Definition Colorimetry based on CEA 861 */
+-	{ DRM_MODE_COLORIMETRY_SMPTE_170M_YCC, "SMPTE_170M_YCC" },
+-	{ DRM_MODE_COLORIMETRY_BT709_YCC, "BT709_YCC" },
+-	/* Standard Definition Colorimetry based on IEC 61966-2-4 */
+-	{ DRM_MODE_COLORIMETRY_XVYCC_601, "XVYCC_601" },
+-	/* High Definition Colorimetry based on IEC 61966-2-4 */
+-	{ DRM_MODE_COLORIMETRY_XVYCC_709, "XVYCC_709" },
+-	/* Colorimetry based on IEC 61966-2-1/Amendment 1 */
+-	{ DRM_MODE_COLORIMETRY_SYCC_601, "SYCC_601" },
+-	/* Colorimetry based on IEC 61966-2-5 [33] */
+-	{ DRM_MODE_COLORIMETRY_OPYCC_601, "opYCC_601" },
+-	/* Colorimetry based on IEC 61966-2-5 */
+-	{ DRM_MODE_COLORIMETRY_OPRGB, "opRGB" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_CYCC, "BT2020_CYCC" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_RGB, "BT2020_RGB" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_YCC, "BT2020_YCC" },
+-	/* Added as part of Additional Colorimetry Extension in 861.G */
+-	{ DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65, "DCI-P3_RGB_D65" },
+-	{ DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER, "DCI-P3_RGB_Theater" },
++static const char * const colorspace_names[] = {
++	[DRM_MODE_COLORIMETRY_DEFAULT] = "Default",
++	[DRM_MODE_COLORIMETRY_SMPTE_170M_YCC] = "SMPTE_170M_YCC",
++	[DRM_MODE_COLORIMETRY_BT709_YCC] = "BT709_YCC",
++	[DRM_MODE_COLORIMETRY_XVYCC_601] = "XVYCC_601",
++	[DRM_MODE_COLORIMETRY_XVYCC_709] = "XVYCC_709",
++	[DRM_MODE_COLORIMETRY_SYCC_601] = "SYCC_601",
++	[DRM_MODE_COLORIMETRY_OPYCC_601] = "opYCC_601",
++	[DRM_MODE_COLORIMETRY_OPRGB] = "opRGB",
++	[DRM_MODE_COLORIMETRY_BT2020_CYCC] = "BT2020_CYCC",
++	[DRM_MODE_COLORIMETRY_BT2020_RGB] = "BT2020_RGB",
++	[DRM_MODE_COLORIMETRY_BT2020_YCC] = "BT2020_YCC",
++	[DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65] = "P3_RGB_D65",
++	[DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER] = "P3_RGB_Theater",
++	[DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED] = "RGB_WIDE_FIXED",
++	[DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT] = "RGB_WIDE_FLOAT",
++	[DRM_MODE_COLORIMETRY_BT601_YCC] = "BT601_YCC",
+ };
+ 
++/**
++ * drm_get_color_encoding_name - return a string for color encoding
++ * @encoding: color encoding to compute name of
++ *
++ * In contrast to the other drm_get_*_name functions this one here returns a
++ * const pointer and hence is threadsafe.
++ */
++const char *drm_get_colorspace_name(enum drm_colorspace colorspace)
++{
++	if (WARN_ON(colorspace >= ARRAY_SIZE(colorspace_names)))
++		return "unknown";
++
++	return colorspace_names[colorspace];
++}
++
++static const u32 hdmi_colorspaces =
++	BIT(DRM_MODE_COLORIMETRY_SMPTE_170M_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_BT709_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_XVYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_XVYCC_709) |
++	BIT(DRM_MODE_COLORIMETRY_SYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_OPYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_OPRGB) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_CYCC) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65) |
++	BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER);
++
+ /*
+  * As per DP 1.4a spec, 2.2.5.7.5 VSC SDP Payload for Pixel Encoding/Colorimetry
+  * Format Table 2-120
+  */
+-static const struct drm_prop_enum_list dp_colorspaces[] = {
+-	/* For Default case, driver will set the colorspace */
+-	{ DRM_MODE_COLORIMETRY_DEFAULT, "Default" },
+-	{ DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED, "RGB_Wide_Gamut_Fixed_Point" },
+-	/* Colorimetry based on scRGB (IEC 61966-2-2) */
+-	{ DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT, "RGB_Wide_Gamut_Floating_Point" },
+-	/* Colorimetry based on IEC 61966-2-5 */
+-	{ DRM_MODE_COLORIMETRY_OPRGB, "opRGB" },
+-	/* Colorimetry based on SMPTE RP 431-2 */
+-	{ DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65, "DCI-P3_RGB_D65" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_RGB, "BT2020_RGB" },
+-	{ DRM_MODE_COLORIMETRY_BT601_YCC, "BT601_YCC" },
+-	{ DRM_MODE_COLORIMETRY_BT709_YCC, "BT709_YCC" },
+-	/* Standard Definition Colorimetry based on IEC 61966-2-4 */
+-	{ DRM_MODE_COLORIMETRY_XVYCC_601, "XVYCC_601" },
+-	/* High Definition Colorimetry based on IEC 61966-2-4 */
+-	{ DRM_MODE_COLORIMETRY_XVYCC_709, "XVYCC_709" },
+-	/* Colorimetry based on IEC 61966-2-1/Amendment 1 */
+-	{ DRM_MODE_COLORIMETRY_SYCC_601, "SYCC_601" },
+-	/* Colorimetry based on IEC 61966-2-5 [33] */
+-	{ DRM_MODE_COLORIMETRY_OPYCC_601, "opYCC_601" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_CYCC, "BT2020_CYCC" },
+-	/* Colorimetry based on ITU-R BT.2020 */
+-	{ DRM_MODE_COLORIMETRY_BT2020_YCC, "BT2020_YCC" },
+-};
++static const u32 dp_colorspaces =
++	BIT(DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED) |
++	BIT(DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT) |
++	BIT(DRM_MODE_COLORIMETRY_OPRGB) |
++	BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) |
++	BIT(DRM_MODE_COLORIMETRY_BT601_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_BT709_YCC) |
++	BIT(DRM_MODE_COLORIMETRY_XVYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_XVYCC_709) |
++	BIT(DRM_MODE_COLORIMETRY_SYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_OPYCC_601) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_CYCC) |
++	BIT(DRM_MODE_COLORIMETRY_BT2020_YCC);
+ 
+ /**
+  * DOC: standard connector properties
+@@ -1975,33 +1983,58 @@ EXPORT_SYMBOL(drm_mode_create_aspect_ratio_property);
+  * drm_mode_create_dp_colorspace_property() is used for DP connector.
+  */
+ 
+-/**
+- * drm_mode_create_hdmi_colorspace_property - create hdmi colorspace property
+- * @connector: connector to create the Colorspace property on.
+- *
+- * Called by a driver the first time it's needed, must be attached to desired
+- * HDMI connectors.
+- *
+- * Returns:
+- * Zero on success, negative errno on failure.
+- */
+-int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector)
++static int drm_mode_create_colorspace_property(struct drm_connector *connector,
++					u32 supported_colorspaces)
+ {
+ 	struct drm_device *dev = connector->dev;
++	u32 colorspaces = supported_colorspaces | BIT(DRM_MODE_COLORIMETRY_DEFAULT);
++	struct drm_prop_enum_list enum_list[DRM_MODE_COLORIMETRY_MAX];
++	int i, len;
+ 
+ 	if (connector->colorspace_property)
+ 		return 0;
+ 
++	if (WARN_ON(supported_colorspaces == 0 ||
++		    (supported_colorspaces & -BIT(DRM_MODE_COLORIMETRY_MAX)) != 0))
++		return -EINVAL;
++
++	len = 0;
++	for (i = 0; i < DRM_MODE_COLORIMETRY_MAX; i++) {
++		if ((colorspaces & BIT(i)) == 0)
++			continue;
++
++		enum_list[len].type = i;
++		enum_list[len].name = colorspace_names[i];
++		len++;
++	}
++
+ 	connector->colorspace_property =
+ 		drm_property_create_enum(dev, DRM_MODE_PROP_ENUM, "Colorspace",
+-					 hdmi_colorspaces,
+-					 ARRAY_SIZE(hdmi_colorspaces));
++					enum_list,
++					len);
+ 
+ 	if (!connector->colorspace_property)
+ 		return -ENOMEM;
+ 
+ 	return 0;
+ }
++/**
++ * drm_mode_create_hdmi_colorspace_property - create hdmi colorspace property
++ * @connector: connector to create the Colorspace property on.
++ *
++ * Called by a driver the first time it's needed, must be attached to desired
++ * HDMI connectors.
++ *
++ * Returns:
++ * Zero on success, negative errno on failure.
++ */
++int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector,
++					     u32 supported_colorspaces)
++{
++	u32 colorspaces = supported_colorspaces & hdmi_colorspaces;
++
++	return drm_mode_create_colorspace_property(connector, colorspaces);
++}
+ EXPORT_SYMBOL(drm_mode_create_hdmi_colorspace_property);
+ 
+ /**
+@@ -2014,22 +2047,12 @@ EXPORT_SYMBOL(drm_mode_create_hdmi_colorspace_property);
+  * Returns:
+  * Zero on success, negative errno on failure.
+  */
+-int drm_mode_create_dp_colorspace_property(struct drm_connector *connector)
++int drm_mode_create_dp_colorspace_property(struct drm_connector *connector,
++					   u32 supported_colorspaces)
+ {
+-	struct drm_device *dev = connector->dev;
++	u32 colorspaces = supported_colorspaces & dp_colorspaces;
+ 
+-	if (connector->colorspace_property)
+-		return 0;
+-
+-	connector->colorspace_property =
+-		drm_property_create_enum(dev, DRM_MODE_PROP_ENUM, "Colorspace",
+-					 dp_colorspaces,
+-					 ARRAY_SIZE(dp_colorspaces));
+-
+-	if (!connector->colorspace_property)
+-		return -ENOMEM;
+-
+-	return 0;
++	return drm_mode_create_colorspace_property(connector, colorspaces);
+ }
+ EXPORT_SYMBOL(drm_mode_create_dp_colorspace_property);
+ 
+diff --git a/drivers/gpu/drm/i915/display/intel_connector.c b/drivers/gpu/drm/i915/display/intel_connector.c
+index 6205ddd3ded0..843a669afd59 100644
+--- a/drivers/gpu/drm/i915/display/intel_connector.c
++++ b/drivers/gpu/drm/i915/display/intel_connector.c
+@@ -283,14 +283,14 @@ intel_attach_aspect_ratio_property(struct drm_connector *connector)
+ void
+ intel_attach_hdmi_colorspace_property(struct drm_connector *connector)
+ {
+-	if (!drm_mode_create_hdmi_colorspace_property(connector))
++	if (!drm_mode_create_hdmi_colorspace_property(connector, 0xffffffff))
+ 		drm_connector_attach_colorspace_property(connector);
+ }
+ 
+ void
+ intel_attach_dp_colorspace_property(struct drm_connector *connector)
+ {
+-	if (!drm_mode_create_dp_colorspace_property(connector))
++	if (!drm_mode_create_dp_colorspace_property(connector, 0xffffffff))
+ 		drm_connector_attach_colorspace_property(connector);
+ }
+ 
+diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
+index 55744216392b..280d11648712 100644
+--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
++++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
+@@ -618,7 +618,7 @@ static int vc4_hdmi_connector_init(struct drm_device *dev,
+ 	if (ret)
+ 		return ret;
+ 
+-	ret = drm_mode_create_hdmi_colorspace_property(connector);
++	ret = drm_mode_create_hdmi_colorspace_property(connector, 0xffffffff);
+ 	if (ret)
+ 		return ret;
+ 
+diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
+index e934aab357be..4fc5120392e3 100644
+--- a/include/drm/display/drm_dp.h
++++ b/include/drm/display/drm_dp.h
+@@ -1617,7 +1617,7 @@ enum dp_pixelformat {
+  *
+  * This enum is used to indicate DP VSC SDP Colorimetry formats.
+  * It is based on DP 1.4 spec [Table 2-117: VSC SDP Payload for DB16 through
+- * DB18] and a name of enum member follows DRM_MODE_COLORIMETRY definition.
++ * DB18] and a name of enum member follows &enum drm_colorimetry definition.
+  *
+  * @DP_COLORIMETRY_DEFAULT: sRGB (IEC 61966-2-1) or
+  *                          ITU-R BT.601 colorimetry format
+diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
+index 565cf9d3c550..68f26a3ebb99 100644
+--- a/include/drm/drm_connector.h
++++ b/include/drm/drm_connector.h
+@@ -30,6 +30,7 @@
+ #include <linux/notifier.h>
+ #include <drm/drm_mode_object.h>
+ #include <drm/drm_util.h>
++#include <drm/drm_property.h>
+ 
+ #include <uapi/drm/drm_mode.h>
+ 
+@@ -371,29 +372,30 @@ enum drm_privacy_screen_status {
+  * a colorspace property which will be created and exposed to
+  * userspace.
+  */
+-
+-/* For Default case, driver will set the colorspace */
+-#define DRM_MODE_COLORIMETRY_DEFAULT			0
+-/* CEA 861 Normal Colorimetry options */
+-#define DRM_MODE_COLORIMETRY_NO_DATA			0
+-#define DRM_MODE_COLORIMETRY_SMPTE_170M_YCC		1
+-#define DRM_MODE_COLORIMETRY_BT709_YCC			2
+-/* CEA 861 Extended Colorimetry Options */
+-#define DRM_MODE_COLORIMETRY_XVYCC_601			3
+-#define DRM_MODE_COLORIMETRY_XVYCC_709			4
+-#define DRM_MODE_COLORIMETRY_SYCC_601			5
+-#define DRM_MODE_COLORIMETRY_OPYCC_601			6
+-#define DRM_MODE_COLORIMETRY_OPRGB			7
+-#define DRM_MODE_COLORIMETRY_BT2020_CYCC		8
+-#define DRM_MODE_COLORIMETRY_BT2020_RGB			9
+-#define DRM_MODE_COLORIMETRY_BT2020_YCC			10
+-/* Additional Colorimetry extension added as part of CTA 861.G */
+-#define DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65		11
+-#define DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER		12
+-/* Additional Colorimetry Options added for DP 1.4a VSC Colorimetry Format */
+-#define DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED		13
+-#define DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT		14
+-#define DRM_MODE_COLORIMETRY_BT601_YCC			15
++enum drm_colorspace {
++	/* For Default case, driver will set the colorspace */
++	DRM_MODE_COLORIMETRY_DEFAULT,
++	/* CEA 861 Normal Colorimetry options */
++	DRM_MODE_COLORIMETRY_SMPTE_170M_YCC,
++	DRM_MODE_COLORIMETRY_BT709_YCC,
++	/* CEA 861 Extended Colorimetry Options */
++	DRM_MODE_COLORIMETRY_XVYCC_601,
++	DRM_MODE_COLORIMETRY_XVYCC_709,
++	DRM_MODE_COLORIMETRY_SYCC_601,
++	DRM_MODE_COLORIMETRY_OPYCC_601,
++	DRM_MODE_COLORIMETRY_OPRGB,
++	DRM_MODE_COLORIMETRY_BT2020_CYCC,
++	DRM_MODE_COLORIMETRY_BT2020_RGB,
++	DRM_MODE_COLORIMETRY_BT2020_YCC,
++	/* Additional Colorimetry extension added as part of CTA 861.G */
++	DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65,
++	DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER,
++	/* Additional Colorimetry Options added for DP 1.4a VSC Colorimetry Format */
++	DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED,
++	DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT,
++	DRM_MODE_COLORIMETRY_BT601_YCC,
++	DRM_MODE_COLORIMETRY_MAX
++};
+ 
+ /**
+  * enum drm_bus_flags - bus_flags info for &drm_display_info
+@@ -828,7 +830,7 @@ struct drm_connector_state {
+ 	 * colorspace change on Sink. This is most commonly used to switch
+ 	 * to wider color gamuts like BT2020.
+ 	 */
+-	u32 colorspace;
++	enum drm_colorspace colorspace;
+ 
+ 	/**
+ 	 * @writeback_job: Writeback job for writeback connectors
+@@ -1835,8 +1837,10 @@ int drm_connector_attach_hdr_output_metadata_property(struct drm_connector *conn
+ bool drm_connector_atomic_hdr_metadata_equal(struct drm_connector_state *old_state,
+ 					     struct drm_connector_state *new_state);
+ int drm_mode_create_aspect_ratio_property(struct drm_device *dev);
+-int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector);
+-int drm_mode_create_dp_colorspace_property(struct drm_connector *connector);
++int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector,
++					     u32 supported_colorspaces);
++int drm_mode_create_dp_colorspace_property(struct drm_connector *connector,
++					   u32 supported_colorspaces);
+ int drm_mode_create_content_type_property(struct drm_device *dev);
+ int drm_mode_create_suggested_offset_properties(struct drm_device *dev);
+ 
+@@ -1919,6 +1923,7 @@ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter);
+ 
+ bool drm_connector_has_possible_encoder(struct drm_connector *connector,
+ 					struct drm_encoder *encoder);
++const char *drm_get_colorspace_name(enum drm_colorspace colorspace);
+ 
+ /**
+  * drm_for_each_connector_iter - connector_list iterator macro
+-- 
+2.39.2
diff --git a/scripts/build.sh b/scripts/build.sh
new file mode 100644
index 0000000..8a48730
--- /dev/null
+++ b/scripts/build.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Pika Kernel - Building"
+
+make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos
\ No newline at end of file
diff --git a/scripts/config.sh b/scripts/config.sh
new file mode 100644
index 0000000..0973a12
--- /dev/null
+++ b/scripts/config.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+echo "Pika Kernel - Applying configuration"
+
+cp ../config .config
+
+scripts/config -k -e CONFIG_GENERIC_CPU
+scripts/config -e CACHY
+scripts/config -e SCHED_BORE
+
+scripts/config -e HZ_300 --set-val HZ 1000
+scripts/config -d HZ_PERIODIC -d NO_HZ_FULL -e NO_HZ_IDLE  -e NO_HZ -e NO_HZ_COMMON
+scripts/config -e PREEMPT_BUILD -d PREEMPT_NONE -d PREEMPT_VOLUNTARY -e PREEMPT -e PREEMPT_COUNT -e PREEMPTION -e PREEMPT_DYNAMIC
+
+scripts/config -d CC_OPTIMIZE_FOR_PERFORMANCE \
+            -e CC_OPTIMIZE_FOR_PERFORMANCE_O3
+
+scripts/config -m TCP_CONG_CUBIC \
+            -d DEFAULT_CUBIC \
+            -e TCP_CONG_BBR2 \
+            -e DEFAULT_BBR2 \
+            --set-str DEFAULT_TCP_CONG bbr2
+
+scripts/config -e LRU_GEN -e LRU_GEN_ENABLED -d LRU_GEN_STATS
+
+scripts/config -d TRANSPARENT_HUGEPAGE_ALWAYS -e TRANSPARENT_HUGEPAGE_MADVISE
+
+scripts/config -e DAMON \
+            -e DAMON_VADDR \
+            -e DAMON_DBGFS \
+            -e DAMON_SYSFS \
+            -e DAMON_PADDR \
+            -e DAMON_RECLAIM \
+            -e DAMON_LRU_SORT
+
+scripts/config -d ZRAM_DEF_COMP_LZORLE \
+            -e ZRAM_DEF_COMP_ZSTD \
+            --set-str ZRAM_DEF_COMP zstd \
+            -d ZSWAP_COMPRESSOR_DEFAULT_LZ4 \
+            -e ZSWAP_COMPRESSOR_DEFAULT_ZSTD \
+            --set-str ZSWAP_COMPRESSOR_DEFAULT zstd
+
+scripts/config --set-val MODULE_COMPRESS_ZSTD_LEVEL 19 -e MODULE_COMPRESS_ZSTD_ULTRA --set-val MODULE_COMPRESS_ZSTD_LEVEL_ULTRA 22 --set-val ZSTD_COMP_VAL 22
+
+scripts/config -e USER_NS
+
+scripts/config -d DEBUG_INFO \
+            -d DEBUG_INFO_DWARF4 \
+            -d DEBUG_INFO_DWARF5 \
+            -d SLUB_DEBUG \
+            -d PM_DEBUG \
+            -d PM_ADVANCED_DEBUG \
+            -d PM_SLEEP_DEBUG \
+            -d ACPI_DEBUG \
+            -d SCHED_DEBUG \
+            -d LATENCYTOP \
+            -d DEBUG_PREEMPT
+
+make prepare
\ No newline at end of file
diff --git a/scripts/output.sh b/scripts/output.sh
new file mode 100644
index 0000000..008c076
--- /dev/null
+++ b/scripts/output.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+echo "Pika Kernel - Copying Output"
+
+cd ..
+mkdir ./output
+rm ./linux-libc*.deb
+cp ./*.deb ./output/
\ No newline at end of file
diff --git a/scripts/patch.sh b/scripts/patch.sh
new file mode 100644
index 0000000..c62732f
--- /dev/null
+++ b/scripts/patch.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+echo "Pika Kernel - Applying patches"
+
+# Cachy patches are here: https://github.com/CachyOS/kernel-patches/
+# orig patch from cachy - 0001-cachyos-base-all.patch
+patch -Np1 < "../patches/0001-cachy-all.patch"
+# orig patch from cachy - 0001-Add-latency-priority-for-CFS-class.patch
+patch -Np1 < "../patches/0002-cfs-nice.patch"
+# orig patch from cachy - 0001-bore-cachy.patch
+patch -Np1 < "../patches/0003-bore.patch"
+# HDR patch - from cachy (but they deleted it)
+patch -Np1 < "../patches/0004-hdr.patch"
\ No newline at end of file
diff --git a/scripts/release.sh b/scripts/release.sh
new file mode 100644
index 0000000..c3ca429
--- /dev/null
+++ b/scripts/release.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+echo "Pika Kernel - Releasing Kernel"
\ No newline at end of file
diff --git a/scripts/source.sh b/scripts/source.sh
new file mode 100644
index 0000000..7006289
--- /dev/null
+++ b/scripts/source.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+echo "Pika Kernel - Getting source"
+
+wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.tar.gz
+tar -zxf ./linux-6.2.tar.gz
+
+cd linux-6.2
\ No newline at end of file