diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..4ca88b6 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,28 @@ + +name: PikaOS Kernel Build Only + +on: + workflow_dispatch + +jobs: + build: + runs-on: self-hosted + container: + image: ubuntu:latest + volumes: + - /proc:/proc + options: --privileged -it + + steps: + - uses: actions/checkout@v3 + + - name: Install needed packages + run: apt update && apt install bc bison build-essential ccache cpio fakeroot flex git kmod libelf-dev libncurses5-dev libssl-dev lz4 qtbase5-dev rsync schedtool wget zstd tar -y + + - name: Build Kernel + run: ./main.sh + + - uses: actions/upload-artifact@v3 + with: + name: PikaOS Kernel + path: builds/ \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..e8a9f3a --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,25 @@ +name: PikaOS Kernel Build And Release + +on: + workflow_dispatch + +jobs: + build: + runs-on: self-hosted + container: + image: ubuntu:latest + volumes: + - /proc:/proc + options: --privileged -it + + steps: + - uses: actions/checkout@v3 + + - name: Install needed packages + run: apt update && apt install bc bison build-essential ccache cpio fakeroot flex git kmod libelf-dev libncurses5-dev libssl-dev lz4 qtbase5-dev rsync schedtool wget zstd tar -y + + - name: Build Kernel + run: ./main.sh + + - name: Release Kernel + run: ./scripts/release.sh \ No newline at end of file diff --git a/main.sh b/main.sh new file mode 100644 index 0000000..0c27e16 --- /dev/null +++ b/main.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +bash ./scripts/source.sh +bash ./scripts/patch.sh +bash ./scripts/config.sh +bash ./scripts/build.sh +bash ./scripts/output.sh + + diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch new file mode 100644 index 0000000..59a9269 --- /dev/null +++ b/patches/0001-cachy-all.patch @@ -0,0 +1,45207 @@ +From 6e4bce513a02a0be7b1f30c06751eb146cf20b1b Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 15 Jan 2023 16:50:23 +0100 +Subject: [PATCH 01/15] bbr2 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 3 +- + include/net/inet_connection_sock.h | 3 +- + include/net/tcp.h | 41 +- + include/uapi/linux/inet_diag.h | 33 + + net/ipv4/Kconfig | 22 + + net/ipv4/Makefile | 1 + + net/ipv4/tcp.c | 1 + + net/ipv4/tcp_bbr.c | 38 +- + net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++ + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 27 +- + net/ipv4/tcp_output.c | 26 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 14 files changed, 2867 insertions(+), 34 deletions(-) + create mode 100644 net/ipv4/tcp_bbr2.c + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index ca7f05a130d2..09dbcd67ee8e 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -255,7 +255,8 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ unused:3; + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u8 chrono_type:2, /* current chronograph type */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index c2b15f7e5516..d85858efa571 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -135,7 +135,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; ++/* XXX inflated by temporary internal debugging info */ ++ u64 icsk_ca_priv[216 / sizeof(u64)]; + #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) + }; + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index db9f828e9d1e..e1f05c2b4707 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_ECT_PERMANENT 16 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) + { + return tcp_ns_to_ts(skb->skb_mstamp_ns); +@@ -898,9 +904,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1046,8 +1061,11 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ + long interval_us; /* time for tp->delivered to incr "delivered" */ +@@ -1061,6 +1079,7 @@ struct rate_sample { + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1084,8 +1103,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 50655de04c9b..0e24f11627d5 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -231,9 +231,42 @@ struct tcp_bbr_info { + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + }; + ++/* Phase as reported in netlink/ss stats. */ ++enum tcp_bbr2_phase { ++ BBR2_PHASE_INVALID = 0, ++ BBR2_PHASE_STARTUP = 1, ++ BBR2_PHASE_DRAIN = 2, ++ BBR2_PHASE_PROBE_RTT = 3, ++ BBR2_PHASE_PROBE_BW_UP = 4, ++ BBR2_PHASE_PROBE_BW_DOWN = 5, ++ BBR2_PHASE_PROBE_BW_CRUISE = 6, ++ BBR2_PHASE_PROBE_BW_REFILL = 7 ++}; ++ ++struct tcp_bbr2_info { ++ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ ++ __u32 bbr_bw_lsb; /* lower 32 bits of bw */ ++ __u32 bbr_bw_msb; /* upper 32 bits of bw */ ++ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ ++ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ ++ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* MUST be at this offset in struct */ ++ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ + union tcp_cc_info { + struct tcpvegas_info vegas; + struct tcp_dctcp_info dctcp; + struct tcp_bbr_info bbr; ++ struct tcp_bbr2_info bbr2; + }; + #endif /* _UAPI_INET_DIAG_H_ */ +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 2dfb12230f08..b6bec331a82e 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -678,6 +678,24 @@ config TCP_CONG_BBR + AQM schemes that do not provide a delay signal. It requires the fq + ("Fair Queue") pacing packet scheduler. + ++config TCP_CONG_BBR2 ++ tristate "BBR2 TCP" ++ default n ++ help ++ ++ BBR2 TCP congestion control is a model-based congestion control ++ algorithm that aims to maximize network utilization, keep queues and ++ retransmit rates low, and to be able to coexist with Reno/CUBIC in ++ common scenarios. It builds an explicit model of the network path. It ++ tolerates a targeted degree of random packet loss and delay that are ++ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, ++ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can ++ coexist with flows that use loss-based congestion control, and can ++ operate with shallow buffers, deep buffers, bufferbloat, policers, or ++ AQM schemes that do not provide a delay signal. It requires pacing, ++ using either TCP internal pacing or the fq ("Fair Queue") pacing packet ++ scheduler. ++ + choice + prompt "Default TCP congestion control" + default DEFAULT_CUBIC +@@ -715,6 +733,9 @@ choice + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + ++ config DEFAULT_BBR2 ++ bool "BBR2" if TCP_CONG_BBR2=y ++ + config DEFAULT_RENO + bool "Reno" + endchoice +@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG + default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG + default "bbr" if DEFAULT_BBR ++ default "bbr2" if DEFAULT_BBR2 + default "cubic" + + config TCP_MD5SIG +diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile +index af7d2cf490fb..e7a86a50838a 100644 +--- a/net/ipv4/Makefile ++++ b/net/ipv4/Makefile +@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o + obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o + obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o + obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o ++obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o + obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o + obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o + obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 33f559f491c8..e9e8040d6491 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3191,6 +3191,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index d2c470524e58..af08fb3cb139 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + sk->sk_pacing_rate = rate; + } + +-/* override sysctl_tcp_min_tso_segs */ + static u32 bbr_min_tso_segs(struct sock *sk) + { + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + } + ++/* Return the number of segments BBR would like in a TSO/GSO skb, given ++ * a particular max gso size as a constraint. ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ u32 segs; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++{ ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); ++} ++ ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c +new file mode 100644 +index 000000000000..85f8052144d1 +--- /dev/null ++++ b/net/ipv4/tcp_bbr2.c +@@ -0,0 +1,2674 @@ ++/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 ++ * ++ * BBRv2 is a model-based congestion control algorithm that aims for low ++ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model ++ * of the network path, it uses measurements of bandwidth and RTT, as well as ++ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that ++ * although it can use ECN or loss signals explicitly, it does not require ++ * either; it can bound its in-flight data based on its estimate of the BDP. ++ * ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. ++ * ++ * Here is a state transition diagram for BBR: ++ * ++ * | ++ * V ++ * +---> STARTUP ----+ ++ * | | | ++ * | V | ++ * | DRAIN ----+ ++ * | | | ++ * | V | ++ * +---> PROBE_BW ----+ ++ * | ^ | | ++ * | | | | ++ * | +----+ | ++ * | | ++ * +---- PROBE_RTT <--+ ++ * ++ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. ++ * When it estimates the pipe is full, it enters DRAIN to drain the queue. ++ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. ++ * A long-lived BBR flow spends the vast majority of its time remaining ++ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth ++ * in a fair manner, with a small, bounded queue. *If* a flow has been ++ * continuously sending for the entire min_rtt window, and hasn't seen an RTT ++ * sample that matches or decreases its min_rtt estimate for 10 seconds, then ++ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe ++ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if ++ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; ++ * otherwise we enter STARTUP to try to fill the pipe. ++ * ++ * BBR is described in detail in: ++ * "BBR: Congestion-Based Congestion Control", ++ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, ++ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. ++ * ++ * There is a public e-mail list for discussing BBR development and testing: ++ * https://groups.google.com/forum/#!forum/bbr-dev ++ * ++ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, ++ * otherwise TCP stack falls back to an internal pacing using one high ++ * resolution timer per TCP socket and may use more resources. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tcp_dctcp.h" ++ ++/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth ++ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. ++ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. ++ * Since the minimum window is >=4 packets, the lower bound isn't ++ * an issue. The upper bound isn't an issue with existing technologies. ++ */ ++#define BW_SCALE 24 ++#define BW_UNIT (1 << BW_SCALE) ++ ++#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ ++#define BBR_UNIT (1 << BBR_SCALE) ++ ++#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ ++#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ ++ ++#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++ ++/* BBR has the following modes for deciding how fast to send: */ ++enum bbr_mode { ++ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ ++ BBR_DRAIN, /* drain any queue created during startup */ ++ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ ++ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ ++}; ++ ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ ++/* BBR congestion control block */ ++struct bbr { ++ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ ++ u32 min_rtt_stamp; /* timestamp of min_rtt_us */ ++ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ ++ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ ++ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u64 cycle_mstamp; /* time of this cycle phase start */ ++ u32 mode:3, /* current bbr_mode in state machine */ ++ prev_ca_state:3, /* CA state on previous ACK */ ++ packet_conservation:1, /* use packet conservation? */ ++ round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ ++ unused2:11, ++ idle_restart:1, /* restarting after idle? */ ++ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ ++ cycle_idx:3, /* current index in pacing_gain cycle array */ ++ has_seen_rtt:1; /* have we seen an RTT sample yet? */ ++ u32 pacing_gain:11, /* current gain for setting pacing rate */ ++ cwnd_gain:11, /* current gain for setting cwnd */ ++ full_bw_reached:1, /* reached full bw in Startup? */ ++ full_bw_cnt:2, /* number of rounds without large bw gains */ ++ init_cwnd:7; /* initial cwnd */ ++ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ ++ u32 full_bw; /* recent bw, to estimate if pipe is full */ ++ ++ /* For tracking ACK aggregation: */ ++ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ ++ u16 extra_acked[2]; /* max excess data ACKed in epoch */ ++ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ ++ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ ++ extra_acked_win_idx:1, /* current index in extra_acked array */ ++ /* BBR v2 state: */ ++ unused1:2, ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1; /* ECN in this cycle? */ ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ /* Params configurable using setsockopt. Refer to correspoding ++ * module param for detailed description of params. ++ */ ++ struct bbr_params { ++ u32 high_gain:11, /* max allowed value: 2047 */ ++ drain_gain:10, /* max allowed value: 1023 */ ++ cwnd_gain:11; /* max allowed value: 2047 */ ++ u32 cwnd_min_target:4, /* max allowed value: 15 */ ++ min_rtt_win_sec:5, /* max allowed value: 31 */ ++ probe_rtt_mode_ms:9, /* max allowed value: 511 */ ++ full_bw_cnt:3, /* max allowed value: 7 */ ++ cwnd_tso_budget:1, /* allowed values: {0, 1} */ ++ unused3:6, ++ drain_to_target:1, /* boolean */ ++ precise_ece_ack:1, /* boolean */ ++ extra_acked_in_startup:1, /* allowed values: {0, 1} */ ++ fast_path:1; /* boolean */ ++ u32 full_bw_thresh:10, /* max allowed value: 1023 */ ++ startup_cwnd_gain:11, /* max allowed value: 2047 */ ++ bw_probe_pif_gain:9, /* max allowed value: 511 */ ++ usage_based_cwnd:1, /* boolean */ ++ unused2:1; ++ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ ++ refill_add_inc:2; /* max allowed value: 3 */ ++ u16 extra_acked_gain:11, /* max allowed value: 2047 */ ++ extra_acked_win_rtts:5; /* max allowed value: 31*/ ++ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ ++ /* Mostly BBR v2 parameters below here: */ ++ u32 ecn_alpha_gain:8, /* max allowed value: 255 */ ++ ecn_factor:8, /* max allowed value: 255 */ ++ ecn_thresh:8, /* max allowed value: 255 */ ++ beta:8; /* max allowed value: 255 */ ++ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ ++ bw_probe_reno_gain:9, /* max allowed value: 511 */ ++ full_loss_cnt:4; /* max allowed value: 15 */ ++ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ ++ inflight_headroom:8, /* max allowed value: 255 */ ++ loss_thresh:8, /* max allowed value: 255 */ ++ bw_probe_max_rounds:8; /* max allowed value: 255 */ ++ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ ++ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ ++ full_ecn_cnt:2; /* max allowed value: 3 */ ++ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ ++ undo:1, /* boolean */ ++ tso_rtt_shift:4, /* max allowed value: 15 */ ++ unused5:1; ++ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ ++ unused1:14, ++ ecn_alpha_init:9; /* max allowed value: 256 */ ++ } params; ++ ++ struct { ++ u32 snd_isn; /* Initial sequence number */ ++ u32 rs_bw; /* last valid rate sample bw */ ++ u32 target_cwnd; /* target cwnd, based on BDP */ ++ u8 undo:1, /* Undo even happened but not yet logged */ ++ unused:7; ++ char event; /* single-letter event debug codes */ ++ u16 unused2; ++ } debug; ++}; ++ ++struct bbr_context { ++ u32 sample_bw; ++ u32 target_cwnd; ++ u32 log:1; ++}; ++ ++/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ ++static u32 bbr_min_rtt_win_sec = 10; ++/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. ++ * Max allowed value is 511 (0x1FF). ++ */ ++static u32 bbr_probe_rtt_mode_ms = 200; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static u32 bbr_probe_rtt_win_ms = 5000; ++/* Skip TSO below the following bandwidth (bits/sec): */ ++static int bbr_min_tso_rate = 1200000; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. By default we cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ ++ ++/* Select cwnd TSO budget approach: ++ * 0: padding ++ * 1: flooring ++ */ ++static uint bbr_cwnd_tso_budget = 1; ++ ++/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. ++ * In order to help drive the network toward lower queues and low latency while ++ * maintaining high utilization, the average pacing rate aims to be slightly ++ * lower than the estimated bandwidth. This is an important aspect of the ++ * design. ++ */ ++static const int bbr_pacing_margin_percent = 1; ++ ++/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++ * that will allow a smoothly increasing pacing rate that will double each RTT ++ * and send the same number of packets per RTT that an un-paced, slow-starting ++ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). ++ */ ++static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; ++/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ ++static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; ++/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++ * the queue created in BBR_STARTUP in a single round. Max allowed value ++ * is 1023 (0x3FF). ++ */ ++static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; ++/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. ++ * Max allowed value is 2047 (0x7FF). ++ */ ++static int bbr_cwnd_gain = BBR_UNIT * 2; ++/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. ++ * Max allowed value for each element is 1023 (0x3FF). ++ */ ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ ++}; ++static int bbr_pacing_gain[] = { ++ BBR_UNIT * 5 / 4, /* probe for more available bw */ ++ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ ++ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ ++ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++}; ++ ++/* Try to keep at least this many packets in flight, if things go smoothly. For ++ * smooth functioning, a sliding window protocol ACKing every other packet ++ * needs at least 4 packets in flight. Max allowed value is 15 (0xF). ++ */ ++static u32 bbr_cwnd_min_target = 4; ++ ++/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. ++ * Use 0 to disable. Max allowed value is 255. ++ */ ++static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* If bw has increased significantly (1.25x), there may be more bw available. ++ * Max allowed value is 1023 (0x3FF). ++ */ ++static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; ++/* But after 3 rounds w/o significant bw growth, estimate pipe is full. ++ * Max allowed value is 7 (0x7). ++ */ ++static u32 bbr_full_bw_cnt = 3; ++ ++static u32 bbr_flags; /* Debugging related stuff */ ++ ++/* Whether to debug using printk. ++ */ ++static bool bbr_debug_with_printk; ++ ++/* Whether to debug using ftrace event tcp:tcp_bbr_event. ++ * Ignored when bbr_debug_with_printk is set. ++ */ ++static bool bbr_debug_ftrace; ++ ++/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ ++static bool bbr_drain_to_target = true; /* default: enabled */ ++ ++/* Experiment: Flags to control BBR with ECN behavior. ++ */ ++static bool bbr_precise_ece_ack = true; /* default: enabled */ ++ ++/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is ++ * (2^(16+14) B)/(1024 B/packet) = 1M packets. ++ */ ++static u32 bbr_cwnd_warn_val = 1U << 20; ++ ++static u16 bbr_debug_port_mask; ++ ++/* BBR module parameters. These are module parameters only in Google prod. ++ * Upstream these are intentionally not module parameters. ++ */ ++static int bbr_pacing_gain_size = CYCLE_LEN; ++ ++/* Gain factor for adding extra_acked to target cwnd: */ ++static int bbr_extra_acked_gain = 256; ++ ++/* Window length of extra_acked window. Max allowed val is 31. */ ++static u32 bbr_extra_acked_win_rtts = 5; ++ ++/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ ++static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; ++ ++/* Time period for clamping cwnd increment due to ack aggregation */ ++static u32 bbr_extra_acked_max_us = 100 * 1000; ++ ++/* Use extra acked in startup ? ++ * 0: disabled ++ * 1: use latest extra_acked value from 1-2 rtt in startup ++ */ ++static int bbr_extra_acked_in_startup = 1; /* default: enabled */ ++ ++/* Experiment: don't grow cwnd beyond twice of what we just probed. */ ++static bool bbr_usage_based_cwnd; /* default: disabled */ ++ ++/* For lab testing, researchers can enable BBRv2 ECN support with this flag, ++ * when they know that any ECN marks that the connections experience will be ++ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. ++ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on ++ * negotiation or configuration that is outside the scope of the BBRv2 ++ * alpha release. ++ */ ++static bool bbr_ecn_enable = false; ++ ++module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); ++module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); ++module_param_named(high_gain, bbr_high_gain, int, 0644); ++module_param_named(drain_gain, bbr_drain_gain, int, 0644); ++module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); ++module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); ++module_param_array_named(pacing_gain, bbr_pacing_gain, int, ++ &bbr_pacing_gain_size, 0644); ++module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); ++module_param_named(probe_rtt_cwnd_gain, ++ bbr_probe_rtt_cwnd_gain, uint, 0664); ++module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); ++module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); ++module_param_named(flags, bbr_flags, uint, 0644); ++module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); ++module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); ++module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); ++module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); ++module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); ++module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); ++module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); ++module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); ++module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); ++module_param_named(extra_acked_win_rtts, ++ bbr_extra_acked_win_rtts, uint, 0664); ++module_param_named(extra_acked_max_us, ++ bbr_extra_acked_max_us, uint, 0664); ++module_param_named(ack_epoch_acked_reset_thresh, ++ bbr_ack_epoch_acked_reset_thresh, uint, 0664); ++module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); ++module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); ++module_param_named(extra_acked_in_startup, ++ bbr_extra_acked_in_startup, int, 0664); ++module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); ++module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); ++ ++static void bbr2_exit_probe_rtt(struct sock *sk); ++static void bbr2_reset_congestion_signals(struct sock *sk); ++ ++static void bbr_check_probe_rtt_done(struct sock *sk); ++ ++/* Do we estimate that STARTUP filled the pipe? */ ++static bool bbr_full_bw_reached(const struct sock *sk) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return bbr->full_bw_reached; ++} ++ ++/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ ++static u32 bbr_max_bw(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); ++} ++ ++/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ ++static u32 bbr_bw(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return min(bbr_max_bw(sk), bbr->bw_lo); ++} ++ ++/* Return maximum extra acked in past k-2k round trips, ++ * where k = bbr_extra_acked_win_rtts. ++ */ ++static u16 bbr_extra_acked(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return max(bbr->extra_acked[0], bbr->extra_acked[1]); ++} ++ ++/* Return rate in bytes per second, optionally with a gain. ++ * The order here is chosen carefully to avoid overflow of u64. This should ++ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. ++ */ ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) ++{ ++ unsigned int mss = tcp_sk(sk)->mss_cache; ++ ++ rate *= mss; ++ rate *= gain; ++ rate >>= BBR_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); ++} ++ ++static u64 bbr_rate_kbps(struct sock *sk, u64 rate) ++{ ++ rate = bbr_bw_bytes_per_sec(sk, rate); ++ rate *= 8; ++ do_div(rate, 1000); ++ return rate; ++} ++ ++static u32 bbr_tso_segs_goal(struct sock *sk); ++static void bbr_debug(struct sock *sk, u32 acked, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ static const char ca_states[] = { ++ [TCP_CA_Open] = 'O', ++ [TCP_CA_Disorder] = 'D', ++ [TCP_CA_CWR] = 'C', ++ [TCP_CA_Recovery] = 'R', ++ [TCP_CA_Loss] = 'L', ++ }; ++ static const char mode[] = { ++ 'G', /* Growing - BBR_STARTUP */ ++ 'D', /* Drain - BBR_DRAIN */ ++ 'W', /* Window - BBR_PROBE_BW */ ++ 'M', /* Min RTT - BBR_PROBE_RTT */ ++ }; ++ static const char ack_phase[] = { /* bbr_ack_phase strings */ ++ 'I', /* BBR_ACKS_INIT - 'Init' */ ++ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ ++ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ ++ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ ++ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ ++ }; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 una = tp->snd_una - bbr->debug.snd_isn; ++ const u32 fack = tcp_highest_sack_seq(tp); ++ const u16 dport = ntohs(inet_sk(sk)->inet_dport); ++ bool is_port_match = (bbr_debug_port_mask && ++ ((dport & bbr_debug_port_mask) == 0)); ++ char debugmsg[320]; ++ ++ if (sk->sk_state == TCP_SYN_SENT) ++ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ ++ ++ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { ++ char addr[INET6_ADDRSTRLEN + 10] = { 0 }; ++ ++ if (sk->sk_family == AF_INET) ++ snprintf(addr, sizeof(addr), "%pI4:%u", ++ &inet_sk(sk)->inet_daddr, dport); ++ else if (sk->sk_family == AF_INET6) ++ snprintf(addr, sizeof(addr), "%pI6:%u", ++ &sk->sk_v6_daddr, dport); ++ ++ WARN_ONCE(1, ++ "BBR %s cwnd alert: %u " ++ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " ++ "bw: %u rtt: %u min_rtt: %u " ++ "acked: %u tso_segs: %u " ++ "bw: %d %ld %d pif: %u\n", ++ addr, tp->snd_cwnd, ++ una, inet_csk(sk)->icsk_ca_state, ++ bbr->pacing_gain, bbr->cwnd_gain, ++ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, ++ acked, bbr_tso_segs_goal(sk), ++ rs->delivered, rs->interval_us, rs->is_retrans, ++ tcp_packets_in_flight(tp)); ++ } ++ ++ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) ++ return; ++ ++ if (!sock_flag(sk, SOCK_DBG) && !is_port_match) ++ return; ++ ++ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) ++ return; ++ ++ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && ++ !(bbr_flags & FLAG_DEBUG_LOOPBACK)) ++ return; ++ ++ snprintf(debugmsg, sizeof(debugmsg) - 1, ++ "BBR %pI4:%-5u %5u,%03u:%-7u %c " ++ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " ++ "bw %llu lb %llu ib %llu qb %llu " ++ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " ++ "lr %d er %d ea %d bwl %lld il %d ih %d c %d " ++ "v %d %c %u %c %s\n", ++ &inet_sk(sk)->inet_daddr, dport, ++ una / 1000, una % 1000, fack - tp->snd_una, ++ ca_states[inet_csk(sk)->icsk_ca_state], ++ bbr->debug.undo ? '@' : mode[bbr->mode], ++ tp->snd_cwnd, ++ bbr_extra_acked(sk), /* br (legacy): extra_acked */ ++ rs->tx_in_flight, /* cr (legacy): tx_inflight */ ++ rs->rtt_us, ++ rs->delivered, ++ rs->interval_us, ++ bbr->min_rtt_us, ++ rs->is_app_limited ? '_' : 'l', ++ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ ++ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ ++ 0ULL, /* lb: [obsolete] */ ++ 0ULL, /* ib: [obsolete] */ ++ div_u64((u64)sk->sk_pacing_rate * 8, 1000), ++ acked, ++ tcp_packets_in_flight(tp), ++ rs->is_ack_delayed ? 'd' : '.', ++ bbr->round_start ? '*' : '.', ++ tp->delivered, tp->lost, ++ tp->app_limited, ++ 0, /* #: [obsolete] */ ++ ctx->target_cwnd, ++ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ ++ ca_states[bbr->prev_ca_state], ++ (rs->lost + rs->delivered) > 0 ? ++ (1000 * rs->lost / ++ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ ++ (rs->delivered) > 0 ? ++ (1000 * rs->delivered_ce / ++ (rs->delivered)) : 0, /* er: ECN rate x1000 */ ++ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ ++ bbr->bw_lo == ~0U ? ++ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ ++ bbr->inflight_lo, /* il */ ++ bbr->inflight_hi, /* ih */ ++ bbr->bw_probe_up_cnt, /* c */ ++ 2, /* v: version */ ++ bbr->debug.event, ++ bbr->cycle_idx, ++ ack_phase[bbr->ack_phase], ++ bbr->bw_probe_samples ? "Y" : "N"); ++ debugmsg[sizeof(debugmsg) - 1] = 0; ++ ++ /* printk takes a higher precedence. */ ++ if (bbr_debug_with_printk) ++ printk(KERN_DEBUG "%s", debugmsg); ++ ++ if (unlikely(bbr->debug.undo)) ++ bbr->debug.undo = 0; ++} ++ ++/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ ++static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) ++{ ++ u64 rate = bw; ++ ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); ++ rate = min_t(u64, rate, sk->sk_max_pacing_rate); ++ return rate; ++} ++ ++/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++static void bbr_init_pacing_rate_from_rtt(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ u32 rtt_us; ++ ++ if (tp->srtt_us) { /* any RTT sample yet? */ ++ rtt_us = max(tp->srtt_us >> 3, 1U); ++ bbr->has_seen_rtt = 1; ++ } else { /* no RTT sample yet */ ++ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ ++ } ++ bw = (u64)tp->snd_cwnd * BW_UNIT; ++ do_div(bw, rtt_us); ++ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); ++} ++ ++/* Pace using current bw estimate and a gain factor. */ ++static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); ++ ++ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) ++ bbr_init_pacing_rate_from_rtt(sk); ++ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) ++ sk->sk_pacing_rate = rate; ++} ++ ++static u32 bbr_min_tso_segs(struct sock *sk) ++{ ++ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; ++} ++ ++/* Return the number of segments BBR would like in a TSO/GSO skb, given ++ * a particular max gso size as a constraint. ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr->params.tso_rtt_shift) { ++ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++{ ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); ++} ++ ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ ++static u32 bbr_tso_segs_goal(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); ++} ++ ++/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ ++static void bbr_save_cwnd(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) ++ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ ++ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ ++ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); ++} ++ ++static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (event == CA_EVENT_TX_START && tp->app_limited) { ++ bbr->idle_restart = 1; ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ /* Avoid pointless buffer overflows: pace at est. bw if we don't ++ * need more speed (we're restarting from idle and app-limited). ++ */ ++ if (bbr->mode == BBR_PROBE_BW) ++ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); ++ else if (bbr->mode == BBR_PROBE_RTT) ++ bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_ecn_enable && ++ bbr->params.precise_ece_ack) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); ++ } ++} ++ ++/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: ++ * ++ * bdp = ceil(bw * min_rtt * gain) ++ * ++ * The key factor, gain, controls the amount of queue. While a small gain ++ * builds a smaller queue, it becomes more vulnerable to noise in RTT ++ * measurements (e.g., delayed ACKs or other ACK compression effects). This ++ * noise may cause BBR to under-estimate the rate. ++ */ ++static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; ++ u64 w; ++ ++ /* If we've never had a valid RTT sample, cap cwnd at the initial ++ * default. This should only happen when the connection is not using TCP ++ * timestamps and has retransmitted all of the SYN/SYNACK/data packets ++ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which ++ * case we need to slow-start up toward something safe: initial cwnd. ++ */ ++ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ ++ ++ w = (u64)bw * bbr->min_rtt_us; ++ ++ /* Apply a gain to the given value, remove the BW_SCALE shift, and ++ * round the value up to avoid a negative feedback loop. ++ */ ++ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; ++ ++ return bdp; ++} ++ ++/* To achieve full performance in high-speed paths, we budget enough cwnd to ++ * fit full-sized skbs in-flight on both end hosts to fully utilize the path: ++ * - one skb in sending host Qdisc, ++ * - one skb in sending host TSO/GSO engine ++ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine ++ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because ++ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * which allows 2 outstanding 2-packet sequences, to try to keep pipe ++ * full even with ACK-every-other-packet delayed ACKs. ++ */ ++static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; ++ ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); ++ ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ if (bbr->params.cwnd_tso_budget == 1) { ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); ++ } else { ++ cwnd += tso_segs_goal; ++ cwnd = (cwnd + 1) & ~1U; ++ } ++ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ cwnd += 2; ++ ++ return cwnd; ++} ++ ++/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ ++static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) ++{ ++ u32 inflight; ++ ++ inflight = bbr_bdp(sk, bw, gain); ++ inflight = bbr_quantization_budget(sk, inflight); ++ ++ return inflight; ++} ++ ++/* With pacing at lower layers, there's often less data "in the network" than ++ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), ++ * we often have several skbs queued in the pacing layer with a pre-scheduled ++ * earliest departure time (EDT). BBR adapts its pacing rate based on the ++ * inflight level that it estimates has already been "baked in" by previous ++ * departure time decisions. We calculate a rough estimate of the number of our ++ * packets that might be in the network at the earliest departure time for the ++ * next skb scheduled: ++ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw ++ * If we're increasing inflight, then we want to know if the transmit of the ++ * EDT skb will push inflight above the target, so inflight_at_edt includes ++ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, ++ * then estimate if inflight will sink too low just before the EDT transmit. ++ */ ++static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 now_ns, edt_ns, interval_us; ++ u32 interval_delivered, inflight_at_edt; ++ ++ now_ns = tp->tcp_clock_cache; ++ edt_ns = max(tp->tcp_wstamp_ns, now_ns); ++ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); ++ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; ++ inflight_at_edt = inflight_now; ++ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ ++ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ ++ if (interval_delivered >= inflight_at_edt) ++ return 0; ++ return inflight_at_edt - interval_delivered; ++} ++ ++/* Find the cwnd increment based on estimate of ack aggregation */ ++static u32 bbr_ack_aggregation_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 max_aggr_cwnd, aggr_cwnd = 0; ++ ++ if (bbr->params.extra_acked_gain && ++ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { ++ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) ++ / BW_UNIT; ++ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) ++ >> BBR_SCALE; ++ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); ++ } ++ ++ return aggr_cwnd; ++} ++ ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->params.probe_rtt_cwnd_gain == 0) ++ return bbr->params.cwnd_min_target; ++ return max_t(u32, bbr->params.cwnd_min_target, ++ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); ++} ++ ++/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss ++ * has drawn us down below target), or snap down to target if we're above it. ++ */ ++static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; ++ ++ if (!acked) ++ goto done; /* no packet fully ACKed; just apply caps */ ++ ++ target_cwnd = bbr_bdp(sk, bw, gain); ++ ++ /* Increment the cwnd to account for excess ACKed data that seems ++ * due to aggregation (of data and/or ACKs) visible in the ACK stream. ++ */ ++ target_cwnd += bbr_ack_aggregation_cwnd(sk); ++ target_cwnd = bbr_quantization_budget(sk, target_cwnd); ++ ++ /* If we're below target cwnd, slow start cwnd toward target cwnd. */ ++ bbr->debug.target_cwnd = target_cwnd; ++ ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } ++ ++ /* When growing cwnd, don't grow beyond twice what we just probed. */ ++ if (bbr->params.usage_based_cwnd) { ++ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); ++ cwnd = min(cwnd, max_probe); ++ } ++ ++ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); ++done: ++ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ ++ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ ++ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); ++ ++ ctx->target_cwnd = target_cwnd; ++ ctx->log = (tp->snd_cwnd != prev_cwnd); ++} ++ ++/* See if we have reached next round trip */ ++static void bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->round_start = 0; ++ ++ /* See if we've reached the next RTT */ ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->round_start = 1; ++ } ++} ++ ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw = 0; ++ ++ /* Divide delivered by the interval to find a (lower bound) bottleneck ++ * bandwidth sample. Delivered is in packets and interval_us in uS and ++ * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. ++ */ ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); ++ } ++ ++ ctx->sample_bw = bw; ++ bbr->debug.rs_bw = bw; ++} ++ ++/* Estimates the windowed max degree of ack aggregation. ++ * This is used to provision extra in-flight data to keep sending during ++ * inter-ACK silences. ++ * ++ * Degree of ack aggregation is estimated as extra data acked beyond expected. ++ * ++ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" ++ * cwnd += max_extra_acked ++ * ++ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). ++ * Max filter is an approximate sliding window of 5-10 (packet timed) round ++ * trips for non-startup phase, and 1-2 round trips for startup. ++ */ ++static void bbr_update_ack_aggregation(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ u32 epoch_us, expected_acked, extra_acked; ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; ++ ++ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || ++ rs->delivered < 0 || rs->interval_us <= 0) ++ return; ++ ++ if (bbr->round_start) { ++ bbr->extra_acked_win_rtts = min(0x1F, ++ bbr->extra_acked_win_rtts + 1); ++ if (bbr->params.extra_acked_in_startup && ++ !bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? ++ 0 : 1; ++ bbr->extra_acked[bbr->extra_acked_win_idx] = 0; ++ } ++ } ++ ++ /* Compute how many packets we expected to be delivered over epoch. */ ++ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, ++ bbr->ack_epoch_mstamp); ++ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; ++ ++ /* Reset the aggregation epoch if ACK rate is below expected rate or ++ * significantly large no. of ack received since epoch (potentially ++ * quite old epoch). ++ */ ++ if (bbr->ack_epoch_acked <= expected_acked || ++ (bbr->ack_epoch_acked + rs->acked_sacked >= ++ bbr_ack_epoch_acked_reset_thresh)) { ++ bbr->ack_epoch_acked = 0; ++ bbr->ack_epoch_mstamp = tp->delivered_mstamp; ++ expected_acked = 0; ++ } ++ ++ /* Compute excess data delivered, beyond what was expected. */ ++ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, ++ bbr->ack_epoch_acked + rs->acked_sacked); ++ extra_acked = bbr->ack_epoch_acked - expected_acked; ++ extra_acked = min(extra_acked, tp->snd_cwnd); ++ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) ++ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh; ++ ++ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) ++ return; ++ ++ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; ++ if (bbr_max_bw(sk) >= bw_thresh) { ++ bbr->full_bw = bbr_max_bw(sk); ++ bbr->full_bw_cnt = 0; ++ return; ++ } ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr2_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) ++ return true; /* exiting DRAIN now */ ++ return false; ++} ++ ++static void bbr_check_probe_rtt_done(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (!(bbr->probe_rtt_done_stamp && ++ after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) ++ return; ++ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ ++ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); ++ bbr2_exit_probe_rtt(sk); ++} ++ ++/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and ++ * periodically drain the bottleneck queue, to converge to measure the true ++ * min_rtt (unloaded propagation delay). This allows the flows to keep queues ++ * small (reducing queuing delay and packet loss) and achieve fairness among ++ * BBR flows. ++ * ++ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, ++ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. ++ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed ++ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and ++ * re-enter the previous mode. BBR uses 200ms to approximately bound the ++ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). ++ * ++ * Note that flows need only pay 2% if they are busy sending over the last 10 ++ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have ++ * natural silences or low-rate periods within 10 seconds where the rate is low ++ * enough for long enough to drain its queue in the bottleneck. We pick up ++ * these min RTT measurements opportunistically with our min_rtt filter. :-) ++ */ ++static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; ++ ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr->params.probe_rtt_win_ms); ++ probe_rtt_expired = after(tcp_jiffies32, expire); ++ if (rs->rtt_us >= 0 && ++ (rs->rtt_us <= bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; ++ } ++ ++ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && ++ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { ++ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ ++ bbr_save_cwnd(sk); /* note cwnd so we can restore it */ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ } ++ ++ if (bbr->mode == BBR_PROBE_RTT) { ++ /* Ignore low rate samples during this mode. */ ++ tp->app_limited = ++ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; ++ /* Maintain min packets in flight for max(200 ms, 1 round). */ ++ if (!bbr->probe_rtt_done_stamp && ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { ++ bbr->probe_rtt_done_stamp = tcp_jiffies32 + ++ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); ++ bbr->probe_rtt_round_done = 0; ++ bbr->next_rtt_delivered = tp->delivered; ++ } else if (bbr->probe_rtt_done_stamp) { ++ if (bbr->round_start) ++ bbr->probe_rtt_round_done = 1; ++ if (bbr->probe_rtt_round_done) ++ bbr_check_probe_rtt_done(sk); ++ } ++ } ++ /* Restart after idle ends only once we process a new S/ACK for data */ ++ if (rs->delivered > 0) ++ bbr->idle_restart = 0; ++} ++ ++static void bbr_update_gains(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ bbr->pacing_gain = bbr->params.high_gain; ++ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; ++ break; ++ case BBR_DRAIN: ++ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ ++ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ ++ break; ++ case BBR_PROBE_BW: ++ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr->params.cwnd_gain; ++ break; ++ case BBR_PROBE_RTT: ++ bbr->pacing_gain = BBR_UNIT; ++ bbr->cwnd_gain = BBR_UNIT; ++ break; ++ default: ++ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); ++ break; ++ } ++} ++ ++static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ int i; ++ ++ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); ++ ++ bbr->initialized = 1; ++ bbr->params.high_gain = min(0x7FF, bbr_high_gain); ++ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); ++ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); ++ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); ++ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); ++ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); ++ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); ++ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); ++ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); ++ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); ++ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); ++ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); ++ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; ++ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; ++ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; ++ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); ++ bbr->params.probe_rtt_win_ms = ++ min(0x3FFFU, ++ min_t(u32, bbr_probe_rtt_win_ms, ++ bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); ++ for (i = 0; i < CYCLE_LEN; i++) ++ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); ++ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; ++ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); ++ ++ bbr->debug.snd_isn = tp->snd_una; ++ bbr->debug.target_cwnd = 0; ++ bbr->debug.undo = 0; ++ ++ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = 0; ++ bbr->prev_ca_state = TCP_CA_Open; ++ bbr->packet_conservation = 0; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; ++ bbr->full_bw_cnt = 0; ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ bbr->mode = BBR_STARTUP; ++ bbr->debug.rs_bw = 0; ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++} ++ ++static u32 bbr_sndbuf_expand(struct sock *sk) ++{ ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; ++} ++ ++/* __________________________________________________________________________ ++ * ++ * Functions new to BBR v2 ("bbr") congestion control are below here. ++ * __________________________________________________________________________ ++ */ ++ ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); ++} ++ ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr2_advance_bw_hi_filter(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} ++ ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr2_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); ++ ++ return min(bdp, tcp_sk(sk)->snd_cwnd); ++} ++ ++static bool bbr2_is_probing_bandwidth(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->full_bw_reached = 1; ++ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) ++ return; ++ ++ if (ce_ratio >= bbr->params.ecn_thresh) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { ++ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ ++ bbr2_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++static void bbr2_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ ++ if (bbr->params.ecn_factor == 0) ++ return; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_ecn_enable && ++ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || ++ !bbr->params.ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ gain = bbr->params.ecn_alpha_gain; ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr2_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tp->snd_cwnd / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++ bbr->debug.event = 'G'; /* Grow inflight_hi slope */ ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr2_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { ++ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ ++ return; /* not fully using inflight_hi, so don't grow it */ ++ } ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->debug.event = 'I'; /* Increment inflight_hi */ ++ } ++ ++ if (bbr->round_start) ++ bbr2_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr2_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) ++ return true; ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr->params.ecn_thresh) { ++ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> ++ BBR_SCALE; ++ if (rs->delivered_ce >= ecn_thresh) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh = bbr->params.loss_thresh; ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (WARN_ONCE(inflight_prev < 0, ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) ++ return ~0U; ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ON_ONCE(lost_prev < 0)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr2_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr->params.inflight_headroom; ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr->params.cwnd_min_target); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr2_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr->params.cwnd_min_target); ++ tp->snd_cwnd = min(cap, tp->snd_cwnd); ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr2_adapt_lower_bounds(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut, ecn_inflight_lo, beta; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr2_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { ++ /* Reduce inflight to (1 - alpha*ecn_factor). */ ++ ecn_cut = (BBR_UNIT - ++ ((bbr->ecn_alpha * bbr->params.ecn_factor) >> ++ BBR_SCALE)); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tp->snd_cwnd; ++ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++ } else { ++ ecn_inflight_lo = ~0U; ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ /* Reduce bw and inflight to (1 - beta). */ ++ if (bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tp->snd_cwnd; ++ beta = bbr->params.beta; ++ bbr->bw_lo = ++ max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ bbr->inflight_lo = ++ max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss or ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr2_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr2_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr2_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr2_take_bw_hi_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ /* Update rate and volume of delivered data from latest round trip: */ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (before(rs->prior_delivered, bbr->loss_round_delivered)) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr->loss_round_delivered = tp->delivered; /* mark round trip */ ++ bbr->loss_round_start = 1; ++ bbr2_adapt_lower_bounds(sk); ++ ++ /* Update windowed "latest" (single-round-trip) filters. */ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 inflight, rounds, reno_gain, reno_rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = bbr->params.bw_probe_max_rounds; ++ ++ reno_gain = bbr->params.bw_probe_reno_gain; ++ if (reno_gain) { ++ inflight = bbr2_target_inflight(sk); ++ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; ++ rounds = min(rounds, reno_rounds); ++ } ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr2_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr->params.bw_probe_rand_rounds); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr->params.bw_probe_base_us + ++ get_random_u32_below(bbr->params.bw_probe_rand_us); ++} ++ ++static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_lower_bounds(sk); ++ if (bbr->inflight_hi != ~0U) ++ bbr->inflight_hi += bbr->params.refill_add_inc; ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr2_start_bw_probe_up(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr2_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr2_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr2_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr2_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr2_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr->params.beta; ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ bbr->debug.event = 'L'; /* Loss/ECN too high */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr2_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr2_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr2_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr2_advance_bw_hi_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr->debug.event = 'R'; /* reprobe */ ++ bbr2_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ ++ if (bbr2_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr2_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ ++ return false; ++ ++ /* To be resilient to random loss, we must raise inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ bbr->debug.event = 'U'; /* raise up inflight_hi */ ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr2_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr2_check_time_to_probe_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case a burst of cross-traffic has ceased and freed up bw, ++ * or in case we are sharing with multiplicatively probing traffic). ++ */ ++ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ ++ /* Calculate n so that when bbr2_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); ++ bbr2_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr2_is_reno_coexistence_probe_time(sk)) { ++ bbr2_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_under_bdp, is_long_enough; ++ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr2_inflight_with_headroom(sk)) ++ return false; ++ ++ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++ if (bbr->params.drain_to_target) ++ return is_under_bdp; ++ ++ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); ++ return is_under_bdp || is_long_enough; ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr2_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_risky = false, is_queuing = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr2_adapt_upper_bounds(sk, rs)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr2_check_time_to_probe_bw(sk)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr2_start_bw_probe_up(sk); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (checked here) ++ * (2) We have probed for at least 1*min_rtt_us, and the ++ * estimated queue is high enough (inflight > 1.25 * estimated_bdp). ++ * (checked here) ++ * (3) Loss filter says loss rate is "too high". ++ * (checked in bbr_is_inflight_too_high()) ++ * (4) ECN filter says ECN mark rate is "too high". ++ * (checked in bbr_is_inflight_too_high()) ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_risky = true; ++ bbr->debug.event = 'D'; /* D for danger */ ++ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && ++ inflight >= ++ bbr_inflight(sk, bw, ++ bbr->params.bw_probe_pif_gain)) { ++ is_queuing = true; ++ bbr->debug.event = 'Q'; /* building Queue */ ++ } ++ if (is_risky || is_queuing) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr2_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr2_check_time_to_probe_bw(sk)) ++ return; /* already decided state transition */ ++ if (bbr2_check_time_to_cruise(sk, inflight, bw)) ++ bbr2_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr2_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr2_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr2_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr2_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr->params.full_loss_cnt && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr->params.full_loss_cnt && ++ bbr2_is_inflight_too_high(sk, rs)) { ++ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ ++ bbr2_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* If we are done draining, advance into steady state operation in PROBE_BW. */ ++static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_check_drain(sk, rs, ctx)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr2_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr2_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr2_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs); ++ bbr2_check_drain(sk, rs, ctx); ++ bbr2_update_cycle_phase(sk, rs); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr2_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr->params.fast_path && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr2_check_drain(sk, rs, ctx); ++ bbr2_update_cycle_phase(sk, rs); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) ++ return true; ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++static void bbr2_main(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw; ++ ++ bbr->debug.event = '.'; /* init to default NOP (no event yet) */ ++ ++ bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ bbr2_update_ecn_alpha(sk); ++ } ++ ++ bbr->ecn_in_round |= rs->is_ece; ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ ++ if (bbr2_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr2_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tp->snd_cwnd, &ctx); ++ bbr2_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++ ++ bbr_debug(sk, rs->acked_sacked, rs, &ctx); ++} ++ ++/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared ++ * down here, so that the algorithm functions that use the parameters must use ++ * the per-socket parameters; if they accidentally use the global version ++ * then there will be a compile error. ++ * TODO(ncardwell): move all per-socket parameters down to this section. ++ */ ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. Max allwed value is 255. ++ */ ++static u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. ++ * Max allowed value is 255. ++ */ ++static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ ++ ++/* The initial value for the ecn_alpha state variable. Default and max ++ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. Max allwed value is 255. ++ */ ++static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. ++ */ ++static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. ++ */ ++static u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then use a multiplicative increase to quickly reprobe bw by ++ * starting inflight probing at the given multiple of inflight_hi. ++ * Default for this experimental knob is 0 (disabled). ++ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. ++ */ ++static u32 bbr_ecn_reprobe_gain; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. Max allowed value is 15 (0xF). ++ */ ++static u32 bbr_full_loss_cnt = 8; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. Max allowed value is 3. ++ */ ++static u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. ++ * Default is 1.25x, as in BBR v1. Max allowed is 511. ++ */ ++static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; ++ ++/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. ++ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. ++ * Max allowed is 511. ++ */ ++static u32 bbr_bw_probe_reno_gain = BBR_UNIT; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ * Max value is 15. ++ */ ++static u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Undo the model changes made in loss recovery if recovery was spurious? */ ++static bool bbr_undo = true; ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static bool bbr_fast_path = true; /* default: enabled */ ++ ++/* Use fast ack mode ? */ ++static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ ++ ++/* How much to additively increase inflight_hi when entering REFILL? */ ++static u32 bbr_refill_add_inc; /* default: disabled */ ++ ++module_param_named(beta, bbr_beta, uint, 0644); ++module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); ++module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); ++module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); ++module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); ++module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); ++module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); ++module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); ++module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); ++module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); ++module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); ++module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); ++module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); ++module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); ++module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); ++module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); ++module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); ++module_param_named(undo, bbr_undo, bool, 0664); ++module_param_named(fast_path, bbr_fast_path, bool, 0664); ++module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); ++module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); ++ ++static void bbr2_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_init(sk); /* run shared init code for v1 and v2 */ ++ ++ /* BBR v2 parameters: */ ++ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); ++ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); ++ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); ++ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); ++ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); ++ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); ++ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); ++ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); ++ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); ++ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); ++ bbr->params.inflight_headroom = ++ min_t(u32, 0xFFU, bbr_inflight_headroom); ++ bbr->params.bw_probe_pif_gain = ++ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); ++ bbr->params.bw_probe_reno_gain = ++ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); ++ bbr->params.bw_probe_max_rounds = ++ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); ++ bbr->params.bw_probe_rand_rounds = ++ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); ++ bbr->params.bw_probe_base_us = ++ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); ++ bbr->params.bw_probe_rand_us = ++ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); ++ bbr->params.undo = bbr_undo; ++ bbr->params.fast_path = bbr_fast_path ? 1 : 0; ++ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); ++ ++ /* BBR v2 state: */ ++ bbr->initialized = 1; ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr2_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr->params.ecn_alpha_init; ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ ++ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); ++ ++ if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs; ++ ++ /* Capture "current" data over the full round trip of loss, ++ * to have a better chance to see the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ memset(&rs, 0, sizeof(rs)); ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr2_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr2_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++static u32 bbr2_undo_cwnd(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->debug.undo = 1; ++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ bbr->full_bw_cnt = 0; ++ bbr->loss_in_round = 0; ++ ++ if (!bbr->params.undo) ++ return tp->snd_cwnd; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ ++static u32 bbr2_ssthresh(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; ++ return tcp_sk(sk)->snd_ssthresh; ++} ++ ++static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR2_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR2_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR2_PHASE_PROBE_RTT; ++ default: ++ return BBR2_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR2_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR2_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR2_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR2_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR2_PHASE_INVALID; ++ } ++} ++ ++static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, ++ union tcp_cc_info *info) ++{ ++ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || ++ ext & (1 << (INET_DIAG_VEGASINFO - 1))) { ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ ++ memset(&info->bbr2, 0, sizeof(info->bbr2)); ++ info->bbr2.bbr_bw_lsb = (u32)bw; ++ info->bbr2.bbr_bw_msb = (u32)(bw >> 32); ++ info->bbr2.bbr_min_rtt = bbr->min_rtt_us; ++ info->bbr2.bbr_pacing_gain = bbr->pacing_gain; ++ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; ++ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; ++ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; ++ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ info->bbr2.bbr_mode = bbr->mode; ++ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); ++ info->bbr2.bbr_version = (__u8)2; ++ info->bbr2.bbr_inflight_lo = bbr->inflight_lo; ++ info->bbr2.bbr_inflight_hi = bbr->inflight_hi; ++ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); ++ *attr = INET_DIAG_BBRINFO; ++ return sizeof(info->bbr2); ++ } ++ return 0; ++} ++ ++static void bbr2_set_state(struct sock *sk, u8 new_state) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (new_state == TCP_CA_Loss) { ++ struct rate_sample rs = { .losses = 1 }; ++ struct bbr_context ctx = { 0 }; ++ ++ bbr->prev_ca_state = TCP_CA_Loss; ++ bbr->full_bw = 0; ++ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tp->snd_cwnd, bbr->prior_cwnd); ++ } ++ bbr_debug(sk, 0, &rs, &ctx); ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++ } ++} ++ ++static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, ++ .name = "bbr2", ++ .owner = THIS_MODULE, ++ .init = bbr2_init, ++ .cong_control = bbr2_main, ++ .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr2_skb_marked_lost, ++ .undo_cwnd = bbr2_undo_cwnd, ++ .cwnd_event = bbr_cwnd_event, ++ .ssthresh = bbr2_ssthresh, ++ .tso_segs = bbr_tso_segs, ++ .get_info = bbr2_get_info, ++ .set_state = bbr2_set_state, ++}; ++ ++static int __init bbr_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&tcp_bbr2_cong_ops); ++} ++ ++static void __exit bbr_unregister(void) ++{ ++ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); ++} ++ ++module_init(bbr_register); ++module_exit(bbr_unregister); ++ ++MODULE_AUTHOR("Van Jacobson "); ++MODULE_AUTHOR("Neal Cardwell "); ++MODULE_AUTHOR("Yuchung Cheng "); ++MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++ ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index d3cae40749e8..0f268f2ff2e9 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index cc072d2cfcd8..754e0212c951 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3813,6 +3829,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -3911,6 +3928,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -5521,13 +5539,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 71d01cf3c13e..0da3da9e56db 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1530,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int nsize, old_factor; ++ int nsize, old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ /* Set buff tx.in_flight as if buff were sent by itself. */ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (WARN_ONCE(inflight_prev < 0, ++ "inconsistent: tx.in_flight: %u old_factor: %d", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor)) ++ inflight_prev = 0; ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -1990,13 +2000,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs)); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2632,6 +2641,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..a8b4c9504570 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->last_end_seq = scb->end_seq; ++ rs->tx_in_flight = scb->tx.in_flight; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index cb79127f45c3..70e4de876a7f 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.39.2 + +From 4b786f8ae226132e5faf03acd49e1ea6ae5e8d9a Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 13 Feb 2023 09:23:53 +0100 +Subject: [PATCH 02/15] bfq + +Signed-off-by: Peter Jung +--- + block/bfq-cgroup.c | 101 ++++--- + block/bfq-iosched.c | 629 ++++++++++++++++++++++++++++-------------- + block/bfq-iosched.h | 144 +++++++--- + block/bfq-wf2q.c | 2 +- + block/blk-cgroup.c | 122 ++++---- + block/blk-cgroup.h | 10 +- + block/blk-iocost.c | 58 ++-- + block/blk-iolatency.c | 39 ++- + block/blk-rq-qos.h | 2 +- + block/blk-throttle.c | 16 +- + block/blk.h | 6 - + 11 files changed, 743 insertions(+), 386 deletions(-) + +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c +index 0fbde0fc0628..59929dfd559b 100644 +--- a/block/bfq-cgroup.c ++++ b/block/bfq-cgroup.c +@@ -706,12 +706,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_activate_bfqq(bfqd, bfqq); + } + +- if (!bfqd->in_service_queue && !bfqd->rq_in_driver) ++ if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) + bfq_schedule_dispatch(bfqd); + /* release extra ref taken above, bfqq may happen to be freed now */ + bfq_put_queue(bfqq); + } + ++static void bfq_sync_bfqq_move(struct bfq_data *bfqd, ++ struct bfq_queue *sync_bfqq, ++ struct bfq_io_cq *bic, ++ struct bfq_group *bfqg, ++ unsigned int act_idx) ++{ ++ struct bfq_queue *bfqq; ++ ++ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { ++ /* We are the only user of this bfqq, just move it */ ++ if (sync_bfqq->entity.sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); ++ return; ++ } ++ ++ /* ++ * The queue was merged to a different queue. Check ++ * that the merge chain still belongs to the same ++ * cgroup. ++ */ ++ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) ++ if (bfqq->entity.sched_data != &bfqg->sched_data) ++ break; ++ if (bfqq) { ++ /* ++ * Some queue changed cgroup so the merge is not valid ++ * anymore. We cannot easily just cancel the merge (by ++ * clearing new_bfqq) as there may be other processes ++ * using this queue and holding refs to all queues ++ * below sync_bfqq->new_bfqq. Similarly if the merge ++ * already happened, we need to detach from bfqq now ++ * so that we cannot merge bio to a request from the ++ * old cgroup. ++ */ ++ bfq_put_cooperator(sync_bfqq); ++ bic_set_bfqq(bic, NULL, true, act_idx); ++ bfq_release_process_ref(bfqd, sync_bfqq); ++ } ++} ++ + /** + * __bfq_bic_change_cgroup - move @bic to @bfqg. + * @bfqd: the queue descriptor. +@@ -726,53 +766,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) + { +- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); +- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); +- struct bfq_entity *entity; ++ unsigned int act_idx; + +- if (async_bfqq) { +- entity = &async_bfqq->entity; ++ for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); + +- if (entity->sched_data != &bfqg->sched_data) { +- bic_set_bfqq(bic, NULL, false); ++ if (async_bfqq && ++ async_bfqq->entity.sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, false, act_idx); + bfq_release_process_ref(bfqd, async_bfqq); + } +- } + +- if (sync_bfqq) { +- if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { +- /* We are the only user of this bfqq, just move it */ +- if (sync_bfqq->entity.sched_data != &bfqg->sched_data) +- bfq_bfqq_move(bfqd, sync_bfqq, bfqg); +- } else { +- struct bfq_queue *bfqq; +- +- /* +- * The queue was merged to a different queue. Check +- * that the merge chain still belongs to the same +- * cgroup. +- */ +- for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) +- if (bfqq->entity.sched_data != +- &bfqg->sched_data) +- break; +- if (bfqq) { +- /* +- * Some queue changed cgroup so the merge is +- * not valid anymore. We cannot easily just +- * cancel the merge (by clearing new_bfqq) as +- * there may be other processes using this +- * queue and holding refs to all queues below +- * sync_bfqq->new_bfqq. Similarly if the merge +- * already happened, we need to detach from +- * bfqq now so that we cannot merge bio to a +- * request from the old cgroup. +- */ +- bfq_put_cooperator(sync_bfqq); +- bic_set_bfqq(bic, NULL, true); +- bfq_release_process_ref(bfqd, sync_bfqq); +- } +- } ++ if (sync_bfqq) ++ bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); + } + } + +@@ -1106,9 +1113,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, + struct bfq_group *bfqg; + u64 v; + +- ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); ++ blkg_conf_init(&ctx, buf); ++ ++ ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); + if (ret) +- return ret; ++ goto out; + + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ +@@ -1130,7 +1139,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, + ret = 0; + } + out: +- blkg_conf_finish(&ctx); ++ blkg_conf_exit(&ctx); + return ret ?: nbytes; + } + +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index 380e9bda2e57..c330ff5fde4c 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; + #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) + #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, ++ unsigned int actuator_idx) + { +- return bic->bfqq[is_sync]; ++ if (is_sync) ++ return bic->bfqq[1][actuator_idx]; ++ ++ return bic->bfqq[0][actuator_idx]; + } + + static void bfq_put_stable_ref(struct bfq_queue *bfqq); + +-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) ++void bic_set_bfqq(struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, ++ bool is_sync, ++ unsigned int actuator_idx) + { +- struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; +- +- /* Clear bic pointer if bfqq is detached from this bic */ +- if (old_bfqq && old_bfqq->bic == bic) +- old_bfqq->bic = NULL; ++ struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; + + /* + * If bfqq != NULL, then a non-stable queue merge between +@@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) + * we cancel the stable merge if + * bic->stable_merge_bfqq == bfqq. + */ +- bic->bfqq[is_sync] = bfqq; ++ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; ++ ++ /* Clear bic pointer if bfqq is detached from this bic */ ++ if (old_bfqq && old_bfqq->bic == bic) ++ old_bfqq->bic = NULL; + +- if (bfqq && bic->stable_merge_bfqq == bfqq) { ++ if (is_sync) ++ bic->bfqq[1][actuator_idx] = bfqq; ++ else ++ bic->bfqq[0][actuator_idx] = bfqq; ++ ++ if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { + /* + * Actually, these same instructions are executed also + * in bfq_setup_cooperator, in case of abort or actual +@@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) + * did so, we would nest even more complexity in this + * function. + */ +- bfq_put_stable_ref(bic->stable_merge_bfqq); ++ bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); + +- bic->stable_merge_bfqq = NULL; ++ bfqq_data->stable_merge_bfqq = NULL; + } + } + +@@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) + { + struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = bfq_bic_lookup(data->q); +- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; + int depth; + unsigned limit = data->q->nr_requests; ++ unsigned int act_idx; + + /* Sync reads have full depth available */ + if (op_is_sync(opf) && !op_is_write(opf)) { +@@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) + limit = (limit * depth) >> bfqd->full_depth_shift; + } + +- /* +- * Does queue (or any parent entity) exceed number of requests that +- * should be available to it? Heavily limit depth so that it cannot +- * consume more available requests and thus starve other entities. +- */ +- if (bfqq && bfqq_request_over_limit(bfqq, limit)) +- depth = 1; ++ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { ++ struct bfq_queue *bfqq = ++ bic_to_bfqq(bic, op_is_sync(opf), act_idx); + ++ /* ++ * Does queue (or any parent entity) exceed number of ++ * requests that should be available to it? Heavily ++ * limit depth so that it cannot consume more ++ * available requests and thus starve other entities. ++ */ ++ if (bfqq && bfqq_request_over_limit(bfqq, limit)) { ++ depth = 1; ++ break; ++ } ++ } + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); + if (depth) +@@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) + { + u64 dur; + +- if (bfqd->bfq_wr_max_time > 0) +- return bfqd->bfq_wr_max_time; +- + dur = bfqd->rate_dur_prod; + do_div(dur, bfqd->peak_rate); + +@@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + { + unsigned int old_wr_coeff = 1; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ unsigned int a_idx = bfqq->actuator_idx; ++ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; + +- if (bic->saved_has_short_ttime) ++ if (bfqq_data->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); + else + bfq_clear_bfqq_has_short_ttime(bfqq); + +- if (bic->saved_IO_bound) ++ if (bfqq_data->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + +- bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; +- bfqq->inject_limit = bic->saved_inject_limit; +- bfqq->decrease_time_jif = bic->saved_decrease_time_jif; ++ bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; ++ bfqq->inject_limit = bfqq_data->saved_inject_limit; ++ bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; + +- bfqq->entity.new_weight = bic->saved_weight; +- bfqq->ttime = bic->saved_ttime; +- bfqq->io_start_time = bic->saved_io_start_time; +- bfqq->tot_idle_time = bic->saved_tot_idle_time; ++ bfqq->entity.new_weight = bfqq_data->saved_weight; ++ bfqq->ttime = bfqq_data->saved_ttime; ++ bfqq->io_start_time = bfqq_data->saved_io_start_time; ++ bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; +- bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_coeff = bfqq_data->saved_wr_coeff; + } +- bfqq->service_from_wr = bic->saved_service_from_wr; +- bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; +- bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; +- bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ bfqq->service_from_wr = bfqq_data->saved_service_from_wr; ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq_data->saved_wr_start_at_switch_to_srt; ++ bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + +@@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, + return bfqq_weight > in_serv_weight; + } + ++/* ++ * Get the index of the actuator that will serve bio. ++ */ ++static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) ++{ ++ unsigned int i; ++ sector_t end; ++ ++ /* no search needed if one or zero ranges present */ ++ if (bfqd->num_actuators == 1) ++ return 0; ++ ++ /* bio_end_sector(bio) gives the sector after the last one */ ++ end = bio_end_sector(bio) - 1; ++ ++ for (i = 0; i < bfqd->num_actuators; i++) { ++ if (end >= bfqd->sector[i] && ++ end < bfqd->sector[i] + bfqd->nr_sectors[i]) ++ return i; ++ } ++ ++ WARN_ONCE(true, ++ "bfq_actuator_index: bio sector out of ranges: end=%llu\n", ++ end); ++ return 0; ++} ++ + static bool bfq_better_to_idle(struct bfq_queue *bfqq); + + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, +@@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + arrived_in_time = ktime_get_ns() <= + bfqq->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; +- ++ unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); ++ bool bfqq_non_merged_or_stably_merged = ++ bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; + + /* + * bfqq deserves to be weight-raised if: +@@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + */ + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || +- (bfq_bfqq_sync(bfqq) && +- (bfqq->bic || RQ_BIC(rq)->stably_merged) && +- (*interactive || soft_rt))); ++ (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && ++ (*interactive || soft_rt))); + + /* + * Using the last flag, update budget and check whether bfqq +@@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * We reset waker detection logic also if too much time has passed + * since the first detection. If wakeups are rare, pointless idling + * doesn't hurt throughput that much. The condition below makes sure +- * we do not uselessly idle blocking waker in more than 1/64 cases. ++ * we do not uselessly idle blocking waker in more than 1/64 cases. + */ + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq || +@@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq) + * elapsed. + */ + if (bfqq == bfqd->in_service_queue && +- (bfqd->rq_in_driver == 0 || ++ (bfqd->tot_rq_in_driver == 0 || + (bfqq->last_serv_time_ns > 0 && +- bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && ++ bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && + time_is_before_eq_jiffies(bfqq->decrease_time_jif + + msecs_to_jiffies(10))) { + bfqd->last_empty_occupied_ns = ktime_get_ns(); +@@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq) + * will be set in case injection is performed + * on bfqq before rq is completed). + */ +- if (bfqd->rq_in_driver == 0) ++ if (bfqd->tot_rq_in_driver == 0) + bfqd->rqs_injected = false; + } + } +@@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, + */ + bfq_bic_update_cgroup(bic, bio); + +- bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), ++ bfq_actuator_index(bfqd, bio)); + } else { + bfqd->bio_bfqq = NULL; + } +@@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) + void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) + { +- int i, j; ++ int i, j, k; + +- for (i = 0; i < 2; i++) +- for (j = 0; j < IOPRIO_NR_LEVELS; j++) +- if (bfqg->async_bfqq[i][j]) +- bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); +- if (bfqg->async_idle_bfqq) +- bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++ for (k = 0; k < bfqd->num_actuators; k++) { ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_NR_LEVELS; j++) ++ if (bfqg->async_bfqq[i][j][k]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); ++ if (bfqg->async_idle_bfqq[k]) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); ++ } + } + + static void bfq_end_wr(struct bfq_data *bfqd) + { + struct bfq_queue *bfqq; ++ int i; + + spin_lock_irq(&bfqd->lock); + +- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) +- bfq_bfqq_end_wr(bfqq); ++ for (i = 0; i < bfqd->num_actuators; i++) { ++ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ } + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); +@@ -2794,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + ++static struct bfq_queue * ++bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_queue *stable_merge_bfqq, ++ struct bfq_iocq_bfqq_data *bfqq_data) ++{ ++ int proc_ref = min(bfqq_process_refs(bfqq), ++ bfqq_process_refs(stable_merge_bfqq)); ++ struct bfq_queue *new_bfqq; ++ ++ if (idling_boosts_thr_without_issues(bfqd, bfqq) || ++ proc_ref == 0) ++ return NULL; ++ ++ /* next function will take at least one ref */ ++ new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); ++ ++ if (new_bfqq) { ++ bfqq_data->stably_merged = true; ++ if (new_bfqq->bic) { ++ unsigned int new_a_idx = new_bfqq->actuator_idx; ++ struct bfq_iocq_bfqq_data *new_bfqq_data = ++ &new_bfqq->bic->bfqq_data[new_a_idx]; ++ ++ new_bfqq_data->stably_merged = true; ++ } ++ } ++ return new_bfqq; ++} ++ + /* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return +@@ -2819,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request, struct bfq_io_cq *bic) + { + struct bfq_queue *in_service_bfqq, *new_bfqq; ++ unsigned int a_idx = bfqq->actuator_idx; ++ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; + + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) +@@ -2840,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * stable merging) also if bic is associated with a + * sync queue, but this bfqq is async + */ +- if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && ++ if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_before_jiffies(bfqq->split_time + + msecs_to_jiffies(bfq_late_stable_merging)) && + time_is_before_jiffies(bfqq->creation_time + + msecs_to_jiffies(bfq_late_stable_merging))) { + struct bfq_queue *stable_merge_bfqq = +- bic->stable_merge_bfqq; +- int proc_ref = min(bfqq_process_refs(bfqq), +- bfqq_process_refs(stable_merge_bfqq)); ++ bfqq_data->stable_merge_bfqq; + + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + +- bic->stable_merge_bfqq = NULL; +- +- if (!idling_boosts_thr_without_issues(bfqd, bfqq) && +- proc_ref > 0) { +- /* next function will take at least one ref */ +- struct bfq_queue *new_bfqq = +- bfq_setup_merge(bfqq, stable_merge_bfqq); +- +- if (new_bfqq) { +- bic->stably_merged = true; +- if (new_bfqq->bic) +- new_bfqq->bic->stably_merged = +- true; +- } +- return new_bfqq; +- } else +- return NULL; ++ bfqq_data->stable_merge_bfqq = NULL; ++ ++ return bfq_setup_stable_merge(bfqd, bfqq, ++ stable_merge_bfqq, ++ bfqq_data); + } + } + +@@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + { + struct bfq_io_cq *bic = bfqq->bic; ++ unsigned int a_idx = bfqq->actuator_idx; ++ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; + + /* + * If !bfqq->bic, the queue is already shared or its requests +@@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + if (!bic) + return; + +- bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; +- bic->saved_inject_limit = bfqq->inject_limit; +- bic->saved_decrease_time_jif = bfqq->decrease_time_jif; +- +- bic->saved_weight = bfqq->entity.orig_weight; +- bic->saved_ttime = bfqq->ttime; +- bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); +- bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); +- bic->saved_io_start_time = bfqq->io_start_time; +- bic->saved_tot_idle_time = bfqq->tot_idle_time; +- bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); +- bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; ++ bfqq_data->saved_inject_limit = bfqq->inject_limit; ++ bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; ++ ++ bfqq_data->saved_weight = bfqq->entity.orig_weight; ++ bfqq_data->saved_ttime = bfqq->ttime; ++ bfqq_data->saved_has_short_ttime = ++ bfq_bfqq_has_short_ttime(bfqq); ++ bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bfqq_data->saved_io_start_time = bfqq->io_start_time; ++ bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; ++ bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bfqq_data->was_in_burst_list = ++ !hlist_unhashed(&bfqq->burst_list_node); ++ + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { +@@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ +- bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; +- bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); +- bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); +- bic->saved_last_wr_start_finish = jiffies; ++ bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bfqq_data->saved_wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); ++ bfqq_data->saved_wr_cur_max_time = ++ bfq_wr_duration(bfqq->bfqd); ++ bfqq_data->saved_last_wr_start_finish = jiffies; + } else { +- bic->saved_wr_coeff = bfqq->wr_coeff; +- bic->saved_wr_start_at_switch_to_srt = ++ bfqq_data->saved_wr_coeff = bfqq->wr_coeff; ++ bfqq_data->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; +- bic->saved_service_from_wr = bfqq->service_from_wr; +- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; +- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ bfqq_data->saved_service_from_wr = ++ bfqq->service_from_wr; ++ bfqq_data->saved_last_wr_start_finish = ++ bfqq->last_wr_start_finish; ++ bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } + } + +@@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ +- bic_set_bfqq(bic, new_bfqq, true); ++ bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): +@@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && +- bfqd->rq_in_driver == 0) ++ bfqd->tot_rq_in_driver == 0) + goto update_rate_and_reset; + + /* Update sampling information */ + bfqd->peak_rate_samples++; + +- if ((bfqd->rq_in_driver > 0 || ++ if ((bfqd->tot_rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) + bfqd->sequential_samples++; +@@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, + return false; + + return (bfqq->wr_coeff > 1 && +- (bfqd->wr_busy_queues < +- tot_busy_queues || +- bfqd->rq_in_driver >= +- bfqq->dispatched + 4)) || ++ (bfqd->wr_busy_queues < tot_busy_queues || ++ bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; + } +@@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + * function to evaluate the I/O speed of a process. + */ + static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, +- bool compensate, enum bfqq_expiration reason, +- unsigned long *delta_ms) ++ bool compensate, unsigned long *delta_ms) + { + ktime_t delta_ktime; + u32 delta_usecs; +@@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, + /* + * Check whether the process is slow (see bfq_bfqq_is_slow). + */ +- slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); + + /* + * As above explained, charge slow (typically seeky) and +@@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) + { + struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; + unsigned int limit = in_serv_bfqq->inject_limit; ++ int i; ++ + /* + * If + * - bfqq is not weight-raised and therefore does not carry +@@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) + ) + limit = 1; + +- if (bfqd->rq_in_driver >= limit) ++ if (bfqd->tot_rq_in_driver >= limit) + return NULL; + + /* +@@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) + * (and re-added only if it gets new requests, but then it + * is assigned again enough budget for its new backlog). + */ +- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) +- if (!RB_EMPTY_ROOT(&bfqq->sort_list) && +- (in_serv_always_inject || bfqq->wr_coeff > 1) && +- bfq_serv_to_charge(bfqq->next_rq, bfqq) <= +- bfq_bfqq_budget_left(bfqq)) { ++ for (i = 0; i < bfqd->num_actuators; i++) { ++ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (in_serv_always_inject || bfqq->wr_coeff > 1) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { + /* + * Allow for only one large in-flight request + * on non-rotational devices, for the +@@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) + */ + if (blk_queue_nonrot(bfqd->queue) && + blk_rq_sectors(bfqq->next_rq) >= +- BFQQ_SECT_THR_NONROT) +- limit = min_t(unsigned int, 1, limit); +- else +- limit = in_serv_bfqq->inject_limit; +- +- if (bfqd->rq_in_driver < limit) { ++ BFQQ_SECT_THR_NONROT && ++ bfqd->tot_rq_in_driver >= 1) ++ continue; ++ else { + bfqd->rqs_injected = true; + return bfqq; + } + } ++ } ++ ++ return NULL; ++} ++ ++static struct bfq_queue * ++bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) ++{ ++ struct bfq_queue *bfqq; ++ ++ if (bfqd->in_service_queue && ++ bfqd->in_service_queue->actuator_idx == idx) ++ return bfqd->in_service_queue; ++ ++ list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { ++ return bfqq; ++ } ++ } + + return NULL; + } + ++/* ++ * Perform a linear scan of each actuator, until an actuator is found ++ * for which the following three conditions hold: the load of the ++ * actuator is below the threshold (see comments on ++ * actuator_load_threshold for details) and lower than that of the ++ * next actuator (comments on this extra condition below), and there ++ * is a queue that contains I/O for that actuator. On success, return ++ * that queue. ++ * ++ * Performing a plain linear scan entails a prioritization among ++ * actuators. The extra condition above breaks this prioritization and ++ * tends to distribute injection uniformly across actuators. ++ */ ++static struct bfq_queue * ++bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) ++{ ++ int i; ++ ++ for (i = 0 ; i < bfqd->num_actuators; i++) { ++ if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && ++ (i == bfqd->num_actuators - 1 || ++ bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { ++ struct bfq_queue *bfqq = ++ bfq_find_active_bfqq_for_actuator(bfqd, i); ++ ++ if (bfqq) ++ return bfqq; ++ } ++ } ++ ++ return NULL; ++} ++ ++ + /* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + { +- struct bfq_queue *bfqq; ++ struct bfq_queue *bfqq, *inject_bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; + +@@ -4689,6 +4821,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + goto expire; + + check_queue: ++ /* ++ * If some actuator is underutilized, but the in-service ++ * queue does not contain I/O for that actuator, then try to ++ * inject I/O for that actuator. ++ */ ++ inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); ++ if (inject_bfqq && inject_bfqq != bfqq) ++ return inject_bfqq; ++ + /* + * This loop is rarely executed more than once. Even when it + * happens, it is much more convenient to re-execute this loop +@@ -4748,11 +4889,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + */ + if (bfq_bfqq_wait_request(bfqq) || + (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { +- struct bfq_queue *async_bfqq = +- bfqq->bic && bfqq->bic->bfqq[0] && +- bfq_bfqq_busy(bfqq->bic->bfqq[0]) && +- bfqq->bic->bfqq[0]->next_rq ? +- bfqq->bic->bfqq[0] : NULL; ++ unsigned int act_idx = bfqq->actuator_idx; ++ struct bfq_queue *async_bfqq = NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, +@@ -4760,6 +4898,10 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + woken_list_node) + : NULL; + ++ if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && ++ bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && ++ bfqq->bic->bfqq[0][act_idx]->next_rq) ++ async_bfqq = bfqq->bic->bfqq[0][act_idx]; + /* + * The next four mutually-exclusive ifs decide + * whether to try injection, and choose the queue to +@@ -4844,7 +4986,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && + bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= + bfq_bfqq_budget_left(async_bfqq)) +- bfqq = bfqq->bic->bfqq[0]; ++ bfqq = async_bfqq; + else if (bfqq->waker_bfqq && + bfq_bfqq_busy(bfqq->waker_bfqq) && + bfqq->waker_bfqq->next_rq && +@@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, + bfq_dispatch_remove(bfqd->queue, rq); + + if (bfqq != bfqd->in_service_queue) +- goto return_rq; ++ return rq; + + /* + * If weight raising has to terminate for bfqq, then next +@@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, + * belongs to CLASS_IDLE and other queues are waiting for + * service. + */ +- if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) +- goto return_rq; +- +- bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); ++ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + +-return_rq: + return rq; + } + +@@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + + /* + * We exploit the bfq_finish_requeue_request hook to +- * decrement rq_in_driver, but ++ * decrement tot_rq_in_driver, but + * bfq_finish_requeue_request will not be invoked on + * this request. So, to avoid unbalance, just start +- * this request, without incrementing rq_in_driver. As +- * a negative consequence, rq_in_driver is deceptively ++ * this request, without incrementing tot_rq_in_driver. As ++ * a negative consequence, tot_rq_in_driver is deceptively + * lower than it should be while this request is in + * service. This may cause bfq_schedule_dispatch to be + * invoked uselessly. +@@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * bfq_finish_requeue_request hook, if defined, is + * probably invoked also on this request. So, by + * exploiting this hook, we could 1) increment +- * rq_in_driver here, and 2) decrement it in ++ * tot_rq_in_driver here, and 2) decrement it in + * bfq_finish_requeue_request. Such a solution would + * let the value of the counter be always accurate, + * but it would entail using an extra interface +@@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * Of course, serving one request at a time may cause loss of + * throughput. + */ +- if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) + goto exit; + + bfqq = bfq_select_queue(bfqd); +@@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + + if (rq) { + inc_in_driver_start_rq: +- bfqd->rq_in_driver++; ++ bfqd->rq_in_driver[bfqq->actuator_idx]++; ++ bfqd->tot_rq_in_driver++; + start_rq: + rq->rq_flags |= RQF_STARTED; + } +@@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { +- if (__bfqq == bfqq) +- break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; +@@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_release_process_ref(bfqd, bfqq); + } + +-static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) ++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, ++ unsigned int actuator_idx) + { +- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); + struct bfq_data *bfqd; + + if (bfqq) + bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ + + if (bfqq && bfqd) { +- unsigned long flags; +- +- spin_lock_irqsave(&bfqd->lock, flags); +- bic_set_bfqq(bic, NULL, is_sync); ++ bic_set_bfqq(bic, NULL, is_sync, actuator_idx); + bfq_exit_bfqq(bfqd, bfqq); +- spin_unlock_irqrestore(&bfqd->lock, flags); + } + } + + static void bfq_exit_icq(struct io_cq *icq) + { + struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ unsigned long flags; ++ unsigned int act_idx; ++ /* ++ * If bfqd and thus bfqd->num_actuators is not available any ++ * longer, then cycle over all possible per-actuator bfqqs in ++ * next loop. We rely on bic being zeroed on creation, and ++ * therefore on its unused per-actuator fields being NULL. ++ */ ++ unsigned int num_actuators = BFQ_MAX_ACTUATORS; ++ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; + +- if (bic->stable_merge_bfqq) { +- struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; ++ /* ++ * bfqd is NULL if scheduler already exited, and in that case ++ * this is the last time these queues are accessed. ++ */ ++ if (bfqd) { ++ spin_lock_irqsave(&bfqd->lock, flags); ++ num_actuators = bfqd->num_actuators; ++ } + +- /* +- * bfqd is NULL if scheduler already exited, and in +- * that case this is the last time bfqq is accessed. +- */ +- if (bfqd) { +- unsigned long flags; ++ for (act_idx = 0; act_idx < num_actuators; act_idx++) { ++ if (bfqq_data[act_idx].stable_merge_bfqq) ++ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); + +- spin_lock_irqsave(&bfqd->lock, flags); +- bfq_put_stable_ref(bic->stable_merge_bfqq); +- spin_unlock_irqrestore(&bfqd->lock, flags); +- } else { +- bfq_put_stable_ref(bic->stable_merge_bfqq); +- } ++ bfq_exit_icq_bfqq(bic, true, act_idx); ++ bfq_exit_icq_bfqq(bic, false, act_idx); + } + +- bfq_exit_icq_bfqq(bic, true); +- bfq_exit_icq_bfqq(bic, false); ++ if (bfqd) ++ spin_unlock_irqrestore(&bfqd->lock, flags); + } + + /* +@@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) + + bic->ioprio = ioprio; + +- bfqq = bic_to_bfqq(bic, false); ++ bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); + if (bfqq) { + struct bfq_queue *old_bfqq = bfqq; + + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); +- bic_set_bfqq(bic, bfqq, false); ++ bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); + bfq_release_process_ref(bfqd, old_bfqq); + } + +- bfqq = bic_to_bfqq(bic, true); ++ bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); + if (bfqq) + bfq_set_next_ioprio_data(bfqq, bic); + } + + static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, +- struct bfq_io_cq *bic, pid_t pid, int is_sync) ++ struct bfq_io_cq *bic, pid_t pid, int is_sync, ++ unsigned int act_idx) + { + u64 now_ns = ktime_get_ns(); + ++ bfqq->actuator_idx = act_idx; + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); +@@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; ++ ++ bfqq->decrease_time_jif = jiffies; + } + + static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, +- int ioprio_class, int ioprio) ++ int ioprio_class, int ioprio, int act_idx) + { + switch (ioprio_class) { + case IOPRIO_CLASS_RT: +- return &bfqg->async_bfqq[0][ioprio]; ++ return &bfqg->async_bfqq[0][ioprio][act_idx]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_BE_NORM; + fallthrough; + case IOPRIO_CLASS_BE: +- return &bfqg->async_bfqq[1][ioprio]; ++ return &bfqg->async_bfqq[1][ioprio][act_idx]; + case IOPRIO_CLASS_IDLE: +- return &bfqg->async_idle_bfqq; ++ return &bfqg->async_idle_bfqq[act_idx]; + default: + return NULL; + } +@@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) + { ++ unsigned int a_idx = last_bfqq_created->actuator_idx; + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + +@@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + return bfqq; + + if (new_bfqq->bic) +- new_bfqq->bic->stably_merged = true; +- bic->stably_merged = true; ++ new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; ++ bic->bfqq_data[a_idx].stably_merged = true; + + /* + * Reusing merge functions. This implies that +@@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a +- * different ioprio or ioprio_class. If none of these +- * conditions holds true, then try an early stable merge or +- * schedule a delayed stable merge. ++ * different ioprio, ioprio_class or actuator_idx. If none of ++ * these conditions holds true, then try an early stable merge ++ * or schedule a delayed stable merge. As for the condition on ++ * actuator_idx, the reason is that, if queues associated with ++ * different actuators are merged, then control is lost on ++ * each actuator. Therefore some actuator may be ++ * underutilized, and throughput may decrease. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more +@@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || +- bfqq->ioprio_class != last_bfqq_created->ioprio_class) ++ bfqq->ioprio_class != last_bfqq_created->ioprio_class || ++ bfqq->actuator_idx != last_bfqq_created->actuator_idx) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, +@@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + /* + * Record the bfqq to merge to. + */ +- bic->stable_merge_bfqq = last_bfqq_created; ++ bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = ++ last_bfqq_created; + } + } + +@@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + bfqg = bfq_bio_bfqg(bfqd, bio); + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, +- ioprio); ++ ioprio, ++ bfq_actuator_index(bfqd, bio)); + bfqq = *async_bfqq; + if (bfqq) + goto out; +@@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, +- is_sync); ++ is_sync, bfq_actuator_index(bfqd, bio)); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { +@@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + * then complete the merge and redirect it to + * new_bfqq. + */ +- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ if (bic_to_bfqq(RQ_BIC(rq), true, ++ bfq_actuator_index(bfqd, rq->bio)) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + +@@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) + struct bfq_queue *bfqq = bfqd->in_service_queue; + + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, +- bfqd->rq_in_driver); ++ bfqd->tot_rq_in_driver); + + if (bfqd->hw_tag == 1) + return; +@@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) + * sum is not exact, as it's not taking into account deactivated + * requests. + */ +- if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) ++ if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + return; + + /* +@@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) + if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && + bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < + BFQ_HW_QUEUE_THRESHOLD && +- bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) ++ bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) +@@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + + bfq_update_hw_tag(bfqd); + +- bfqd->rq_in_driver--; ++ bfqd->rq_in_driver[bfqq->actuator_idx]--; ++ bfqd->tot_rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { +@@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + BFQQE_NO_MORE_REQUESTS); + } + +- if (!bfqd->rq_in_driver) ++ if (!bfqd->tot_rq_in_driver) + bfq_schedule_dispatch(bfqd); + } + +@@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, + * conditions to do it, or we can lower the last base value + * computed. + * +- * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O ++ * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O + * request in flight, because this function is in the code + * path that handles the completion of a request of bfqq, and, + * in particular, this function is executed before +- * bfqd->rq_in_driver is decremented in such a code path. ++ * bfqd->tot_rq_in_driver is decremented in such a code path. + */ +- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || ++ if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || + tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* +@@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } + bfqq->last_serv_time_ns = tot_time_ns; +- } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) ++ } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) + /* + * No I/O injected and no request still in service in + * the drive: these are the exact conditions for +@@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) + return bfqq; + } + +- bic_set_bfqq(bic, NULL, true); ++ bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); + + bfq_put_cooperator(bfqq); + +@@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + bool split, bool is_sync, + bool *new_queue) + { +- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ unsigned int act_idx = bfq_actuator_index(bfqd, bio); ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); ++ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; + + if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) + return bfqq; +@@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); + +- bic_set_bfqq(bic, bfqq, is_sync); ++ bic_set_bfqq(bic, bfqq, is_sync, act_idx); + if (split && is_sync) { +- if ((bic->was_in_burst_list && bfqd->large_burst) || +- bic->saved_in_large_burst) ++ if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || ++ bfqq_data->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); +- if (bic->was_in_burst_list) ++ if (bfqq_data->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being +@@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) + struct bfq_queue *bfqq; + bool new_queue = false; + bool bfqq_already_existing = false, split = false; ++ unsigned int a_idx = bfq_actuator_index(bfqd, bio); + + if (unlikely(!rq->elv.icq)) + return NULL; + + /* +- * Assuming that elv.priv[1] is set only if everything is set ++ * Assuming that RQ_BFQQ(rq) is set only if everything is set + * for this rq. This holds true, because this function is + * invoked only for insertion or merging, and, after such + * events, a request cannot be manipulated any longer before + * being removed from bfq. + */ +- if (rq->elv.priv[1]) +- return rq->elv.priv[1]; ++ if (RQ_BFQQ(rq)) ++ return RQ_BFQQ(rq); + + bic = icq_to_bic(rq->elv.icq); + +@@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) + if (likely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && +- !bic->stably_merged) { ++ !bic->bfqq_data[a_idx].stably_merged) { + struct bfq_queue *old_bfqq = bfqq; + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) +- bic->saved_in_large_burst = true; ++ bic->bfqq_data[a_idx].saved_in_large_burst = ++ true; + + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; +@@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + */ + void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) + { +- int i, j; ++ int i, j, k; + +- for (i = 0; i < 2; i++) +- for (j = 0; j < IOPRIO_NR_LEVELS; j++) +- __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ for (k = 0; k < bfqd->num_actuators; k++) { ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_NR_LEVELS; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); + +- __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); ++ } + } + + /* +@@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + { + struct bfq_data *bfqd; + struct elevator_queue *eq; ++ unsigned int i; ++ struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; + + eq = elevator_alloc(q, e); + if (!eq) +@@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. ++ * Set zero as actuator index: we will pretend that ++ * all I/O requests are for the same actuator. + */ +- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); + bfqd->oom_bfqq.ref++; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; +@@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + + bfqd->queue = q; + ++ bfqd->num_actuators = 1; ++ /* ++ * If the disk supports multiple actuators, copy independent ++ * access ranges from the request queue structure. ++ */ ++ spin_lock_irq(&q->queue_lock); ++ if (ia_ranges) { ++ /* ++ * Check if the disk ia_ranges size exceeds the current bfq ++ * actuator limit. ++ */ ++ if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { ++ pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", ++ ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); ++ pr_crit("Falling back to single actuator mode.\n"); ++ } else { ++ bfqd->num_actuators = ia_ranges->nr_ia_ranges; ++ ++ for (i = 0; i < bfqd->num_actuators; i++) { ++ bfqd->sector[i] = ia_ranges->ia_range[i].sector; ++ bfqd->nr_sectors[i] = ++ ia_ranges->ia_range[i].nr_sectors; ++ } ++ } ++ } ++ ++ /* Otherwise use single-actuator dev info */ ++ if (bfqd->num_actuators == 1) { ++ bfqd->sector[0] = 0; ++ bfqd->nr_sectors[0] = get_capacity(q->disk); ++ } ++ spin_unlock_irq(&q->queue_lock); ++ + INIT_LIST_HEAD(&bfqd->dispatch); + + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, +@@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->num_groups_with_pending_reqs = 0; + #endif + +- INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->active_list[0]); ++ INIT_LIST_HEAD(&bfqd->active_list[1]); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + +@@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + */ + bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); +- bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* +@@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + ++ /* see comments on the definition of next field inside bfq_data */ ++ bfqd->actuator_load_threshold = 4; ++ + spin_lock_init(&bfqd->lock); + + /* +@@ -7412,6 +7617,7 @@ MODULE_ALIAS("bfq-iosched"); + static int __init bfq_init(void) + { + int ret; ++ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.2"; + + #ifdef CONFIG_BFQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); +@@ -7443,6 +7649,11 @@ static int __init bfq_init(void) + if (ret) + goto slab_kill; + ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ + return 0; + + slab_kill: +diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h +index 466e4865ace6..75cc6a324267 100644 +--- a/block/bfq-iosched.h ++++ b/block/bfq-iosched.h +@@ -33,6 +33,14 @@ + */ + #define BFQ_SOFTRT_WEIGHT_FACTOR 100 + ++/* ++ * Maximum number of actuators supported. This constant is used simply ++ * to define the size of the static array that will contain ++ * per-actuator data. The current value is hopefully a good upper ++ * bound to the possible number of actuators of any actual drive. ++ */ ++#define BFQ_MAX_ACTUATORS 8 ++ + struct bfq_entity; + + /** +@@ -227,12 +235,14 @@ struct bfq_ttime { + * struct bfq_queue - leaf schedulable entity. + * + * A bfq_queue is a leaf request queue; it can be associated with an +- * io_context or more, if it is async or shared between cooperating +- * processes. @cgroup holds a reference to the cgroup, to be sure that it +- * does not disappear while a bfqq still references it (mostly to avoid +- * races between request issuing and task migration followed by cgroup +- * destruction). +- * All the fields are protected by the queue lock of the containing bfqd. ++ * io_context or more, if it is async or shared between cooperating ++ * processes. Besides, it contains I/O requests for only one actuator ++ * (an io_context is associated with a different bfq_queue for each ++ * actuator it generates I/O for). @cgroup holds a reference to the ++ * cgroup, to be sure that it does not disappear while a bfqq still ++ * references it (mostly to avoid races between request issuing and ++ * task migration followed by cgroup destruction). All the fields are ++ * protected by the queue lock of the containing bfqd. + */ + struct bfq_queue { + /* reference counter */ +@@ -397,24 +407,18 @@ struct bfq_queue { + * the woken queues when this queue exits. + */ + struct hlist_head woken_list; ++ ++ /* index of the actuator this queue is associated with */ ++ unsigned int actuator_idx; + }; + + /** +- * struct bfq_io_cq - per (request_queue, io_context) structure. +- */ +-struct bfq_io_cq { +- /* associated io_cq structure */ +- struct io_cq icq; /* must be the first member */ +- /* array of two process queues, the sync and the async */ +- struct bfq_queue *bfqq[2]; +- /* per (request_queue, blkcg) ioprio */ +- int ioprio; +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- uint64_t blkcg_serial_nr; /* the current blkcg serial */ +-#endif ++* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq ++*/ ++struct bfq_iocq_bfqq_data { + /* + * Snapshot of the has_short_time flag before merging; taken +- * to remember its value while the queue is merged, so as to ++ * to remember its values while the queue is merged, so as to + * be able to restore it in case of split. + */ + bool saved_has_short_ttime; +@@ -428,7 +432,7 @@ struct bfq_io_cq { + u64 saved_tot_idle_time; + + /* +- * Same purpose as the previous fields for the value of the ++ * Same purpose as the previous fields for the values of the + * field keeping the queue's belonging to a large burst + */ + bool saved_in_large_burst; +@@ -466,6 +470,38 @@ struct bfq_io_cq { + struct bfq_queue *stable_merge_bfqq; + + bool stably_merged; /* non splittable if true */ ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* ++ * Matrix of associated process queues: first row for async ++ * queues, second row sync queues. Each row contains one ++ * column for each actuator. An I/O request generated by the ++ * process is inserted into the queue pointed by bfqq[i][j] if ++ * the request is to be served by the j-th actuator of the ++ * drive, where i==0 or i==1, depending on whether the request ++ * is async or sync. So there is a distinct queue for each ++ * actuator. ++ */ ++ struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Persistent data for associated synchronous process queues ++ * (one queue per actuator, see field bfqq above). In ++ * particular, each of these queues may undergo a merge. ++ */ ++ struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; ++ + unsigned int requests; /* Number of requests this process has in flight */ + }; + +@@ -554,7 +590,12 @@ struct bfq_data { + /* number of queued requests */ + int queued; + /* number of requests dispatched and waiting for completion */ +- int rq_in_driver; ++ int tot_rq_in_driver; ++ /* ++ * number of requests dispatched and waiting for completion ++ * for each actuator ++ */ ++ int rq_in_driver[BFQ_MAX_ACTUATORS]; + + /* true if the device is non rotational and performs queueing */ + bool nonrot_with_queueing; +@@ -648,8 +689,13 @@ struct bfq_data { + /* maximum budget allotted to a bfq_queue before rescheduling */ + int bfq_max_budget; + +- /* list of all the bfq_queues active on the device */ +- struct list_head active_list; ++ /* ++ * List of all the bfq_queues active for a specific actuator ++ * on the device. Keeping active queues separate on a ++ * per-actuator basis helps implementing per-actuator ++ * injection more efficiently. ++ */ ++ struct list_head active_list[BFQ_MAX_ACTUATORS]; + /* list of all the bfq_queues idle on the device */ + struct list_head idle_list; + +@@ -723,8 +769,6 @@ struct bfq_data { + * is multiplied. + */ + unsigned int bfq_wr_coeff; +- /* maximum duration of a weight-raising period (jiffies) */ +- unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ + unsigned int bfq_wr_rt_max_time; +@@ -772,6 +816,42 @@ struct bfq_data { + */ + unsigned int word_depths[2][2]; + unsigned int full_depth_shift; ++ ++ /* ++ * Number of independent actuators. This is equal to 1 in ++ * case of single-actuator drives. ++ */ ++ unsigned int num_actuators; ++ /* ++ * Disk independent access ranges for each actuator ++ * in this device. ++ */ ++ sector_t sector[BFQ_MAX_ACTUATORS]; ++ sector_t nr_sectors[BFQ_MAX_ACTUATORS]; ++ struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; ++ ++ /* ++ * If the number of I/O requests queued in the device for a ++ * given actuator is below next threshold, then the actuator ++ * is deemed as underutilized. If this condition is found to ++ * hold for some actuator upon a dispatch, but (i) the ++ * in-service queue does not contain I/O for that actuator, ++ * while (ii) some other queue does contain I/O for that ++ * actuator, then the head I/O request of the latter queue is ++ * returned (injected), instead of the head request of the ++ * currently in-service queue. ++ * ++ * We set the threshold, empirically, to the minimum possible ++ * value for which an actuator is fully utilized, or close to ++ * be fully utilized. By doing so, injected I/O 'steals' as ++ * few drive-queue slots as possibile to the in-service ++ * queue. This reduces as much as possible the probability ++ * that the service of I/O from the in-service bfq_queue gets ++ * delayed because of slot exhaustion, i.e., because all the ++ * slots of the drive queue are filled with I/O injected from ++ * other queues (NCQ provides for 32 slots). ++ */ ++ unsigned int actuator_load_threshold; + }; + + enum bfqq_state_flags { +@@ -937,8 +1017,8 @@ struct bfq_group { + + struct bfq_data *bfqd; + +- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; +- struct bfq_queue *async_idle_bfqq; ++ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; ++ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; + + struct bfq_entity *my_entity; + +@@ -955,8 +1035,8 @@ struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + +- struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; +- struct bfq_queue *async_idle_bfqq; ++ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; ++ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; + + struct rb_root rq_pos_tree; + }; +@@ -969,8 +1049,10 @@ struct bfq_group { + + extern const int bfq_timeout; + +-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); +-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); ++struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, ++ unsigned int actuator_idx); ++void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, ++ unsigned int actuator_idx); + struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); + void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); + void bfq_weights_tree_add(struct bfq_queue *bfqq); +diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c +index ea4c3d757fdd..7941b6f07391 100644 +--- a/block/bfq-wf2q.c ++++ b/block/bfq-wf2q.c +@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, + bfq_update_active_tree(node); + + if (bfqq) +- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); + + bfq_inc_active_entities(entity); + } +diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c +index 9ac1efb053e0..4272599a3f08 100644 +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -33,7 +33,6 @@ + #include "blk-cgroup.h" + #include "blk-ioprio.h" + #include "blk-throttle.h" +-#include "blk-rq-qos.h" + + /* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. +@@ -626,69 +625,93 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) + EXPORT_SYMBOL_GPL(__blkg_prfill_u64); + + /** +- * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update +- * @inputp: input string pointer ++ * blkg_conf_init - initialize a blkg_conf_ctx ++ * @ctx: blkg_conf_ctx to initialize ++ * @input: input string ++ * ++ * Initialize @ctx which can be used to parse blkg config input string @input. ++ * Once initialized, @ctx can be used with blkg_conf_open_bdev() and ++ * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). ++ */ ++void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) ++{ ++ *ctx = (struct blkg_conf_ctx){ .input = input }; ++} ++EXPORT_SYMBOL_GPL(blkg_conf_init); ++ ++/** ++ * blkg_conf_open_bdev - parse and open bdev for per-blkg config update ++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * +- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update +- * from @input and get and return the matching bdev. *@inputp is +- * updated to point past the device node prefix. Returns an ERR_PTR() +- * value on error. ++ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from ++ * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is ++ * set to point past the device node prefix. + * +- * Use this function iff blkg_conf_prep() can't be used for some reason. ++ * This function may be called multiple times on @ctx and the extra calls become ++ * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function ++ * explicitly if bdev access is needed without resolving the blkcg / policy part ++ * of @ctx->input. Returns -errno on error. + */ +-struct block_device *blkcg_conf_open_bdev(char **inputp) ++int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) + { +- char *input = *inputp; ++ char *input = ctx->input; + unsigned int major, minor; + struct block_device *bdev; + int key_len; + ++ if (ctx->bdev) ++ return 0; ++ + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) +- return ERR_PTR(-EINVAL); ++ return -EINVAL; + + input += key_len; + if (!isspace(*input)) +- return ERR_PTR(-EINVAL); ++ return -EINVAL; + input = skip_spaces(input); + + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) +- return ERR_PTR(-ENODEV); ++ return -ENODEV; + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); +- return ERR_PTR(-ENODEV); ++ return -ENODEV; + } + +- *inputp = input; +- return bdev; ++ ctx->body = input; ++ ctx->bdev = bdev; ++ return 0; + } + + /** + * blkg_conf_prep - parse and prepare for per-blkg config update + * @blkcg: target block cgroup + * @pol: target policy +- * @input: input string +- * @ctx: blkg_conf_ctx to be filled ++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() ++ * ++ * Parse per-blkg config update from @ctx->input and initialize @ctx ++ * accordingly. On success, @ctx->body points to the part of @ctx->input ++ * following MAJ:MIN, @ctx->bdev points to the target block device and ++ * @ctx->blkg to the blkg being configured. + * +- * Parse per-blkg config update from @input and initialize @ctx with the +- * result. @ctx->blkg points to the blkg to be updated and @ctx->body the +- * part of @input following MAJ:MIN. This function returns with RCU read +- * lock and queue lock held and must be paired with blkg_conf_finish(). ++ * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this ++ * function returns with queue lock held and must be followed by ++ * blkg_conf_exit(). + */ + int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, +- char *input, struct blkg_conf_ctx *ctx) +- __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) ++ struct blkg_conf_ctx *ctx) ++ __acquires(&bdev->bd_queue->queue_lock) + { +- struct block_device *bdev; + struct gendisk *disk; + struct request_queue *q; + struct blkcg_gq *blkg; + int ret; + +- bdev = blkcg_conf_open_bdev(&input); +- if (IS_ERR(bdev)) +- return PTR_ERR(bdev); +- disk = bdev->bd_disk; ++ ret = blkg_conf_open_bdev(ctx); ++ if (ret) ++ return ret; ++ ++ disk = ctx->bdev->bd_disk; + q = disk->queue; + + /* +@@ -699,7 +722,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + if (ret) + goto fail; + +- rcu_read_lock(); + spin_lock_irq(&q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) { +@@ -728,7 +750,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + + /* Drop locks to do new blkg allocation with GFP_KERNEL. */ + spin_unlock_irq(&q->queue_lock); +- rcu_read_unlock(); + + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); + if (unlikely(!new_blkg)) { +@@ -742,7 +763,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + goto fail_exit_queue; + } + +- rcu_read_lock(); + spin_lock_irq(&q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) { +@@ -769,20 +789,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + } + success: + blk_queue_exit(q); +- ctx->bdev = bdev; + ctx->blkg = blkg; +- ctx->body = input; + return 0; + + fail_preloaded: + radix_tree_preload_end(); + fail_unlock: + spin_unlock_irq(&q->queue_lock); +- rcu_read_unlock(); + fail_exit_queue: + blk_queue_exit(q); + fail: +- blkdev_put_no_open(bdev); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue +@@ -798,20 +814,27 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + EXPORT_SYMBOL_GPL(blkg_conf_prep); + + /** +- * blkg_conf_finish - finish up per-blkg config update +- * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() ++ * blkg_conf_exit - clean up per-blkg config update ++ * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * +- * Finish up after per-blkg config update. This function must be paired +- * with blkg_conf_prep(). ++ * Clean up after per-blkg config update. This function must be called on all ++ * blkg_conf_ctx's initialized with blkg_conf_init(). + */ +-void blkg_conf_finish(struct blkg_conf_ctx *ctx) +- __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) ++void blkg_conf_exit(struct blkg_conf_ctx *ctx) ++ __releases(&ctx->bdev->bd_queue->queue_lock) + { +- spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); +- rcu_read_unlock(); +- blkdev_put_no_open(ctx->bdev); ++ if (ctx->blkg) { ++ spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ++ ctx->blkg = NULL; ++ } ++ ++ if (ctx->bdev) { ++ blkdev_put_no_open(ctx->bdev); ++ ctx->body = NULL; ++ ctx->bdev = NULL; ++ } + } +-EXPORT_SYMBOL_GPL(blkg_conf_finish); ++EXPORT_SYMBOL_GPL(blkg_conf_exit); + + static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) + { +@@ -1300,14 +1323,8 @@ int blkcg_init_disk(struct gendisk *disk) + if (ret) + goto err_ioprio_exit; + +- ret = blk_iolatency_init(disk); +- if (ret) +- goto err_throtl_exit; +- + return 0; + +-err_throtl_exit: +- blk_throtl_exit(disk); + err_ioprio_exit: + blk_ioprio_exit(disk); + err_destroy_all: +@@ -1323,7 +1340,6 @@ int blkcg_init_disk(struct gendisk *disk) + void blkcg_exit_disk(struct gendisk *disk) + { + blkg_destroy_all(disk); +- rq_qos_exit(disk->queue); + blk_throtl_exit(disk); + } + +diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h +index 1e94e404eaa8..fe09e8b4c2a8 100644 +--- a/block/blk-cgroup.h ++++ b/block/blk-cgroup.h +@@ -208,15 +208,17 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); + + struct blkg_conf_ctx { ++ char *input; ++ char *body; + struct block_device *bdev; + struct blkcg_gq *blkg; +- char *body; + }; + +-struct block_device *blkcg_conf_open_bdev(char **inputp); ++void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); ++int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); + int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, +- char *input, struct blkg_conf_ctx *ctx); +-void blkg_conf_finish(struct blkg_conf_ctx *ctx); ++ struct blkg_conf_ctx *ctx); ++void blkg_conf_exit(struct blkg_conf_ctx *ctx); + + /** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg +diff --git a/block/blk-iocost.c b/block/blk-iocost.c +index 6955605629e4..22a3639a7a05 100644 +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -3091,9 +3091,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, + return nbytes; + } + +- ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); ++ blkg_conf_init(&ctx, buf); ++ ++ ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); + if (ret) +- return ret; ++ goto err; + + iocg = blkg_to_iocg(ctx.blkg); + +@@ -3112,12 +3114,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, + weight_updated(iocg, &now); + spin_unlock(&iocg->ioc->lock); + +- blkg_conf_finish(&ctx); ++ blkg_conf_exit(&ctx); + return nbytes; + + einval: +- blkg_conf_finish(&ctx); +- return -EINVAL; ++ ret = -EINVAL; ++err: ++ blkg_conf_exit(&ctx); ++ return ret; + } + + static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, +@@ -3172,19 +3176,22 @@ static const match_table_t qos_tokens = { + static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) + { +- struct block_device *bdev; ++ struct blkg_conf_ctx ctx; + struct gendisk *disk; + struct ioc *ioc; + u32 qos[NR_QOS_PARAMS]; + bool enable, user; +- char *p; ++ char *body, *p; + int ret; + +- bdev = blkcg_conf_open_bdev(&input); +- if (IS_ERR(bdev)) +- return PTR_ERR(bdev); ++ blkg_conf_init(&ctx, input); + +- disk = bdev->bd_disk; ++ ret = blkg_conf_open_bdev(&ctx); ++ if (ret) ++ goto err; ++ ++ body = ctx.body; ++ disk = ctx.bdev->bd_disk; + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk); +@@ -3201,7 +3208,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + enable = ioc->enabled; + user = ioc->user_qos_params; + +- while ((p = strsep(&input, " \t\n"))) { ++ while ((p = strsep(&body, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; +@@ -3290,7 +3297,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + +- blkdev_put_no_open(bdev); ++ blkg_conf_exit(&ctx); + return nbytes; + einval: + spin_unlock_irq(&ioc->lock); +@@ -3300,7 +3307,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + + ret = -EINVAL; + err: +- blkdev_put_no_open(bdev); ++ blkg_conf_exit(&ctx); + return ret; + } + +@@ -3351,22 +3358,25 @@ static const match_table_t i_lcoef_tokens = { + static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) + { +- struct block_device *bdev; ++ struct blkg_conf_ctx ctx; + struct request_queue *q; + struct ioc *ioc; + u64 u[NR_I_LCOEFS]; + bool user; +- char *p; ++ char *body, *p; + int ret; + +- bdev = blkcg_conf_open_bdev(&input); +- if (IS_ERR(bdev)) +- return PTR_ERR(bdev); ++ blkg_conf_init(&ctx, input); ++ ++ ret = blkg_conf_open_bdev(&ctx); ++ if (ret) ++ goto err; + +- q = bdev_get_queue(bdev); ++ body = ctx.body; ++ q = bdev_get_queue(ctx.bdev); + ioc = q_to_ioc(q); + if (!ioc) { +- ret = blk_iocost_init(bdev->bd_disk); ++ ret = blk_iocost_init(ctx.bdev->bd_disk); + if (ret) + goto err; + ioc = q_to_ioc(q); +@@ -3379,7 +3389,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + memcpy(u, ioc->params.i_lcoefs, sizeof(u)); + user = ioc->user_cost_model; + +- while ((p = strsep(&input, " \t\n"))) { ++ while ((p = strsep(&body, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; +@@ -3426,7 +3436,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + +- blkdev_put_no_open(bdev); ++ blkg_conf_exit(&ctx); + return nbytes; + + einval: +@@ -3437,7 +3447,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + + ret = -EINVAL; + err: +- blkdev_put_no_open(bdev); ++ blkg_conf_exit(&ctx); + return ret; + } + +diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c +index ecdc10741836..3484393dbc4a 100644 +--- a/block/blk-iolatency.c ++++ b/block/blk-iolatency.c +@@ -755,7 +755,7 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) + } + } + +-int blk_iolatency_init(struct gendisk *disk) ++static int blk_iolatency_init(struct gendisk *disk) + { + struct request_queue *q = disk->queue; + struct blk_iolatency *blkiolat; +@@ -830,6 +830,29 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) + } + } + ++static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) ++{ ++ static DEFINE_MUTEX(init_mutex); ++ int ret; ++ ++ ret = blkg_conf_open_bdev(ctx); ++ if (ret) ++ return ret; ++ ++ /* ++ * blk_iolatency_init() may fail after rq_qos_add() succeeds which can ++ * confuse iolat_rq_qos() test. Make the test and init atomic. ++ */ ++ mutex_lock(&init_mutex); ++ ++ if (!iolat_rq_qos(ctx->bdev->bd_queue)) ++ ret = blk_iolatency_init(ctx->bdev->bd_disk); ++ ++ mutex_unlock(&init_mutex); ++ ++ return ret; ++} ++ + static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) + { +@@ -842,9 +865,15 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + u64 oldval; + int ret; + +- ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); ++ blkg_conf_init(&ctx, buf); ++ ++ ret = blk_iolatency_try_init(&ctx); + if (ret) +- return ret; ++ goto out; ++ ++ ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); ++ if (ret) ++ goto out; + + iolat = blkg_to_lat(ctx.blkg); + p = ctx.body; +@@ -880,7 +909,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + iolatency_clear_scaling(blkg); + ret = 0; + out: +- blkg_conf_finish(&ctx); ++ blkg_conf_exit(&ctx); + return ret ?: nbytes; + } + +@@ -974,7 +1003,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) + { + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); +- struct rq_qos *rqos = blkcg_rq_qos(blkg->q); ++ struct rq_qos *rqos = iolat_rq_qos(blkg->q); + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + u64 now = ktime_to_ns(ktime_get()); + int cpu; +diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h +index 1ef1f7d4bc3c..27f004fae66b 100644 +--- a/block/blk-rq-qos.h ++++ b/block/blk-rq-qos.h +@@ -74,7 +74,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) + return rq_qos_id(q, RQ_QOS_WBT); + } + +-static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) ++static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) + { + return rq_qos_id(q, RQ_QOS_LATENCY); + } +diff --git a/block/blk-throttle.c b/block/blk-throttle.c +index 6fb5a2f9e1ee..75841d1d9bf4 100644 +--- a/block/blk-throttle.c ++++ b/block/blk-throttle.c +@@ -1369,9 +1369,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, + int ret; + u64 v; + +- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); ++ blkg_conf_init(&ctx, buf); ++ ++ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); + if (ret) +- return ret; ++ goto out_finish; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) +@@ -1390,7 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, + tg_conf_updated(tg, false); + ret = 0; + out_finish: +- blkg_conf_finish(&ctx); ++ blkg_conf_exit(&ctx); + return ret ?: nbytes; + } + +@@ -1562,9 +1564,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, + int ret; + int index = of_cft(of)->private; + +- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); ++ blkg_conf_init(&ctx, buf); ++ ++ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); + if (ret) +- return ret; ++ goto out_finish; + + tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); +@@ -1663,7 +1667,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, + tg->td->limit_valid[LIMIT_LOW]); + ret = 0; + out_finish: +- blkg_conf_finish(&ctx); ++ blkg_conf_exit(&ctx); + return ret ?: nbytes; + } + +diff --git a/block/blk.h b/block/blk.h +index 4c3b3325219a..78f1706cddca 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -392,12 +392,6 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, + return bio; + } + +-#ifdef CONFIG_BLK_CGROUP_IOLATENCY +-int blk_iolatency_init(struct gendisk *disk); +-#else +-static inline int blk_iolatency_init(struct gendisk *disk) { return 0; }; +-#endif +- + #ifdef CONFIG_BLK_DEV_ZONED + void disk_free_zone_bitmaps(struct gendisk *disk); + void disk_clear_zone_settings(struct gendisk *disk); +-- +2.39.2 + +From f5846f885c52570685c30c97eae68dbebe7639b3 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 13 Feb 2023 11:26:20 +0100 +Subject: [PATCH 03/15] bitmap + +Signed-off-by: Peter Jung +--- + include/linux/bitmap.h | 46 ++++++------- + include/linux/cpumask.h | 144 +++++++++++++++++++-------------------- + include/linux/find.h | 40 +++++------ + include/linux/nodemask.h | 86 +++++++++++------------ + 4 files changed, 158 insertions(+), 158 deletions(-) + +diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h +index 7d6d73b78147..40e53a2ecc0d 100644 +--- a/include/linux/bitmap.h ++++ b/include/linux/bitmap.h +@@ -189,7 +189,7 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, + * the bit offset of all zero areas this function finds is multiples of that + * power of 2. A @align_mask of 0 means no alignment is required. + */ +-static inline unsigned long ++static __always_inline unsigned long + bitmap_find_next_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, +@@ -237,7 +237,7 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, + #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) + #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) + +-static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) ++static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) + { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + +@@ -247,7 +247,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) + memset(dst, 0, len); + } + +-static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) ++static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) + { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + +@@ -257,7 +257,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) + memset(dst, 0xff, len); + } + +-static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, ++static __always_inline void bitmap_copy(unsigned long *dst, const unsigned long *src, + unsigned int nbits) + { + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); +@@ -271,7 +271,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, + /* + * Copy bitmap and clear tail bits in last word. + */ +-static inline void bitmap_copy_clear_tail(unsigned long *dst, ++static __always_inline void bitmap_copy_clear_tail(unsigned long *dst, + const unsigned long *src, unsigned int nbits) + { + bitmap_copy(dst, src, nbits); +@@ -317,7 +317,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); + bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) + #endif + +-static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, ++static __always_inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -325,7 +325,7 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, + return __bitmap_and(dst, src1, src2, nbits); + } + +-static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, ++static __always_inline void bitmap_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -334,7 +334,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, + __bitmap_or(dst, src1, src2, nbits); + } + +-static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, ++static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -343,7 +343,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, + __bitmap_xor(dst, src1, src2, nbits); + } + +-static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, ++static __always_inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -351,7 +351,7 @@ static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, + return __bitmap_andnot(dst, src1, src2, nbits); + } + +-static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, ++static __always_inline void bitmap_complement(unsigned long *dst, const unsigned long *src, + unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -367,7 +367,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr + #endif + #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) + +-static inline bool bitmap_equal(const unsigned long *src1, ++static __always_inline bool bitmap_equal(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -387,7 +387,7 @@ static inline bool bitmap_equal(const unsigned long *src1, + * + * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise + */ +-static inline bool bitmap_or_equal(const unsigned long *src1, ++static __always_inline bool bitmap_or_equal(const unsigned long *src1, + const unsigned long *src2, + const unsigned long *src3, + unsigned int nbits) +@@ -398,7 +398,7 @@ static inline bool bitmap_or_equal(const unsigned long *src1, + return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); + } + +-static inline bool bitmap_intersects(const unsigned long *src1, ++static __always_inline bool bitmap_intersects(const unsigned long *src1, + const unsigned long *src2, + unsigned int nbits) + { +@@ -408,7 +408,7 @@ static inline bool bitmap_intersects(const unsigned long *src1, + return __bitmap_intersects(src1, src2, nbits); + } + +-static inline bool bitmap_subset(const unsigned long *src1, ++static __always_inline bool bitmap_subset(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -417,7 +417,7 @@ static inline bool bitmap_subset(const unsigned long *src1, + return __bitmap_subset(src1, src2, nbits); + } + +-static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) ++static __always_inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) + { + if (small_const_nbits(nbits)) + return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); +@@ -425,7 +425,7 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) + return find_first_bit(src, nbits) == nbits; + } + +-static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) ++static __always_inline bool bitmap_full(const unsigned long *src, unsigned int nbits) + { + if (small_const_nbits(nbits)) + return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); +@@ -482,7 +482,7 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, + __bitmap_clear(map, start, nbits); + } + +-static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, ++static __always_inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -491,7 +491,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s + __bitmap_shift_right(dst, src, shift, nbits); + } + +-static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, ++static __always_inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) + { + if (small_const_nbits(nbits)) +@@ -500,7 +500,7 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr + __bitmap_shift_left(dst, src, shift, nbits); + } + +-static inline void bitmap_replace(unsigned long *dst, ++static __always_inline void bitmap_replace(unsigned long *dst, + const unsigned long *old, + const unsigned long *new, + const unsigned long *mask, +@@ -512,7 +512,7 @@ static inline void bitmap_replace(unsigned long *dst, + __bitmap_replace(dst, old, new, mask, nbits); + } + +-static inline void bitmap_next_set_region(unsigned long *bitmap, ++static __always_inline void bitmap_next_set_region(unsigned long *bitmap, + unsigned int *rs, unsigned int *re, + unsigned int end) + { +@@ -563,7 +563,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, + * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, + * but we expect the lower 32-bits of u64. + */ +-static inline void bitmap_from_u64(unsigned long *dst, u64 mask) ++static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) + { + bitmap_from_arr64(dst, &mask, 64); + } +@@ -576,7 +576,7 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) + * Returns the 8-bit value located at the @start bit offset within the @src + * memory region. + */ +-static inline unsigned long bitmap_get_value8(const unsigned long *map, ++static __always_inline unsigned long bitmap_get_value8(const unsigned long *map, + unsigned long start) + { + const size_t index = BIT_WORD(start); +@@ -591,7 +591,7 @@ static inline unsigned long bitmap_get_value8(const unsigned long *map, + * @value: the 8-bit value; values wider than 8 bits may clobber bitmap + * @start: bit offset of the 8-bit value; must be a multiple of 8 + */ +-static inline void bitmap_set_value8(unsigned long *map, unsigned long value, ++static __always_inline void bitmap_set_value8(unsigned long *map, unsigned long value, + unsigned long start) + { + const size_t index = BIT_WORD(start); +diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h +index c2aa0aa26b45..9543b22d6dc2 100644 +--- a/include/linux/cpumask.h ++++ b/include/linux/cpumask.h +@@ -41,7 +41,7 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; + extern unsigned int nr_cpu_ids; + #endif + +-static inline void set_nr_cpu_ids(unsigned int nr) ++static __always_inline void set_nr_cpu_ids(unsigned int nr) + { + #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) + WARN_ON(nr != nr_cpu_ids); +@@ -124,7 +124,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) + * + * Returns >= nr_cpu_ids if no cpus set. + */ +-static inline unsigned int cpumask_first(const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) + { + return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -135,7 +135,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) + { + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -147,7 +147,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +-static inline ++static __always_inline + unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) + { + return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +@@ -159,7 +159,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask + * + * Returns >= nr_cpumask_bits if no CPUs set. + */ +-static inline unsigned int cpumask_last(const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) + { + return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -171,7 +171,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) + * + * Returns >= nr_cpu_ids if no further cpus set. + */ +-static inline ++static __always_inline + unsigned int cpumask_next(int n, const struct cpumask *srcp) + { + /* -1 is a legal arg here. */ +@@ -187,7 +187,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) + * + * Returns >= nr_cpu_ids if no further cpus unset. + */ +-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) + { + /* -1 is a legal arg here. */ + if (n != -1) +@@ -197,18 +197,18 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) + + #if NR_CPUS == 1 + /* Uniprocessor: there is only one valid CPU */ +-static inline unsigned int cpumask_local_spread(unsigned int i, int node) ++static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) + { + return 0; + } + +-static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, ++static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p) + { + return cpumask_first_and(src1p, src2p); + } + +-static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) + { + return cpumask_first(srcp); + } +@@ -227,7 +227,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp); + * + * Returns >= nr_cpu_ids if no further cpus set in both. + */ +-static inline ++static __always_inline + unsigned int cpumask_next_and(int n, const struct cpumask *src1p, + const struct cpumask *src2p) + { +@@ -259,7 +259,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, + for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits) + + #if NR_CPUS == 1 +-static inline ++static __always_inline + unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) + { + cpumask_check(start); +@@ -335,7 +335,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta + * Often used to find any cpu but smp_processor_id() in a mask. + * Returns >= nr_cpu_ids if no cpus set. + */ +-static inline ++static __always_inline + unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) + { + unsigned int i; +@@ -354,7 +354,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +-static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) + { + return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu)); + } +@@ -367,7 +367,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +-static inline ++static __always_inline + unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, + const struct cpumask *srcp2) + { +@@ -383,7 +383,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +-static inline ++static __always_inline + unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, + const struct cpumask *srcp2) + { +@@ -476,7 +476,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask * + * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +-static inline void cpumask_setall(struct cpumask *dstp) ++static __always_inline void cpumask_setall(struct cpumask *dstp) + { + bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); + } +@@ -485,7 +485,7 @@ static inline void cpumask_setall(struct cpumask *dstp) + * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask + * @dstp: the cpumask pointer + */ +-static inline void cpumask_clear(struct cpumask *dstp) ++static __always_inline void cpumask_clear(struct cpumask *dstp) + { + bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); + } +@@ -498,7 +498,7 @@ static inline void cpumask_clear(struct cpumask *dstp) + * + * If *@dstp is empty, returns false, else returns true + */ +-static inline bool cpumask_and(struct cpumask *dstp, ++static __always_inline bool cpumask_and(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) + { +@@ -512,7 +512,7 @@ static inline bool cpumask_and(struct cpumask *dstp, + * @src1p: the first input + * @src2p: the second input + */ +-static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, ++static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) + { + bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), +@@ -525,7 +525,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, + * @src1p: the first input + * @src2p: the second input + */ +-static inline void cpumask_xor(struct cpumask *dstp, ++static __always_inline void cpumask_xor(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) + { +@@ -541,7 +541,7 @@ static inline void cpumask_xor(struct cpumask *dstp, + * + * If *@dstp is empty, returns false, else returns true + */ +-static inline bool cpumask_andnot(struct cpumask *dstp, ++static __always_inline bool cpumask_andnot(struct cpumask *dstp, + const struct cpumask *src1p, + const struct cpumask *src2p) + { +@@ -554,7 +554,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp, + * @dstp: the cpumask result + * @srcp: the input to invert + */ +-static inline void cpumask_complement(struct cpumask *dstp, ++static __always_inline void cpumask_complement(struct cpumask *dstp, + const struct cpumask *srcp) + { + bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), +@@ -566,7 +566,7 @@ static inline void cpumask_complement(struct cpumask *dstp, + * @src1p: the first input + * @src2p: the second input + */ +-static inline bool cpumask_equal(const struct cpumask *src1p, ++static __always_inline bool cpumask_equal(const struct cpumask *src1p, + const struct cpumask *src2p) + { + return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), +@@ -579,7 +579,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p, + * @src2p: the second input + * @src3p: the third input + */ +-static inline bool cpumask_or_equal(const struct cpumask *src1p, ++static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, + const struct cpumask *src2p, + const struct cpumask *src3p) + { +@@ -592,7 +592,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p, + * @src1p: the first input + * @src2p: the second input + */ +-static inline bool cpumask_intersects(const struct cpumask *src1p, ++static __always_inline bool cpumask_intersects(const struct cpumask *src1p, + const struct cpumask *src2p) + { + return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), +@@ -606,7 +606,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p, + * + * Returns true if *@src1p is a subset of *@src2p, else returns false + */ +-static inline bool cpumask_subset(const struct cpumask *src1p, ++static __always_inline bool cpumask_subset(const struct cpumask *src1p, + const struct cpumask *src2p) + { + return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), +@@ -617,7 +617,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p, + * cpumask_empty - *srcp == 0 + * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. + */ +-static inline bool cpumask_empty(const struct cpumask *srcp) ++static __always_inline bool cpumask_empty(const struct cpumask *srcp) + { + return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -626,7 +626,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp) + * cpumask_full - *srcp == 0xFFFFFFFF... + * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. + */ +-static inline bool cpumask_full(const struct cpumask *srcp) ++static __always_inline bool cpumask_full(const struct cpumask *srcp) + { + return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -635,7 +635,7 @@ static inline bool cpumask_full(const struct cpumask *srcp) + * cpumask_weight - Count of bits in *srcp + * @srcp: the cpumask to count bits (< nr_cpu_ids) in. + */ +-static inline unsigned int cpumask_weight(const struct cpumask *srcp) ++static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) + { + return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); + } +@@ -645,7 +645,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) + * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. + * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. + */ +-static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, ++static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) + { + return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +@@ -657,7 +657,7 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +-static inline void cpumask_shift_right(struct cpumask *dstp, ++static __always_inline void cpumask_shift_right(struct cpumask *dstp, + const struct cpumask *srcp, int n) + { + bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, +@@ -670,7 +670,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp, + * @srcp: the input to shift + * @n: the number of bits to shift by + */ +-static inline void cpumask_shift_left(struct cpumask *dstp, ++static __always_inline void cpumask_shift_left(struct cpumask *dstp, + const struct cpumask *srcp, int n) + { + bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, +@@ -682,7 +682,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp, + * @dstp: the result + * @srcp: the input cpumask + */ +-static inline void cpumask_copy(struct cpumask *dstp, ++static __always_inline void cpumask_copy(struct cpumask *dstp, + const struct cpumask *srcp) + { + bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); +@@ -719,7 +719,7 @@ static inline void cpumask_copy(struct cpumask *dstp, + * + * Returns -errno, or 0 for success. + */ +-static inline int cpumask_parse_user(const char __user *buf, int len, ++static __always_inline int cpumask_parse_user(const char __user *buf, int len, + struct cpumask *dstp) + { + return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); +@@ -733,7 +733,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len, + * + * Returns -errno, or 0 for success. + */ +-static inline int cpumask_parselist_user(const char __user *buf, int len, ++static __always_inline int cpumask_parselist_user(const char __user *buf, int len, + struct cpumask *dstp) + { + return bitmap_parselist_user(buf, len, cpumask_bits(dstp), +@@ -747,7 +747,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, + * + * Returns -errno, or 0 for success. + */ +-static inline int cpumask_parse(const char *buf, struct cpumask *dstp) ++static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) + { + return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); + } +@@ -759,7 +759,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) + * + * Returns -errno, or 0 for success. + */ +-static inline int cpulist_parse(const char *buf, struct cpumask *dstp) ++static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) + { + return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); + } +@@ -767,7 +767,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) + /** + * cpumask_size - size to allocate for a 'struct cpumask' in bytes + */ +-static inline unsigned int cpumask_size(void) ++static __always_inline unsigned int cpumask_size(void) + { + return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); + } +@@ -820,7 +820,7 @@ typedef struct cpumask *cpumask_var_t; + + bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); + +-static inline ++static __always_inline + bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) + { + return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); +@@ -836,13 +836,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) + * + * See alloc_cpumask_var_node. + */ +-static inline ++static __always_inline + bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) + { + return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); + } + +-static inline ++static __always_inline + bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) + { + return alloc_cpumask_var(mask, flags | __GFP_ZERO); +@@ -852,7 +852,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); + void free_cpumask_var(cpumask_var_t mask); + void free_bootmem_cpumask_var(cpumask_var_t mask); + +-static inline bool cpumask_available(cpumask_var_t mask) ++static __always_inline bool cpumask_available(cpumask_var_t mask) + { + return mask != NULL; + } +@@ -863,43 +863,43 @@ typedef struct cpumask cpumask_var_t[1]; + #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) + #define __cpumask_var_read_mostly + +-static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) ++static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) + { + return true; + } + +-static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, ++static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, + int node) + { + return true; + } + +-static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) ++static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) + { + cpumask_clear(*mask); + return true; + } + +-static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, ++static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, + int node) + { + cpumask_clear(*mask); + return true; + } + +-static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) ++static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) + { + } + +-static inline void free_cpumask_var(cpumask_var_t mask) ++static __always_inline void free_cpumask_var(cpumask_var_t mask) + { + } + +-static inline void free_bootmem_cpumask_var(cpumask_var_t mask) ++static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) + { + } + +-static inline bool cpumask_available(cpumask_var_t mask) ++static __always_inline bool cpumask_available(cpumask_var_t mask) + { + return true; + } +@@ -929,12 +929,12 @@ void init_cpu_present(const struct cpumask *src); + void init_cpu_possible(const struct cpumask *src); + void init_cpu_online(const struct cpumask *src); + +-static inline void reset_cpu_possible_mask(void) ++static __always_inline void reset_cpu_possible_mask(void) + { + bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); + } + +-static inline void ++static __always_inline void + set_cpu_possible(unsigned int cpu, bool possible) + { + if (possible) +@@ -943,7 +943,7 @@ set_cpu_possible(unsigned int cpu, bool possible) + cpumask_clear_cpu(cpu, &__cpu_possible_mask); + } + +-static inline void ++static __always_inline void + set_cpu_present(unsigned int cpu, bool present) + { + if (present) +@@ -954,7 +954,7 @@ set_cpu_present(unsigned int cpu, bool present) + + void set_cpu_online(unsigned int cpu, bool online); + +-static inline void ++static __always_inline void + set_cpu_active(unsigned int cpu, bool active) + { + if (active) +@@ -963,7 +963,7 @@ set_cpu_active(unsigned int cpu, bool active) + cpumask_clear_cpu(cpu, &__cpu_active_mask); + } + +-static inline void ++static __always_inline void + set_cpu_dying(unsigned int cpu, bool dying) + { + if (dying) +@@ -986,7 +986,7 @@ set_cpu_dying(unsigned int cpu, bool dying) + ((struct cpumask *)(1 ? (bitmap) \ + : (void *)sizeof(__check_is_bitmap(bitmap)))) + +-static inline int __check_is_bitmap(const unsigned long *bitmap) ++static __always_inline int __check_is_bitmap(const unsigned long *bitmap) + { + return 1; + } +@@ -1001,7 +1001,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap) + extern const unsigned long + cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; + +-static inline const struct cpumask *get_cpu_mask(unsigned int cpu) ++static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) + { + const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; + p -= cpu / BITS_PER_LONG; +@@ -1017,7 +1017,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) + * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held + * region. + */ +-static inline unsigned int num_online_cpus(void) ++static __always_inline unsigned int num_online_cpus(void) + { + return atomic_read(&__num_online_cpus); + } +@@ -1025,27 +1025,27 @@ static inline unsigned int num_online_cpus(void) + #define num_present_cpus() cpumask_weight(cpu_present_mask) + #define num_active_cpus() cpumask_weight(cpu_active_mask) + +-static inline bool cpu_online(unsigned int cpu) ++static __always_inline bool cpu_online(unsigned int cpu) + { + return cpumask_test_cpu(cpu, cpu_online_mask); + } + +-static inline bool cpu_possible(unsigned int cpu) ++static __always_inline bool cpu_possible(unsigned int cpu) + { + return cpumask_test_cpu(cpu, cpu_possible_mask); + } + +-static inline bool cpu_present(unsigned int cpu) ++static __always_inline bool cpu_present(unsigned int cpu) + { + return cpumask_test_cpu(cpu, cpu_present_mask); + } + +-static inline bool cpu_active(unsigned int cpu) ++static __always_inline bool cpu_active(unsigned int cpu) + { + return cpumask_test_cpu(cpu, cpu_active_mask); + } + +-static inline bool cpu_dying(unsigned int cpu) ++static __always_inline bool cpu_dying(unsigned int cpu) + { + return cpumask_test_cpu(cpu, cpu_dying_mask); + } +@@ -1057,27 +1057,27 @@ static inline bool cpu_dying(unsigned int cpu) + #define num_present_cpus() 1U + #define num_active_cpus() 1U + +-static inline bool cpu_online(unsigned int cpu) ++static __always_inline bool cpu_online(unsigned int cpu) + { + return cpu == 0; + } + +-static inline bool cpu_possible(unsigned int cpu) ++static __always_inline bool cpu_possible(unsigned int cpu) + { + return cpu == 0; + } + +-static inline bool cpu_present(unsigned int cpu) ++static __always_inline bool cpu_present(unsigned int cpu) + { + return cpu == 0; + } + +-static inline bool cpu_active(unsigned int cpu) ++static __always_inline bool cpu_active(unsigned int cpu) + { + return cpu == 0; + } + +-static inline bool cpu_dying(unsigned int cpu) ++static __always_inline bool cpu_dying(unsigned int cpu) + { + return false; + } +@@ -1111,7 +1111,7 @@ static inline bool cpu_dying(unsigned int cpu) + * Returns the length of the (null-terminated) @buf string, zero if + * nothing is copied. + */ +-static inline ssize_t ++static __always_inline ssize_t + cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) + { + return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), +@@ -1134,7 +1134,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) + * Returns the length of how many bytes have been copied, excluding + * terminating '\0'. + */ +-static inline ssize_t ++static __always_inline ssize_t + cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, + loff_t off, size_t count) + { +@@ -1149,7 +1149,7 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, + * Everything is same with the above cpumap_print_bitmask_to_buf() + * except the print format. + */ +-static inline ssize_t ++static __always_inline ssize_t + cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, + loff_t off, size_t count) + { +diff --git a/include/linux/find.h b/include/linux/find.h +index ccaf61a0f5fd..db2f2851601d 100644 +--- a/include/linux/find.h ++++ b/include/linux/find.h +@@ -45,7 +45,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) + { +@@ -74,7 +74,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +@@ -105,7 +105,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +@@ -134,7 +134,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, + * Returns the bit number of the next zero bit + * If no bits are zero, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) + { +@@ -161,7 +161,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + * Returns the bit number of the first set bit. + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_first_bit(const unsigned long *addr, unsigned long size) + { + if (small_const_nbits(size)) { +@@ -187,7 +187,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) + { + if (n >= size) +@@ -212,7 +212,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) + { +@@ -239,7 +239,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long * + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size, unsigned long n) + { +@@ -265,7 +265,7 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +@@ -289,7 +289,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1, + * Returns the bit number of the first cleared bit. + * If no bits are zero, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) + { + if (small_const_nbits(size)) { +@@ -310,7 +310,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) + * + * Returns the bit number of the last set bit, or size. + */ +-static inline ++static __always_inline + unsigned long find_last_bit(const unsigned long *addr, unsigned long size) + { + if (small_const_nbits(size)) { +@@ -333,7 +333,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) + * Returns the bit number for the next set bit, or first set bit up to @offset + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_and_bit_wrap(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size, unsigned long offset) +@@ -356,7 +356,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, + * Returns the bit number for the next set bit, or first set bit up to @offset + * If no bits are set, returns @size. + */ +-static inline ++static __always_inline + unsigned long find_next_bit_wrap(const unsigned long *addr, + unsigned long size, unsigned long offset) + { +@@ -373,7 +373,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, + * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing + * before using it alone. + */ +-static inline ++static __always_inline + unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, + unsigned long start, unsigned long n) + { +@@ -414,19 +414,19 @@ extern unsigned long find_next_clump8(unsigned long *clump, + + #if defined(__LITTLE_ENDIAN) + +-static inline unsigned long find_next_zero_bit_le(const void *addr, ++static __always_inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) + { + return find_next_zero_bit(addr, size, offset); + } + +-static inline unsigned long find_next_bit_le(const void *addr, ++static __always_inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) + { + return find_next_bit(addr, size, offset); + } + +-static inline unsigned long find_first_zero_bit_le(const void *addr, ++static __always_inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) + { + return find_first_zero_bit(addr, size); +@@ -435,7 +435,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, + #elif defined(__BIG_ENDIAN) + + #ifndef find_next_zero_bit_le +-static inline ++static __always_inline + unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) + { +@@ -454,7 +454,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned + #endif + + #ifndef find_first_zero_bit_le +-static inline ++static __always_inline + unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) + { + if (small_const_nbits(size)) { +@@ -468,7 +468,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) + #endif + + #ifndef find_next_bit_le +-static inline ++static __always_inline + unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) + { +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index bb0ee80526b2..8c04254c5284 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_; + */ + #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ + __nodemask_pr_bits(maskp) +-static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) ++static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) + { + return m ? MAX_NUMNODES : 0; + } +-static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) ++static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) + { + return m ? m->bits : NULL; + } +@@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp) + } + + #define node_clear(node, dst) __node_clear((node), &(dst)) +-static inline void __node_clear(int node, volatile nodemask_t *dstp) ++static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) + { + clear_bit(node, dstp->bits); + } + + #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) +-static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) ++static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) + { + bitmap_fill(dstp->bits, nbits); + } + + #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) +-static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) ++static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) + { + bitmap_zero(dstp->bits, nbits); + } +@@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) + + #define node_test_and_set(node, nodemask) \ + __node_test_and_set((node), &(nodemask)) +-static inline bool __node_test_and_set(int node, nodemask_t *addr) ++static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) + { + return test_and_set_bit(node, addr->bits); + } + + #define nodes_and(dst, src1, src2) \ + __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) +-static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, ++static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); +@@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, + + #define nodes_or(dst, src1, src2) \ + __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) +-static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, ++static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); +@@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, + + #define nodes_xor(dst, src1, src2) \ + __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) +-static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, ++static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); +@@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, + + #define nodes_andnot(dst, src1, src2) \ + __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) +-static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, ++static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); +@@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, + + #define nodes_complement(dst, src) \ + __nodes_complement(&(dst), &(src), MAX_NUMNODES) +-static inline void __nodes_complement(nodemask_t *dstp, ++static __always_inline void __nodes_complement(nodemask_t *dstp, + const nodemask_t *srcp, unsigned int nbits) + { + bitmap_complement(dstp->bits, srcp->bits, nbits); +@@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp, + + #define nodes_equal(src1, src2) \ + __nodes_equal(&(src1), &(src2), MAX_NUMNODES) +-static inline bool __nodes_equal(const nodemask_t *src1p, ++static __always_inline bool __nodes_equal(const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + return bitmap_equal(src1p->bits, src2p->bits, nbits); +@@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p, + + #define nodes_intersects(src1, src2) \ + __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) +-static inline bool __nodes_intersects(const nodemask_t *src1p, ++static __always_inline bool __nodes_intersects(const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + return bitmap_intersects(src1p->bits, src2p->bits, nbits); +@@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p, + + #define nodes_subset(src1, src2) \ + __nodes_subset(&(src1), &(src2), MAX_NUMNODES) +-static inline bool __nodes_subset(const nodemask_t *src1p, ++static __always_inline bool __nodes_subset(const nodemask_t *src1p, + const nodemask_t *src2p, unsigned int nbits) + { + return bitmap_subset(src1p->bits, src2p->bits, nbits); + } + + #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) +-static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) ++static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) + { + return bitmap_empty(srcp->bits, nbits); + } + + #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) +-static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) ++static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) + { + return bitmap_full(srcp->bits, nbits); + } + + #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) +-static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) ++static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) + { + return bitmap_weight(srcp->bits, nbits); + } + + #define nodes_shift_right(dst, src, n) \ + __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) +-static inline void __nodes_shift_right(nodemask_t *dstp, ++static __always_inline void __nodes_shift_right(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) + { + bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); +@@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp, + + #define nodes_shift_left(dst, src, n) \ + __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) +-static inline void __nodes_shift_left(nodemask_t *dstp, ++static __always_inline void __nodes_shift_left(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) + { + bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); +@@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp, + > MAX_NUMNODES, then the silly min_ts could be dropped. */ + + #define first_node(src) __first_node(&(src)) +-static inline unsigned int __first_node(const nodemask_t *srcp) ++static __always_inline unsigned int __first_node(const nodemask_t *srcp) + { + return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); + } + + #define next_node(n, src) __next_node((n), &(src)) +-static inline unsigned int __next_node(int n, const nodemask_t *srcp) ++static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) + { + return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); + } +@@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp) + * the first node in src if needed. Returns MAX_NUMNODES if src is empty. + */ + #define next_node_in(n, src) __next_node_in((n), &(src)) +-static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) ++static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) + { + unsigned int ret = __next_node(node, srcp); + +@@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) + return ret; + } + +-static inline void init_nodemask_of_node(nodemask_t *mask, int node) ++static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) + { + nodes_clear(*mask); + node_set(node, *mask); +@@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) + }) + + #define first_unset_node(mask) __first_unset_node(&(mask)) +-static inline unsigned int __first_unset_node(const nodemask_t *maskp) ++static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) + { + return min_t(unsigned int, MAX_NUMNODES, + find_first_zero_bit(maskp->bits, MAX_NUMNODES)); +@@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp) + + #define nodemask_parse_user(ubuf, ulen, dst) \ + __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) +-static inline int __nodemask_parse_user(const char __user *buf, int len, ++static __always_inline int __nodemask_parse_user(const char __user *buf, int len, + nodemask_t *dstp, int nbits) + { + return bitmap_parse_user(buf, len, dstp->bits, nbits); + } + + #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) +-static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) ++static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) + { + return bitmap_parselist(buf, dstp->bits, nbits); + } + + #define node_remap(oldbit, old, new) \ + __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) +-static inline int __node_remap(int oldbit, ++static __always_inline int __node_remap(int oldbit, + const nodemask_t *oldp, const nodemask_t *newp, int nbits) + { + return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); +@@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit, + + #define nodes_remap(dst, src, old, new) \ + __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) +-static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, ++static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, + const nodemask_t *oldp, const nodemask_t *newp, int nbits) + { + bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); +@@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, + + #define nodes_onto(dst, orig, relmap) \ + __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) +-static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, ++static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, + const nodemask_t *relmapp, int nbits) + { + bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); +@@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, + + #define nodes_fold(dst, orig, sz) \ + __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) +-static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, ++static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, + int sz, int nbits) + { + bitmap_fold(dstp->bits, origp->bits, sz, nbits); +@@ -418,22 +418,22 @@ enum node_states { + extern nodemask_t node_states[NR_NODE_STATES]; + + #if MAX_NUMNODES > 1 +-static inline int node_state(int node, enum node_states state) ++static __always_inline int node_state(int node, enum node_states state) + { + return node_isset(node, node_states[state]); + } + +-static inline void node_set_state(int node, enum node_states state) ++static __always_inline void node_set_state(int node, enum node_states state) + { + __node_set(node, &node_states[state]); + } + +-static inline void node_clear_state(int node, enum node_states state) ++static __always_inline void node_clear_state(int node, enum node_states state) + { + __node_clear(node, &node_states[state]); + } + +-static inline int num_node_state(enum node_states state) ++static __always_inline int num_node_state(enum node_states state) + { + return nodes_weight(node_states[state]); + } +@@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state) + + #define first_online_node first_node(node_states[N_ONLINE]) + #define first_memory_node first_node(node_states[N_MEMORY]) +-static inline unsigned int next_online_node(int nid) ++static __always_inline unsigned int next_online_node(int nid) + { + return next_node(nid, node_states[N_ONLINE]); + } +-static inline unsigned int next_memory_node(int nid) ++static __always_inline unsigned int next_memory_node(int nid) + { + return next_node(nid, node_states[N_MEMORY]); + } +@@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid) + extern unsigned int nr_node_ids; + extern unsigned int nr_online_nodes; + +-static inline void node_set_online(int nid) ++static __always_inline void node_set_online(int nid) + { + node_set_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); + } + +-static inline void node_set_offline(int nid) ++static __always_inline void node_set_offline(int nid) + { + node_clear_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); +@@ -469,20 +469,20 @@ static inline void node_set_offline(int nid) + + #else + +-static inline int node_state(int node, enum node_states state) ++static __always_inline int node_state(int node, enum node_states state) + { + return node == 0; + } + +-static inline void node_set_state(int node, enum node_states state) ++static __always_inline void node_set_state(int node, enum node_states state) + { + } + +-static inline void node_clear_state(int node, enum node_states state) ++static __always_inline void node_clear_state(int node, enum node_states state) + { + } + +-static inline int num_node_state(enum node_states state) ++static __always_inline int num_node_state(enum node_states state) + { + return 1; + } +@@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state) + + #endif + +-static inline int node_random(const nodemask_t *maskp) ++static __always_inline int node_random(const nodemask_t *maskp) + { + #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) + int w, bit; +-- +2.39.2 + +From 0e3205aac37cde833a7cc71dd35595de9f88a5b8 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Tue, 21 Feb 2023 10:26:39 +0100 +Subject: [PATCH 04/15] cachy + +Signed-off-by: Peter Jung +--- + .gitignore | 1 + + .../admin-guide/kernel-parameters.txt | 11 +- + Documentation/dontdiff | 1 + + Makefile | 8 +- + arch/arc/configs/axs101_defconfig | 1 + + arch/arc/configs/axs103_defconfig | 1 + + arch/arc/configs/axs103_smp_defconfig | 1 + + arch/arc/configs/haps_hs_defconfig | 1 + + arch/arc/configs/haps_hs_smp_defconfig | 1 + + arch/arc/configs/hsdk_defconfig | 1 + + arch/arc/configs/nsim_700_defconfig | 1 + + arch/arc/configs/nsimosci_defconfig | 1 + + arch/arc/configs/nsimosci_hs_defconfig | 1 + + arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + + arch/arc/configs/tb10x_defconfig | 1 + + arch/arc/configs/vdk_hs38_defconfig | 1 + + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + + arch/x86/Kconfig.cpu | 416 ++++++++++- + arch/x86/Makefile | 45 +- + arch/x86/Makefile.postlink | 41 ++ + arch/x86/boot/compressed/.gitignore | 1 - + arch/x86/boot/compressed/Makefile | 10 +- + arch/x86/include/asm/vermagic.h | 72 ++ + drivers/Makefile | 15 +- + drivers/i2c/busses/Kconfig | 9 + + drivers/i2c/busses/Makefile | 1 + + drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ + drivers/i2c/busses/i2c-piix4.c | 4 +- + drivers/md/dm-crypt.c | 5 + + drivers/pci/quirks.c | 101 +++ + include/linux/pagemap.h | 2 +- + include/linux/user_namespace.h | 4 + + include/net/netns/ipv4.h | 1 + + include/trace/events/tcp.h | 7 + + init/Kconfig | 39 ++ + kernel/Kconfig.hz | 24 + + kernel/fork.c | 14 + + kernel/module/Kconfig | 25 + + kernel/rcu/Kconfig | 4 +- + kernel/rcu/rcutorture.c | 2 +- + kernel/rcu/tree.c | 6 +- + kernel/rcu/tree_nocb.h | 4 +- + kernel/rcu/tree_plugin.h | 4 +- + kernel/sched/fair.c | 20 +- + kernel/sysctl.c | 12 + + kernel/user_namespace.c | 7 + + lib/string.c | 62 +- + mm/Kconfig | 2 +- + mm/compaction.c | 4 + + mm/page-writeback.c | 8 + + mm/swap.c | 5 + + mm/vmpressure.c | 4 + + mm/vmscan.c | 8 + + net/ipv4/sysctl_net_ipv4.c | 7 + + net/ipv4/tcp_input.c | 36 + + net/ipv4/tcp_ipv4.c | 2 + + scripts/Makefile.lib | 13 +- + scripts/Makefile.modinst | 7 +- + 58 files changed, 1660 insertions(+), 74 deletions(-) + create mode 100644 arch/x86/Makefile.postlink + create mode 100644 drivers/i2c/busses/i2c-nct6775.c + +diff --git a/.gitignore b/.gitignore +index 20dce5c3b9e0..466c23de56ce 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -63,6 +63,7 @@ modules.order + /vmlinux + /vmlinux.32 + /vmlinux.map ++/vmlinux.relocs + /vmlinux.symvers + /vmlinux-gdb.py + /vmlinuz +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 6cfa6e3996cf..9595abf34974 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4178,6 +4178,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multfunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specfic device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +@@ -4751,7 +4760,7 @@ + overwritten. + + rcutree.kthread_prio= [KNL,BOOT] +- Set the SCHED_FIFO priority of the RCU per-CPU ++ Set the SCHED_RR priority of the RCU per-CPU + kthreads (rcuc/N). This value is also used for + the priority of the RCU boost threads (rcub/N) + and for the RCU grace-period kthreads (rcu_bh, +diff --git a/Documentation/dontdiff b/Documentation/dontdiff +index 352ff53a2306..7c210744d84c 100644 +--- a/Documentation/dontdiff ++++ b/Documentation/dontdiff +@@ -255,6 +255,7 @@ vmlinux.aout + vmlinux.bin.all + vmlinux.lds + vmlinux.map ++vmlinux.relocs + vmlinux.symvers + vmlinuz + voffset.h +diff --git a/Makefile b/Makefile +index 3f6628780eb2..335e93ed017f 100644 +--- a/Makefile ++++ b/Makefile +@@ -834,6 +834,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) + ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + KBUILD_RUSTFLAGS += -Copt-level=2 ++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++KBUILD_CFLAGS += -O3 ++KBUILD_RUSTFLAGS += -Copt-level=3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os + KBUILD_RUSTFLAGS += -Copt-level=s +@@ -1075,11 +1078,6 @@ KBUILD_CFLAGS += -fno-strict-overflow + # Make sure -fstack-check isn't enabled (like gentoo apparently did) + KBUILD_CFLAGS += -fno-stack-check + +-# conserve stack if available +-ifdef CONFIG_CC_IS_GCC +-KBUILD_CFLAGS += -fconserve-stack +-endif +- + # Prohibit date/time macros, which would make the build non-deterministic + KBUILD_CFLAGS += -Werror=date-time + +diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig +index 81764160451f..2c15d3bf747a 100644 +--- a/arch/arc/configs/axs101_defconfig ++++ b/arch/arc/configs/axs101_defconfig +@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig +index d5181275490e..7d868e148d9a 100644 +--- a/arch/arc/configs/axs103_defconfig ++++ b/arch/arc/configs/axs103_defconfig +@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig +index 2f336d99a8cf..777a9f21eb6b 100644 +--- a/arch/arc/configs/axs103_smp_defconfig ++++ b/arch/arc/configs/axs103_smp_defconfig +@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig +index 899b2fd5c71d..bda15a876849 100644 +--- a/arch/arc/configs/haps_hs_defconfig ++++ b/arch/arc/configs/haps_hs_defconfig +@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EXPERT=y + CONFIG_PERF_EVENTS=y + # CONFIG_COMPAT_BRK is not set +diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig +index 0d32aac8069f..dbd74fea69aa 100644 +--- a/arch/arc/configs/haps_hs_smp_defconfig ++++ b/arch/arc/configs/haps_hs_smp_defconfig +@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig +index d18378d2c2a6..2396ca417182 100644 +--- a/arch/arc/configs/hsdk_defconfig ++++ b/arch/arc/configs/hsdk_defconfig +@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y + CONFIG_BLK_DEV_RAM=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig +index 3e9829775992..5044609540cc 100644 +--- a/arch/arc/configs/nsim_700_defconfig ++++ b/arch/arc/configs/nsim_700_defconfig +@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_KALLSYMS_ALL=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y +diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig +index 502c87f351c8..748c809d1c4c 100644 +--- a/arch/arc/configs/nsimosci_defconfig ++++ b/arch/arc/configs/nsimosci_defconfig +@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_KALLSYMS_ALL=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y +diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig +index f721cc3997d0..205c32b0074c 100644 +--- a/arch/arc/configs/nsimosci_hs_defconfig ++++ b/arch/arc/configs/nsimosci_hs_defconfig +@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_KALLSYMS_ALL=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y +diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig +index 1419fc946a08..2477b7c80977 100644 +--- a/arch/arc/configs/nsimosci_hs_smp_defconfig ++++ b/arch/arc/configs/nsimosci_hs_smp_defconfig +@@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y + # CONFIG_UTS_NS is not set + # CONFIG_PID_NS is not set + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_PERF_EVENTS=y + # CONFIG_COMPAT_BRK is not set + CONFIG_KPROBES=y +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index 6f0d2be9d926..cf02ad0fc210 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" + CONFIG_INITRAMFS_ROOT_UID=2100 + CONFIG_INITRAMFS_ROOT_GID=501 + # CONFIG_RD_GZIP is not set ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_KALLSYMS_ALL=y + # CONFIG_AIO is not set + CONFIG_EMBEDDED=y +diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig +index d3ef189c75f8..922b1b24f518 100644 +--- a/arch/arc/configs/vdk_hs38_defconfig ++++ b/arch/arc/configs/vdk_hs38_defconfig +@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig +index 944b347025fd..ed64319f7eb2 100644 +--- a/arch/arc/configs/vdk_hs38_smp_defconfig ++++ b/arch/arc/configs/vdk_hs38_smp_defconfig +@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_BLK_DEV_INITRD=y ++CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y + CONFIG_EMBEDDED=y + CONFIG_PERF_EVENTS=y + # CONFIG_VM_EVENT_COUNTERS is not set +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index 542377cd419d..08d887d1220d 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -157,7 +157,7 @@ config MPENTIUM4 + + + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + help + Select this for an AMD K6-family processor. Enables use of +@@ -165,7 +165,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + help + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -173,12 +173,106 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + help + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ help ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ help ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ help ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ help ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ help ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ help ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ help ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ help ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ help ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ help ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ ++config MZEN2 ++ bool "AMD Zen 2" ++ help ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ ++config MZEN3 ++ bool "AMD Zen 3" ++ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ Select this for AMD Family 19h Zen 3 processors. ++ ++ Enables -march=znver3 ++ ++config MZEN4 ++ bool "AMD Zen 4" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ help ++ Select this for AMD Family 19h Zen 4 processors. ++ ++ Enables -march=znver4 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -270,7 +364,7 @@ config MPSC + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" + help + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -278,6 +372,8 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + ++ Enables -march=core2 ++ + config MATOM + bool "Intel Atom" + help +@@ -287,6 +383,202 @@ config MATOM + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + ++config MNEHALEM ++ bool "Intel Nehalem" ++ select X86_P6_NOP ++ help ++ ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MGOLDMONT ++ bool "Intel Goldmont" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. ++ ++ Enables -march=goldmont ++ ++config MGOLDMONTPLUS ++ bool "Intel Goldmont Plus" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Goldmont Plus platform including Gemini Lake. ++ ++ Enables -march=goldmont-plus ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ select X86_P6_NOP ++ help ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ select X86_P6_NOP ++ help ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ select X86_P6_NOP ++ help ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ select X86_P6_NOP ++ help ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ select X86_P6_NOP ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 10th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake-client ++ ++config MCASCADELAKE ++ bool "Intel Cascade Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for Xeon processors in the Cascade Lake family. ++ ++ Enables -march=cascadelake ++ ++config MCOOPERLAKE ++ bool "Intel Cooper Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ select X86_P6_NOP ++ help ++ ++ Select this for Xeon processors in the Cooper Lake family. ++ ++ Enables -march=cooperlake ++ ++config MTIGERLAKE ++ bool "Intel Tiger Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ select X86_P6_NOP ++ help ++ ++ Select this for third-generation 10 nm process processors in the Tiger Lake family. ++ ++ Enables -march=tigerlake ++ ++config MSAPPHIRERAPIDS ++ bool "Intel Sapphire Rapids" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for third-generation 10 nm process processors in the Sapphire Rapids family. ++ ++ Enables -march=sapphirerapids ++ ++config MROCKETLAKE ++ bool "Intel Rocket Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for eleventh-generation processors in the Rocket Lake family. ++ ++ Enables -march=rocketlake ++ ++config MALDERLAKE ++ bool "Intel Alder Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for twelfth-generation processors in the Alder Lake family. ++ ++ Enables -march=alderlake ++ ++config MRAPTORLAKE ++ bool "Intel Raptor Lake" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ select X86_P6_NOP ++ help ++ ++ Select this for thirteenth-generation processors in the Raptor Lake family. ++ ++ Enables -march=raptorlake ++ ++config MMETEORLAKE ++ bool "Intel Meteor Lake" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) ++ select X86_P6_NOP ++ help ++ ++ Select this for fourteenth-generation processors in the Meteor Lake family. ++ ++ Enables -march=meteorlake ++ + config GENERIC_CPU + bool "Generic-x86-64" + depends on X86_64 +@@ -294,6 +586,50 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config GENERIC_CPU2 ++ bool "Generic-x86-64-v2" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64 CPU. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v2. ++ ++config GENERIC_CPU3 ++ bool "Generic-x86-64-v3" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64-v3 CPU with v3 instructions. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v3. ++ ++config GENERIC_CPU4 ++ bool "Generic-x86-64-v4" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64 CPU with v4 instructions. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v4. ++ ++config MNATIVE_INTEL ++ bool "Intel-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for AMD CPUs. Intel Only! ++ ++ Enables -march=native ++ ++config MNATIVE_AMD ++ bool "AMD-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for Intel CPUs. AMD Only! ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ ++ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ ++ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ ++ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ ++ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ ++ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ ++ || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \ ++ || GENERIC_CPU4 + default "4" if MELAN || M486SX || M486 || MGEODEGX1 +- default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX ++ default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \ ++ || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + + config X86_F00F_BUG + def_bool y +@@ -332,15 +676,27 @@ config X86_INVD_BUG + + config X86_ALIGNMENT_16 + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \ ++ || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \ ++ || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ ++ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ ++ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ ++ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ ++ || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ ++ || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ ++ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ ++ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ ++ || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ ++ || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ ++ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD + + # + # P6_NOPs are a relatively minor optimization that require a family >= +@@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM + config X86_P6_NOP + def_bool y + depends on X86_64 +- depends on (MCORE2 || MPENTIUM4 || MPSC) ++ depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ ++ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \ ++ || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \ ++ || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL) + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ ++ || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ ++ || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ ++ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ ++ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ ++ || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ ++ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \ ++ || MNATIVE_AMD) || X86_64 + + config X86_CMPXCHG64 + def_bool y +- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 ++ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ ++ || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ ++ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ ++ || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ ++ || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ ++ || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ ++ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD + + # this should be set for all -march=.. options where the compiler + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ ++ || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ ++ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ ++ || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ ++ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ ++ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ ++ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD) + + config X86_MINIMUM_CPU_FAMILY + int + default "64" if X86_64 +- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) ++ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ ++ || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 \ ++ || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ ++ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ ++ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ ++ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ ++ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ ++ || MNATIVE_INTEL || MNATIVE_AMD) + default "5" if X86_32 && X86_CMPXCHG64 + default "4" + + config X86_DEBUGCTLMSR + def_bool y +- depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML ++ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \ ++ || M486SX || M486) && !UML + + config IA32_FEAT_CTL + def_bool y +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 73ed982d4100..cb4c6620b34a 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -67,7 +67,7 @@ export BITS + # + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 + # +-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx ++KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3 -fno-tree-vectorize + KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 + + ifeq ($(CONFIG_X86_KERNEL_IBT),y) +@@ -151,8 +151,47 @@ else + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += -march=k8 + cflags-$(CONFIG_MPSC) += -march=nocona +- cflags-$(CONFIG_MCORE2) += -march=core2 +- cflags-$(CONFIG_MATOM) += -march=atom ++ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 ++ cflags-$(CONFIG_MK10) += -march=amdfam10 ++ cflags-$(CONFIG_MBARCELONA) += -march=barcelona ++ cflags-$(CONFIG_MBOBCAT) += -march=btver1 ++ cflags-$(CONFIG_MJAGUAR) += -march=btver2 ++ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 ++ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm ++ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm ++ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm ++ cflags-$(CONFIG_MZEN) += -march=znver1 ++ cflags-$(CONFIG_MZEN2) += -march=znver2 ++ cflags-$(CONFIG_MZEN3) += -march=znver3 ++ cflags-$(CONFIG_MZEN4) += -march=znver4 ++ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native ++ cflags-$(CONFIG_MNATIVE_AMD) += -march=native ++ cflags-$(CONFIG_MATOM) += -march=bonnell ++ cflags-$(CONFIG_MCORE2) += -march=core2 ++ cflags-$(CONFIG_MNEHALEM) += -march=nehalem ++ cflags-$(CONFIG_MWESTMERE) += -march=westmere ++ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont ++ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont ++ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus ++ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge ++ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge ++ cflags-$(CONFIG_MHASWELL) += -march=haswell ++ cflags-$(CONFIG_MBROADWELL) += -march=broadwell ++ cflags-$(CONFIG_MSKYLAKE) += -march=skylake ++ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 ++ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake ++ cflags-$(CONFIG_MICELAKE) += -march=icelake-client ++ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake ++ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake ++ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake ++ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids ++ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake ++ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake ++ cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake ++ cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake ++ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 ++ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 ++ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 + cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + KBUILD_CFLAGS += $(cflags-y) + +diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink +new file mode 100644 +index 000000000000..b38ffa4defb3 +--- /dev/null ++++ b/arch/x86/Makefile.postlink +@@ -0,0 +1,41 @@ ++# SPDX-License-Identifier: GPL-2.0 ++# =========================================================================== ++# Post-link x86 pass ++# =========================================================================== ++# ++# 1. Separate relocations from vmlinux into vmlinux.relocs. ++# 2. Strip relocations from vmlinux. ++ ++PHONY := __archpost ++__archpost: ++ ++-include include/config/auto.conf ++include scripts/Kbuild.include ++ ++CMD_RELOCS = arch/x86/tools/relocs ++quiet_cmd_relocs = RELOCS $@.relocs ++ cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ ++ ++quiet_cmd_strip_relocs = RSTRIP $@ ++ cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ ++ ++# `@true` prevents complaint when there is nothing to be done ++ ++vmlinux: FORCE ++ @true ++ifeq ($(CONFIG_X86_NEED_RELOCS),y) ++ $(call cmd,relocs) ++ $(call cmd,strip_relocs) ++endif ++ ++%.ko: FORCE ++ @true ++ ++clean: ++ @rm -f vmlinux.relocs ++ ++PHONY += FORCE clean ++ ++FORCE: ++ ++.PHONY: $(PHONY) +diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore +index 25805199a506..b2968175fc27 100644 +--- a/arch/x86/boot/compressed/.gitignore ++++ b/arch/x86/boot/compressed/.gitignore +@@ -1,7 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0-only + relocs + vmlinux.bin.all +-vmlinux.relocs + vmlinux.lds + mkpiggy + piggy.S +diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile +index 1acff356d97a..d995595394bb 100644 +--- a/arch/x86/boot/compressed/Makefile ++++ b/arch/x86/boot/compressed/Makefile +@@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE + + targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs + +-CMD_RELOCS = arch/x86/tools/relocs +-quiet_cmd_relocs = RELOCS $@ +- cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +-$(obj)/vmlinux.relocs: vmlinux FORCE +- $(call if_changed,relocs) ++# vmlinux.relocs is created by the vmlinux postlink step. ++vmlinux.relocs: vmlinux ++ @true + + vmlinux.bin.all-y := $(obj)/vmlinux.bin +-vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs ++vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs + + $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE + $(call if_changed,gzip) +diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h +index 75884d2cdec3..18021e8c0c28 100644 +--- a/arch/x86/include/asm/vermagic.h ++++ b/arch/x86/include/asm/vermagic.h +@@ -17,6 +17,52 @@ + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE_INTEL ++#define MODULE_PROC_FAMILY "NATIVE_INTEL " ++#elif defined CONFIG_MNATIVE_AMD ++#define MODULE_PROC_FAMILY "NATIVE_AMD " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MGOLDMONT ++#define MODULE_PROC_FAMILY "GOLDMONT " ++#elif defined CONFIG_MGOLDMONTPLUS ++#define MODULE_PROC_FAMILY "GOLDMONTPLUS " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MCASCADELAKE ++#define MODULE_PROC_FAMILY "CASCADELAKE " ++#elif defined CONFIG_MCOOPERLAKE ++#define MODULE_PROC_FAMILY "COOPERLAKE " ++#elif defined CONFIG_MTIGERLAKE ++#define MODULE_PROC_FAMILY "TIGERLAKE " ++#elif defined CONFIG_MSAPPHIRERAPIDS ++#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " ++#elif defined CONFIG_ROCKETLAKE ++#define MODULE_PROC_FAMILY "ROCKETLAKE " ++#elif defined CONFIG_MALDERLAKE ++#define MODULE_PROC_FAMILY "ALDERLAKE " ++#elif defined CONFIG_MRAPTORLAKE ++#define MODULE_PROC_FAMILY "RAPTORLAKE " ++#elif defined CONFIG_MMETEORLAKE ++#define MODULE_PROC_FAMILY "METEORLAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -35,6 +81,32 @@ + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " ++#elif defined CONFIG_MZEN3 ++#define MODULE_PROC_FAMILY "ZEN3 " ++#elif defined CONFIG_MZEN4 ++#define MODULE_PROC_FAMILY "ZEN4 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE +diff --git a/drivers/Makefile b/drivers/Makefile +index bdf1c66141c9..1e1a0832fb48 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -59,15 +59,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_LIBNVDIMM) += nvdimm/ +@@ -79,6 +72,14 @@ obj-y += macintosh/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig +index a7bfddf08fa7..c9a5fa597950 100644 +--- a/drivers/i2c/busses/Kconfig ++++ b/drivers/i2c/busses/Kconfig +@@ -229,6 +229,15 @@ config I2C_CHT_WC + combined with a FUSB302 Type-C port-controller as such it is advised + to also select CONFIG_TYPEC_FUSB302=m. + ++config I2C_NCT6775 ++ tristate "Nuvoton NCT6775 and compatible SMBus controller" ++ help ++ If you say yes to this option, support will be included for the ++ Nuvoton NCT6775 and compatible SMBus controllers. ++ ++ This driver can also be built as a module. If so, the module ++ will be called i2c-nct6775. ++ + config I2C_NFORCE2 + tristate "Nvidia nForce2, nForce3 and nForce4" + depends on PCI +diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile +index e73cdb1d2b5a..052ccd05c13c 100644 +--- a/drivers/i2c/busses/Makefile ++++ b/drivers/i2c/busses/Makefile +@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o + obj-$(CONFIG_I2C_I801) += i2c-i801.o + obj-$(CONFIG_I2C_ISCH) += i2c-isch.o + obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o ++obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o + obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o + obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o + obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o +diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c +new file mode 100644 +index 000000000000..0462f0952043 +--- /dev/null ++++ b/drivers/i2c/busses/i2c-nct6775.c +@@ -0,0 +1,647 @@ ++/* ++ * i2c-nct6775 - Driver for the SMBus master functionality of ++ * Nuvoton NCT677x Super-I/O chips ++ * ++ * Copyright (C) 2019 Adam Honse ++ * ++ * Derived from nct6775 hwmon driver ++ * Copyright (C) 2012 Guenter Roeck ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRVNAME "i2c-nct6775" ++ ++/* Nuvoton SMBus address offsets */ ++#define SMBHSTDAT (0 + nuvoton_nct6793d_smba) ++#define SMBBLKSZ (1 + nuvoton_nct6793d_smba) ++#define SMBHSTCMD (2 + nuvoton_nct6793d_smba) ++#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers ++#define SMBHSTCTL (4 + nuvoton_nct6793d_smba) ++#define SMBHSTADD (5 + nuvoton_nct6793d_smba) ++#define SMBHSTERR (9 + nuvoton_nct6793d_smba) ++#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba) ++ ++/* Command register */ ++#define NCT6793D_READ_BYTE 0 ++#define NCT6793D_READ_WORD 1 ++#define NCT6793D_READ_BLOCK 2 ++#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3 ++#define NCT6793D_PROC_CALL 4 ++#define NCT6793D_WRITE_BYTE 8 ++#define NCT6793D_WRITE_WORD 9 ++#define NCT6793D_WRITE_BLOCK 10 ++ ++/* Control register */ ++#define NCT6793D_MANUAL_START 128 ++#define NCT6793D_SOFT_RESET 64 ++ ++/* Error register */ ++#define NCT6793D_NO_ACK 32 ++ ++/* Status register */ ++#define NCT6793D_FIFO_EMPTY 1 ++#define NCT6793D_FIFO_FULL 2 ++#define NCT6793D_MANUAL_ACTIVE 4 ++ ++#define NCT6775_LD_SMBUS 0x0B ++ ++/* Other settings */ ++#define MAX_RETRIES 400 ++ ++enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793, ++ nct6795, nct6796, nct6798 }; ++ ++struct nct6775_sio_data { ++ int sioreg; ++ enum kinds kind; ++}; ++ ++/* used to set data->name = nct6775_device_names[data->sio_kind] */ ++static const char * const nct6775_device_names[] = { ++ "nct6106", ++ "nct6775", ++ "nct6776", ++ "nct6779", ++ "nct6791", ++ "nct6792", ++ "nct6793", ++ "nct6795", ++ "nct6796", ++ "nct6798", ++}; ++ ++static const char * const nct6775_sio_names[] __initconst = { ++ "NCT6106D", ++ "NCT6775F", ++ "NCT6776D/F", ++ "NCT6779D", ++ "NCT6791D", ++ "NCT6792D", ++ "NCT6793D", ++ "NCT6795D", ++ "NCT6796D", ++ "NCT6798D", ++}; ++ ++#define SIO_REG_LDSEL 0x07 /* Logical device select */ ++#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ ++#define SIO_REG_SMBA 0x62 /* SMBus base address register */ ++ ++#define SIO_NCT6106_ID 0xc450 ++#define SIO_NCT6775_ID 0xb470 ++#define SIO_NCT6776_ID 0xc330 ++#define SIO_NCT6779_ID 0xc560 ++#define SIO_NCT6791_ID 0xc800 ++#define SIO_NCT6792_ID 0xc910 ++#define SIO_NCT6793_ID 0xd120 ++#define SIO_NCT6795_ID 0xd350 ++#define SIO_NCT6796_ID 0xd420 ++#define SIO_NCT6798_ID 0xd428 ++#define SIO_ID_MASK 0xFFF0 ++ ++static inline void ++superio_outb(int ioreg, int reg, int val) ++{ ++ outb(reg, ioreg); ++ outb(val, ioreg + 1); ++} ++ ++static inline int ++superio_inb(int ioreg, int reg) ++{ ++ outb(reg, ioreg); ++ return inb(ioreg + 1); ++} ++ ++static inline void ++superio_select(int ioreg, int ld) ++{ ++ outb(SIO_REG_LDSEL, ioreg); ++ outb(ld, ioreg + 1); ++} ++ ++static inline int ++superio_enter(int ioreg) ++{ ++ /* ++ * Try to reserve and for exclusive access. ++ */ ++ if (!request_muxed_region(ioreg, 2, DRVNAME)) ++ return -EBUSY; ++ ++ outb(0x87, ioreg); ++ outb(0x87, ioreg); ++ ++ return 0; ++} ++ ++static inline void ++superio_exit(int ioreg) ++{ ++ outb(0xaa, ioreg); ++ outb(0x02, ioreg); ++ outb(0x02, ioreg + 1); ++ release_region(ioreg, 2); ++} ++ ++/* ++ * ISA constants ++ */ ++ ++#define IOREGION_ALIGNMENT (~7) ++#define IOREGION_LENGTH 2 ++#define ADDR_REG_OFFSET 0 ++#define DATA_REG_OFFSET 1 ++ ++#define NCT6775_REG_BANK 0x4E ++#define NCT6775_REG_CONFIG 0x40 ++ ++static struct i2c_adapter *nct6775_adapter; ++ ++struct i2c_nct6775_adapdata { ++ unsigned short smba; ++}; ++ ++/* Return negative errno on error. */ ++static s32 nct6775_access(struct i2c_adapter * adap, u16 addr, ++ unsigned short flags, char read_write, ++ u8 command, int size, union i2c_smbus_data * data) ++{ ++ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); ++ unsigned short nuvoton_nct6793d_smba = adapdata->smba; ++ int i, len, cnt; ++ union i2c_smbus_data tmp_data; ++ int timeout = 0; ++ ++ tmp_data.word = 0; ++ cnt = 0; ++ len = 0; ++ ++ outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL); ++ ++ switch (size) { ++ case I2C_SMBUS_QUICK: ++ outb_p((addr << 1) | read_write, ++ SMBHSTADD); ++ break; ++ case I2C_SMBUS_BYTE_DATA: ++ tmp_data.byte = data->byte; ++ case I2C_SMBUS_BYTE: ++ outb_p((addr << 1) | read_write, ++ SMBHSTADD); ++ outb_p(command, SMBHSTIDX); ++ if (read_write == I2C_SMBUS_WRITE) { ++ outb_p(tmp_data.byte, SMBHSTDAT); ++ outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD); ++ } ++ else { ++ outb_p(NCT6793D_READ_BYTE, SMBHSTCMD); ++ } ++ break; ++ case I2C_SMBUS_WORD_DATA: ++ outb_p((addr << 1) | read_write, ++ SMBHSTADD); ++ outb_p(command, SMBHSTIDX); ++ if (read_write == I2C_SMBUS_WRITE) { ++ outb_p(data->word & 0xff, SMBHSTDAT); ++ outb_p((data->word & 0xff00) >> 8, SMBHSTDAT); ++ outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD); ++ } ++ else { ++ outb_p(NCT6793D_READ_WORD, SMBHSTCMD); ++ } ++ break; ++ case I2C_SMBUS_BLOCK_DATA: ++ outb_p((addr << 1) | read_write, ++ SMBHSTADD); ++ outb_p(command, SMBHSTIDX); ++ if (read_write == I2C_SMBUS_WRITE) { ++ len = data->block[0]; ++ if (len == 0 || len > I2C_SMBUS_BLOCK_MAX) ++ return -EINVAL; ++ outb_p(len, SMBBLKSZ); ++ ++ cnt = 1; ++ if (len >= 4) { ++ for (i = cnt; i <= 4; i++) { ++ outb_p(data->block[i], SMBHSTDAT); ++ } ++ ++ len -= 4; ++ cnt += 4; ++ } ++ else { ++ for (i = cnt; i <= len; i++ ) { ++ outb_p(data->block[i], SMBHSTDAT); ++ } ++ ++ len = 0; ++ } ++ ++ outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD); ++ } ++ else { ++ return -ENOTSUPP; ++ } ++ break; ++ default: ++ dev_warn(&adap->dev, "Unsupported transaction %d\n", size); ++ return -EOPNOTSUPP; ++ } ++ ++ outb_p(NCT6793D_MANUAL_START, SMBHSTCTL); ++ ++ while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) { ++ if (read_write == I2C_SMBUS_WRITE) { ++ timeout = 0; ++ while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0) ++ { ++ if(timeout > MAX_RETRIES) ++ { ++ return -ETIMEDOUT; ++ } ++ usleep_range(250, 500); ++ timeout++; ++ } ++ ++ //Load more bytes into FIFO ++ if (len >= 4) { ++ for (i = cnt; i <= (cnt + 4); i++) { ++ outb_p(data->block[i], SMBHSTDAT); ++ } ++ ++ len -= 4; ++ cnt += 4; ++ } ++ else { ++ for (i = cnt; i <= (cnt + len); i++) { ++ outb_p(data->block[i], SMBHSTDAT); ++ } ++ ++ len = 0; ++ } ++ } ++ else { ++ return -ENOTSUPP; ++ } ++ ++ } ++ ++ //wait for manual mode to complete ++ timeout = 0; ++ while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0) ++ { ++ if(timeout > MAX_RETRIES) ++ { ++ return -ETIMEDOUT; ++ } ++ usleep_range(250, 500); ++ timeout++; ++ } ++ ++ if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) { ++ return -ENXIO; ++ } ++ else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) { ++ return 0; ++ } ++ ++ switch (size) { ++ case I2C_SMBUS_QUICK: ++ case I2C_SMBUS_BYTE_DATA: ++ data->byte = inb_p(SMBHSTDAT); ++ break; ++ case I2C_SMBUS_WORD_DATA: ++ data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8); ++ break; ++ } ++ return 0; ++} ++ ++static u32 nct6775_func(struct i2c_adapter *adapter) ++{ ++ return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | ++ I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | ++ I2C_FUNC_SMBUS_BLOCK_DATA; ++} ++ ++static const struct i2c_algorithm smbus_algorithm = { ++ .smbus_xfer = nct6775_access, ++ .functionality = nct6775_func, ++}; ++ ++static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap) ++{ ++ struct i2c_adapter *adap; ++ struct i2c_nct6775_adapdata *adapdata; ++ int retval; ++ ++ adap = kzalloc(sizeof(*adap), GFP_KERNEL); ++ if (adap == NULL) { ++ return -ENOMEM; ++ } ++ ++ adap->owner = THIS_MODULE; ++ adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; ++ adap->algo = &smbus_algorithm; ++ ++ adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); ++ if (adapdata == NULL) { ++ kfree(adap); ++ return -ENOMEM; ++ } ++ ++ adapdata->smba = smba; ++ ++ snprintf(adap->name, sizeof(adap->name), ++ "SMBus NCT67xx adapter%s at %04x", name, smba); ++ ++ i2c_set_adapdata(adap, adapdata); ++ ++ retval = i2c_add_adapter(adap); ++ if (retval) { ++ kfree(adapdata); ++ kfree(adap); ++ return retval; ++ } ++ ++ *padap = adap; ++ return 0; ++} ++ ++static void nct6775_remove_adapter(struct i2c_adapter *adap) ++{ ++ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); ++ ++ if (adapdata->smba) { ++ i2c_del_adapter(adap); ++ kfree(adapdata); ++ kfree(adap); ++ } ++} ++ ++//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume); ++ ++/* ++ * when Super-I/O functions move to a separate file, the Super-I/O ++ * bus will manage the lifetime of the device and this module will only keep ++ * track of the nct6775 driver. But since we use platform_device_alloc(), we ++ * must keep track of the device ++ */ ++static struct platform_device *pdev[2]; ++ ++static int nct6775_probe(struct platform_device *pdev) ++{ ++ struct device *dev = &pdev->dev; ++ struct nct6775_sio_data *sio_data = dev_get_platdata(dev); ++ struct resource *res; ++ ++ res = platform_get_resource(pdev, IORESOURCE_IO, 0); ++ if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH, ++ DRVNAME)) ++ return -EBUSY; ++ ++ switch (sio_data->kind) { ++ case nct6791: ++ case nct6792: ++ case nct6793: ++ case nct6795: ++ case nct6796: ++ case nct6798: ++ nct6775_add_adapter(res->start, "", &nct6775_adapter); ++ break; ++ default: ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++/* ++static void nct6791_enable_io_mapping(int sioaddr) ++{ ++ int val; ++ ++ val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE); ++ if (val & 0x10) { ++ pr_info("Enabling hardware monitor logical device mappings.\n"); ++ superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE, ++ val & ~0x10); ++ } ++}*/ ++ ++static struct platform_driver i2c_nct6775_driver = { ++ .driver = { ++ .name = DRVNAME, ++// .pm = &nct6775_dev_pm_ops, ++ }, ++ .probe = nct6775_probe, ++}; ++ ++static void __exit i2c_nct6775_exit(void) ++{ ++ int i; ++ ++ if(nct6775_adapter) ++ nct6775_remove_adapter(nct6775_adapter); ++ ++ for (i = 0; i < ARRAY_SIZE(pdev); i++) { ++ if (pdev[i]) ++ platform_device_unregister(pdev[i]); ++ } ++ platform_driver_unregister(&i2c_nct6775_driver); ++} ++ ++/* nct6775_find() looks for a '627 in the Super-I/O config space */ ++static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data) ++{ ++ u16 val; ++ int err; ++ int addr; ++ ++ err = superio_enter(sioaddr); ++ if (err) ++ return err; ++ ++ val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) | ++ superio_inb(sioaddr, SIO_REG_DEVID + 1); ++ ++ switch (val & SIO_ID_MASK) { ++ case SIO_NCT6106_ID: ++ sio_data->kind = nct6106; ++ break; ++ case SIO_NCT6775_ID: ++ sio_data->kind = nct6775; ++ break; ++ case SIO_NCT6776_ID: ++ sio_data->kind = nct6776; ++ break; ++ case SIO_NCT6779_ID: ++ sio_data->kind = nct6779; ++ break; ++ case SIO_NCT6791_ID: ++ sio_data->kind = nct6791; ++ break; ++ case SIO_NCT6792_ID: ++ sio_data->kind = nct6792; ++ break; ++ case SIO_NCT6793_ID: ++ sio_data->kind = nct6793; ++ break; ++ case SIO_NCT6795_ID: ++ sio_data->kind = nct6795; ++ break; ++ case SIO_NCT6796_ID: ++ sio_data->kind = nct6796; ++ break; ++ case SIO_NCT6798_ID: ++ sio_data->kind = nct6798; ++ break; ++ default: ++ if (val != 0xffff) ++ pr_debug("unsupported chip ID: 0x%04x\n", val); ++ superio_exit(sioaddr); ++ return -ENODEV; ++ } ++ ++ /* We have a known chip, find the SMBus I/O address */ ++ superio_select(sioaddr, NCT6775_LD_SMBUS); ++ val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8) ++ | superio_inb(sioaddr, SIO_REG_SMBA + 1); ++ addr = val & IOREGION_ALIGNMENT; ++ if (addr == 0) { ++ pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n"); ++ superio_exit(sioaddr); ++ return -ENODEV; ++ } ++ ++ //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 || ++ // sio_data->kind == nct6793 || sio_data->kind == nct6795 || ++ // sio_data->kind == nct6796) ++ // nct6791_enable_io_mapping(sioaddr); ++ ++ superio_exit(sioaddr); ++ pr_info("Found %s or compatible chip at %#x:%#x\n", ++ nct6775_sio_names[sio_data->kind], sioaddr, addr); ++ sio_data->sioreg = sioaddr; ++ ++ return addr; ++} ++ ++static int __init i2c_nct6775_init(void) ++{ ++ int i, err; ++ bool found = false; ++ int address; ++ struct resource res; ++ struct nct6775_sio_data sio_data; ++ int sioaddr[2] = { 0x2e, 0x4e }; ++ ++ err = platform_driver_register(&i2c_nct6775_driver); ++ if (err) ++ return err; ++ ++ /* ++ * initialize sio_data->kind and sio_data->sioreg. ++ * ++ * when Super-I/O functions move to a separate file, the Super-I/O ++ * driver will probe 0x2e and 0x4e and auto-detect the presence of a ++ * nct6775 hardware monitor, and call probe() ++ */ ++ for (i = 0; i < ARRAY_SIZE(pdev); i++) { ++ address = nct6775_find(sioaddr[i], &sio_data); ++ if (address <= 0) ++ continue; ++ ++ found = true; ++ ++ pdev[i] = platform_device_alloc(DRVNAME, address); ++ if (!pdev[i]) { ++ err = -ENOMEM; ++ goto exit_device_unregister; ++ } ++ ++ err = platform_device_add_data(pdev[i], &sio_data, ++ sizeof(struct nct6775_sio_data)); ++ if (err) ++ goto exit_device_put; ++ ++ memset(&res, 0, sizeof(res)); ++ res.name = DRVNAME; ++ res.start = address; ++ res.end = address + IOREGION_LENGTH - 1; ++ res.flags = IORESOURCE_IO; ++ ++ err = acpi_check_resource_conflict(&res); ++ if (err) { ++ platform_device_put(pdev[i]); ++ pdev[i] = NULL; ++ continue; ++ } ++ ++ err = platform_device_add_resources(pdev[i], &res, 1); ++ if (err) ++ goto exit_device_put; ++ ++ /* platform_device_add calls probe() */ ++ err = platform_device_add(pdev[i]); ++ if (err) ++ goto exit_device_put; ++ } ++ if (!found) { ++ err = -ENODEV; ++ goto exit_unregister; ++ } ++ ++ return 0; ++ ++exit_device_put: ++ platform_device_put(pdev[i]); ++exit_device_unregister: ++ while (--i >= 0) { ++ if (pdev[i]) ++ platform_device_unregister(pdev[i]); ++ } ++exit_unregister: ++ platform_driver_unregister(&i2c_nct6775_driver); ++ return err; ++} ++ ++MODULE_AUTHOR("Adam Honse "); ++MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips"); ++MODULE_LICENSE("GPL"); ++ ++module_init(i2c_nct6775_init); ++module_exit(i2c_nct6775_exit); +diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c +index 809fbd014cd6..d54b35b147ee 100644 +--- a/drivers/i2c/busses/i2c-piix4.c ++++ b/drivers/i2c/busses/i2c-piix4.c +@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) + if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */ + usleep_range(2000, 2100); + else +- usleep_range(250, 500); ++ usleep_range(25, 50); + + while ((++timeout < MAX_TIMEOUT) && + ((temp = inb_p(SMBHSTSTS)) & 0x01)) +- usleep_range(250, 500); ++ usleep_range(25, 50); + + /* If the SMBus is still busy, we give up */ + if (timeout == MAX_TIMEOUT) { +diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c +index 2653516bcdef..973fe8f80051 100644 +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -3207,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) + goto bad; + } + ++#ifdef CONFIG_CACHY ++ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); ++ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); ++#endif ++ + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 285acc4aaccc..492e88a99c07 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be + * prevented for those affected devices. +@@ -4980,6 +5080,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 29e1f9e76eb6..a7852e22101f 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -1178,7 +1178,7 @@ struct readahead_control { + ._index = i, \ + } + +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) + + void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h +index 45f09bec02c4..87b20e2ee274 100644 +--- a/include/linux/user_namespace.h ++++ b/include/linux/user_namespace.h +@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, + + #ifdef CONFIG_USER_NS + ++extern int unprivileged_userns_clone; ++ + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) + { + if (ns) +@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); + struct ns_common *ns_get_owner(struct ns_common *ns); + #else + ++#define unprivileged_userns_clone 0 ++ + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) + { + return &init_user_ns; +diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h +index db762e35aca9..0336791656eb 100644 +--- a/include/net/netns/ipv4.h ++++ b/include/net/netns/ipv4.h +@@ -194,6 +194,7 @@ struct netns_ipv4 { + int sysctl_udp_rmem_min; + + u8 sysctl_fib_notify_on_flag_change; ++ unsigned int sysctl_tcp_collapse_max_bytes; + + #ifdef CONFIG_NET_L3_MASTER_DEV + u8 sysctl_udp_l3mdev_accept; +diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h +index 901b440238d5..7026df84a0f6 100644 +--- a/include/trace/events/tcp.h ++++ b/include/trace/events/tcp.h +@@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, + TP_ARGS(sk) + ); + ++DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, ++ ++ TP_PROTO(struct sock *sk), ++ ++ TP_ARGS(sk) ++); ++ + TRACE_EVENT(tcp_retransmit_synack, + + TP_PROTO(const struct sock *sk, const struct request_sock *req), +diff --git a/init/Kconfig b/init/Kconfig +index 44e90b28a30f..748a9491ca12 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config CACHY ++ bool "Some kernel tweaks by CachyOS" ++ default y ++ + config BROKEN + bool + +@@ -348,6 +352,19 @@ config KERNEL_UNCOMPRESSED + + endchoice + ++menu "ZSTD compression options" ++ depends on KERNEL_ZSTD ++ ++config ZSTD_COMPRESSION_LEVEL ++ int "Compression level (1-22)" ++ range 1 22 ++ default "22" ++ help ++ Choose a compression level for zstd kernel compression. ++ Default is 22, which is the maximum. ++ ++endmenu ++ + config DEFAULT_INIT + string "Default init path" + default "" +@@ -1253,6 +1270,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +@@ -1420,6 +1453,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + ++config CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++ bool "Optimize more for performance (-O3)" ++ help ++ Choosing this option will pass "-O3" to your compiler to optimize ++ the kernel yet more for performance. ++ + config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size (-Os)" + help +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..0f78364efd4f 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -40,6 +40,27 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ ++ config HZ_600 ++ bool "600 HZ" ++ help ++ 600 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -53,6 +74,9 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 ++ default 600 if HZ_600 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK +diff --git a/kernel/fork.c b/kernel/fork.c +index 9f7fe3541897..068062cdf5a3 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -98,6 +98,10 @@ + #include + #include + ++#ifdef CONFIG_USER_NS ++#include ++#endif ++ + #include + #include + #include +@@ -2030,6 +2034,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -3180,6 +3188,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig +index 424b3bc58f3f..ecf2798c5ccf 100644 +--- a/kernel/module/Kconfig ++++ b/kernel/module/Kconfig +@@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD + + endchoice + ++menu "ZSTD module compression options" ++ depends on MODULE_COMPRESS_ZSTD ++ ++config MODULE_COMPRESS_ZSTD_LEVEL ++ int "Compression level (1-19)" ++ range 1 19 ++ default 9 ++ help ++ Compression level used by zstd for compressing modules. ++ ++config MODULE_COMPRESS_ZSTD_ULTRA ++ bool "Enable ZSTD ultra compression" ++ help ++ Compress modules with ZSTD using the highest possible compression. ++ ++config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA ++ int "Compression level (20-22)" ++ depends on MODULE_COMPRESS_ZSTD_ULTRA ++ range 20 22 ++ default 20 ++ help ++ Ultra compression level used by zstd for compressing modules. ++ ++endmenu ++ + config MODULE_DECOMPRESS + bool "Support in-kernel module decompression" + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD +diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +index ab62074174c3..f1f909bdc30d 100644 +--- a/kernel/rcu/Kconfig ++++ b/kernel/rcu/Kconfig +@@ -280,9 +280,9 @@ config RCU_NOCB_CPU_CB_BOOST + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help +- Use this option to invoke offloaded callbacks as SCHED_FIFO ++ Use this option to invoke offloaded callbacks as SCHED_RR + to avoid starvation by heavy SCHED_OTHER background load. +- Of course, running as SCHED_FIFO during callback floods will ++ Of course, running as SCHED_RR during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive +diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c +index 634df26a2c27..8c54871cc0a0 100644 +--- a/kernel/rcu/rcutorture.c ++++ b/kernel/rcu/rcutorture.c +@@ -2406,7 +2406,7 @@ static int rcutorture_booster_init(unsigned int cpu) + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; +- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(t, SCHED_RR, &sp); + } + + /* Don't allow time recalculation while creating a new task. */ +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index cf34a961821a..80cf9824d461 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -4443,8 +4443,8 @@ static void __init rcu_start_exp_gp_kworkers(void) + return; + } + +- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); +- sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, ++ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, ¶m); ++ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR, + ¶m); + } + +@@ -4482,7 +4482,7 @@ static int __init rcu_spawn_gp_kthread(void) + return 0; + if (kthread_prio) { + sp.sched_priority = kthread_prio; +- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(t, SCHED_RR, &sp); + } + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); +diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h +index 9e1c8caec5ce..dd39c50ae099 100644 +--- a/kernel/rcu/tree_nocb.h ++++ b/kernel/rcu/tree_nocb.h +@@ -1465,7 +1465,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) + } + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + if (kthread_prio) +- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(t, SCHED_RR, &sp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + +@@ -1476,7 +1476,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) + goto end; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) +- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(t, SCHED_RR, &sp); + + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); +diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h +index 7b0fe741a088..77ad9e033358 100644 +--- a/kernel/rcu/tree_plugin.h ++++ b/kernel/rcu/tree_plugin.h +@@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) + struct sched_param sp; + + sp.sched_priority = kthread_prio; +- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(current, SCHED_RR, &sp); + #endif /* #ifdef CONFIG_RCU_BOOST */ + + WRITE_ONCE(rdp->rcuc_activity, jiffies); +@@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; +- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); ++ sched_setscheduler_nocheck(t, SCHED_RR, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + + out: +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0f8736991427..86a988c830ef 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -69,9 +69,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_CACHY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +- ++#endif + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_CACHY ++unsigned int sysctl_sched_min_granularity = 400000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. +@@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_CACHY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; ++#endif + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +@@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_CACHY ++static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + #ifdef CONFIG_NUMA_BALANCING + /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 137d4abe3eda..98e2d9cc8491 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); + #ifdef CONFIG_PERF_EVENTS + static const int six_hundred_forty_kb = 640 * 1024; + #endif ++#ifdef CONFIG_USER_NS ++#include ++#endif + + + static const int ngroups_max = NGROUPS_MAX; +@@ -1640,6 +1643,15 @@ static struct ctl_table kern_table[] = { + .mode = 0644, + .proc_handler = proc_dointvec, + }, ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 54211dbd516c..16ca0c151629 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -22,6 +22,13 @@ + #include + #include + ++/* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else ++int unprivileged_userns_clone; ++#endif ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/string.c b/lib/string.c +index 4fb566ea610f..4746a98b153e 100644 +--- a/lib/string.c ++++ b/lib/string.c +@@ -792,24 +792,61 @@ char *strnstr(const char *s1, const char *s2, size_t len) + EXPORT_SYMBOL(strnstr); + #endif + ++#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 ++ ++#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL) ++ ++#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) ++ ++#define MEMCHR_MASK_GEN(mask) \ ++ do { \ ++ mask *= 0x01010101; \ ++ mask |= mask << 32; \ ++ } while (0) ++ ++#else ++ ++#define MEMCHR_MASK_GEN(mask) \ ++ do { \ ++ mask |= mask << 8; \ ++ mask |= mask << 16; \ ++ mask |= mask << 32; \ ++ } while (0) ++ ++#endif ++ + #ifndef __HAVE_ARCH_MEMCHR + /** + * memchr - Find a character in an area of memory. +- * @s: The memory area ++ * @p: The memory area + * @c: The byte to search for +- * @n: The size of the area. ++ * @length: The size of the area. + * + * returns the address of the first occurrence of @c, or %NULL + * if @c is not found + */ +-void *memchr(const void *s, int c, size_t n) ++void *memchr(const void *p, int c, unsigned long length) + { +- const unsigned char *p = s; +- while (n-- != 0) { +- if ((unsigned char)c == *p++) { +- return (void *)(p - 1); ++ u64 mask, val; ++ const void *end = p + length; ++ ++ c &= 0xff; ++ if (p <= end - 8) { ++ mask = c; ++ MEMCHR_MASK_GEN(mask); ++ ++ for (; p <= end - 8; p += 8) { ++ val = *(u64 *)p ^ mask; ++ if ((val + 0xfefefefefefefeffu) & ++ (~val & 0x8080808080808080u)) ++ break; + } + } ++ ++ for (; p < end; p++) ++ if (*(unsigned char *)p == c) ++ return (void *)p; ++ + return NULL; + } + EXPORT_SYMBOL(memchr); +@@ -845,16 +882,7 @@ void *memchr_inv(const void *start, int c, size_t bytes) + return check_bytes8(start, value, bytes); + + value64 = value; +-#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 +- value64 *= 0x0101010101010101ULL; +-#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) +- value64 *= 0x01010101; +- value64 |= value64 << 32; +-#else +- value64 |= value64 << 8; +- value64 |= value64 << 16; +- value64 |= value64 << 32; +-#endif ++ MEMCHR_MASK_GEN(value64); + + prefix = (unsigned long)start % 8; + if (prefix) { +diff --git a/mm/Kconfig b/mm/Kconfig +index ff7b209dec05..bf317c39ed2d 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -602,7 +602,7 @@ config COMPACTION + config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION +- default 0 if PREEMPT_RT ++ default 0 if PREEMPT_RT || CACHY + default 1 + + # +diff --git a/mm/compaction.c b/mm/compaction.c +index 8238e83385a7..d0b16a5b30f7 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -2717,7 +2717,11 @@ static void compact_nodes(void) + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ ++#ifdef CONFIG_CACHY ++unsigned int __read_mostly sysctl_compaction_proactiveness; ++#else + unsigned int __read_mostly sysctl_compaction_proactiveness = 20; ++#endif + + int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index ad608ef2a243..178cfd5490b1 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_CACHY ++static int dirty_background_ratio = 5; ++#else + static int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes; + /* + * The interval between `kupdate'-style writebacks + */ ++#ifdef CONFIG_CACHY ++unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */ ++#else + unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ ++#endif + + EXPORT_SYMBOL_GPL(dirty_writeback_interval); + +diff --git a/mm/swap.c b/mm/swap.c +index 70e2063ef43a..79ab9b1c3910 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -1134,6 +1134,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); + */ + void __init swap_setup(void) + { ++#ifdef CONFIG_CACHY ++ /* Only swap-in pages requested, avoid readahead */ ++ page_cluster = 0; ++#else + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ +@@ -1145,4 +1149,5 @@ void __init swap_setup(void) + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++#endif + } +diff --git a/mm/vmpressure.c b/mm/vmpressure.c +index b52644771cc4..11a4b0e3b583 100644 +--- a/mm/vmpressure.c ++++ b/mm/vmpressure.c +@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ ++#ifdef CONFIG_CACHY ++static const unsigned int vmpressure_level_med = 65; ++#else + static const unsigned int vmpressure_level_med = 60; ++#endif + static const unsigned int vmpressure_level_critical = 95; + + /* +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 5b7b8d4f5297..160acbbdf111 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -190,7 +190,11 @@ struct scan_control { + /* + * From 0 .. 200. Higher means more swappy. + */ ++#ifdef CONFIG_CACHY ++int vm_swappiness = 20; ++#else + int vm_swappiness = 60; ++#endif + + static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +@@ -4559,7 +4563,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned + } + + /* to protect the working set of the last N jiffies */ ++#ifdef CONFIG_CACHY ++static unsigned long lru_gen_min_ttl __read_mostly = HZ; ++#else + static unsigned long lru_gen_min_ttl __read_mostly; ++#endif + + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { +diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c +index 0d0cc4ef2b85..544104f9f4b3 100644 +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -1467,6 +1467,13 @@ static struct ctl_table ipv4_net_table[] = { + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, ++ { ++ .procname = "tcp_collapse_max_bytes", ++ .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ }, + { } + }; + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 754e0212c951..b6d7faeb737a 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5414,6 +5414,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) + static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); + + NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); + +@@ -5425,6 +5426,39 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + ++ /* For context and additional information about this patch, see the ++ * blog post at ++ * ++ * sysctl: net.ipv4.tcp_collapse_max_bytes ++ * ++ * If tcp_collapse_max_bytes is non-zero, attempt to collapse the ++ * queue to free up memory if the current amount of memory allocated ++ * is less than tcp_collapse_max_bytes. Otherwise, the packet is ++ * dropped without attempting to collapse the queue. ++ * ++ * If tcp_collapse_max_bytes is zero, this feature is disabled ++ * and the default Linux behavior is used. The default Linux ++ * behavior is to always perform the attempt to collapse the ++ * queue to free up memory. ++ * ++ * When the receive queue is small, we want to collapse the ++ * queue. There are two reasons for this: (a) the latency of ++ * performing the collapse will be small on a small queue, and ++ * (b) we want to avoid sending a congestion signal (via a ++ * packet drop) to the sender when the receive queue is small. ++ * ++ * The result is that we avoid latency spikes caused by the ++ * time it takes to perform the collapse logic when the receive ++ * queue is large and full, while preserving existing behavior ++ * and performance for all other cases. ++ */ ++ if (net->ipv4.sysctl_tcp_collapse_max_bytes && ++ (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { ++ /* We are dropping the packet */ ++ trace_tcp_collapse_max_bytes_exceeded(sk); ++ goto do_not_collapse; ++ } ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, +@@ -5443,6 +5477,8 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + ++do_not_collapse: ++ + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 8320d0ecb13a..37a09cd767a1 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -3274,6 +3274,8 @@ static int __net_init tcp_sk_init(struct net *net) + else + net->ipv4.tcp_congestion_control = &tcp_reno; + ++ net->ipv4.sysctl_tcp_collapse_max_bytes = 0; ++ + return 0; + } + +diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib +index 4a4a5f67c1a6..993e4578c0f2 100644 +--- a/scripts/Makefile.lib ++++ b/scripts/Makefile.lib +@@ -557,14 +557,21 @@ quiet_cmd_xzmisc = XZMISC $@ + # decompression is used, like initramfs decompression, zstd22 should likely not + # be used because it would require zstd to allocate a 128 MB buffer. + ++ifdef CONFIG_ZSTD_COMPRESSION_LEVEL ++zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL) ++ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) ++zstd_comp_val += --ultra ++endif ++endif ++ + quiet_cmd_zstd = ZSTD $@ +- cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@ ++ cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@ + + quiet_cmd_zstd22 = ZSTD22 $@ +- cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ ++ cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@ + + quiet_cmd_zstd22_with_size = ZSTD22 $@ +- cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ ++ cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@ + + # ASM offsets + # --------------------------------------------------------------------------- +diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst +index 4815a8e32227..6a3c36713045 100644 +--- a/scripts/Makefile.modinst ++++ b/scripts/Makefile.modinst +@@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP $@ + cmd_gzip = $(KGZIP) -n -f $< + quiet_cmd_xz = XZ $@ + cmd_xz = $(XZ) --lzma2=dict=2MiB -f $< ++ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA + quiet_cmd_zstd = ZSTD $@ +- cmd_zstd = $(ZSTD) -T0 --rm -f -q $< ++ cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $< ++else ++quiet_cmd_zstd = ZSTD $@ ++ cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $< ++endif + + $(dst)/%.ko.gz: $(dst)/%.ko FORCE + $(call cmd,gzip) +-- +2.39.2 + +From e80cb8174e11427fa2c9a98d05cf11552767b940 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 15 Jan 2023 16:51:11 +0100 +Subject: [PATCH 05/15] clr + +Signed-off-by: Peter Jung +--- + arch/x86/kernel/tsc.c | 3 ++ + arch/x86/mm/fault.c | 4 +- + drivers/cpufreq/intel_pstate.c | 7 ++++ + drivers/idle/intel_idle.c | 50 ++++++++++++------------ + drivers/input/serio/i8042.c | 10 ++--- + drivers/net/dummy.c | 2 +- + drivers/pci/pci.c | 2 +- + drivers/powercap/intel_rapl_common.c | 2 +- + drivers/thermal/intel/intel_powerclamp.c | 10 +++++ + fs/xattr.c | 15 +++---- + include/linux/jbd2.h | 2 +- + include/linux/wait.h | 2 + + include/uapi/linux/if_bonding.h | 2 +- + init/do_mounts.c | 16 +++++++- + kernel/locking/rwsem.c | 4 +- + kernel/sched/wait.c | 24 ++++++++++++ + kernel/watchdog.c | 2 +- + lib/raid6/algos.c | 4 +- + mm/ksm.c | 11 ++++-- + net/ipv4/inet_connection_sock.c | 2 +- + net/ipv4/tcp.c | 4 +- + 21 files changed, 123 insertions(+), 55 deletions(-) + +diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c +index a78e73da4a74..bab8a98080cf 100644 +--- a/arch/x86/kernel/tsc.c ++++ b/arch/x86/kernel/tsc.c +@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) + if (!constant_tsc || !mask) + return 0; + ++ if (cpu != 0) ++ return cpu_data(0).loops_per_jiffy; ++ + sibling = cpumask_any_but(mask, cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index 7b0d4ab894c8..1a14f52added 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, + if (!printk_ratelimit()) + return; + +- printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", ++ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", + loglvl, tsk->comm, task_pid_nr(tsk), address, +- (void *)regs->ip, (void *)regs->sp, error_code); ++ (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); + + print_vma_addr(KERN_CONT " in ", regs->ip); + +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index fd73d6d2b808..0c0071ab3966 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -366,6 +366,13 @@ static void intel_pstate_set_itmt_prio(int cpu) + * update them at any time after it has been called. + */ + sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); ++ /* ++ * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. ++ * In this case we can't use CPPC.highest_perf to enable ITMT. ++ * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. ++ */ ++ if (cppc_perf.highest_perf == 0xff) ++ cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); + + if (max_highest_perf <= min_highest_perf) { + if (cppc_perf.highest_perf > max_highest_perf) +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index cfeb24d40d37..8d1945afa973 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 4, +- .target_residency = 4, ++ .target_residency = 40, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 170, +- .target_residency = 600, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -987,7 +987,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, +- .target_residency = 4, ++ .target_residency = 40, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c +index 6dac7c1853a5..fab04cd8a7a0 100644 +--- a/drivers/input/serio/i8042.c ++++ b/drivers/input/serio/i8042.c +@@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { + i8042_ctr &= ~I8042_CTR_KBDINT; + i8042_ctr |= I8042_CTR_KBDDIS; +- pr_err("Failed to enable KBD port\n"); ++ pr_info("Failed to enable KBD port\n"); + return -EIO; + } + +@@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { + i8042_ctr &= ~I8042_CTR_AUXINT; + i8042_ctr |= I8042_CTR_AUXDIS; +- pr_err("Failed to enable AUX port\n"); ++ pr_info("Failed to enable AUX port\n"); + return -EIO; + } + +@@ -732,7 +732,7 @@ static int i8042_check_mux(void) + i8042_ctr &= ~I8042_CTR_AUXINT; + + if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { +- pr_err("Failed to disable AUX port, can't use MUX\n"); ++ pr_info("Failed to disable AUX port, can't use MUX\n"); + return -EIO; + } + +@@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) + do { + + if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { +- pr_err("i8042 controller selftest timeout\n"); ++ pr_info("i8042 controller selftest timeout\n"); + return -ENODEV; + } + +@@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) + pr_info("giving up on controller selftest, continuing anyway...\n"); + return 0; + #else +- pr_err("i8042 controller selftest failed\n"); ++ pr_info("i8042 controller selftest failed\n"); + return -EIO; + #endif + } +diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c +index c4b1b0aa438a..06b00f7a8eab 100644 +--- a/drivers/net/dummy.c ++++ b/drivers/net/dummy.c +@@ -43,7 +43,7 @@ + + #define DRV_NAME "dummy" + +-static int numdummies = 1; ++static int numdummies = 0; + + /* fake multicast ability */ + static void set_multicast_list(struct net_device *dev) +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index 5641786bd020..0ef504e909db 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -62,7 +62,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c +index 26d00b1853b4..3e239d6548b5 100644 +--- a/drivers/powercap/intel_rapl_common.c ++++ b/drivers/powercap/intel_rapl_common.c +@@ -1518,7 +1518,7 @@ static int __init rapl_init(void) + + id = x86_match_cpu(rapl_ids); + if (!id) { +- pr_err("driver does not support CPU family %d model %d\n", ++ pr_info("driver does not support CPU family %d model %d\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return -ENODEV; +diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c +index b80e25ec1261..187b4ee6e9f5 100644 +--- a/drivers/thermal/intel/intel_powerclamp.c ++++ b/drivers/thermal/intel/intel_powerclamp.c +@@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { + .set_cur_state = powerclamp_set_cur_state, + }; + ++static const struct x86_cpu_id amd_cpu[] = { ++ { X86_VENDOR_AMD }, ++ {}, ++}; ++ + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { + X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), + {} +@@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); + static int __init powerclamp_probe(void) + { + ++ if (x86_match_cpu(amd_cpu)){ ++ pr_info("Intel PowerClamp does not support AMD CPUs\n"); ++ return -ENODEV; ++ } ++ + if (!x86_match_cpu(intel_powerclamp_ids)) { + pr_err("CPU does not support MWAIT\n"); + return -ENODEV; +diff --git a/fs/xattr.c b/fs/xattr.c +index adab9a70b536..4ada829a3b1b 100644 +--- a/fs/xattr.c ++++ b/fs/xattr.c +@@ -139,16 +139,17 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, + } + + /* +- * In the user.* namespace, only regular files and directories can have +- * extended attributes. For sticky directories, only the owner and +- * privileged users can write attributes. ++ * In the user.* namespace, only regular files, symbolic links, and ++ * directories can have extended attributes. For symbolic links and ++ * sticky directories, only the owner and privileged users can write ++ * attributes. + */ + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { +- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) ++ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; +- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && +- (mask & MAY_WRITE) && +- !inode_owner_or_capable(mnt_userns, inode)) ++ if (((S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX)) ++ || S_ISLNK(inode->i_mode)) && (mask & MAY_WRITE) ++ && !inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + } + +diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h +index 2170e0cc279d..e8fa79f5bb34 100644 +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -45,7 +45,7 @@ + /* + * The default maximum commit age, in seconds. + */ +-#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 ++#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 + + #ifdef CONFIG_JBD2_DEBUG + /* +diff --git a/include/linux/wait.h b/include/linux/wait.h +index a0307b516b09..edc21128f387 100644 +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) + + extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); ++extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + +@@ -1192,6 +1193,7 @@ do { \ + */ + void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); ++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); + void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); + long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); +diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h +index d174914a837d..bf8e2af101a3 100644 +--- a/include/uapi/linux/if_bonding.h ++++ b/include/uapi/linux/if_bonding.h +@@ -82,7 +82,7 @@ + #define BOND_STATE_ACTIVE 0 /* link is active */ + #define BOND_STATE_BACKUP 1 /* link is backup */ + +-#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ ++#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ + + #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ + +diff --git a/init/do_mounts.c b/init/do_mounts.c +index 811e94daf0a8..06fef7f97c02 100644 +--- a/init/do_mounts.c ++++ b/init/do_mounts.c +@@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) + if (strcmp(name, "/dev/ram") == 0) + return Root_RAM0; + #ifdef CONFIG_BLOCK +- if (strncmp(name, "PARTUUID=", 9) == 0) +- return devt_from_partuuid(name + 9); ++ if (strncmp(name, "PARTUUID=", 9) == 0) { ++ dev_t res; ++ int needtowait = 40<<1; ++ res = devt_from_partuuid(name + 9); ++ while (!res && needtowait) { ++ /* waiting 0.5 sec */ ++ msleep(500); ++ res = devt_from_partuuid(name + 9); ++ needtowait--; ++ } ++ return res; ++ } + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10); + if (strncmp(name, "/dev/", 5) == 0) +@@ -612,7 +622,9 @@ void __init prepare_namespace(void) + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ ++ async_synchronize_full(); + wait_for_device_probe(); ++ async_synchronize_full(); + + md_run_setup(); + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index 44873594de03..fe62d59f2bdc 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + lockdep_assert_preemption_disabled(); + +@@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + + return state; +diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c +index 133b74730738..1647fb8662eb 100644 +--- a/kernel/sched/wait.c ++++ b/kernel/sched/wait.c +@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ + } + EXPORT_SYMBOL_GPL(add_wait_queue_priority); + ++void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) ++{ ++ unsigned long flags; ++ ++ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; ++ spin_lock_irqsave(&wq_head->lock, flags); ++ __add_wait_queue(wq_head, wq_entry); ++ spin_unlock_irqrestore(&wq_head->lock, flags); ++} ++EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); ++ + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) + { + unsigned long flags; +@@ -293,6 +304,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent + } + EXPORT_SYMBOL(prepare_to_wait_exclusive); + ++void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) ++{ ++ unsigned long flags; ++ ++ wq_entry->flags |= WQ_FLAG_EXCLUSIVE; ++ spin_lock_irqsave(&wq_head->lock, flags); ++ if (list_empty(&wq_entry->entry)) ++ __add_wait_queue(wq_head, wq_entry); ++ set_current_state(state); ++ spin_unlock_irqrestore(&wq_head->lock, flags); ++} ++EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); ++ + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) + { + wq_entry->flags = flags; +diff --git a/kernel/watchdog.c b/kernel/watchdog.c +index 8e61f21e7e33..be1439d38f26 100644 +--- a/kernel/watchdog.c ++++ b/kernel/watchdog.c +@@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; + int __read_mostly watchdog_user_enabled = 1; + int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; + int __read_mostly soft_watchdog_user_enabled = 1; +-int __read_mostly watchdog_thresh = 10; ++int __read_mostly watchdog_thresh = 40; + static int __read_mostly nmi_watchdog_available; + + struct cpumask watchdog_cpumask __read_mostly; +diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c +index a22a05c9af8a..a70bcbbd1673 100644 +--- a/lib/raid6/algos.c ++++ b/lib/raid6/algos.c +@@ -126,8 +126,10 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) + + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) + if (!best || (*algo)->priority > best->priority) +- if (!(*algo)->valid || (*algo)->valid()) ++ if (!(*algo)->valid || (*algo)->valid()) { + best = *algo; ++ break; ++ } + + if (best) { + raid6_2data_recov = best->data2; +diff --git a/mm/ksm.c b/mm/ksm.c +index addf490da146..a92c9594a2d3 100644 +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -2454,9 +2454,14 @@ static int ksm_scan_thread(void *nothing) + + if (ksmd_should_run()) { + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); +- wait_event_interruptible_timeout(ksm_iter_wait, +- sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), +- msecs_to_jiffies(sleep_ms)); ++ if (sleep_ms >= 1000) ++ wait_event_interruptible_timeout(ksm_iter_wait, ++ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), ++ msecs_to_jiffies(round_jiffies_relative(sleep_ms))); ++ else ++ wait_event_interruptible_timeout(ksm_iter_wait, ++ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), ++ msecs_to_jiffies(sleep_ms)); + } else { + wait_event_freezable(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index f2c43f67187d..9885bfb429a2 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -606,7 +606,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) + * having to remove and re-insert us on the wait queue. + */ + for (;;) { +- prepare_to_wait_exclusive(sk_sleep(sk), &wait, ++ prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index e9e8040d6491..f9b56123b3b8 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -4815,8 +4815,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.39.2 + +From 952f0ec42e0dddee76cb525f4cca1fe60e910b95 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Tue, 21 Feb 2023 10:27:37 +0100 +Subject: [PATCH 06/15] fixes + +Signed-off-by: Peter Jung +--- + Documentation/ABI/stable/sysfs-block | 10 + + .../testing/sysfs-class-led-trigger-blkdev | 78 ++ + Documentation/admin-guide/mm/ksm.rst | 7 + + Documentation/leds/index.rst | 1 + + Documentation/leds/ledtrig-blkdev.rst | 158 +++ + arch/x86/boot/compressed/Makefile | 2 +- + arch/x86/kernel/acpi/boot.c | 19 +- + arch/x86/mm/tlb.c | 2 +- + drivers/acpi/acpica/Makefile | 2 +- + drivers/bluetooth/btusb.c | 9 + + drivers/char/tpm/tpm-chip.c | 62 +- + drivers/char/tpm/tpm.h | 73 + + drivers/hwmon/nct6775-core.c | 2 +- + drivers/leds/trigger/Kconfig | 9 + + drivers/leds/trigger/Makefile | 1 + + drivers/leds/trigger/ledtrig-blkdev.c | 1220 +++++++++++++++++ + drivers/md/dm.c | 2 + + fs/eventpoll.c | 2 +- + fs/proc/base.c | 1 + + include/linux/mm_types.h | 7 +- + include/linux/pageblock-flags.h | 2 +- + kernel/kthread.c | 5 + + kernel/padata.c | 4 +- + lib/string.c | 10 +- + lib/zstd/decompress/huf_decompress.c | 2 +- + mm/compaction.c | 75 +- + mm/internal.h | 6 +- + mm/ksm.c | 185 ++- + mm/page_alloc.c | 22 +- + mm/z3fold.c | 2 - + mm/zsmalloc.c | 3 - + scripts/Kconfig.include | 2 +- + scripts/Makefile.compiler | 8 +- + scripts/Makefile.vmlinux_o | 2 +- + scripts/as-version.sh | 2 +- + security/Kconfig.hardening | 3 + + sound/pci/hda/cs35l41_hda.c | 2 +- + .../selftests/vm/ksm_functional_tests.c | 96 +- + 38 files changed, 1992 insertions(+), 106 deletions(-) + create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev + create mode 100644 Documentation/leds/ledtrig-blkdev.rst + create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c + +diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block +index cd14ecb3c9a5..853cb2601242 100644 +--- a/Documentation/ABI/stable/sysfs-block ++++ b/Documentation/ABI/stable/sysfs-block +@@ -101,6 +101,16 @@ Description: + devices that support receiving integrity metadata. + + ++What: /sys/block//linked_leds ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Directory that contains symbolic links to all LEDs that ++ are associated with (linked to) this block device by the ++ blkdev LED trigger. Only present when at least one LED ++ is linked. (See Documentation/leds/ledtrig-blkdev.rst.) ++ ++ + What: /sys/block///alignment_offset + Date: April 2009 + Contact: Martin K. Petersen +diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev +new file mode 100644 +index 000000000000..45275eb0bad3 +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev +@@ -0,0 +1,78 @@ ++What: /sys/class/leds//blink_time ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Time (in milliseconds) that the LED will be on during a single ++ "blink". ++ ++What: /sys/class/leds//check_interval ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Interval (in milliseconds) between checks of the block devices ++ linked to this LED. The LED will be blinked if the correct type ++ of activity (see blink_on_{read,write,discard,flush} attributes) ++ has occurred on any of the linked devices since the previous ++ check. ++ ++What: /sys/class/leds//blink_on_read ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to read activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_write ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to write activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_discard ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to discard activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_flush ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to cache flush activity on any of its linked block devices. ++ ++What: /sys/class/leds//link_dev_by_path ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Associate a block device with this LED by writing the path to ++ the device special file (e.g. /dev/sda) to this attribute. ++ Symbolic links are followed. ++ ++What: /sys/class/leds//unlink_dev_by_path ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Remove the association between this LED and a block device by ++ writing the path to the device special file (e.g. /dev/sda) to ++ this attribute. Symbolic links are followed. ++ ++What: /sys/class/leds//unlink_dev_by_name ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Remove the association between this LED and a block device by ++ writing the kernel name of the device (e.g. sda) to this ++ attribute. ++ ++What: /sys/class/leds//linked_devices ++Date: October 2022 ++Contact: Ian Pilcher ++Description: ++ Directory containing links to all block devices that are ++ associated with this LED. (Note that the names of the ++ symbolic links in this directory are *kernel* names, which ++ may not match the device special file paths written to ++ link_device and unlink_device.) +diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst +index fb6ba2002a4b..f160f9487a90 100644 +--- a/Documentation/admin-guide/mm/ksm.rst ++++ b/Documentation/admin-guide/mm/ksm.rst +@@ -173,6 +173,13 @@ stable_node_chains + the number of KSM pages that hit the ``max_page_sharing`` limit + stable_node_dups + number of duplicated KSM pages ++zero_pages_sharing ++ how many empty pages are sharing kernel zero page(s) instead of ++ with each other as it would happen normally. Only effective when ++ enabling ``use_zero_pages`` knob. ++ ++When enabling ``use_zero_pages``, the sum of ``pages_sharing`` + ++``zero_pages_sharing`` represents how much really saved by KSM. + + A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good + sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` +diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst +index e5d63b940045..e3c24e468cbc 100644 +--- a/Documentation/leds/index.rst ++++ b/Documentation/leds/index.rst +@@ -10,6 +10,7 @@ LEDs + leds-class + leds-class-flash + leds-class-multicolor ++ ledtrig-blkdev + ledtrig-oneshot + ledtrig-transient + ledtrig-usbport +diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst +new file mode 100644 +index 000000000000..9ff5b99de451 +--- /dev/null ++++ b/Documentation/leds/ledtrig-blkdev.rst +@@ -0,0 +1,158 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++================================= ++Block Device (blkdev) LED Trigger ++================================= ++ ++Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or ++``CONFIG_LEDS_TRIGGER_BLKDEV=m``. ++ ++See also: ++ ++* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev`` ++* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block//linked_leds``) ++ ++Overview ++======== ++ ++.. note:: ++ The examples below use ```` to refer to the name of a ++ system-specific LED. If no suitable LED is available on a test ++ system (in a virtual machine, for example), it is possible to ++ use a userspace LED. (See ``Documentation/leds/uleds.rst``.) ++ ++Verify that the ``blkdev`` LED trigger is available:: ++ ++ # grep blkdev /sys/class/leds//trigger ++ ... rfkill-none blkdev ++ ++(If the previous command produces no output, you may need to load the trigger ++module - ``modprobe ledtrig_blkdev``. If the module is not available, check ++the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.) ++ ++Associate the LED with the ``blkdev`` LED trigger:: ++ ++ # echo blkdev > /sys/class/leds//trigger ++ ++ # cat /sys/class/leds//trigger ++ ... rfkill-none [blkdev] ++ ++Note that several new device attributes are available in the ++``/sys/class/leds/`` directory. ++ ++* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are ++ used to manage the set of block devices associated with this LED. The LED ++ will blink when activity occurs on any of its linked devices. ++ ++* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and ++ ``blink_on_flush`` are boolean values that determine whether the LED will ++ blink when a particular type of activity is detected on one of its linked ++ block devices. ++ ++* ``blink_time`` is the duration (in milliseconds) of each blink of this LED. ++ (The minimum value is 10 milliseconds.) ++ ++* ``check_interval`` is the frequency (in milliseconds) with which block devices ++ linked to this LED will be checked for activity and the LED blinked (if the ++ correct type of activity has occurred). ++ ++* The ``linked_devices`` directory will contain a symbolic link to every device ++ that is associated with this LED. ++ ++Link a block device to the LED:: ++ ++ # echo /dev/sda > /sys/class/leds//link_dev_by_path ++ ++ # ls /sys/class/leds//linked_devices ++ sda ++ ++(The value written to ``link_dev_by_path`` must be the path of the device ++special file, such as ``/dev/sda``, that represents the block device - or the ++path of a symbolic link to such a device special file.) ++ ++Activity on the device will now cause the LED to blink. The duration of each ++blink (in milliseconds) can be adjusted by setting ++``/sys/class/leds//blink_time``. (But see **check_interval and ++blink_time** below.) ++ ++Associate a second device with the LED:: ++ ++ # echo /dev/sdb > /sys/class/leds//link_dev_by_path ++ ++ # ls /sys/class/leds//linked_devices ++ sda sdb ++ ++When a block device is linked to one or more LEDs, the LEDs are linked from ++the device's ``linked_leds`` directory:: ++ ++ # ls /sys/class/block/sd{a,b}/linked_leds ++ /sys/class/block/sda/linked_leds: ++ ++ ++ /sys/class/block/sdb/linked_leds: ++ ++ ++(The ``linked_leds`` directory only exists when the block device is linked to ++at least one LED.) ++ ++``check_interval`` and ``blink_time`` ++===================================== ++ ++* By default, linked block devices are checked for activity every 100 ++ milliseconds. This frequency can be changed for an LED via the ++ ``/sys/class/leds//check_interval`` attribute. (The minimum value is 25 ++ milliseconds.) ++ ++* All block devices associated with an LED are checked for activity every ++ ``check_interval`` milliseconds, and a blink is triggered if the correct type ++ of activity (as determined by the LED's ``blink_on_*`` attributes) is ++ detected. The duration of an LED's blink is determined by its ``blink_time`` ++ attribute. Thus (when the correct type of activity is detected), the LED will ++ be on for ``blink_time`` milliseconds and off for ++ ``check_interval - blink_time`` milliseconds. ++ ++* The LED subsystem ignores new blink requests for an LED that is already in ++ in the process of blinking, so setting a ``blink_time`` greater than or equal ++ to ``check_interval`` will cause some blinks to be missed. ++ ++* Because of processing times, scheduling latencies, etc., avoiding missed ++ blinks actually requires a difference of at least a few milliseconds between ++ the ``blink_time`` and ``check_interval``. The required difference is likely ++ to vary from system to system. As a reference, a Thecus N5550 NAS requires a ++ difference of 7 milliseconds (e.g. ``check_interval == 100``, ++ ``blink_time == 93``). ++ ++* The default values (``check_interval == 100``, ``blink_time == 75``) cause the ++ LED associated with a continuously active device to blink rapidly. For a more ++ "always on" effect, increase the ``blink_time`` (but not too much; see the ++ previous bullet). ++ ++Other Notes ++=========== ++ ++* Many (possibly all) types of block devices work with this trigger, including: ++ ++ * SCSI (including SATA and USB) hard disk drives and SSDs ++ * SCSI (including SATA and USB) optical drives ++ * NVMe SSDs ++ * SD cards ++ * loopback block devices (``/dev/loop*``) ++ * device mapper devices, such as LVM logical volumes ++ * MD RAID devices ++ * zRAM compressed RAM-disks ++ * partitions on block devices that support them ++ ++* The names of the symbolic links in ``/sys/class/leds//linked_devices`` ++ are **kernel** names, which may not match the paths used for ++ ``link_dev_by_path`` and ``unlink_dev_by_path``. This is most likely when a ++ symbolic link is used to refer to the device (as is common with logical ++ volumes), but it can be true for any device, because nothing prevents the ++ creation of device special files with arbitrary names (e.g. ++ ``sudo mknod /foo b 8 0``). ++ ++ Kernel names can be used to unlink block devices from LEDs by writing them to ++ the LED's ``unlink_dev_by_name`` attribute. ++ ++* The ``blkdev`` LED trigger supports many-to-many device/LED associations. ++ A device can be associated with multiple LEDs, and an LED can be associated ++ with multiple devices. +diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile +index d995595394bb..19d1fb601796 100644 +--- a/arch/x86/boot/compressed/Makefile ++++ b/arch/x86/boot/compressed/Makefile +@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) + KBUILD_CFLAGS += -fno-asynchronous-unwind-tables + KBUILD_CFLAGS += -D__DISABLE_EXPORTS + # Disable relocation relaxation in case the link is not PIE. +-KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) ++KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) + KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h + + # sev.c indirectly inludes inat-table.h which is generated during +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 907cc98b1938..518bda50068c 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) + return cpu; + } + ++static bool __init acpi_is_processor_usable(u32 lapic_flags) ++{ ++ if (lapic_flags & ACPI_MADT_ENABLED) ++ return true; ++ ++ if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) ++ return true; ++ ++ return false; ++} ++ + static int __init + acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) + { +@@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) + if (apic_id == 0xffffffff) + return 0; + ++ /* don't register processors that cannot be onlined */ ++ if (!acpi_is_processor_usable(processor->lapic_flags)) ++ return 0; ++ + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size +@@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) + return 0; + + /* don't register processors that can not be onlined */ +- if (acpi_support_online_capable && +- !(processor->lapic_flags & ACPI_MADT_ENABLED) && +- !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) ++ if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + /* +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index c1e31e9a85d7..92d73ccede70 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1205,7 +1205,7 @@ void __flush_tlb_all(void) + */ + VM_WARN_ON_ONCE(preemptible()); + +- if (boot_cpu_has(X86_FEATURE_PGE)) { ++ if (cpu_feature_enabled(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* +diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile +index 9e0d95d76fff..30f3fc13c29d 100644 +--- a/drivers/acpi/acpica/Makefile ++++ b/drivers/acpi/acpica/Makefile +@@ -3,7 +3,7 @@ + # Makefile for ACPICA Core interpreter + # + +-ccflags-y := -Os -D_LINUX -DBUILDING_ACPICA ++ccflags-y := -D_LINUX -DBUILDING_ACPICA + ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT + + # use acpi.o to put all files here into acpi.o modparam namespace +diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c +index 2ad4efdd9e40..afd2f08ffe30 100644 +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -64,6 +64,7 @@ static struct usb_driver btusb_driver; + #define BTUSB_INTEL_BROKEN_SHUTDOWN_LED BIT(24) + #define BTUSB_INTEL_BROKEN_INITIAL_NCMD BIT(25) + #define BTUSB_INTEL_NO_WBS_SUPPORT BIT(26) ++#define BTUSB_ACTIONS_SEMI BIT(27) + + static const struct usb_device_id btusb_table[] = { + /* Generic Bluetooth USB device */ +@@ -677,6 +678,9 @@ static const struct usb_device_id blacklist_table[] = { + { USB_DEVICE(0x0cb5, 0xc547), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, + ++ /* Actions Semiconductor ATS2851 based devices */ ++ { USB_DEVICE(0x10d7, 0xb012), .driver_info = BTUSB_ACTIONS_SEMI }, ++ + /* Silicon Wave based devices */ + { USB_DEVICE(0x0c10, 0x0000), .driver_info = BTUSB_SWAVE }, + +@@ -4098,6 +4102,11 @@ static int btusb_probe(struct usb_interface *intf, + set_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags); + } + ++ if (id->driver_info & BTUSB_ACTIONS_SEMI) { ++ /* Support is advertised, but not implemented */ ++ set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks); ++ } ++ + if (!reset) + set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + +diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c +index 741d8f3e8fb3..348dd5705fbb 100644 +--- a/drivers/char/tpm/tpm-chip.c ++++ b/drivers/char/tpm/tpm-chip.c +@@ -512,6 +512,65 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) + return 0; + } + ++static bool tpm_is_rng_defective(struct tpm_chip *chip) ++{ ++ int ret; ++ u64 version; ++ u32 val1, val2; ++ ++ /* No known-broken TPM1 chips. */ ++ if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) ++ return false; ++ ++ ret = tpm_request_locality(chip); ++ if (ret) ++ return false; ++ ++ /* Some AMD fTPM versions may cause stutter */ ++ ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL); ++ if (ret) ++ goto release; ++ if (val1 != 0x414D4400U /* AMD */) { ++ ret = -ENODEV; ++ goto release; ++ } ++ ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL); ++ if (ret) ++ goto release; ++ ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL); ++ if (ret) ++ goto release; ++ ++release: ++ tpm_relinquish_locality(chip); ++ ++ if (ret) ++ return false; ++ ++ version = ((u64)val1 << 32) | val2; ++ /* ++ * Fixes for stutter as described in ++ * https://www.amd.com/en/support/kb/faq/pa-410 ++ * are available in two series of fTPM firmware: ++ * 6.x.y.z series: 6.0.18.6 + ++ * 3.x.y.z series: 3.57.x.5 + ++ */ ++ if ((version >> 48) == 6) { ++ if (version >= 0x0006000000180006ULL) ++ return false; ++ } else if ((version >> 48) == 3) { ++ if (version >= 0x0003005700000005ULL) ++ return false; ++ } else { ++ return false; ++ } ++ dev_warn(&chip->dev, ++ "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n", ++ version); ++ ++ return true; ++} ++ + static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) + { + struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); +@@ -521,7 +580,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) + + static int tpm_add_hwrng(struct tpm_chip *chip) + { +- if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip)) ++ if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || ++ tpm_is_rng_defective(chip)) + return 0; + + snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), +diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h +index 24ee4e1cc452..830014a26609 100644 +--- a/drivers/char/tpm/tpm.h ++++ b/drivers/char/tpm/tpm.h +@@ -150,6 +150,79 @@ enum tpm_sub_capabilities { + TPM_CAP_PROP_TIS_DURATION = 0x120, + }; + ++enum tpm2_pt_props { ++ TPM2_PT_NONE = 0x00000000, ++ TPM2_PT_GROUP = 0x00000100, ++ TPM2_PT_FIXED = TPM2_PT_GROUP * 1, ++ TPM2_PT_FAMILY_INDICATOR = TPM2_PT_FIXED + 0, ++ TPM2_PT_LEVEL = TPM2_PT_FIXED + 1, ++ TPM2_PT_REVISION = TPM2_PT_FIXED + 2, ++ TPM2_PT_DAY_OF_YEAR = TPM2_PT_FIXED + 3, ++ TPM2_PT_YEAR = TPM2_PT_FIXED + 4, ++ TPM2_PT_MANUFACTURER = TPM2_PT_FIXED + 5, ++ TPM2_PT_VENDOR_STRING_1 = TPM2_PT_FIXED + 6, ++ TPM2_PT_VENDOR_STRING_2 = TPM2_PT_FIXED + 7, ++ TPM2_PT_VENDOR_STRING_3 = TPM2_PT_FIXED + 8, ++ TPM2_PT_VENDOR_STRING_4 = TPM2_PT_FIXED + 9, ++ TPM2_PT_VENDOR_TPM_TYPE = TPM2_PT_FIXED + 10, ++ TPM2_PT_FIRMWARE_VERSION_1 = TPM2_PT_FIXED + 11, ++ TPM2_PT_FIRMWARE_VERSION_2 = TPM2_PT_FIXED + 12, ++ TPM2_PT_INPUT_BUFFER = TPM2_PT_FIXED + 13, ++ TPM2_PT_HR_TRANSIENT_MIN = TPM2_PT_FIXED + 14, ++ TPM2_PT_HR_PERSISTENT_MIN = TPM2_PT_FIXED + 15, ++ TPM2_PT_HR_LOADED_MIN = TPM2_PT_FIXED + 16, ++ TPM2_PT_ACTIVE_SESSIONS_MAX = TPM2_PT_FIXED + 17, ++ TPM2_PT_PCR_COUNT = TPM2_PT_FIXED + 18, ++ TPM2_PT_PCR_SELECT_MIN = TPM2_PT_FIXED + 19, ++ TPM2_PT_CONTEXT_GAP_MAX = TPM2_PT_FIXED + 20, ++ TPM2_PT_NV_COUNTERS_MAX = TPM2_PT_FIXED + 22, ++ TPM2_PT_NV_INDEX_MAX = TPM2_PT_FIXED + 23, ++ TPM2_PT_MEMORY = TPM2_PT_FIXED + 24, ++ TPM2_PT_CLOCK_UPDATE = TPM2_PT_FIXED + 25, ++ TPM2_PT_CONTEXT_HASH = TPM2_PT_FIXED + 26, ++ TPM2_PT_CONTEXT_SYM = TPM2_PT_FIXED + 27, ++ TPM2_PT_CONTEXT_SYM_SIZE = TPM2_PT_FIXED + 28, ++ TPM2_PT_ORDERLY_COUNT = TPM2_PT_FIXED + 29, ++ TPM2_PT_MAX_COMMAND_SIZE = TPM2_PT_FIXED + 30, ++ TPM2_PT_MAX_RESPONSE_SIZE = TPM2_PT_FIXED + 31, ++ TPM2_PT_MAX_DIGEST = TPM2_PT_FIXED + 32, ++ TPM2_PT_MAX_OBJECT_CONTEXT = TPM2_PT_FIXED + 33, ++ TPM2_PT_MAX_SESSION_CONTEXT = TPM2_PT_FIXED + 34, ++ TPM2_PT_PS_FAMILY_INDICATOR = TPM2_PT_FIXED + 35, ++ TPM2_PT_PS_LEVEL = TPM2_PT_FIXED + 36, ++ TPM2_PT_PS_REVISION = TPM2_PT_FIXED + 37, ++ TPM2_PT_PS_DAY_OF_YEAR = TPM2_PT_FIXED + 38, ++ TPM2_PT_PS_YEAR = TPM2_PT_FIXED + 39, ++ TPM2_PT_SPLIT_MAX = TPM2_PT_FIXED + 40, ++ TPM2_PT_TOTAL_COMMANDS = TPM2_PT_FIXED + 41, ++ TPM2_PT_LIBRARY_COMMANDS = TPM2_PT_FIXED + 42, ++ TPM2_PT_VENDOR_COMMANDS = TPM2_PT_FIXED + 43, ++ TPM2_PT_NV_BUFFER_MAX = TPM2_PT_FIXED + 44, ++ TPM2_PT_MODES = TPM2_PT_FIXED + 45, ++ TPM2_PT_MAX_CAP_BUFFER = TPM2_PT_FIXED + 46, ++ TPM2_PT_VAR = TPM2_PT_GROUP * 2, ++ TPM2_PT_PERMANENT = TPM2_PT_VAR + 0, ++ TPM2_PT_STARTUP_CLEAR = TPM2_PT_VAR + 1, ++ TPM2_PT_HR_NV_INDEX = TPM2_PT_VAR + 2, ++ TPM2_PT_HR_LOADED = TPM2_PT_VAR + 3, ++ TPM2_PT_HR_LOADED_AVAIL = TPM2_PT_VAR + 4, ++ TPM2_PT_HR_ACTIVE = TPM2_PT_VAR + 5, ++ TPM2_PT_HR_ACTIVE_AVAIL = TPM2_PT_VAR + 6, ++ TPM2_PT_HR_TRANSIENT_AVAIL = TPM2_PT_VAR + 7, ++ TPM2_PT_HR_PERSISTENT = TPM2_PT_VAR + 8, ++ TPM2_PT_HR_PERSISTENT_AVAIL = TPM2_PT_VAR + 9, ++ TPM2_PT_NV_COUNTERS = TPM2_PT_VAR + 10, ++ TPM2_PT_NV_COUNTERS_AVAIL = TPM2_PT_VAR + 11, ++ TPM2_PT_ALGORITHM_SET = TPM2_PT_VAR + 12, ++ TPM2_PT_LOADED_CURVES = TPM2_PT_VAR + 13, ++ TPM2_PT_LOCKOUT_COUNTER = TPM2_PT_VAR + 14, ++ TPM2_PT_MAX_AUTH_FAIL = TPM2_PT_VAR + 15, ++ TPM2_PT_LOCKOUT_INTERVAL = TPM2_PT_VAR + 16, ++ TPM2_PT_LOCKOUT_RECOVERY = TPM2_PT_VAR + 17, ++ TPM2_PT_NV_WRITE_RECOVERY = TPM2_PT_VAR + 18, ++ TPM2_PT_AUDIT_COUNTER_0 = TPM2_PT_VAR + 19, ++ TPM2_PT_AUDIT_COUNTER_1 = TPM2_PT_VAR + 20, ++}; + + /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18 + * bytes, but 128 is still a relatively large number of random bytes and +diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c +index da9ec6983e13..c54233f0369b 100644 +--- a/drivers/hwmon/nct6775-core.c ++++ b/drivers/hwmon/nct6775-core.c +@@ -1150,7 +1150,7 @@ static int nct6775_write_fan_div(struct nct6775_data *data, int nr) + if (err) + return err; + reg &= 0x70 >> oddshift; +- reg |= data->fan_div[nr] & (0x7 << oddshift); ++ reg |= (data->fan_div[nr] & 0x7) << oddshift; + return nct6775_write_value(data, fandiv_reg, reg); + } + +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index dc6816d36d06..bda249068182 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY + + When build as a module this driver will be called ledtrig-tty. + ++config LEDS_TRIGGER_BLKDEV ++ tristate "LED Trigger for block devices" ++ depends on BLOCK ++ help ++ The blkdev LED trigger allows LEDs to be controlled by block device ++ activity (reads and writes). ++ ++ See Documentation/leds/ledtrig-blkdev.rst. ++ + endif # LEDS_TRIGGERS +diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile +index 25c4db97cdd4..d53bab5d93f1 100644 +--- a/drivers/leds/trigger/Makefile ++++ b/drivers/leds/trigger/Makefile +@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o + obj-$(CONFIG_LEDS_TRIGGER_PATTERN) += ledtrig-pattern.o + obj-$(CONFIG_LEDS_TRIGGER_AUDIO) += ledtrig-audio.o + obj-$(CONFIG_LEDS_TRIGGER_TTY) += ledtrig-tty.o ++obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o +diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c +new file mode 100644 +index 000000000000..8614e308fadc +--- /dev/null ++++ b/drivers/leds/trigger/ledtrig-blkdev.c +@@ -0,0 +1,1220 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * Block device LED trigger ++ * ++ * Copyright 2021-2022 Ian Pilcher ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/** ++ * DOC: Overview ++ * ++ * The ``blkdev`` LED trigger works by periodically checking the activity ++ * counters of block devices that have been linked to one or more LEDs and ++ * blinking those LED(s) if the correct type of activity has occurred. The ++ * periodic check is scheduled with the Linux kernel's deferred work facility. ++ * ++ * Trigger-specific data about block devices and LEDs is stored in two data ++ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led ++ * (a "BTL"). Each structure contains a &struct xarray that holds links to any ++ * linked devices of the other type. I.e. &blkdev_trig_bdev.linked_btls ++ * contains links to all BTLs whose LEDs have been linked to the BTB's block ++ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose ++ * block devices have been linked to the BTL's LED. Thus, a block device can ++ * be linked to more than one LED, and an LED can be linked to more than one ++ * block device. ++ */ ++ ++/* Default, minimum & maximum blink duration (milliseconds) */ ++#define BLKDEV_TRIG_BLINK_DEF 75 ++#define BLKDEV_TRIG_BLINK_MIN 10 ++#define BLKDEV_TRIG_BLINK_MAX 86400000 /* 24 hours */ ++ ++/* Default, minimum & maximum activity check interval (milliseconds) */ ++#define BLKDEV_TRIG_CHECK_DEF 100 ++#define BLKDEV_TRIG_CHECK_MIN 25 ++#define BLKDEV_TRIG_CHECK_MAX 86400000 /* 24 hours */ ++ ++/* ++ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying ++ * again (milliseconds) ++ */ ++#define BLKDEV_TRIG_CHECK_RETRY 5 ++ ++/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */ ++#define BLKDEV_TRIG_FMODE 0 ++ ++/** ++ * struct blkdev_trig_bdev - Trigger-specific data about a block device. ++ * @last_checked: Time (in jiffies) at which the trigger last checked this ++ * block device for activity. ++ * @last_activity: Time (in jiffies) at which the trigger last detected ++ * activity of each type. ++ * @ios: Activity counter values for each type, corresponding to ++ * the timestamps in &last_activity. ++ * @index: &xarray index, so the BTB can be included in one or more ++ * &blkdev_trig_led.linked_btbs. ++ * @bdev: The block device. ++ * @linked_btls: The BTLs that represent the LEDs linked to the BTB's ++ * block device. ++ * ++ * Every block device linked to at least one LED gets a "BTB." A BTB is created ++ * when a block device that is not currently linked to any LEDs is linked to an ++ * LED. ++ * ++ * A BTB is freed when one of the following occurs: ++ * ++ * * The number of LEDs linked to the block device becomes zero, because it has ++ * been unlinked from its last LED using the trigger's &sysfs interface. ++ * ++ * * The number of LEDs linked to the block device becomes zero, because the ++ * last LED to which it was linked has been disassociated from the trigger ++ * (which happens automatically if the LED device is removed from the system). ++ * ++ * * The BTB's block device is removed from the system. To accomodate this ++ * scenario, BTB's are created as device resources, so that the release ++ * function will be called by the driver core when the device is removed. ++ */ ++struct blkdev_trig_bdev { ++ unsigned long last_checked; ++ unsigned long last_activity[NR_STAT_GROUPS]; ++ unsigned long ios[NR_STAT_GROUPS]; ++ unsigned long index; ++ struct block_device *bdev; ++ struct xarray linked_btls; ++}; ++ ++/** ++ * struct blkdev_trig_led - Trigger-specific data about an LED. ++ * @last_checked: Time (in jiffies) at which the trigger last checked the ++ * the block devices linked to this LED for activity. ++ * @index: &xarray index, so the BTL can be included in one or more ++ * &blkdev_trig_bdev.linked_btls. ++ * @mode: Bitmask for types of block device activity that will ++ * cause this LED to blink --- reads, writes, discards, ++ * etc. ++ * @led: The LED device. ++ * @blink_msec: Duration of a blink (milliseconds). ++ * @check_jiffies: Frequency with which block devices linked to this LED ++ * should be checked for activity (jiffies). ++ * @linked_btbs: The BTBs that represent the block devices linked to the ++ * BTL's LED. ++ * @all_btls_node: The BTL's node in the module's list of all BTLs. ++ * ++ * Every LED associated with the block device trigger gets a "BTL." A BTL is ++ * created when the trigger is "activated" on an LED (usually by writing ++ * ``blkdev`` to the LED's &sysfs &trigger attribute). A BTL is freed wnen its ++ * LED is disassociated from the trigger, either through the trigger's &sysfs ++ * interface or because the LED device is removed from the system. ++ */ ++struct blkdev_trig_led { ++ unsigned long last_checked; ++ unsigned long index; ++ unsigned long mode; /* must be ulong for atomic bit ops */ ++ struct led_classdev *led; ++ unsigned int blink_msec; ++ unsigned int check_jiffies; ++ struct xarray linked_btbs; ++ struct hlist_node all_btls_node; ++}; ++ ++/* Protects everything except atomic LED attributes */ ++static DEFINE_MUTEX(blkdev_trig_mutex); ++ ++/* BTB device resource release function */ ++static void blkdev_trig_btb_release(struct device *dev, void *res); ++ ++/* Index for next BTB or BTL */ ++static unsigned long blkdev_trig_next_index; ++ ++/* All LEDs associated with the trigger */ ++static HLIST_HEAD(blkdev_trig_all_btls); ++ ++/* Delayed work to periodically check for activity & blink LEDs */ ++static void blkdev_trig_check(struct work_struct *work); ++static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check); ++ ++/* When is the delayed work scheduled to run next (jiffies) */ ++static unsigned long blkdev_trig_next_check; ++ ++/* Total number of BTB-to-BTL links */ ++static unsigned int blkdev_trig_link_count; ++ ++/* Empty sysfs attribute list for next 2 declarations */ ++static struct attribute *blkdev_trig_attrs_empty[] = { NULL }; ++ ++/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */ ++static const struct attribute_group blkdev_trig_linked_leds = { ++ .name = "linked_leds", ++ .attrs = blkdev_trig_attrs_empty, ++}; ++ ++/* linked_devices sysfs directory for each LED associated with the trigger */ ++static const struct attribute_group blkdev_trig_linked_devs = { ++ .name = "linked_devices", ++ .attrs = blkdev_trig_attrs_empty, ++}; ++ ++ ++/* ++ * ++ * Delayed work to check for activity & blink LEDs ++ * ++ */ ++ ++/** ++ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has ++ * occurred on the block device. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: &true if the LED is blinked, &false if not. ++ */ ++static bool blkdev_trig_blink(const struct blkdev_trig_led *btl, ++ const struct blkdev_trig_bdev *btb) ++{ ++ unsigned long mode, mask, delay_on, delay_off; ++ enum stat_group i; ++ ++ mode = READ_ONCE(btl->mode); ++ ++ for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) { ++ ++ if (!(mode & mask)) ++ continue; ++ ++ if (time_before_eq(btb->last_activity[i], btl->last_checked)) ++ continue; ++ ++ delay_on = READ_ONCE(btl->blink_msec); ++ delay_off = 1; /* 0 leaves LED turned on */ ++ ++ led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0); ++ return true; ++ } ++ ++ return false; ++} ++ ++/** ++ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps. ++ * @btb: The BTB ++ * @now: Timestamp (in jiffies) ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb, ++ unsigned long now) ++{ ++ unsigned long new_ios; ++ enum stat_group i; ++ ++ for (i = STAT_READ; i <= STAT_FLUSH; ++i) { ++ ++ new_ios = part_stat_read(btb->bdev, ios[i]); ++ ++ if (new_ios != btb->ios[i]) { ++ btb->ios[i] = new_ios; ++ btb->last_activity[i] = now; ++ } ++ } ++ ++ btb->last_checked = now; ++} ++ ++/** ++ * blkdev_trig_check() - Check linked devices for activity and blink LEDs. ++ * @work: Delayed work (&blkdev_trig_work) ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_check(struct work_struct *work) ++{ ++ struct blkdev_trig_led *btl; ++ struct blkdev_trig_bdev *btb; ++ unsigned long index, delay, now, led_check, led_delay; ++ bool blinked; ++ ++ if (!mutex_trylock(&blkdev_trig_mutex)) { ++ delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY); ++ goto exit_reschedule; ++ } ++ ++ now = jiffies; ++ delay = ULONG_MAX; ++ ++ hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) { ++ ++ led_check = btl->last_checked + btl->check_jiffies; ++ ++ if (time_before_eq(led_check, now)) { ++ ++ blinked = false; ++ ++ xa_for_each (&btl->linked_btbs, index, btb) { ++ ++ if (btb->last_checked != now) ++ blkdev_trig_update_btb(btb, now); ++ if (!blinked) ++ blinked = blkdev_trig_blink(btl, btb); ++ } ++ ++ btl->last_checked = now; ++ led_delay = btl->check_jiffies; ++ ++ } else { ++ led_delay = led_check - now; ++ } ++ ++ if (led_delay < delay) ++ delay = led_delay; ++ } ++ ++ mutex_unlock(&blkdev_trig_mutex); ++ ++exit_reschedule: ++ WARN_ON_ONCE(delay == ULONG_MAX); ++ WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay)); ++} ++ ++/** ++ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new ++ * LED is added to the schedule. ++ * @btl: The BTL that represents the LED ++ * ++ * Called when the number of block devices to which an LED is linked becomes ++ * non-zero. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl) ++{ ++ unsigned long delay = READ_ONCE(btl->check_jiffies); ++ unsigned long check_by = jiffies + delay; ++ ++ /* ++ * If no other LED-to-block device links exist, simply schedule the ++ * delayed work according to this LED's check_interval attribute ++ * (check_jiffies). ++ */ ++ if (blkdev_trig_link_count == 0) { ++ WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay)); ++ blkdev_trig_next_check = check_by; ++ return; ++ } ++ ++ /* ++ * If the next check is already scheduled to occur soon enough to ++ * accomodate this LED's check_interval, the schedule doesn't need ++ * to be changed. ++ */ ++ if (time_after_eq(check_by, blkdev_trig_next_check)) ++ return; ++ ++ /* ++ * Modify the schedule, so that the delayed work runs soon enough for ++ * this LED. ++ */ ++ WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay)); ++ blkdev_trig_next_check = check_by; ++} ++ ++ ++/* ++ * ++ * Linking and unlinking LEDs and block devices ++ * ++ */ ++ ++/** ++ * blkdev_trig_link() - Link a block device to an LED. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: &0 on success, negative &errno on error. ++ */ ++static int blkdev_trig_link(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ bool led_first_link; ++ int err; ++ ++ led_first_link = xa_empty(&btl->linked_btbs); ++ ++ err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL); ++ if (err) ++ return err; ++ ++ err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL); ++ if (err) ++ goto error_erase_btl; ++ ++ /* Create /sys/class/block//linked_leds/ symlink */ ++ err = sysfs_add_link_to_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ &btl->led->dev->kobj, btl->led->name); ++ if (err) ++ goto error_erase_btb; ++ ++ /* Create /sys/class/leds//linked_devices/ symlink */ ++ err = sysfs_add_link_to_group(&btl->led->dev->kobj, ++ blkdev_trig_linked_devs.name, ++ bdev_kobj(btb->bdev), ++ dev_name(&btb->bdev->bd_device)); ++ if (err) ++ goto error_remove_symlink; ++ ++ /* ++ * If this is the first block device linked to this LED, the delayed ++ * work schedule may need to be changed. ++ */ ++ if (led_first_link) ++ blkdev_trig_sched_led(btl); ++ ++ ++blkdev_trig_link_count; ++ ++ return 0; ++ ++error_remove_symlink: ++ sysfs_remove_link_from_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ btl->led->name); ++error_erase_btb: ++ xa_erase(&btl->linked_btbs, btb->index); ++error_erase_btl: ++ xa_erase(&btb->linked_btls, btl->index); ++ return err; ++} ++ ++/** ++ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed. ++ * @btb: The BTB ++ * ++ * Does nothing if the BTB (block device) is still linked to at least one LED. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb) ++{ ++ struct block_device *bdev = btb->bdev; ++ int err; ++ ++ if (xa_empty(&btb->linked_btls)) { ++ ++ sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); ++ err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ WARN_ON(err); ++ } ++} ++ ++/** ++ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of ++ * unlinking a block device from an LED. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * When a block device is unlinked from an LED, certain steps must be performed ++ * only if the block device is **not** being released. This function performs ++ * those steps that are **always** required, whether or not the block device is ++ * being released. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ --blkdev_trig_link_count; ++ ++ if (blkdev_trig_link_count == 0) ++ WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work)); ++ ++ xa_erase(&btb->linked_btls, btl->index); ++ xa_erase(&btl->linked_btbs, btb->index); ++ ++ /* Remove /sys/class/leds//linked_devices/ symlink */ ++ sysfs_remove_link_from_group(&btl->led->dev->kobj, ++ blkdev_trig_linked_devs.name, ++ dev_name(&btb->bdev->bd_device)); ++} ++ ++/** ++ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is ++ * **not** being released. ++ * @btl: The BTL that represents the LED. ++ * @btb: The BTB that represents the block device. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ _blkdev_trig_unlink_always(btl, btb); ++ ++ /* Remove /sys/class/block//linked_leds/ symlink */ ++ sysfs_remove_link_from_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ btl->led->name); ++ ++ blkdev_trig_put_btb(btb); ++} ++ ++/** ++ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is ++ * being released. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ _blkdev_trig_unlink_always(btl, btb); ++ ++ /* ++ * If the BTB is being released, the driver core has already removed the ++ * device's attribute groups, and the BTB will be freed automatically, ++ * so there's nothing else to do. ++ */ ++} ++ ++ ++/* ++ * ++ * BTB creation ++ * ++ */ ++ ++/** ++ * blkdev_trig_btb_release() - BTB device resource release function. ++ * @dev: The block device ++ * @res: The BTB ++ * ++ * Called by the driver core when a block device with a BTB is removed. ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_btb_release(struct device *dev, void *res) ++{ ++ struct blkdev_trig_bdev *btb = res; ++ struct blkdev_trig_led *btl; ++ unsigned long index; ++ ++ mutex_lock(&blkdev_trig_mutex); ++ ++ xa_for_each (&btb->linked_btls, index, btl) ++ blkdev_trig_unlink_release(btl, btb); ++ ++ mutex_unlock(&blkdev_trig_mutex); ++} ++ ++/** ++ * blkdev_trig_get_bdev() - Get a block device by path. ++ * @path: The value written to an LED's &link_dev_by_path or ++ * &unlink_dev_by_path attribute, which should be the path to a ++ * special file that represents a block device ++ * @len: The number of characters in &path (not including its ++ * terminating null) ++ * ++ * The caller must call blkdev_put() when finished with the device. ++ * ++ * Context: Process context. ++ * Return: The block device, or an error pointer. ++ */ ++static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len) ++{ ++ struct block_device *bdev; ++ char *buf; ++ ++ buf = kmemdup(path, len + 1, GFP_KERNEL); /* +1 to include null */ ++ if (buf == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE); ++ kfree(buf); ++ return bdev; ++} ++ ++/** ++ * blkdev_trig_get_btb() - Find or create the BTB for a block device. ++ * @path: The value written to an LED's &link_dev_by_path attribute, ++ * which should be the path to a special file that represents a ++ * block device ++ * @len: The number of characters in &path ++ * ++ * If a new BTB is created, because the block device was not previously linked ++ * to any LEDs, the block device's &linked_leds &sysfs directory is created. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: Pointer to the BTB, error pointer on error. ++ */ ++static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path, ++ size_t len) ++{ ++ struct block_device *bdev; ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ bdev = blkdev_trig_get_bdev(path, len); ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ if (btb != NULL) { ++ err = 0; ++ goto exit_put_bdev; ++ } ++ ++ if (blkdev_trig_next_index == ULONG_MAX) { ++ err = -EOVERFLOW; ++ goto exit_put_bdev; ++ } ++ ++ btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL); ++ if (btb == NULL) { ++ err = -ENOMEM; ++ goto exit_put_bdev; ++ } ++ ++ err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); ++ if (err) ++ goto exit_free_btb; ++ ++ btb->index = blkdev_trig_next_index++; ++ btb->bdev = bdev; ++ xa_init(&btb->linked_btls); ++ ++ /* Populate BTB activity counters */ ++ blkdev_trig_update_btb(btb, jiffies); ++ ++ devres_add(&bdev->bd_device, btb); ++ ++exit_free_btb: ++ if (err) ++ devres_free(btb); ++exit_put_bdev: ++ blkdev_put(bdev, BLKDEV_TRIG_FMODE); ++ return err ? ERR_PTR(err) : btb; ++} ++ ++ ++/* ++ * ++ * Activating and deactivating the trigger on an LED ++ * ++ */ ++ ++/** ++ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is ++ * associated with the trigger. ++ * @led: The LED ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &0 on success, negative &errno on error. ++ */ ++static int blkdev_trig_activate(struct led_classdev *led) ++{ ++ struct blkdev_trig_led *btl; ++ int err; ++ ++ btl = kzalloc(sizeof(*btl), GFP_KERNEL); ++ if (btl == NULL) ++ return -ENOMEM; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ goto exit_free; ++ ++ if (blkdev_trig_next_index == ULONG_MAX) { ++ err = -EOVERFLOW; ++ goto exit_unlock; ++ } ++ ++ btl->index = blkdev_trig_next_index++; ++ btl->last_checked = jiffies; ++ btl->mode = -1; /* set all bits */ ++ btl->led = led; ++ btl->blink_msec = BLKDEV_TRIG_BLINK_DEF; ++ btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF); ++ xa_init(&btl->linked_btbs); ++ ++ hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls); ++ led_set_trigger_data(led, btl); ++ ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++exit_free: ++ if (err) ++ kfree(btl); ++ return err; ++} ++ ++/** ++ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is ++ * disassociated from the trigger. ++ * @led: The LED ++ * ++ * The LEDs subsystem also calls this function when an LED associated with the ++ * trigger is removed or when the trigger is unregistered (if the module is ++ * unloaded). ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_deactivate(struct led_classdev *led) ++{ ++ struct blkdev_trig_led *btl = led_get_trigger_data(led); ++ struct blkdev_trig_bdev *btb; ++ unsigned long index; ++ ++ mutex_lock(&blkdev_trig_mutex); ++ ++ xa_for_each (&btl->linked_btbs, index, btb) ++ blkdev_trig_unlink_norelease(btl, btb); ++ ++ hlist_del(&btl->all_btls_node); ++ kfree(btl); ++ ++ mutex_unlock(&blkdev_trig_mutex); ++} ++ ++ ++/* ++ * ++ * Link-related attribute store functions ++ * ++ */ ++ ++/** ++ * link_dev_by_path_store() - &link_dev_by_path device attribute store function. ++ * @dev: The LED device ++ * @attr: The &link_dev_by_path attribute (&dev_attr_link_dev_by_path) ++ * @buf: The value written to the attribute, which should be the path to ++ * a special file that represents a block device to be linked to ++ * the LED (e.g. ``/dev/sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t link_dev_by_path_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ return err; ++ ++ btb = blkdev_trig_get_btb(buf, count); ++ if (IS_ERR(btb)) { ++ err = PTR_ERR(btb); ++ goto exit_unlock; ++ } ++ ++ if (xa_load(&btb->linked_btls, btl->index) != NULL) { ++ err = -EEXIST; ++ goto exit_put_btb; ++ } ++ ++ err = blkdev_trig_link(btl, btb); ++ ++exit_put_btb: ++ if (err) ++ blkdev_trig_put_btb(btb); ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++ return err ? : count; ++} ++ ++/** ++ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store ++ * function. ++ * @dev: The LED device ++ * @attr: The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path) ++ * @buf: The value written to the attribute, which should be the path to ++ * a special file that represents a block device to be unlinked ++ * from the LED (e.g. ``/dev/sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t unlink_dev_by_path_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct block_device *bdev; ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ bdev = blkdev_trig_get_bdev(buf, count); ++ if (IS_ERR(bdev)) ++ return PTR_ERR(bdev); ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ goto exit_put_bdev; ++ ++ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ if (btb == NULL) { ++ err = -EUNATCH; /* bdev isn't linked to any LED */ ++ goto exit_unlock; ++ } ++ ++ if (xa_load(&btb->linked_btls, btl->index) == NULL) { ++ err = -EUNATCH; /* bdev isn't linked to this LED */ ++ goto exit_unlock; ++ } ++ ++ blkdev_trig_unlink_norelease(btl, btb); ++ ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++exit_put_bdev: ++ blkdev_put(bdev, BLKDEV_TRIG_FMODE); ++ return err ? : count; ++} ++ ++/** ++ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store ++ * function. ++ * @dev: The LED device ++ * @attr: The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name) ++ * @buf: The value written to the attribute, which should be the kernel ++ * name of a block device to be unlinked from the LED (e.g. ++ * ``sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t unlink_dev_by_name_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct blkdev_trig_bdev *btb; ++ unsigned long index; ++ int err; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ return err; ++ ++ err = -EUNATCH; ++ ++ xa_for_each (&btl->linked_btbs, index, btb) { ++ ++ if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) { ++ blkdev_trig_unlink_norelease(btl, btb); ++ err = 0; ++ break; ++ } ++ } ++ ++ mutex_unlock(&blkdev_trig_mutex); ++ return err ? : count; ++} ++ ++ ++/* ++ * ++ * Atomic attribute show & store functions ++ * ++ */ ++ ++/** ++ * blink_time_show() - &blink_time device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_time attribute (&dev_attr_blink_time) ++ * @buf: Output buffer ++ * ++ * Writes the value of &blkdev_trig_led.blink_msec to &buf. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_time_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ ++ return sprintf(buf, "%u\n", READ_ONCE(btl->blink_msec)); ++} ++ ++/** ++ * blink_time_store() - &blink_time device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_time attribute (&dev_attr_blink_time) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets &blkdev_trig_led.blink_msec to the value in &buf. ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_time_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ unsigned int value; ++ int err; ++ ++ err = kstrtouint(buf, 0, &value); ++ if (err) ++ return err; ++ ++ if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX) ++ return -ERANGE; ++ ++ WRITE_ONCE(btl->blink_msec, value); ++ return count; ++} ++ ++/** ++ * check_interval_show() - &check_interval device attribute show function. ++ * @dev: The LED device ++ * @attr: The &check_interval attribute (&dev_attr_check_interval) ++ * @buf: Output buffer ++ * ++ * Writes the value of &blkdev_trig_led.check_jiffies (converted to ++ * milliseconds) to &buf. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t check_interval_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ ++ return sprintf(buf, "%u\n", ++ jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); ++} ++ ++/** ++ * check_interval_store() - &check_interval device attribute store function ++ * @dev: The LED device ++ * @attr: The &check_interval attribute (&dev_attr_check_interval) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting ++ * from milliseconds). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t check_interval_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *led = led_trigger_get_drvdata(dev); ++ unsigned int value; ++ int err; ++ ++ err = kstrtouint(buf, 0, &value); ++ if (err) ++ return err; ++ ++ if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX) ++ return -ERANGE; ++ ++ WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value)); ++ ++ return count; ++} ++ ++/** ++ * blkdev_trig_mode_show() - Helper for boolean attribute show functions. ++ * @led: The LED ++ * @buf: Output buffer ++ * @bit: Which bit to show ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf, ++ enum stat_group bit) ++{ ++ return sprintf(buf, READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); ++} ++ ++/** ++ * blkdev_trig_mode_store() - Helper for boolean attribute store functions. ++ * @led: The LED ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * @bit: Which bit to set ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static int blkdev_trig_mode_store(struct blkdev_trig_led *led, ++ const char *buf, size_t count, ++ enum stat_group bit) ++{ ++ bool set; ++ int err; ++ ++ err = kstrtobool(buf, &set); ++ if (err) ++ return err; ++ ++ if (set) ++ set_bit(bit, &led->mode); ++ else ++ clear_bit(bit, &led->mode); ++ ++ return count; ++} ++ ++/** ++ * blink_on_read_show() - &blink_on_read device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_read_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_READ); ++} ++ ++/** ++ * blink_on_read_store() - &blink_on_read device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_read_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_READ); ++} ++ ++/** ++ * blink_on_write_show() - &blink_on_write device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in ++ * in &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_write_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_WRITE); ++} ++ ++/** ++ * blink_on_write_store() - &blink_on_write device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_write_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_WRITE); ++} ++ ++/** ++ * blink_on_flush_show() - &blink_on_flush device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_flush_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_FLUSH); ++} ++ ++/** ++ * blink_on_flush_store() - &blink_on_flush device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_flush_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_FLUSH); ++} ++ ++/** ++ * blink_on_discard_show() - &blink_on_discard device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_discard_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_DISCARD); ++} ++ ++/** ++ * blink_on_discard_store() - &blink_on_discard device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_discard_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_DISCARD); ++} ++ ++/* Device attributes */ ++static DEVICE_ATTR_WO(link_dev_by_path); ++static DEVICE_ATTR_WO(unlink_dev_by_path); ++static DEVICE_ATTR_WO(unlink_dev_by_name); ++static DEVICE_ATTR_RW(blink_time); ++static DEVICE_ATTR_RW(check_interval); ++static DEVICE_ATTR_RW(blink_on_read); ++static DEVICE_ATTR_RW(blink_on_write); ++static DEVICE_ATTR_RW(blink_on_flush); ++static DEVICE_ATTR_RW(blink_on_discard); ++ ++/* Device attributes in LED directory (/sys/class/leds//...) */ ++static struct attribute *blkdev_trig_attrs[] = { ++ &dev_attr_link_dev_by_path.attr, ++ &dev_attr_unlink_dev_by_path.attr, ++ &dev_attr_unlink_dev_by_name.attr, ++ &dev_attr_blink_time.attr, ++ &dev_attr_check_interval.attr, ++ &dev_attr_blink_on_read.attr, ++ &dev_attr_blink_on_write.attr, ++ &dev_attr_blink_on_flush.attr, ++ &dev_attr_blink_on_discard.attr, ++ NULL ++}; ++ ++/* Unnamed attribute group == no subdirectory */ ++static const struct attribute_group blkdev_trig_attr_group = { ++ .attrs = blkdev_trig_attrs, ++}; ++ ++/* Attribute groups for the trigger */ ++static const struct attribute_group *blkdev_trig_attr_groups[] = { ++ &blkdev_trig_attr_group, /* /sys/class/leds//... */ ++ &blkdev_trig_linked_devs, /* /sys/class/leds//linked_devices/ */ ++ NULL ++}; ++ ++/* Trigger registration data */ ++static struct led_trigger blkdev_trig_trigger = { ++ .name = "blkdev", ++ .activate = blkdev_trig_activate, ++ .deactivate = blkdev_trig_deactivate, ++ .groups = blkdev_trig_attr_groups, ++}; ++ ++/** ++ * blkdev_trig_init() - Block device LED trigger initialization. ++ * ++ * Registers the ``blkdev`` LED trigger. ++ * ++ * Return: &0 on success, negative &errno on failure. ++ */ ++static int __init blkdev_trig_init(void) ++{ ++ return led_trigger_register(&blkdev_trig_trigger); ++} ++module_init(blkdev_trig_init); ++ ++/** ++ * blkdev_trig_exit() - Block device LED trigger module exit. ++ * ++ * Unregisters the ``blkdev`` LED trigger. ++ */ ++static void __exit blkdev_trig_exit(void) ++{ ++ led_trigger_unregister(&blkdev_trig_trigger); ++} ++module_exit(blkdev_trig_exit); ++ ++MODULE_DESCRIPTION("Block device LED trigger"); ++MODULE_AUTHOR("Ian Pilcher "); ++MODULE_LICENSE("GPL v2"); +diff --git a/drivers/md/dm.c b/drivers/md/dm.c +index b424a6ee27ba..df3fe80824bc 100644 +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -1008,6 +1008,7 @@ static void dm_wq_requeue_work(struct work_struct *work) + io->next = NULL; + __dm_io_complete(io, false); + io = next; ++ cond_resched(); + } + } + +@@ -2569,6 +2570,7 @@ static void dm_wq_work(struct work_struct *work) + break; + + submit_bio_noacct(bio); ++ cond_resched(); + } + } + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 64659b110973..8b5ca9f8f4bb 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1760,7 +1760,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + { + int ret = default_wake_function(wq_entry, mode, sync, key); + +- list_del_init(&wq_entry->entry); ++ list_del_init_careful(&wq_entry->entry); + return ret; + } + +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 9e479d7d202b..ac9ebe972be0 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); ++ seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing); + mmput(mm); + } + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 9757067c3053..d853e1c8a581 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -776,7 +776,7 @@ struct mm_struct { + #ifdef CONFIG_KSM + /* + * Represent how many pages of this process are involved in KSM +- * merging. ++ * merging (not including ksm_zero_pages_sharing). + */ + unsigned long ksm_merging_pages; + /* +@@ -784,6 +784,11 @@ struct mm_struct { + * including merged and not merged. + */ + unsigned long ksm_rmap_items; ++ /* ++ * Represent how many empty pages are merged with kernel zero ++ * pages when enabling KSM use_zero_pages. ++ */ ++ unsigned long ksm_zero_pages_sharing; + #endif + #ifdef CONFIG_LRU_GEN + struct { +diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h +index 5f1ae07d724b..97cda629c9e9 100644 +--- a/include/linux/pageblock-flags.h ++++ b/include/linux/pageblock-flags.h +@@ -48,7 +48,7 @@ extern unsigned int pageblock_order; + #else /* CONFIG_HUGETLB_PAGE */ + + /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ +-#define pageblock_order (MAX_ORDER-1) ++#define pageblock_order PAGE_ALLOC_COSTLY_ORDER + + #endif /* CONFIG_HUGETLB_PAGE */ + +diff --git a/kernel/kthread.c b/kernel/kthread.c +index f97fd01a2932..7e6751b29101 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); + * Flush and destroy @worker. The simple flush is enough because the kthread + * worker API is used only in trivial scenarios. There are no multi-step state + * machines needed. ++ * ++ * Note that this function is not responsible for handling delayed work, so ++ * caller should be responsible for queuing or canceling all delayed work items ++ * before invoke this function. + */ + void kthread_destroy_worker(struct kthread_worker *worker) + { +@@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) + + kthread_flush_worker(worker); + kthread_stop(task); ++ WARN_ON(!list_empty(&worker->delayed_work_list)); + WARN_ON(!list_empty(&worker->work_list)); + kfree(worker); + } +diff --git a/kernel/padata.c b/kernel/padata.c +index e007b8a4b738..7c80301ab084 100644 +--- a/kernel/padata.c ++++ b/kernel/padata.c +@@ -45,7 +45,7 @@ struct padata_mt_job_state { + }; + + static void padata_free_pd(struct parallel_data *pd); +-static void __init padata_mt_helper(struct work_struct *work); ++static void padata_mt_helper(struct work_struct *work); + + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) + { +@@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) + return err; + } + +-static void __init padata_mt_helper(struct work_struct *w) ++static void padata_mt_helper(struct work_struct *w) + { + struct padata_work *pw = container_of(w, struct padata_work, pw_work); + struct padata_mt_job_state *ps = pw->pw_data; +diff --git a/lib/string.c b/lib/string.c +index 4746a98b153e..6b7cf32b4e54 100644 +--- a/lib/string.c ++++ b/lib/string.c +@@ -480,13 +480,11 @@ EXPORT_SYMBOL(strcspn); + */ + char *strpbrk(const char *cs, const char *ct) + { +- const char *sc1, *sc2; ++ const char *sc; + +- for (sc1 = cs; *sc1 != '\0'; ++sc1) { +- for (sc2 = ct; *sc2 != '\0'; ++sc2) { +- if (*sc1 == *sc2) +- return (char *)sc1; +- } ++ for (sc = cs; *sc != '\0'; ++sc) { ++ if (strchr(ct, *sc)) ++ return (char *)sc; + } + return NULL; + } +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 89b269a641c7..60958afebc41 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -985,7 +985,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +diff --git a/mm/compaction.c b/mm/compaction.c +index d0b16a5b30f7..3613d7f174dc 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -122,7 +122,6 @@ bool PageMovable(struct page *page) + + return false; + } +-EXPORT_SYMBOL(PageMovable); + + void __SetPageMovable(struct page *page, const struct movable_operations *mops) + { +@@ -1102,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + + /* + * Avoid isolating too much unless this block is being +- * rescanned (e.g. dirty/writeback pages, parallel allocation) ++ * fully scanned (e.g. dirty/writeback pages, parallel allocation) + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. + */ + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && +- !cc->rescan && !cc->contended) { ++ !cc->finish_pageblock && !cc->contended) { + ++low_pfn; + break; + } +@@ -1172,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + } + + /* +- * Updated the cached scanner pfn once the pageblock has been scanned ++ * Update the cached scanner pfn once the pageblock has been scanned. + * Pages will either be migrated in which case there is no point + * scanning in the near future or migration failed in which case the + * failure reason may persist. The block is marked for skipping if + * there were no pages isolated in the block or if the block is + * rescanned twice in a row. + */ +- if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { ++ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { + if (valid_page && !skip_updated) + set_pageblock_skip(valid_page); + update_cached_migrate(cc, low_pfn); +@@ -1762,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) + if (cc->ignore_skip_hint) + return pfn; + ++ /* ++ * If the pageblock should be finished then do not select a different ++ * pageblock. ++ */ ++ if (cc->finish_pageblock) ++ return pfn; ++ + /* + * If the migrate_pfn is not at the start of a zone or the start + * of a pageblock then assume this is a continuation of a previous +@@ -1839,7 +1845,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) + pfn = cc->zone->zone_start_pfn; + cc->fast_search_fail = 0; + found_block = true; +- set_pageblock_skip(freepage); + break; + } + } +@@ -2375,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) + unsigned long iteration_start_pfn = cc->migrate_pfn; + + /* +- * Avoid multiple rescans which can happen if a page cannot be +- * isolated (dirty/writeback in async mode) or if the migrated +- * pages are being allocated before the pageblock is cleared. +- * The first rescan will capture the entire pageblock for +- * migration. If it fails, it'll be marked skip and scanning +- * will proceed as normal. ++ * Avoid multiple rescans of the same pageblock which can ++ * happen if a page cannot be isolated (dirty/writeback in ++ * async mode) or if the migrated pages are being allocated ++ * before the pageblock is cleared. The first rescan will ++ * capture the entire pageblock for migration. If it fails, ++ * it'll be marked skip and scanning will proceed as normal. + */ +- cc->rescan = false; ++ cc->finish_pageblock = false; + if (pageblock_start_pfn(last_migrated_pfn) == + pageblock_start_pfn(iteration_start_pfn)) { +- cc->rescan = true; ++ cc->finish_pageblock = true; + } + ++rescan: + switch (isolate_migratepages(cc)) { + case ISOLATE_ABORT: + ret = COMPACT_CONTENDED; +@@ -2430,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) + goto out; + } + /* +- * We failed to migrate at least one page in the current +- * order-aligned block, so skip the rest of it. ++ * If an ASYNC or SYNC_LIGHT fails to migrate a page ++ * within the current order-aligned block, scan the ++ * remainder of the pageblock. This will mark the ++ * pageblock "skip" to avoid rescanning in the near ++ * future. This will isolate more pages than necessary ++ * for the request but avoid loops due to ++ * fast_find_migrateblock revisiting blocks that were ++ * recently partially scanned. + */ +- if (cc->direct_compaction && +- (cc->mode == MIGRATE_ASYNC)) { +- cc->migrate_pfn = block_end_pfn( +- cc->migrate_pfn - 1, cc->order); +- /* Draining pcplists is useless in this case */ +- last_migrated_pfn = 0; ++ if (cc->direct_compaction && !cc->finish_pageblock && ++ (cc->mode < MIGRATE_SYNC)) { ++ cc->finish_pageblock = true; ++ ++ /* ++ * Draining pcplists does not help THP if ++ * any page failed to migrate. Even after ++ * drain, the pageblock will not be free. ++ */ ++ if (cc->order == COMPACTION_HPAGE_ORDER) ++ last_migrated_pfn = 0; ++ ++ goto rescan; + } + } + ++ /* Stop if a page has been captured */ ++ if (capc && capc->page) { ++ ret = COMPACT_SUCCESS; ++ break; ++ } ++ + check_drain: + /* + * Has the migration scanner moved away from the previous +@@ -2460,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) + last_migrated_pfn = 0; + } + } +- +- /* Stop if a page has been captured */ +- if (capc && capc->page) { +- ret = COMPACT_SUCCESS; +- break; +- } + } + + out: +diff --git a/mm/internal.h b/mm/internal.h +index bcf75a8b032d..21466d0ab22f 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -422,7 +422,11 @@ struct compact_control { + bool proactive_compaction; /* kcompactd proactive compaction */ + bool whole_zone; /* Whole zone should/has been scanned */ + bool contended; /* Signal lock contention */ +- bool rescan; /* Rescanning the same pageblock */ ++ bool finish_pageblock; /* Scan the remainder of a pageblock. Used ++ * when there are potentially transient ++ * isolation or migration failures to ++ * ensure forward progress. ++ */ + bool alloc_contig; /* alloc_contig_range allocation */ + }; + +diff --git a/mm/ksm.c b/mm/ksm.c +index a92c9594a2d3..c267b92b837b 100644 +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -214,6 +214,7 @@ struct ksm_rmap_item { + #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ + #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ + #define STABLE_FLAG 0x200 /* is listed from the stable tree */ ++#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */ + + /* The stable and unstable tree heads */ + static struct rb_root one_stable_tree[1] = { RB_ROOT }; +@@ -275,6 +276,9 @@ static unsigned int zero_checksum __read_mostly; + /* Whether to merge empty (zeroed) pages with actual zero pages */ + static bool ksm_use_zero_pages __read_mostly; + ++/* The number of zero pages placed by KSM use_zero_pages */ ++static unsigned long ksm_zero_pages_sharing; ++ + #ifdef CONFIG_NUMA + /* Zeroed when merging across nodes is not allowed */ + static unsigned int ksm_merge_across_nodes = 1; +@@ -420,6 +424,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm) + return atomic_read(&mm->mm_users) == 0; + } + ++enum break_ksm_pmd_entry_return_flag { ++ HAVE_KSM_PAGE = 1, ++ HAVE_ZERO_PAGE ++}; ++ + static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) + { +@@ -427,6 +436,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex + spinlock_t *ptl; + pte_t *pte; + int ret; ++ bool is_zero_page = false; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; +@@ -434,6 +444,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); ++ if (!page) ++ is_zero_page = is_zero_pfn(pte_pfn(*pte)); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + +@@ -444,7 +456,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } +- ret = page && PageKsm(page); ++ ++ if (page && PageKsm(page)) ++ ret = HAVE_KSM_PAGE; ++ else if (is_zero_page) ++ ret = HAVE_ZERO_PAGE; ++ else ++ ret = 0; ++ + pte_unmap_unlock(pte, ptl); + return ret; + } +@@ -466,19 +485,22 @@ static const struct mm_walk_ops break_ksm_ops = { + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. + */ +-static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr, ++ bool unshare_zero_page) + { + vm_fault_t ret = 0; + + do { +- int ksm_page; ++ int walk_result; + + cond_resched(); +- ksm_page = walk_page_range_vma(vma, addr, addr + 1, ++ walk_result = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); +- if (WARN_ON_ONCE(ksm_page < 0)) +- return ksm_page; +- if (!ksm_page) ++ if (WARN_ON_ONCE(walk_result < 0)) ++ return walk_result; ++ if (!walk_result) ++ return 0; ++ if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, +@@ -539,7 +561,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, addr); + if (vma) +- break_ksm(vma, addr); ++ break_ksm(vma, addr, false); + mmap_read_unlock(mm); + } + +@@ -764,6 +786,33 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, + return NULL; + } + ++/* ++ * Cleaning the rmap_item's ZERO_PAGE_FLAG ++ * This function will be called when unshare or writing on zero pages. ++ */ ++static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) ++{ ++ if (rmap_item->address & ZERO_PAGE_FLAG) { ++ ksm_zero_pages_sharing--; ++ rmap_item->mm->ksm_zero_pages_sharing--; ++ rmap_item->address &= PAGE_MASK; ++ } ++} ++ ++/* Only called when rmap_item is going to be freed */ ++static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma; ++ ++ if (rmap_item->address & ZERO_PAGE_FLAG) { ++ vma = vma_lookup(rmap_item->mm, rmap_item->address); ++ if (vma && !ksm_test_exit(rmap_item->mm)) ++ break_ksm(vma, rmap_item->address, true); ++ } ++ /* Put at last. */ ++ clean_rmap_item_zero_flag(rmap_item); ++} ++ + /* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. +@@ -824,6 +873,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) + struct ksm_rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); ++ unshare_zero_pages(rmap_item); + free_rmap_item(rmap_item); + } + } +@@ -853,7 +903,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, + if (signal_pending(current)) + err = -ERESTARTSYS; + else +- err = break_ksm(vma, addr); ++ err = break_ksm(vma, addr, false); + } + return err; + } +@@ -2044,6 +2094,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, + rmap_item->mm->ksm_merging_pages++; + } + ++static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, ++ struct page *page) ++{ ++ struct mm_struct *mm = rmap_item->mm; ++ int err = 0; ++ ++ /* ++ * It should not take ZERO_PAGE_FLAG because on one hand, ++ * get_next_rmap_item don't return zero pages' rmap_item. ++ * On the other hand, even if zero page was writen as ++ * anonymous page, rmap_item has been cleaned after ++ * stable_tree_search ++ */ ++ if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) { ++ struct vm_area_struct *vma; ++ ++ mmap_read_lock(mm); ++ vma = find_mergeable_vma(mm, rmap_item->address); ++ if (vma) { ++ err = try_to_merge_one_page(vma, page, ++ ZERO_PAGE(rmap_item->address)); ++ if (!err) { ++ rmap_item->address |= ZERO_PAGE_FLAG; ++ ksm_zero_pages_sharing++; ++ rmap_item->mm->ksm_zero_pages_sharing++; ++ } ++ } else { ++ /* If the vma is out of date, we do not need to continue. */ ++ err = 0; ++ } ++ mmap_read_unlock(mm); ++ } ++ ++ return err; ++} ++ + /* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can +@@ -2055,7 +2141,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, + */ + static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) + { +- struct mm_struct *mm = rmap_item->mm; + struct ksm_rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct ksm_stable_node *stable_node; +@@ -2092,6 +2177,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite + } + + remove_rmap_item_from_tree(rmap_item); ++ clean_rmap_item_zero_flag(rmap_item); + + if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) +@@ -2128,29 +2214,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite + * Same checksum as an empty page. We attempt to merge it with the + * appropriate zero page if the user enabled this via sysfs. + */ +- if (ksm_use_zero_pages && (checksum == zero_checksum)) { +- struct vm_area_struct *vma; +- +- mmap_read_lock(mm); +- vma = find_mergeable_vma(mm, rmap_item->address); +- if (vma) { +- err = try_to_merge_one_page(vma, page, +- ZERO_PAGE(rmap_item->address)); +- } else { ++ if (ksm_use_zero_pages) { ++ if (checksum == zero_checksum) + /* +- * If the vma is out of date, we do not need to +- * continue. ++ * In case of failure, the page was not really empty, so we ++ * need to continue. Otherwise we're done. + */ +- err = 0; +- } +- mmap_read_unlock(mm); +- /* +- * In case of failure, the page was not really empty, so we +- * need to continue. Otherwise we're done. +- */ +- if (!err) +- return; ++ if (!try_to_merge_with_kernel_zero_page(rmap_item, page)) ++ return; + } ++ + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); + if (tree_rmap_item) { +@@ -2214,23 +2287,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite + } + } + +-static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, +- struct ksm_rmap_item **rmap_list, +- unsigned long addr) ++static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, ++ struct ksm_rmap_item **rmap_list) + { +- struct ksm_rmap_item *rmap_item; +- + while (*rmap_list) { +- rmap_item = *rmap_list; ++ struct ksm_rmap_item *rmap_item = *rmap_list; ++ + if ((rmap_item->address & PAGE_MASK) == addr) + return rmap_item; + if (rmap_item->address > addr) + break; + *rmap_list = rmap_item->rmap_list; ++ /* ++ * If we end up here, the VMA is MADV_UNMERGEABLE or its page ++ * is ineligible or discarded, e.g. MADV_DONTNEED. ++ */ + remove_rmap_item_from_tree(rmap_item); ++ unshare_zero_pages(rmap_item); + free_rmap_item(rmap_item); + } + ++ return NULL; ++} ++ ++static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, ++ struct ksm_rmap_item **rmap_list, ++ unsigned long addr) ++{ ++ struct ksm_rmap_item *rmap_item; ++ ++ rmap_item = try_to_get_old_rmap_item(addr, rmap_list); ++ if (rmap_item) ++ return rmap_item; ++ + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ +@@ -2337,6 +2426,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) + } + if (is_zone_device_page(*page)) + goto next_page; ++ if (is_zero_pfn(page_to_pfn(*page))) { ++ /* ++ * To monitor ksm zero pages which becomes non-anonymous, ++ * we have to save each rmap_item of zero pages by ++ * try_to_get_old_rmap_item() walking on ++ * ksm_scan.rmap_list, otherwise their rmap_items will be ++ * freed by the next turn of get_next_rmap_item(). The ++ * function get_next_rmap_item() will free all "skipped" ++ * rmap_items because it thinks its areas as UNMERGEABLE. ++ */ ++ rmap_item = try_to_get_old_rmap_item(ksm_scan.address, ++ ksm_scan.rmap_list); ++ if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) ++ ksm_scan.rmap_list = &rmap_item->rmap_list; ++ goto next_page; ++ } + if (PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); +@@ -3138,6 +3243,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, + } + KSM_ATTR_RO(pages_volatile); + ++static ssize_t zero_pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%ld\n", ksm_zero_pages_sharing); ++} ++KSM_ATTR_RO(zero_pages_sharing); ++ + static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + { +@@ -3193,6 +3305,7 @@ static struct attribute *ksm_attrs[] = { + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, ++ &zero_pages_sharing_attr.attr, + &full_scans_attr.attr, + #ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3bb3484563ed..3aec9a6a9cb7 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + { + unsigned long flags; + int i, allocated = 0; ++ struct list_head *prev_tail = list->prev; ++ struct page *pos, *n; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { +@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + if (unlikely(page == NULL)) + break; + +- if (unlikely(check_pcp_refill(page, order))) +- continue; +- + /* + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of +@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); +- allocated++; + if (is_migrate_cma(get_pcppage_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); +@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock_irqrestore(&zone->lock, flags); ++ ++ /* ++ * Pages are appended to the pcp list without checking to reduce the ++ * time holding the zone lock. Checking the appended pages happens right ++ * after the critical section while still holding the pcp lock. ++ */ ++ pos = list_first_entry(prev_tail, struct page, pcp_list); ++ list_for_each_entry_safe_from(pos, n, list, pcp_list) { ++ if (unlikely(check_pcp_refill(pos, order))) { ++ list_del(&pos->pcp_list); ++ continue; ++ } ++ ++ allocated++; ++ } ++ + return allocated; + } + +diff --git a/mm/z3fold.c b/mm/z3fold.c +index a4de0c317ac7..0cef845d397b 100644 +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + +- VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + if (test_bit(PAGE_HEADLESS, &page->private)) +@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, + struct z3fold_header *zhdr, *new_zhdr; + struct z3fold_pool *pool; + +- VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index 702bc3fd687a..9d27d9b00bce 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) + * Page is locked so zspage couldn't be destroyed. For detail, look at + * lock_zspage in free_zspage. + */ +- VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); +@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + +- VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + /* The page is locked, so this pointer must remain valid */ +@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page) + { + struct zspage *zspage; + +- VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); +diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include +index 274125307ebd..5a84b6443875 100644 +--- a/scripts/Kconfig.include ++++ b/scripts/Kconfig.include +@@ -33,7 +33,7 @@ ld-option = $(success,$(LD) -v $(1)) + + # $(as-instr,) + # Return y if the assembler supports , n otherwise +-as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler -o /dev/null -) ++as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler-with-cpp -o /dev/null -) + + # check if $(CC) and $(LD) exist + $(error-if,$(failure,command -v $(CC)),C compiler '$(CC)' not found) +diff --git a/scripts/Makefile.compiler b/scripts/Makefile.compiler +index 3d8adfd34af1..7aa1fbc4aafe 100644 +--- a/scripts/Makefile.compiler ++++ b/scripts/Makefile.compiler +@@ -29,16 +29,16 @@ try-run = $(shell set -e; \ + fi) + + # as-option +-# Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,) ++# Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,) + + as-option = $(call try-run,\ +- $(CC) $(KBUILD_CFLAGS) $(1) -c -x assembler /dev/null -o "$$TMP",$(1),$(2)) ++ $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) + + # as-instr +-# Usage: cflags-y += $(call as-instr,instr,option1,option2) ++# Usage: aflags-y += $(call as-instr,instr,option1,option2) + + as-instr = $(call try-run,\ +- printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) ++ printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) + + # __cc-option + # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) +diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o +index 0edfdb40364b..ae52d3b3f063 100644 +--- a/scripts/Makefile.vmlinux_o ++++ b/scripts/Makefile.vmlinux_o +@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@ + + .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \ + vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE +- $(call if_changed,gen_initcalls_lds) ++ +$(call if_changed,gen_initcalls_lds) + + targets := .tmp_initcalls.lds + +diff --git a/scripts/as-version.sh b/scripts/as-version.sh +index 1a21495e9ff0..af717476152d 100755 +--- a/scripts/as-version.sh ++++ b/scripts/as-version.sh +@@ -45,7 +45,7 @@ orig_args="$@" + # Get the first line of the --version output. + IFS=' + ' +-set -- $(LC_ALL=C "$@" -Wa,--version -c -x assembler /dev/null -o /dev/null 2>/dev/null) ++set -- $(LC_ALL=C "$@" -Wa,--version -c -x assembler-with-cpp /dev/null -o /dev/null 2>/dev/null) + + # Split the line on spaces. + IFS=' ' +diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening +index 53baa95cb644..0f295961e773 100644 +--- a/security/Kconfig.hardening ++++ b/security/Kconfig.hardening +@@ -281,6 +281,9 @@ endmenu + + config CC_HAS_RANDSTRUCT + def_bool $(cc-option,-frandomize-layout-seed-file=/dev/null) ++ # Randstruct was first added in Clang 15, but it isn't safe to use until ++ # Clang 16 due to https://github.com/llvm/llvm-project/issues/60349 ++ depends on !CC_IS_CLANG || CLANG_VERSION >= 160000 + + choice + prompt "Randomize layout of sensitive kernel structures" +diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c +index f7815ee24f83..e94b0a6b96df 100644 +--- a/sound/pci/hda/cs35l41_hda.c ++++ b/sound/pci/hda/cs35l41_hda.c +@@ -1240,7 +1240,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd + + if (strncmp(hid, "CLSA0100", 8) == 0) { + hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH; +- } else if (strncmp(hid, "CLSA0101", 8) == 0) { ++ } else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) { + hw_cfg->bst_type = CS35L41_EXT_BOOST; + hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; + hw_cfg->gpio1.valid = true; +diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c +index b11b7e5115dc..3033cd6ed3b4 100644 +--- a/tools/testing/selftests/vm/ksm_functional_tests.c ++++ b/tools/testing/selftests/vm/ksm_functional_tests.c +@@ -24,9 +24,12 @@ + + #define KiB 1024u + #define MiB (1024 * KiB) ++#define PageSize (4 * KiB) + + static int ksm_fd; + static int ksm_full_scans_fd; ++static int ksm_zero_pages_fd; ++static int ksm_use_zero_pages_fd; + static int pagemap_fd; + static size_t pagesize; + +@@ -57,6 +60,21 @@ static bool range_maps_duplicates(char *addr, unsigned long size) + return false; + } + ++static long ksm_get_zero_pages(void) ++{ ++ char buf[20]; ++ ssize_t read_size; ++ unsigned long ksm_zero_pages; ++ ++ read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); ++ if (read_size < 0) ++ return -errno; ++ buf[read_size] = 0; ++ ksm_zero_pages = strtol(buf, NULL, 10); ++ ++ return ksm_zero_pages; ++} ++ + static long ksm_get_full_scans(void) + { + char buf[10]; +@@ -70,15 +88,12 @@ static long ksm_get_full_scans(void) + return strtol(buf, NULL, 10); + } + +-static int ksm_merge(void) ++static int wait_two_full_scans(void) + { + long start_scans, end_scans; + +- /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) +- return start_scans; +- if (write(ksm_fd, "1", 1) != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); +@@ -89,6 +104,34 @@ static int ksm_merge(void) + return 0; + } + ++static inline int ksm_merge(void) ++{ ++ /* Wait for two full scans such that any possible merging happened. */ ++ if (write(ksm_fd, "1", 1) != 1) ++ return -errno; ++ ++ return wait_two_full_scans(); ++} ++ ++static int unmerge_zero_page(char *start, unsigned long size) ++{ ++ int ret; ++ ++ ret = madvise(start, size, MADV_UNMERGEABLE); ++ if (ret) { ++ ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); ++ return ret; ++ } ++ ++ /* ++ * Wait for two full scans such that any possible unmerging of zero ++ * pages happened. Why? Because the unmerge action of zero pages is not ++ * done in the context of madvise(), but in the context of ++ * unshare_zero_pages() of the ksmd thread. ++ */ ++ return wait_two_full_scans(); ++} ++ + static char *mmap_and_merge_range(char val, unsigned long size) + { + char *map; +@@ -146,6 +189,48 @@ static void test_unmerge(void) + munmap(map, size); + } + ++static void test_unmerge_zero_pages(void) ++{ ++ const unsigned int size = 2 * MiB; ++ char *map; ++ unsigned long pages_expected; ++ ++ ksft_print_msg("[RUN] %s\n", __func__); ++ ++ /* Confirm the interfaces*/ ++ if (ksm_zero_pages_fd < 0) { ++ ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); ++ return; ++ } ++ if (ksm_use_zero_pages_fd < 0) { ++ ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); ++ return; ++ } ++ if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { ++ ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); ++ return; ++ } ++ ++ /* Mmap zero pages*/ ++ map = mmap_and_merge_range(0x00, size); ++ if (map == MAP_FAILED) ++ return; ++ ++ if (unmerge_zero_page(map + size / 2, size / 2)) ++ goto unmap; ++ ++ /* Check if zero_pages_sharing can be update correctly when unmerge */ ++ pages_expected = (size / 2) / PageSize; ++ ksft_test_result(pages_expected == ksm_get_zero_pages(), ++ "zero page count react to unmerge\n"); ++ ++ /* Check if ksm zero pages are really unmerged */ ++ ksft_test_result(!range_maps_duplicates(map + size / 2, size / 2), ++ "KSM zero pages were unmerged\n"); ++unmap: ++ munmap(map, size); ++} ++ + static void test_unmerge_discarded(void) + { + const unsigned int size = 2 * MiB; +@@ -264,8 +349,11 @@ int main(int argc, char **argv) + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); ++ ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); ++ ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + + test_unmerge(); ++ test_unmerge_zero_pages(); + test_unmerge_discarded(); + #ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +-- +2.39.2 + +From d1c5ae2d043a5ae09cbe88ad8f21e4753ced9418 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 13 Feb 2023 11:27:09 +0100 +Subject: [PATCH 07/15] fs-patches + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/xfs.rst | 2 +- + block/blk-merge.c | 3 +- + fs/btrfs/Makefile | 6 +- + fs/btrfs/backref.c | 33 +- + fs/btrfs/bio.c | 557 +++++++++++++++++++++--- + fs/btrfs/bio.h | 67 +-- + fs/btrfs/block-group.c | 273 ++++++++++-- + fs/btrfs/block-group.h | 24 +- + fs/btrfs/btrfs_inode.h | 22 +- + fs/btrfs/compression.c | 276 ++---------- + fs/btrfs/compression.h | 3 - + fs/btrfs/ctree.c | 62 ++- + fs/btrfs/ctree.h | 15 + + fs/btrfs/defrag.c | 4 +- + fs/btrfs/delayed-ref.c | 24 +- + fs/btrfs/delayed-ref.h | 2 +- + fs/btrfs/discard.c | 41 +- + fs/btrfs/disk-io.c | 225 +--------- + fs/btrfs/disk-io.h | 14 +- + fs/btrfs/extent-io-tree.c | 10 +- + fs/btrfs/extent-io-tree.h | 1 - + fs/btrfs/extent-tree.c | 181 +++----- + fs/btrfs/extent-tree.h | 81 ++++ + fs/btrfs/extent_io.c | 582 +++---------------------- + fs/btrfs/extent_io.h | 36 +- + fs/btrfs/file-item.c | 72 ++-- + fs/btrfs/file-item.h | 8 +- + fs/btrfs/file.c | 2 +- + fs/btrfs/free-space-tree.c | 2 +- + fs/btrfs/fs.c | 4 + + fs/btrfs/fs.h | 11 +- + fs/btrfs/inode.c | 641 ++++------------------------ + fs/btrfs/ioctl.c | 2 +- + fs/btrfs/lru_cache.c | 166 ++++++++ + fs/btrfs/lru_cache.h | 80 ++++ + fs/btrfs/lzo.c | 2 +- + fs/btrfs/messages.c | 30 -- + fs/btrfs/messages.h | 34 -- + fs/btrfs/ordered-data.c | 25 +- + fs/btrfs/ordered-data.h | 3 +- + fs/btrfs/qgroup.c | 2 +- + fs/btrfs/raid56.c | 334 ++++++--------- + fs/btrfs/raid56.h | 4 +- + fs/btrfs/relocation.c | 2 +- + fs/btrfs/scrub.c | 51 ++- + fs/btrfs/send.c | 684 ++++++++++++++++-------------- + fs/btrfs/super.c | 3 +- + fs/btrfs/sysfs.c | 41 +- + fs/btrfs/sysfs.h | 3 +- + fs/btrfs/tests/extent-map-tests.c | 2 +- + fs/btrfs/transaction.c | 34 ++ + fs/btrfs/transaction.h | 31 ++ + fs/btrfs/tree-log.c | 87 ++-- + fs/btrfs/tree-log.h | 9 +- + fs/btrfs/volumes.c | 116 ++--- + fs/btrfs/volumes.h | 18 - + fs/btrfs/zoned.c | 146 +++---- + fs/btrfs/zoned.h | 20 +- + fs/gfs2/bmap.c | 38 +- + fs/iomap/buffered-io.c | 91 ++-- + fs/iomap/direct-io.c | 10 +- + fs/xfs/libxfs/xfs_alloc.c | 32 +- + fs/xfs/libxfs/xfs_bmap.c | 32 +- + fs/xfs/libxfs/xfs_bmap.h | 5 +- + fs/xfs/libxfs/xfs_btree.c | 18 +- + fs/xfs/libxfs/xfs_refcount.c | 96 ++--- + fs/xfs/libxfs/xfs_refcount.h | 4 +- + fs/xfs/libxfs/xfs_rmap.c | 50 ++- + fs/xfs/libxfs/xfs_rmap.h | 6 +- + fs/xfs/xfs_bmap_item.c | 137 +++--- + fs/xfs/xfs_error.c | 2 +- + fs/xfs/xfs_error.h | 12 +- + fs/xfs/xfs_extfree_item.c | 99 +++-- + fs/xfs/xfs_globals.c | 3 +- + fs/xfs/xfs_iomap.c | 4 +- + fs/xfs/xfs_refcount_item.c | 110 +++-- + fs/xfs/xfs_rmap_item.c | 142 +++---- + fs/xfs/xfs_sysfs.c | 12 +- + fs/xfs/xfs_sysfs.h | 10 +- + fs/xfs/xfs_trace.h | 15 +- + include/linux/bio.h | 4 + + include/linux/iomap.h | 30 +- + include/trace/events/btrfs.h | 127 +++++- + 83 files changed, 2936 insertions(+), 3366 deletions(-) + create mode 100644 fs/btrfs/lru_cache.c + create mode 100644 fs/btrfs/lru_cache.h + +diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst +index 8de008c0c5ad..e2561416391c 100644 +--- a/Documentation/admin-guide/xfs.rst ++++ b/Documentation/admin-guide/xfs.rst +@@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem: + XFS_ERRLEVEL_LOW: 1 + XFS_ERRLEVEL_HIGH: 5 + +- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) ++ fs.xfs.panic_mask (Min: 0 Default: 0 Max: 511) + Causes certain error conditions to call BUG(). Value is a bitmask; + OR together the tags which represent errors which should cause panics: + +diff --git a/block/blk-merge.c b/block/blk-merge.c +index b7c193d67185..64bf7d9dd8e8 100644 +--- a/block/blk-merge.c ++++ b/block/blk-merge.c +@@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, + * responsible for ensuring that @bs is only destroyed after processing of the + * split bio has finished. + */ +-static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, ++struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes) + { + struct bio_vec bv, bvprv, *bvprvp = NULL; +@@ -336,6 +336,7 @@ static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + bio_clear_polled(bio); + return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); + } ++EXPORT_SYMBOL_GPL(bio_split_rw); + + /** + * __bio_split_to_limits - split a bio to fit the queue limits +diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile +index 555c962fdad6..90d53209755b 100644 +--- a/fs/btrfs/Makefile ++++ b/fs/btrfs/Makefile +@@ -11,7 +11,8 @@ condflags := \ + $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wpacked-not-aligned) \ +- $(call cc-option, -Wstringop-truncation) ++ $(call cc-option, -Wstringop-truncation) \ ++ $(call cc-option, -Wmaybe-uninitialized) + subdir-ccflags-y += $(condflags) + # The following turn off the warnings enabled by -Wextra + subdir-ccflags-y += -Wno-missing-field-initializers +@@ -31,7 +32,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ +- subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o ++ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ ++ lru_cache.o + + btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o + btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c +index 46851511b661..90e40d5ceccd 100644 +--- a/fs/btrfs/backref.c ++++ b/fs/btrfs/backref.c +@@ -1252,8 +1252,12 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) + { ++ const struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_backref_shared_cache_entry *entry; + ++ if (!current->journal_info) ++ lockdep_assert_held(&fs_info->commit_root_sem); ++ + if (!ctx->use_path_cache) + return false; + +@@ -1288,7 +1292,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ct + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && +- entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) ++ entry->gen != btrfs_get_last_root_drop_gen(fs_info)) + return false; + + *is_shared = entry->is_shared; +@@ -1318,9 +1322,13 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) + { ++ const struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + ++ if (!current->journal_info) ++ lockdep_assert_held(&fs_info->commit_root_sem); ++ + if (!ctx->use_path_cache) + return; + +@@ -1336,7 +1344,7 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx + ASSERT(level >= 0); + + if (is_shared) +- gen = btrfs_get_last_root_drop_gen(root->fs_info); ++ gen = btrfs_get_last_root_drop_gen(fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + +@@ -1864,6 +1872,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + .have_delayed_delete_refs = false, + }; + int level; ++ bool leaf_cached; ++ bool leaf_is_shared; + + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) +@@ -1885,6 +1895,23 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + walk_ctx.time_seq = elem.seq; + } + ++ ctx->use_path_cache = true; ++ ++ /* ++ * We may have previously determined that the current leaf is shared. ++ * If it is, then we have a data extent that is shared due to a shared ++ * subtree (caused by snapshotting) and we don't need to check for data ++ * backrefs. If the leaf is not shared, then we must do backref walking ++ * to determine if the data extent is shared through reflinks. ++ */ ++ leaf_cached = lookup_backref_shared_cache(ctx, root, ++ ctx->curr_leaf_bytenr, 0, ++ &leaf_is_shared); ++ if (leaf_cached && leaf_is_shared) { ++ ret = 1; ++ goto out_trans; ++ } ++ + walk_ctx.ignore_extent_item_pos = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; +@@ -1893,7 +1920,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + /* -1 means we are in the bytenr of the data extent. */ + level = -1; + ULIST_ITER_INIT(&uiter); +- ctx->use_path_cache = true; + while (1) { + bool is_shared; + bool cached; +@@ -1964,6 +1990,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, + ctx->prev_extents_cache_slot = slot; + } + ++out_trans: + if (trans) { + btrfs_put_tree_mod_seq(fs_info, &elem); + btrfs_end_transaction(trans); +diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c +index 8affc88b0e0a..d8b90f95b157 100644 +--- a/fs/btrfs/bio.c ++++ b/fs/btrfs/bio.c +@@ -14,19 +14,31 @@ + #include "dev-replace.h" + #include "rcu-string.h" + #include "zoned.h" ++#include "file-item.h" + + static struct bio_set btrfs_bioset; ++static struct bio_set btrfs_clone_bioset; ++static struct bio_set btrfs_repair_bioset; ++static mempool_t btrfs_failed_bio_pool; ++ ++struct btrfs_failed_bio { ++ struct btrfs_bio *bbio; ++ int num_copies; ++ atomic_t repair_count; ++}; + + /* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +-static inline void btrfs_bio_init(struct btrfs_bio *bbio, +- btrfs_bio_end_io_t end_io, void *private) ++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, ++ btrfs_bio_end_io_t end_io, void *private) + { + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); ++ bbio->inode = inode; + bbio->end_io = end_io; + bbio->private = private; ++ atomic_set(&bbio->pending_ios, 1); + } + + /* +@@ -37,32 +49,235 @@ static inline void btrfs_bio_init(struct btrfs_bio *bbio, + * a mempool. + */ + struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, ++ struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private) + { + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); +- btrfs_bio_init(btrfs_bio(bio), end_io, private); ++ btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); + return bio; + } + +-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, +- btrfs_bio_end_io_t end_io, void *private) ++static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, ++ struct bio *orig, u64 map_length, ++ bool use_append) + { ++ struct btrfs_bio *orig_bbio = btrfs_bio(orig); + struct bio *bio; +- struct btrfs_bio *bbio; + +- ASSERT(offset <= UINT_MAX && size <= UINT_MAX); ++ if (use_append) { ++ unsigned int nr_segs; ++ ++ bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, ++ &btrfs_clone_bioset, map_length); ++ } else { ++ bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, ++ &btrfs_clone_bioset); ++ } ++ btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); + +- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); +- bbio = btrfs_bio(bio); +- btrfs_bio_init(bbio, end_io, private); ++ btrfs_bio(bio)->file_offset = orig_bbio->file_offset; ++ if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) ++ orig_bbio->file_offset += map_length; + +- bio_trim(bio, offset >> 9, size >> 9); +- bbio->iter = bio->bi_iter; ++ atomic_inc(&orig_bbio->pending_ios); + return bio; + } + ++static void btrfs_orig_write_end_io(struct bio *bio); ++ ++static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, ++ struct btrfs_bio *orig_bbio) ++{ ++ /* ++ * For writes we tolerate nr_mirrors - 1 write failures, so we can't ++ * just blindly propagate a write failure here. Instead increment the ++ * error count in the original I/O context so that it is guaranteed to ++ * be larger than the error tolerance. ++ */ ++ if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { ++ struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; ++ struct btrfs_io_context *orig_bioc = orig_stripe->bioc; ++ ++ atomic_add(orig_bioc->max_errors, &orig_bioc->error); ++ } else { ++ orig_bbio->bio.bi_status = bbio->bio.bi_status; ++ } ++} ++ ++static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) ++{ ++ if (bbio->bio.bi_pool == &btrfs_clone_bioset) { ++ struct btrfs_bio *orig_bbio = bbio->private; ++ ++ if (bbio->bio.bi_status) ++ btrfs_bbio_propagate_error(bbio, orig_bbio); ++ bio_put(&bbio->bio); ++ bbio = orig_bbio; ++ } ++ ++ if (atomic_dec_and_test(&bbio->pending_ios)) ++ bbio->end_io(bbio); ++} ++ ++static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) ++{ ++ if (cur_mirror == fbio->num_copies) ++ return cur_mirror + 1 - fbio->num_copies; ++ return cur_mirror + 1; ++} ++ ++static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) ++{ ++ if (cur_mirror == 1) ++ return fbio->num_copies; ++ return cur_mirror - 1; ++} ++ ++static void btrfs_repair_done(struct btrfs_failed_bio *fbio) ++{ ++ if (atomic_dec_and_test(&fbio->repair_count)) { ++ btrfs_orig_bbio_end_io(fbio->bbio); ++ mempool_free(fbio, &btrfs_failed_bio_pool); ++ } ++} ++ ++static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, ++ struct btrfs_device *dev) ++{ ++ struct btrfs_failed_bio *fbio = repair_bbio->private; ++ struct btrfs_inode *inode = repair_bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); ++ int mirror = repair_bbio->mirror_num; ++ ++ if (repair_bbio->bio.bi_status || ++ !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { ++ bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); ++ repair_bbio->bio.bi_iter = repair_bbio->saved_iter; ++ ++ mirror = next_repair_mirror(fbio, mirror); ++ if (mirror == fbio->bbio->mirror_num) { ++ btrfs_debug(fs_info, "no mirror left"); ++ fbio->bbio->bio.bi_status = BLK_STS_IOERR; ++ goto done; ++ } ++ ++ btrfs_submit_bio(&repair_bbio->bio, mirror); ++ return; ++ } ++ ++ do { ++ mirror = prev_repair_mirror(fbio, mirror); ++ btrfs_repair_io_failure(fs_info, btrfs_ino(inode), ++ repair_bbio->file_offset, fs_info->sectorsize, ++ repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, ++ bv->bv_page, bv->bv_offset, mirror); ++ } while (mirror != fbio->bbio->mirror_num); ++ ++done: ++ btrfs_repair_done(fbio); ++ bio_put(&repair_bbio->bio); ++} ++ ++/* ++ * Try to kick off a repair read to the next available mirror for a bad sector. ++ * ++ * This primarily tries to recover good data to serve the actual read request, ++ * but also tries to write the good data back to the bad mirror(s) when a ++ * read succeeded to restore the redundancy. ++ */ ++static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, ++ u32 bio_offset, ++ struct bio_vec *bv, ++ struct btrfs_failed_bio *fbio) ++{ ++ struct btrfs_inode *inode = failed_bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ const u32 sectorsize = fs_info->sectorsize; ++ const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); ++ struct btrfs_bio *repair_bbio; ++ struct bio *repair_bio; ++ int num_copies; ++ int mirror; ++ ++ btrfs_debug(fs_info, "repair read error: read error at %llu", ++ failed_bbio->file_offset + bio_offset); ++ ++ num_copies = btrfs_num_copies(fs_info, logical, sectorsize); ++ if (num_copies == 1) { ++ btrfs_debug(fs_info, "no copy to repair from"); ++ failed_bbio->bio.bi_status = BLK_STS_IOERR; ++ return fbio; ++ } ++ ++ if (!fbio) { ++ fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); ++ fbio->bbio = failed_bbio; ++ fbio->num_copies = num_copies; ++ atomic_set(&fbio->repair_count, 1); ++ } ++ ++ atomic_inc(&fbio->repair_count); ++ ++ repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, ++ &btrfs_repair_bioset); ++ repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; ++ bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); ++ ++ repair_bbio = btrfs_bio(repair_bio); ++ btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); ++ repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; ++ ++ mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); ++ btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); ++ btrfs_submit_bio(repair_bio, mirror); ++ return fbio; ++} ++ ++static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) ++{ ++ struct btrfs_inode *inode = bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ u32 sectorsize = fs_info->sectorsize; ++ struct bvec_iter *iter = &bbio->saved_iter; ++ blk_status_t status = bbio->bio.bi_status; ++ struct btrfs_failed_bio *fbio = NULL; ++ u32 offset = 0; ++ ++ /* ++ * Hand off repair bios to the repair code as there is no upper level ++ * submitter for them. ++ */ ++ if (bbio->bio.bi_pool == &btrfs_repair_bioset) { ++ btrfs_end_repair_bio(bbio, dev); ++ return; ++ } ++ ++ /* Clear the I/O error. A failed repair will reset it. */ ++ bbio->bio.bi_status = BLK_STS_OK; ++ ++ while (iter->bi_size) { ++ struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); ++ ++ bv.bv_len = min(bv.bv_len, sectorsize); ++ if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) ++ fbio = repair_one_sector(bbio, offset, &bv, fbio); ++ ++ bio_advance_iter_single(&bbio->bio, iter, sectorsize); ++ offset += sectorsize; ++ } ++ ++ if (bbio->csum != bbio->csum_inline) ++ kfree(bbio->csum); ++ ++ if (fbio) ++ btrfs_repair_done(fbio); ++ else ++ btrfs_orig_bbio_end_io(bbio); ++} ++ + static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) + { + if (!dev || !dev->bdev) +@@ -90,24 +305,31 @@ static void btrfs_end_bio_work(struct work_struct *work) + { + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + +- bbio->end_io(bbio); ++ /* Metadata reads are checked and repaired by the submitter. */ ++ if (bbio->bio.bi_opf & REQ_META) ++ bbio->end_io(bbio); ++ else ++ btrfs_check_read_bio(bbio, bbio->bio.bi_private); + } + + static void btrfs_simple_end_io(struct bio *bio) + { +- struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); ++ struct btrfs_device *dev = bio->bi_private; ++ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) +- btrfs_log_dev_io_error(bio, bbio->device); ++ btrfs_log_dev_io_error(bio, dev); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { +- bbio->end_io(bbio); ++ if (bio_op(bio) == REQ_OP_ZONE_APPEND) ++ btrfs_record_physical_zoned(bbio); ++ btrfs_orig_bbio_end_io(bbio); + } + } + +@@ -118,7 +340,10 @@ static void btrfs_raid56_end_io(struct bio *bio) + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; +- bbio->end_io(bbio); ++ if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) ++ btrfs_check_read_bio(bbio, NULL); ++ else ++ btrfs_orig_bbio_end_io(bbio); + + btrfs_put_bioc(bioc); + } +@@ -145,7 +370,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) + else + bio->bi_status = BLK_STS_OK; + +- bbio->end_io(bbio); ++ btrfs_orig_bbio_end_io(bbio); + btrfs_put_bioc(bioc); + } + +@@ -181,16 +406,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; ++ u64 zone_start = round_down(physical, dev->fs_info->zone_size); + +- if (btrfs_dev_is_sequential(dev, physical)) { +- u64 zone_start = round_down(physical, +- dev->fs_info->zone_size); +- +- bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; +- } else { +- bio->bi_opf &= ~REQ_OP_ZONE_APPEND; +- bio->bi_opf |= REQ_OP_WRITE; +- } ++ ASSERT(btrfs_dev_is_sequential(dev, physical)); ++ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", +@@ -224,41 +443,21 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); + } + +-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) ++static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, ++ struct btrfs_io_stripe *smap, int mirror_num) + { +- u64 logical = bio->bi_iter.bi_sector << 9; +- u64 length = bio->bi_iter.bi_size; +- u64 map_length = length; +- struct btrfs_io_context *bioc = NULL; +- struct btrfs_io_stripe smap; +- int ret; +- +- btrfs_bio_counter_inc_blocked(fs_info); +- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, +- &bioc, &smap, &mirror_num, 1); +- if (ret) { +- btrfs_bio_counter_dec(fs_info); +- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); +- return; +- } +- +- if (map_length < length) { +- btrfs_crit(fs_info, +- "mapping failed logical %llu bio len %llu len %llu", +- logical, length, map_length); +- BUG(); +- } ++ /* Do not leak our private flag into the block layer. */ ++ bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; + + if (!bioc) { +- /* Single mirror read/write fast path */ ++ /* Single mirror read/write fast path. */ + btrfs_bio(bio)->mirror_num = mirror_num; +- btrfs_bio(bio)->device = smap.dev; +- bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; +- bio->bi_private = fs_info; ++ bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; ++ bio->bi_private = smap->dev; + bio->bi_end_io = btrfs_simple_end_io; +- btrfs_submit_dev_bio(smap.dev, bio); ++ btrfs_submit_dev_bio(smap->dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { +- /* Parity RAID write or read recovery */ ++ /* Parity RAID write or read recovery. */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) +@@ -266,16 +465,233 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror + else + raid56_parity_write(bio, bioc); + } else { +- /* Write to multiple mirrors */ ++ /* Write to multiple mirrors. */ + int total_devs = bioc->num_stripes; +- int dev_nr; + + bioc->orig_bio = bio; +- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) ++ for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } + } + ++static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) ++{ ++ if (bbio->bio.bi_opf & REQ_META) ++ return btree_csum_one_bio(bbio); ++ return btrfs_csum_one_bio(bbio); ++} ++ ++/* ++ * Async submit bios are used to offload expensive checksumming onto the worker ++ * threads. ++ */ ++struct async_submit_bio { ++ struct btrfs_bio *bbio; ++ struct btrfs_io_context *bioc; ++ struct btrfs_io_stripe smap; ++ int mirror_num; ++ struct btrfs_work work; ++}; ++ ++/* ++ * In order to insert checksums into the metadata in large chunks, we wait ++ * until bio submission time. All the pages in the bio are checksummed and ++ * sums are attached onto the ordered extent record. ++ * ++ * At IO completion time the csums attached on the ordered extent record are ++ * inserted into the btree. ++ */ ++static void run_one_async_start(struct btrfs_work *work) ++{ ++ struct async_submit_bio *async = ++ container_of(work, struct async_submit_bio, work); ++ blk_status_t ret; ++ ++ ret = btrfs_bio_csum(async->bbio); ++ if (ret) ++ async->bbio->bio.bi_status = ret; ++} ++ ++/* ++ * In order to insert checksums into the metadata in large chunks, we wait ++ * until bio submission time. All the pages in the bio are checksummed and ++ * sums are attached onto the ordered extent record. ++ * ++ * At IO completion time the csums attached on the ordered extent record are ++ * inserted into the tree. ++ */ ++static void run_one_async_done(struct btrfs_work *work) ++{ ++ struct async_submit_bio *async = ++ container_of(work, struct async_submit_bio, work); ++ struct bio *bio = &async->bbio->bio; ++ ++ /* If an error occurred we just want to clean up the bio and move on. */ ++ if (bio->bi_status) { ++ btrfs_orig_bbio_end_io(async->bbio); ++ return; ++ } ++ ++ /* ++ * All of the bios that pass through here are from async helpers. ++ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. ++ * This changes nothing when cgroups aren't in use. ++ */ ++ bio->bi_opf |= REQ_CGROUP_PUNT; ++ __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); ++} ++ ++static void run_one_async_free(struct btrfs_work *work) ++{ ++ kfree(container_of(work, struct async_submit_bio, work)); ++} ++ ++static bool should_async_write(struct btrfs_bio *bbio) ++{ ++ /* ++ * If the I/O is not issued by fsync and friends, (->sync_writers != 0), ++ * then try to defer the submission to a workqueue to parallelize the ++ * checksum calculation. ++ */ ++ if (atomic_read(&bbio->inode->sync_writers)) ++ return false; ++ ++ /* ++ * Submit metadata writes synchronously if the checksum implementation ++ * is fast, or we are on a zoned device that wants I/O to be submitted ++ * in order. ++ */ ++ if (bbio->bio.bi_opf & REQ_META) { ++ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ ++ if (btrfs_is_zoned(fs_info)) ++ return false; ++ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) ++ return false; ++ } ++ ++ return true; ++} ++ ++/* ++ * Submit bio to an async queue. ++ * ++ * Return true if the work has been succesfuly submitted, else false. ++ */ ++static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, ++ struct btrfs_io_context *bioc, ++ struct btrfs_io_stripe *smap, int mirror_num) ++{ ++ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ struct async_submit_bio *async; ++ ++ async = kmalloc(sizeof(*async), GFP_NOFS); ++ if (!async) ++ return false; ++ ++ async->bbio = bbio; ++ async->bioc = bioc; ++ async->smap = *smap; ++ async->mirror_num = mirror_num; ++ ++ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, ++ run_one_async_free); ++ if (op_is_sync(bbio->bio.bi_opf)) ++ btrfs_queue_work(fs_info->hipri_workers, &async->work); ++ else ++ btrfs_queue_work(fs_info->workers, &async->work); ++ return true; ++} ++ ++static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) ++{ ++ struct btrfs_bio *bbio = btrfs_bio(bio); ++ struct btrfs_inode *inode = bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct btrfs_bio *orig_bbio = bbio; ++ u64 logical = bio->bi_iter.bi_sector << 9; ++ u64 length = bio->bi_iter.bi_size; ++ u64 map_length = length; ++ bool use_append = btrfs_use_zone_append(bbio); ++ struct btrfs_io_context *bioc = NULL; ++ struct btrfs_io_stripe smap; ++ blk_status_t ret; ++ int error; ++ ++ btrfs_bio_counter_inc_blocked(fs_info); ++ error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, ++ &bioc, &smap, &mirror_num, 1); ++ if (error) { ++ ret = errno_to_blk_status(error); ++ goto fail; ++ } ++ ++ map_length = min(map_length, length); ++ if (use_append) ++ map_length = min(map_length, fs_info->max_zone_append_size); ++ ++ if (map_length < length) { ++ bio = btrfs_split_bio(fs_info, bio, map_length, use_append); ++ bbio = btrfs_bio(bio); ++ } ++ ++ /* ++ * Save the iter for the end_io handler and preload the checksums for ++ * data reads. ++ */ ++ if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { ++ bbio->saved_iter = bio->bi_iter; ++ ret = btrfs_lookup_bio_sums(bbio); ++ if (ret) ++ goto fail_put_bio; ++ } ++ ++ if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ++ if (use_append) { ++ bio->bi_opf &= ~REQ_OP_WRITE; ++ bio->bi_opf |= REQ_OP_ZONE_APPEND; ++ ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); ++ if (ret) ++ goto fail_put_bio; ++ } ++ ++ /* ++ * Csum items for reloc roots have already been cloned at this ++ * point, so they are handled as part of the no-checksum case. ++ */ ++ if (!(inode->flags & BTRFS_INODE_NODATASUM) && ++ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && ++ !btrfs_is_data_reloc_root(inode->root)) { ++ if (should_async_write(bbio) && ++ btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) ++ goto done; ++ ++ ret = btrfs_bio_csum(bbio); ++ if (ret) ++ goto fail_put_bio; ++ } ++ } ++ ++ __btrfs_submit_bio(bio, bioc, &smap, mirror_num); ++done: ++ return map_length == length; ++ ++fail_put_bio: ++ if (map_length < length) ++ bio_put(bio); ++fail: ++ btrfs_bio_counter_dec(fs_info); ++ btrfs_bio_end_io(orig_bbio, ret); ++ /* Do not submit another chunk */ ++ return true; ++} ++ ++void btrfs_submit_bio(struct bio *bio, int mirror_num) ++{ ++ while (!btrfs_submit_chunk(bio, mirror_num)) ++ ; ++} ++ + /* + * Submit a repair write. + * +@@ -283,7 +699,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * +- * The I/O is issued sychronously to block the repair read completion from ++ * The I/O is issued synchronously to block the repair read completion from + * freeing the bio. + */ + int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, +@@ -381,10 +797,31 @@ int __init btrfs_bioset_init(void) + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; ++ if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, ++ offsetof(struct btrfs_bio, bio), 0)) ++ goto out_free_bioset; ++ if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, ++ offsetof(struct btrfs_bio, bio), ++ BIOSET_NEED_BVECS)) ++ goto out_free_clone_bioset; ++ if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, ++ sizeof(struct btrfs_failed_bio))) ++ goto out_free_repair_bioset; + return 0; ++ ++out_free_repair_bioset: ++ bioset_exit(&btrfs_repair_bioset); ++out_free_clone_bioset: ++ bioset_exit(&btrfs_clone_bioset); ++out_free_bioset: ++ bioset_exit(&btrfs_bioset); ++ return -ENOMEM; + } + + void __cold btrfs_bioset_exit(void) + { ++ mempool_exit(&btrfs_failed_bio_pool); ++ bioset_exit(&btrfs_repair_bioset); ++ bioset_exit(&btrfs_clone_bioset); + bioset_exit(&btrfs_bioset); + } +diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h +index b12f84b3b341..873ff85817f0 100644 +--- a/fs/btrfs/bio.h ++++ b/fs/btrfs/bio.h +@@ -26,32 +26,23 @@ struct btrfs_fs_info; + typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + + /* +- * Additional info to pass along bio. +- * +- * Mostly for btrfs specific features like csum and mirror_num. ++ * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and ++ * passed to btrfs_submit_bio for mapping to the physical devices. + */ + struct btrfs_bio { +- unsigned int mirror_num:7; +- +- /* +- * Extra indicator for metadata bios. +- * For some btrfs bios they use pages without a mapping, thus +- * we can not rely on page->mapping->host to determine if +- * it's a metadata bio. +- */ +- unsigned int is_metadata:1; +- struct bvec_iter iter; +- +- /* for direct I/O */ ++ /* Inode and offset into it that this I/O operates on. */ ++ struct btrfs_inode *inode; + u64 file_offset; + +- /* @device is for stripe IO submission. */ +- struct btrfs_device *device; + union { +- /* For data checksum verification. */ ++ /* ++ * Data checksumming and original I/O information for internal ++ * use in the btrfs_submit_bio machinery. ++ */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; ++ struct bvec_iter saved_iter; + }; + + /* For metadata parentness verification. */ +@@ -62,7 +53,9 @@ struct btrfs_bio { + btrfs_bio_end_io_t end_io; + void *private; + +- /* For read end I/O handling */ ++ /* For internal use in read end I/O handling */ ++ unsigned int mirror_num; ++ atomic_t pending_ios; + struct work_struct end_io_work; + + /* +@@ -80,11 +73,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) + int __init btrfs_bioset_init(void); + void __cold btrfs_bioset_exit(void); + ++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, ++ btrfs_bio_end_io_t end_io, void *private); + struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, ++ struct btrfs_inode *inode, + btrfs_bio_end_io_t end_io, void *private); +-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, +- btrfs_bio_end_io_t end_io, void *private); +- + + static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) + { +@@ -92,34 +85,10 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) + bbio->end_io(bbio); + } + +-static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) +-{ +- if (bbio->is_metadata) +- return; +- if (bbio->csum != bbio->csum_inline) { +- kfree(bbio->csum); +- bbio->csum = NULL; +- } +-} ++/* Bio only refers to one ordered extent. */ ++#define REQ_BTRFS_ONE_ORDERED REQ_DRV + +-/* +- * Iterate through a btrfs_bio (@bbio) on a per-sector basis. +- * +- * bvl - struct bio_vec +- * bbio - struct btrfs_bio +- * iters - struct bvec_iter +- * bio_offset - unsigned int +- */ +-#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ +- for ((iter) = (bbio)->iter, (bio_offset) = 0; \ +- (iter).bi_size && \ +- (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ +- (bio_offset) += fs_info->sectorsize, \ +- bio_advance_iter_single(&(bbio)->bio, &(iter), \ +- (fs_info)->sectorsize)) +- +-void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, +- int mirror_num); ++void btrfs_submit_bio(struct bio *bio, int mirror_num); + int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index 708d843daa72..5b10401d803b 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + ++#include + #include + #include "misc.h" + #include "ctree.h" +@@ -539,6 +540,153 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end + return total_added; + } + ++/* ++ * Get an arbitrary extent item index / max_index through the block group ++ * ++ * @block_group the block group to sample from ++ * @index: the integral step through the block group to grab from ++ * @max_index: the granularity of the sampling ++ * @key: return value parameter for the item we find ++ * ++ * Pre-conditions on indices: ++ * 0 <= index <= max_index ++ * 0 < max_index ++ * ++ * Returns: 0 on success, 1 if the search didn't yield a useful item, negative ++ * error code on error. ++ */ ++static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, ++ struct btrfs_block_group *block_group, ++ int index, int max_index, ++ struct btrfs_key *key) ++{ ++ struct btrfs_fs_info *fs_info = block_group->fs_info; ++ struct btrfs_root *extent_root; ++ int ret = 0; ++ u64 search_offset; ++ u64 search_end = block_group->start + block_group->length; ++ struct btrfs_path *path; ++ ++ ASSERT(index >= 0); ++ ASSERT(index <= max_index); ++ ASSERT(max_index > 0); ++ lockdep_assert_held(&caching_ctl->mutex); ++ lockdep_assert_held_read(&fs_info->commit_root_sem); ++ ++ path = btrfs_alloc_path(); ++ if (!path) ++ return -ENOMEM; ++ ++ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, ++ BTRFS_SUPER_INFO_OFFSET)); ++ ++ path->skip_locking = 1; ++ path->search_commit_root = 1; ++ path->reada = READA_FORWARD; ++ ++ search_offset = index * div_u64(block_group->length, max_index); ++ key->objectid = block_group->start + search_offset; ++ key->type = BTRFS_EXTENT_ITEM_KEY; ++ key->offset = 0; ++ ++ while (1) { ++ ret = btrfs_search_forward(extent_root, key, path, 0); ++ if (ret != 0) ++ goto out; ++ /* Success; sampled an extent item in the block group */ ++ if (key->type == BTRFS_EXTENT_ITEM_KEY && ++ key->objectid >= block_group->start && ++ key->objectid + key->offset <= search_end) ++ goto out; ++ ++ /* We can't possibly find a valid extent item anymore */ ++ if (key->objectid >= search_end) { ++ ret = 1; ++ break; ++ } ++ if (key->type < BTRFS_EXTENT_ITEM_KEY) ++ key->type = BTRFS_EXTENT_ITEM_KEY; ++ else ++ key->objectid++; ++ btrfs_release_path(path); ++ up_read(&fs_info->commit_root_sem); ++ mutex_unlock(&caching_ctl->mutex); ++ cond_resched(); ++ mutex_lock(&caching_ctl->mutex); ++ down_read(&fs_info->commit_root_sem); ++ } ++out: ++ lockdep_assert_held(&caching_ctl->mutex); ++ lockdep_assert_held_read(&fs_info->commit_root_sem); ++ btrfs_free_path(path); ++ return ret; ++} ++ ++/* ++ * Best effort attempt to compute a block group's size class while caching it. ++ * ++ * @block_group: the block group we are caching ++ * ++ * We cannot infer the size class while adding free space extents, because that ++ * logic doesn't care about contiguous file extents (it doesn't differentiate ++ * between a 100M extent and 100 contiguous 1M extents). So we need to read the ++ * file extent items. Reading all of them is quite wasteful, because usually ++ * only a handful are enough to give a good answer. Therefore, we just grab 5 of ++ * them at even steps through the block group and pick the smallest size class ++ * we see. Since size class is best effort, and not guaranteed in general, ++ * inaccuracy is acceptable. ++ * ++ * To be more explicit about why this algorithm makes sense: ++ * ++ * If we are caching in a block group from disk, then there are three major cases ++ * to consider: ++ * 1. the block group is well behaved and all extents in it are the same size ++ * class. ++ * 2. the block group is mostly one size class with rare exceptions for last ++ * ditch allocations ++ * 3. the block group was populated before size classes and can have a totally ++ * arbitrary mix of size classes. ++ * ++ * In case 1, looking at any extent in the block group will yield the correct ++ * result. For the mixed cases, taking the minimum size class seems like a good ++ * approximation, since gaps from frees will be usable to the size class. For ++ * 2., a small handful of file extents is likely to yield the right answer. For ++ * 3, we can either read every file extent, or admit that this is best effort ++ * anyway and try to stay fast. ++ * ++ * Returns: 0 on success, negative error code on error. ++ */ ++static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, ++ struct btrfs_block_group *block_group) ++{ ++ struct btrfs_key key; ++ int i; ++ u64 min_size = block_group->length; ++ enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; ++ int ret; ++ ++ if (!btrfs_block_group_should_use_size_class(block_group)) ++ return 0; ++ ++ for (i = 0; i < 5; ++i) { ++ ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); ++ if (ret < 0) ++ goto out; ++ if (ret > 0) ++ continue; ++ min_size = min_t(u64, min_size, key.offset); ++ size_class = btrfs_calc_block_group_size_class(min_size); ++ } ++ if (size_class != BTRFS_BG_SZ_NONE) { ++ spin_lock(&block_group->lock); ++ block_group->size_class = size_class; ++ spin_unlock(&block_group->lock); ++ } ++ ++out: ++ return ret; ++} ++ + static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) + { + struct btrfs_block_group *block_group = caching_ctl->block_group; +@@ -683,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work) + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + ++ load_block_group_size_class(caching_ctl, block_group); + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + ret = load_free_space_cache(block_group); + if (ret == 1) { +@@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) + * + * @fs_info: the filesystem + * @chunk_start: logical address of block group +- * @bdev: physical device to resolve, can be NULL to indicate any device + * @physical: physical address to map to logical addresses + * @logical: return array of logical addresses which map to @physical + * @naddrs: length of @logical +@@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) + * block copies. + */ + int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, +- struct block_device *bdev, u64 physical, u64 **logical, +- int *naddrs, int *stripe_len) ++ u64 physical, u64 **logical, int *naddrs, int *stripe_len) + { + struct extent_map *em; + struct map_lookup *map; +@@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + data_stripe_length)) + continue; + +- if (bdev && map->stripes[i].dev->bdev != bdev) +- continue; +- + stripe_nr = physical - map->stripes[i].physical; + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); + +@@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); +- ret = btrfs_rmap_block(fs_info, cache->start, NULL, ++ ret = btrfs_rmap_block(fs_info, cache->start, + bytenr, &logical, &nr, &stripe_len); + if (ret) + return ret; +@@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, + spin_unlock(&info->delalloc_root_lock); + + while (total) { +- bool reclaim; ++ bool reclaim = false; + + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) { +@@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, + cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); ++ + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + +@@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, + * reservation and return -EAGAIN, otherwise this function always succeeds. + */ + int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, +- u64 ram_bytes, u64 num_bytes, int delalloc) ++ u64 ram_bytes, u64 num_bytes, int delalloc, ++ bool force_wrong_size_class) + { + struct btrfs_space_info *space_info = cache->space_info; ++ enum btrfs_block_group_size_class size_class; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; +- } else { +- cache->reserved += num_bytes; +- space_info->bytes_reserved += num_bytes; +- trace_btrfs_space_reservation(cache->fs_info, "space_info", +- space_info->flags, num_bytes, 1); +- btrfs_space_info_update_bytes_may_use(cache->fs_info, +- space_info, -ram_bytes); +- if (delalloc) +- cache->delalloc_bytes += num_bytes; ++ goto out; ++ } + +- /* +- * Compression can use less space than we reserved, so wake +- * tickets if that happens +- */ +- if (num_bytes < ram_bytes) +- btrfs_try_granting_tickets(cache->fs_info, space_info); ++ if (btrfs_block_group_should_use_size_class(cache)) { ++ size_class = btrfs_calc_block_group_size_class(num_bytes); ++ ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); ++ if (ret) ++ goto out; + } ++ cache->reserved += num_bytes; ++ space_info->bytes_reserved += num_bytes; ++ trace_btrfs_space_reservation(cache->fs_info, "space_info", ++ space_info->flags, num_bytes, 1); ++ btrfs_space_info_update_bytes_may_use(cache->fs_info, ++ space_info, -ram_bytes); ++ if (delalloc) ++ cache->delalloc_bytes += num_bytes; ++ ++ /* ++ * Compression can use less space than we reserved, so wake tickets if ++ * that happens. ++ */ ++ if (num_bytes < ram_bytes) ++ btrfs_try_granting_tickets(cache->fs_info, space_info); ++out: + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +@@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount + bg->swap_extents -= amount; + spin_unlock(&bg->lock); + } ++ ++enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) ++{ ++ if (size <= SZ_128K) ++ return BTRFS_BG_SZ_SMALL; ++ if (size <= SZ_8M) ++ return BTRFS_BG_SZ_MEDIUM; ++ return BTRFS_BG_SZ_LARGE; ++} ++ ++/* ++ * Handle a block group allocating an extent in a size class ++ * ++ * @bg: The block group we allocated in. ++ * @size_class: The size class of the allocation. ++ * @force_wrong_size_class: Whether we are desperate enough to allow ++ * mismatched size classes. ++ * ++ * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the ++ * case of a race that leads to the wrong size class without ++ * force_wrong_size_class set. ++ * ++ * find_free_extent will skip block groups with a mismatched size class until ++ * it really needs to avoid ENOSPC. In that case it will set ++ * force_wrong_size_class. However, if a block group is newly allocated and ++ * doesn't yet have a size class, then it is possible for two allocations of ++ * different sizes to race and both try to use it. The loser is caught here and ++ * has to retry. ++ */ ++int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, ++ enum btrfs_block_group_size_class size_class, ++ bool force_wrong_size_class) ++{ ++ ASSERT(size_class != BTRFS_BG_SZ_NONE); ++ ++ /* The new allocation is in the right size class, do nothing */ ++ if (bg->size_class == size_class) ++ return 0; ++ /* ++ * The new allocation is in a mismatched size class. ++ * This means one of two things: ++ * ++ * 1. Two tasks in find_free_extent for different size_classes raced ++ * and hit the same empty block_group. Make the loser try again. ++ * 2. A call to find_free_extent got desperate enough to set ++ * 'force_wrong_slab'. Don't change the size_class, but allow the ++ * allocation. ++ */ ++ if (bg->size_class != BTRFS_BG_SZ_NONE) { ++ if (force_wrong_size_class) ++ return 0; ++ return -EAGAIN; ++ } ++ /* ++ * The happy new block group case: the new allocation is the first ++ * one in the block_group so we set size_class. ++ */ ++ bg->size_class = size_class; ++ ++ return 0; ++} ++ ++bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) ++{ ++ if (btrfs_is_zoned(bg->fs_info)) ++ return false; ++ if (!btrfs_is_block_group_data_only(bg)) ++ return false; ++ return true; ++} +diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h +index a02ea76fd6cf..6e4a0b429ac3 100644 +--- a/fs/btrfs/block-group.h ++++ b/fs/btrfs/block-group.h +@@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { + BTRFS_DC_SETUP, + }; + ++enum btrfs_block_group_size_class { ++ /* Unset */ ++ BTRFS_BG_SZ_NONE, ++ /* 0 < size <= 128K */ ++ BTRFS_BG_SZ_SMALL, ++ /* 128K < size <= 8M */ ++ BTRFS_BG_SZ_MEDIUM, ++ /* 8M < size < BG_LENGTH */ ++ BTRFS_BG_SZ_LARGE, ++}; ++ + /* + * This describes the state of the block_group for async discard. This is due + * to the two pass nature of it where extent discarding is prioritized over +@@ -233,6 +244,7 @@ struct btrfs_block_group { + struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; ++ enum btrfs_block_group_size_class size_class; + }; + + static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +@@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); + int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool alloc); + int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, +- u64 ram_bytes, u64 num_bytes, int delalloc); ++ u64 ram_bytes, u64 num_bytes, int delalloc, ++ bool force_wrong_size_class); + void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, + u64 num_bytes, int delalloc); + int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +@@ -315,8 +328,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); + void btrfs_put_block_group_cache(struct btrfs_fs_info *info); + int btrfs_free_block_groups(struct btrfs_fs_info *info); + int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, +- struct block_device *bdev, u64 physical, u64 **logical, +- int *naddrs, int *stripe_len); ++ u64 physical, u64 **logical, int *naddrs, int *stripe_len); + + static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) + { +@@ -346,4 +358,10 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); + bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); + void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); + ++enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); ++int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, ++ enum btrfs_block_group_size_class size_class, ++ bool force_wrong_size_class); ++bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); ++ + #endif /* BTRFS_BLOCK_GROUP_H */ +diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h +index 195c09e20609..49a92aa65de1 100644 +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -93,12 +93,6 @@ struct btrfs_inode { + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + +- /* special utility tree used to record which mirrors have already been +- * tried when checksums fail for a given block +- */ +- struct rb_root io_failure_tree; +- spinlock_t io_failure_lock; +- + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate +@@ -411,21 +405,11 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, + #define CSUM_FMT "0x%*phN" + #define CSUM_FMT_VALUE(size, bytes) size, bytes + +-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, +- int mirror_num, enum btrfs_compression_type compress_type); +-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); +-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); +-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, +- struct bio *bio, +- u64 dio_file_offset); + int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, +- u32 bio_offset, struct page *page, u32 pgoff); +-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, +- u32 bio_offset, struct page *page, +- u64 start, u64 end); ++blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); ++bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, ++ u32 bio_offset, struct bio_vec *bv); + noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); +diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c +index 5122ca79f7ea..f42f31f22d13 100644 +--- a/fs/btrfs/compression.c ++++ b/fs/btrfs/compression.c +@@ -141,12 +141,15 @@ static int compression_decompress(int type, struct list_head *ws, + + static int btrfs_decompress_bio(struct compressed_bio *cb); + +-static void finish_compressed_bio_read(struct compressed_bio *cb) ++static void end_compressed_bio_read(struct btrfs_bio *bbio) + { ++ struct compressed_bio *cb = bbio->private; + unsigned int index; + struct page *page; + +- if (cb->status == BLK_STS_OK) ++ if (bbio->bio.bi_status) ++ cb->status = bbio->bio.bi_status; ++ else + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + + /* Release the compressed pages */ +@@ -162,54 +165,6 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) + /* Finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +-} +- +-/* +- * Verify the checksums and kick off repair if needed on the uncompressed data +- * before decompressing it into the original bio and freeing the uncompressed +- * pages. +- */ +-static void end_compressed_bio_read(struct btrfs_bio *bbio) +-{ +- struct compressed_bio *cb = bbio->private; +- struct inode *inode = cb->inode; +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- struct btrfs_inode *bi = BTRFS_I(inode); +- bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && +- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); +- blk_status_t status = bbio->bio.bi_status; +- struct bvec_iter iter; +- struct bio_vec bv; +- u32 offset; +- +- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { +- u64 start = bbio->file_offset + offset; +- +- if (!status && +- (!csum || !btrfs_check_data_csum(bi, bbio, offset, +- bv.bv_page, bv.bv_offset))) { +- btrfs_clean_io_failure(bi, start, bv.bv_page, +- bv.bv_offset); +- } else { +- int ret; +- +- refcount_inc(&cb->pending_ios); +- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, +- bv.bv_page, bv.bv_offset, +- true); +- if (ret) { +- refcount_dec(&cb->pending_ios); +- status = errno_to_blk_status(ret); +- } +- } +- } +- +- if (status) +- cb->status = status; +- +- if (refcount_dec_and_test(&cb->pending_ios)) +- finish_compressed_bio_read(cb); +- btrfs_bio_free_csum(bbio); + bio_put(&bbio->bio); + } + +@@ -303,68 +258,12 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) + static void end_compressed_bio_write(struct btrfs_bio *bbio) + { + struct compressed_bio *cb = bbio->private; +- +- if (bbio->bio.bi_status) +- cb->status = bbio->bio.bi_status; +- +- if (refcount_dec_and_test(&cb->pending_ios)) { +- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); +- +- btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); +- queue_work(fs_info->compressed_write_workers, &cb->write_end_work); +- } +- bio_put(&bbio->bio); +-} +- +-/* +- * Allocate a compressed_bio, which will be used to read/write on-disk +- * (aka, compressed) * data. +- * +- * @cb: The compressed_bio structure, which records all the needed +- * information to bind the compressed data to the uncompressed +- * page cache. +- * @disk_byten: The logical bytenr where the compressed data will be read +- * from or written to. +- * @endio_func: The endio function to call after the IO for compressed data +- * is finished. +- * @next_stripe_start: Return value of logical bytenr of where next stripe starts. +- * Let the caller know to only fill the bio up to the stripe +- * boundary. +- */ +- +- +-static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, +- blk_opf_t opf, +- btrfs_bio_end_io_t endio_func, +- u64 *next_stripe_start) +-{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); +- struct btrfs_io_geometry geom; +- struct extent_map *em; +- struct bio *bio; +- int ret; + +- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); +- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; ++ cb->status = bbio->bio.bi_status; ++ queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + +- em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); +- if (IS_ERR(em)) { +- bio_put(bio); +- return ERR_CAST(em); +- } +- +- if (bio_op(bio) == REQ_OP_ZONE_APPEND) +- bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); +- +- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); +- free_extent_map(em); +- if (ret < 0) { +- bio_put(bio); +- return ERR_PTR(ret); +- } +- *next_stripe_start = disk_bytenr + geom.len; +- refcount_inc(&cb->pending_ios); +- return bio; ++ bio_put(&bbio->bio); + } + + /* +@@ -389,18 +288,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + struct bio *bio = NULL; + struct compressed_bio *cb; + u64 cur_disk_bytenr = disk_start; +- u64 next_stripe_start; + blk_status_t ret = BLK_STS_OK; +- int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; +- const bool use_append = btrfs_use_zone_append(inode, disk_start); +- const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); + if (!cb) + return BLK_STS_RESOURCE; +- refcount_set(&cb->pending_ios, 1); + cb->status = BLK_STS_OK; + cb->inode = &inode->vfs_inode; + cb->start = start; +@@ -411,8 +305,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); + cb->nr_pages = nr_pages; + +- if (blkcg_css) ++ if (blkcg_css) { + kthread_associate_blkcg(blkcg_css); ++ write_flags |= REQ_CGROUP_PUNT; ++ } ++ ++ write_flags |= REQ_BTRFS_ONE_ORDERED; ++ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, ++ BTRFS_I(cb->inode), end_compressed_bio_write, cb); ++ bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; ++ btrfs_bio(bio)->file_offset = start; + + while (cur_disk_bytenr < disk_start + compressed_len) { + u64 offset = cur_disk_bytenr - disk_start; +@@ -420,77 +322,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + unsigned int real_size; + unsigned int added; + struct page *page = compressed_pages[index]; +- bool submit = false; +- +- /* Allocate new bio if submitted or not yet allocated */ +- if (!bio) { +- bio = alloc_compressed_bio(cb, cur_disk_bytenr, +- bio_op | write_flags, end_compressed_bio_write, +- &next_stripe_start); +- if (IS_ERR(bio)) { +- ret = errno_to_blk_status(PTR_ERR(bio)); +- break; +- } +- if (blkcg_css) +- bio->bi_opf |= REQ_CGROUP_PUNT; +- } +- /* +- * We should never reach next_stripe_start start as we will +- * submit comp_bio when reach the boundary immediately. +- */ +- ASSERT(cur_disk_bytenr != next_stripe_start); + + /* + * We have various limits on the real read size: +- * - stripe boundary + * - page boundary + * - compressed length boundary + */ +- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); +- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); ++ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + +- if (use_append) +- added = bio_add_zone_append_page(bio, page, real_size, +- offset_in_page(offset)); +- else +- added = bio_add_page(bio, page, real_size, +- offset_in_page(offset)); +- /* Reached zoned boundary */ +- if (added == 0) +- submit = true; +- ++ added = bio_add_page(bio, page, real_size, offset_in_page(offset)); ++ /* ++ * Maximum compressed extent is smaller than bio size limit, ++ * thus bio_add_page() should always success. ++ */ ++ ASSERT(added == real_size); + cur_disk_bytenr += added; +- /* Reached stripe boundary */ +- if (cur_disk_bytenr == next_stripe_start) +- submit = true; +- +- /* Finished the range */ +- if (cur_disk_bytenr == disk_start + compressed_len) +- submit = true; +- +- if (submit) { +- if (!skip_sum) { +- ret = btrfs_csum_one_bio(inode, bio, start, true); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- break; +- } +- } +- +- ASSERT(bio->bi_iter.bi_size); +- btrfs_submit_bio(fs_info, bio, 0); +- bio = NULL; +- } +- cond_resched(); + } + ++ /* Finished the range. */ ++ ASSERT(bio->bi_iter.bi_size); ++ btrfs_submit_bio(bio, 0); + if (blkcg_css) + kthread_associate_blkcg(NULL); +- +- if (refcount_dec_and_test(&cb->pending_ios)) +- finish_compressed_bio_write(cb); + return ret; + } + +@@ -667,10 +522,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + unsigned int compressed_len; +- struct bio *comp_bio = NULL; ++ struct bio *comp_bio; + const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_byte = disk_bytenr; +- u64 next_stripe_start; + u64 file_offset; + u64 em_len; + u64 em_start; +@@ -703,7 +557,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + goto out; + } + +- refcount_set(&cb->pending_ios, 1); + cb->status = BLK_STS_OK; + cb->inode = inode; + +@@ -737,37 +590,23 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + /* include any pages we added in add_ra-bio_pages */ + cb->len = bio->bi_iter.bi_size; + ++ comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), ++ end_compressed_bio_read, cb); ++ comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); ++ + while (cur_disk_byte < disk_bytenr + compressed_len) { + u64 offset = cur_disk_byte - disk_bytenr; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = cb->compressed_pages[index]; +- bool submit = false; +- +- /* Allocate new bio if submitted or not yet allocated */ +- if (!comp_bio) { +- comp_bio = alloc_compressed_bio(cb, cur_disk_byte, +- REQ_OP_READ, end_compressed_bio_read, +- &next_stripe_start); +- if (IS_ERR(comp_bio)) { +- cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); +- break; +- } +- } +- /* +- * We should never reach next_stripe_start start as we will +- * submit comp_bio when reach the boundary immediately. +- */ +- ASSERT(cur_disk_byte != next_stripe_start); ++ + /* + * We have various limit on the real read size: +- * - stripe boundary + * - page boundary + * - compressed length boundary + */ +- real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); +- real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); ++ real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + +@@ -778,45 +617,20 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + */ + ASSERT(added == real_size); + cur_disk_byte += added; +- +- /* Reached stripe boundary, need to submit */ +- if (cur_disk_byte == next_stripe_start) +- submit = true; +- +- /* Has finished the range, need to submit */ +- if (cur_disk_byte == disk_bytenr + compressed_len) +- submit = true; +- +- if (submit) { +- /* Save the original iter for read repair */ +- if (bio_op(comp_bio) == REQ_OP_READ) +- btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; +- +- /* +- * Save the initial offset of this chunk, as there +- * is no direct correlation between compressed pages and +- * the original file offset. The field is only used for +- * priting error messages. +- */ +- btrfs_bio(comp_bio)->file_offset = file_offset; +- +- ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(comp_bio), ret); +- break; +- } +- +- ASSERT(comp_bio->bi_iter.bi_size); +- btrfs_submit_bio(fs_info, comp_bio, mirror_num); +- comp_bio = NULL; +- } + } + + if (memstall) + psi_memstall_leave(&pflags); + +- if (refcount_dec_and_test(&cb->pending_ios)) +- finish_compressed_bio_read(cb); ++ /* ++ * Stash the initial offset of this chunk, as there is no direct ++ * correlation between compressed pages and the original file offset. ++ * The field is only used for printing error messages anyway. ++ */ ++ btrfs_bio(comp_bio)->file_offset = file_offset; ++ ++ ASSERT(comp_bio->bi_iter.bi_size); ++ btrfs_submit_bio(comp_bio, mirror_num); + return; + + fail: +@@ -1609,7 +1423,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + index_end = end >> PAGE_SHIFT; + + /* Don't miss unaligned end */ +- if (!IS_ALIGNED(end, PAGE_SIZE)) ++ if (!PAGE_ALIGNED(end)) + index_end++; + + curr_sample_pos = 0; +@@ -1642,7 +1456,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + * + * For now is's a naive and optimistic 'return true', we'll extend the logic to + * quickly (compared to direct compression) detect data characteristics +- * (compressible/uncompressible) to avoid wasting CPU time on uncompressible ++ * (compressible/incompressible) to avoid wasting CPU time on incompressible + * data. + * + * The following types of analysis can be performed: +diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h +index 6209d40a1e08..a5e3377db9ad 100644 +--- a/fs/btrfs/compression.h ++++ b/fs/btrfs/compression.h +@@ -31,9 +31,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + #define BTRFS_ZLIB_DEFAULT_LEVEL 3 + + struct compressed_bio { +- /* Number of outstanding bios */ +- refcount_t pending_ios; +- + /* Number of compressed pages in the array */ + unsigned int nr_pages; + +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index 4754c9101a4c..a5b6bb54545f 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -484,7 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + if (ret) + return ret; + } +- btrfs_clean_tree_block(buf); ++ btrfs_clear_buffer_dirty(trans, buf); + *last_ref = 1; + } + return 0; +@@ -853,8 +853,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, + /* + * Search for a key in the given extent_buffer. + * +- * The lower boundary for the search is specified by the slot number @low. Use a +- * value of 0 to search over the whole extent buffer. ++ * The lower boundary for the search is specified by the slot number @first_slot. ++ * Use a value of 0 to search over the whole extent buffer. + * + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise +@@ -863,18 +863,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. + */ +-static noinline int generic_bin_search(struct extent_buffer *eb, int low, +- const struct btrfs_key *key, int *slot) ++int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, ++ const struct btrfs_key *key, int *slot) + { + unsigned long p; + int item_size; +- int high = btrfs_header_nritems(eb); ++ /* ++ * Use unsigned types for the low and high slots, so that we get a more ++ * efficient division in the search loop below. ++ */ ++ u32 low = first_slot; ++ u32 high = btrfs_header_nritems(eb); + int ret; + const int key_size = sizeof(struct btrfs_disk_key); + +- if (low > high) { ++ if (unlikely(low > high)) { + btrfs_err(eb->fs_info, +- "%s: low (%d) > high (%d) eb %llu owner %llu level %d", ++ "%s: low (%u) > high (%u) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; +@@ -925,16 +930,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, int low, + return 1; + } + +-/* +- * Simple binary search on an extent buffer. Works for both leaves and nodes, and +- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). +- */ +-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, +- int *slot) +-{ +- return generic_bin_search(eb, 0, key, slot); +-} +- + static void root_add_used(struct btrfs_root *root, u32 size) + { + spin_lock(&root->accounting_lock); +@@ -1054,7 +1049,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, + + path->locks[level] = 0; + path->nodes[level] = NULL; +- btrfs_clean_tree_block(mid); ++ btrfs_clear_buffer_dirty(trans, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); +@@ -1115,7 +1110,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { +- btrfs_clean_tree_block(right); ++ btrfs_clear_buffer_dirty(trans, right); + btrfs_tree_unlock(right); + del_ptr(root, path, level + 1, pslot + 1); + root_sub_used(root, right->len); +@@ -1161,7 +1156,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { +- btrfs_clean_tree_block(mid); ++ btrfs_clear_buffer_dirty(trans, mid); + btrfs_tree_unlock(mid); + del_ptr(root, path, level + 1, pslot); + root_sub_used(root, mid->len); +@@ -1869,7 +1864,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, + return 0; + } + +- return generic_bin_search(eb, search_low_slot, key, slot); ++ return btrfs_generic_bin_search(eb, search_low_slot, key, slot); + } + + static int search_leaf(struct btrfs_trans_handle *trans, +@@ -3041,7 +3036,8 @@ noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ +-static noinline int __push_leaf_right(struct btrfs_path *path, ++static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, ++ struct btrfs_path *path, + int data_size, int empty, + struct extent_buffer *right, + int free_space, u32 left_nritems, +@@ -3139,7 +3135,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + if (left_nritems) + btrfs_mark_buffer_dirty(left); + else +- btrfs_clean_tree_block(left); ++ btrfs_clear_buffer_dirty(trans, left); + + btrfs_mark_buffer_dirty(right); + +@@ -3151,7 +3147,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) +- btrfs_clean_tree_block(path->nodes[0]); ++ btrfs_clear_buffer_dirty(trans, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; +@@ -3243,8 +3239,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + return 0; + } + +- return __push_leaf_right(path, min_data_size, empty, +- right, free_space, left_nritems, min_slot); ++ return __push_leaf_right(trans, path, min_data_size, empty, right, ++ free_space, left_nritems, min_slot); + out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); +@@ -3259,7 +3255,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items + */ +-static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, ++static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, ++ struct btrfs_path *path, int data_size, + int empty, struct extent_buffer *left, + int free_space, u32 right_nritems, + u32 max_slot) +@@ -3363,7 +3360,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, + if (right_nritems) + btrfs_mark_buffer_dirty(right); + else +- btrfs_clean_tree_block(right); ++ btrfs_clear_buffer_dirty(trans, right); + + btrfs_item_key(right, &disk_key, 0); + fixup_low_keys(path, &disk_key, 1); +@@ -3449,9 +3446,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + ret = -EUCLEAN; + goto out; + } +- return __push_leaf_left(path, min_data_size, +- empty, left, free_space, right_nritems, +- max_slot); ++ return __push_leaf_left(trans, path, min_data_size, empty, left, ++ free_space, right_nritems, max_slot); + out: + btrfs_tree_unlock(left); + free_extent_buffer(left); +@@ -4400,7 +4396,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { +- btrfs_clean_tree_block(leaf); ++ btrfs_clear_buffer_dirty(trans, leaf); + btrfs_del_leaf(trans, root, path, leaf); + } + } else { +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 6965703a81b6..97897107fab5 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -507,6 +507,21 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); + /* ctree.c */ + int __init btrfs_ctree_init(void); + void __cold btrfs_ctree_exit(void); ++ ++int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, ++ const struct btrfs_key *key, int *slot); ++ ++/* ++ * Simple binary search on an extent buffer. Works for both leaves and nodes, and ++ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). ++ */ ++static inline int btrfs_bin_search(struct extent_buffer *eb, ++ const struct btrfs_key *key, ++ int *slot) ++{ ++ return btrfs_generic_bin_search(eb, 0, key, slot); ++} ++ + int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, + int *slot); + int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); +diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c +index d81b764a7644..8065341d831a 100644 +--- a/fs/btrfs/defrag.c ++++ b/fs/btrfs/defrag.c +@@ -765,7 +765,7 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i + break; + + unlock_page(page); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* +@@ -999,7 +999,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, + } + + #define CLUSTER_SIZE (SZ_256K) +-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ++static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); + + /* + * Defrag one contiguous target range. +diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c +index 573ebab886e2..886ffb232eac 100644 +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -437,8 +437,7 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + return 0; + } + +-static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, +- struct btrfs_delayed_ref_root *delayed_refs, ++static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref) + { +@@ -452,8 +451,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, + atomic_dec(&delayed_refs->num_entries); + } + +-static bool merge_ref(struct btrfs_trans_handle *trans, +- struct btrfs_delayed_ref_root *delayed_refs, ++static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +@@ -482,10 +480,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, + mod = -next->ref_mod; + } + +- drop_delayed_ref(trans, delayed_refs, head, next); ++ drop_delayed_ref(delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { +- drop_delayed_ref(trans, delayed_refs, head, ref); ++ drop_delayed_ref(delayed_refs, head, ref); + done = true; + } else { + /* +@@ -499,11 +497,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, + return done; + } + +-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, ++void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) + { +- struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_node *ref; + struct rb_node *node; + u64 seq = 0; +@@ -524,7 +521,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + if (seq && ref->seq >= seq) + continue; +- if (merge_ref(trans, delayed_refs, head, ref, seq)) ++ if (merge_ref(delayed_refs, head, ref, seq)) + goto again; + } + } +@@ -601,8 +598,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + * Return 0 for insert. + * Return >0 for merge. + */ +-static int insert_delayed_ref(struct btrfs_trans_handle *trans, +- struct btrfs_delayed_ref_root *root, ++static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) + { +@@ -641,7 +637,7 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans, + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) +- drop_delayed_ref(trans, root, href, exist); ++ drop_delayed_ref(root, href, exist); + spin_unlock(&href->lock); + return ret; + inserted: +@@ -978,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + +- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); ++ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + spin_unlock(&delayed_refs->lock); + + /* +@@ -1070,7 +1066,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + +- ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); ++ ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + spin_unlock(&delayed_refs->lock); + + /* +diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h +index d6304b690ec4..2eb34abf700f 100644 +--- a/fs/btrfs/delayed-ref.h ++++ b/fs/btrfs/delayed-ref.h +@@ -357,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); +-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, ++void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); + +diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c +index ff2e524d9937..317aeff6c1da 100644 +--- a/fs/btrfs/discard.c ++++ b/fs/btrfs/discard.c +@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, + static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) + { ++ lockdep_assert_held(&discard_ctl->lock); + if (!btrfs_run_discard_work(discard_ctl)) + return; + +@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + BTRFS_DISCARD_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + } ++ if (list_empty(&block_group->discard_list)) ++ btrfs_get_block_group(block_group); + + list_move_tail(&block_group->discard_list, + get_discard_list(discard_ctl, block_group)); +@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) + { ++ bool queued; ++ + spin_lock(&discard_ctl->lock); + ++ queued = !list_empty(&block_group->discard_list); ++ + if (!btrfs_run_discard_work(discard_ctl)) { + spin_unlock(&discard_ctl->lock); + return; +@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_UNUSED_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; ++ if (!queued) ++ btrfs_get_block_group(block_group); + list_add_tail(&block_group->discard_list, + &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); + +@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) + { + bool running = false; ++ bool queued = false; + + spin_lock(&discard_ctl->lock); + +@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + } + + block_group->discard_eligible_time = 0; ++ queued = !list_empty(&block_group->discard_list); + list_del_init(&block_group->discard_list); ++ /* ++ * If the block group is currently running in the discard workfn, we ++ * don't want to deref it, since it's still being used by the workfn. ++ * The workfn will notice this case and deref the block group when it is ++ * finished. ++ */ ++ if (queued && !running) ++ btrfs_put_block_group(block_group); + + spin_unlock(&discard_ctl->lock); + +@@ -214,10 +233,12 @@ static struct btrfs_block_group *peek_discard_list( + if (block_group && now >= block_group->discard_eligible_time) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && + block_group->used != 0) { +- if (btrfs_is_block_group_data_only(block_group)) ++ if (btrfs_is_block_group_data_only(block_group)) { + __add_to_discard_list(discard_ctl, block_group); +- else ++ } else { + list_del_init(&block_group->discard_list); ++ btrfs_put_block_group(block_group); ++ } + goto again; + } + if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { +@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) + spin_lock(&discard_ctl->lock); + discard_ctl->prev_discard = trimmed; + discard_ctl->prev_discard_time = now; ++ /* ++ * If the block group was removed from the discard list while it was ++ * running in this workfn, then we didn't deref it, since this function ++ * still owned that reference. But we set the discard_ctl->block_group ++ * back to NULL, so we can use that condition to know that now we need ++ * to deref the block_group. ++ */ ++ if (discard_ctl->block_group == NULL) ++ btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + __btrfs_discard_schedule_work(discard_ctl, now, false); + spin_unlock(&discard_ctl->lock); +@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) + list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, + bg_list) { + list_del_init(&block_group->bg_list); +- btrfs_put_block_group(block_group); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); ++ /* ++ * This put is for the get done by btrfs_mark_bg_unused. ++ * Queueing discard incremented it for discard's reference. ++ */ ++ btrfs_put_block_group(block_group); + } + spin_unlock(&fs_info->unused_bgs_lock); + } +@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) + if (block_group->used == 0) + btrfs_mark_bg_unused(block_group); + spin_lock(&discard_ctl->lock); ++ btrfs_put_block_group(block_group); + } + } + spin_unlock(&discard_ctl->lock); +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 3aa04224315e..b53f0e30ce2b 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) + crypto_free_shash(fs_info->csum_shash); + } + +-/* +- * async submit bios are used to offload expensive checksumming +- * onto the worker threads. They checksum file and metadata bios +- * just before they are sent down the IO stack. +- */ +-struct async_submit_bio { +- struct btrfs_inode *inode; +- struct bio *bio; +- enum btrfs_wq_submit_cmd submit_cmd; +- int mirror_num; +- +- /* Optional parameter for used by direct io */ +- u64 dio_file_offset; +- struct btrfs_work work; +- blk_status_t status; +-}; +- + /* + * Compute the csum of a btree block and store the result to provided buffer. + */ +@@ -455,6 +438,22 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec + return csum_one_extent_buffer(eb); + } + ++blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) ++{ ++ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ int ret = 0; ++ ++ bio_for_each_segment(bv, &bbio->bio, iter) { ++ ret = csum_dirty_buffer(fs_info, &bv); ++ if (ret) ++ break; ++ } ++ ++ return errno_to_blk_status(ret); ++} ++ + static int check_tree_block_fsid(struct extent_buffer *eb) + { + struct btrfs_fs_info *fs_info = eb->fs_info; +@@ -700,172 +699,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, + return ret; + } + +-static void run_one_async_start(struct btrfs_work *work) +-{ +- struct async_submit_bio *async; +- blk_status_t ret; +- +- async = container_of(work, struct async_submit_bio, work); +- switch (async->submit_cmd) { +- case WQ_SUBMIT_METADATA: +- ret = btree_submit_bio_start(async->bio); +- break; +- case WQ_SUBMIT_DATA: +- ret = btrfs_submit_bio_start(async->inode, async->bio); +- break; +- case WQ_SUBMIT_DATA_DIO: +- ret = btrfs_submit_bio_start_direct_io(async->inode, +- async->bio, async->dio_file_offset); +- break; +- } +- if (ret) +- async->status = ret; +-} +- +-/* +- * In order to insert checksums into the metadata in large chunks, we wait +- * until bio submission time. All the pages in the bio are checksummed and +- * sums are attached onto the ordered extent record. +- * +- * At IO completion time the csums attached on the ordered extent record are +- * inserted into the tree. +- */ +-static void run_one_async_done(struct btrfs_work *work) +-{ +- struct async_submit_bio *async = +- container_of(work, struct async_submit_bio, work); +- struct btrfs_inode *inode = async->inode; +- struct btrfs_bio *bbio = btrfs_bio(async->bio); +- +- /* If an error occurred we just want to clean up the bio and move on */ +- if (async->status) { +- btrfs_bio_end_io(bbio, async->status); +- return; +- } +- +- /* +- * All of the bios that pass through here are from async helpers. +- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. +- * This changes nothing when cgroups aren't in use. +- */ +- async->bio->bi_opf |= REQ_CGROUP_PUNT; +- btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); +-} +- +-static void run_one_async_free(struct btrfs_work *work) +-{ +- struct async_submit_bio *async; +- +- async = container_of(work, struct async_submit_bio, work); +- kfree(async); +-} +- +-/* +- * Submit bio to an async queue. +- * +- * Retrun: +- * - true if the work has been succesfuly submitted +- * - false in case of error +- */ +-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, +- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct async_submit_bio *async; +- +- async = kmalloc(sizeof(*async), GFP_NOFS); +- if (!async) +- return false; +- +- async->inode = inode; +- async->bio = bio; +- async->mirror_num = mirror_num; +- async->submit_cmd = cmd; +- +- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, +- run_one_async_free); +- +- async->dio_file_offset = dio_file_offset; +- +- async->status = 0; +- +- if (op_is_sync(bio->bi_opf)) +- btrfs_queue_work(fs_info->hipri_workers, &async->work); +- else +- btrfs_queue_work(fs_info->workers, &async->work); +- return true; +-} +- +-static blk_status_t btree_csum_one_bio(struct bio *bio) +-{ +- struct bio_vec *bvec; +- struct btrfs_root *root; +- int ret = 0; +- struct bvec_iter_all iter_all; +- +- ASSERT(!bio_flagged(bio, BIO_CLONED)); +- bio_for_each_segment_all(bvec, bio, iter_all) { +- root = BTRFS_I(bvec->bv_page->mapping->host)->root; +- ret = csum_dirty_buffer(root->fs_info, bvec); +- if (ret) +- break; +- } +- +- return errno_to_blk_status(ret); +-} +- +-blk_status_t btree_submit_bio_start(struct bio *bio) +-{ +- /* +- * when we're called for a write, we're already in the async +- * submission context. Just jump into btrfs_submit_bio. +- */ +- return btree_csum_one_bio(bio); +-} +- +-static bool should_async_write(struct btrfs_fs_info *fs_info, +- struct btrfs_inode *bi) +-{ +- if (btrfs_is_zoned(fs_info)) +- return false; +- if (atomic_read(&bi->sync_writers)) +- return false; +- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) +- return false; +- return true; +-} +- +-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct btrfs_bio *bbio = btrfs_bio(bio); +- blk_status_t ret; +- +- bio->bi_opf |= REQ_META; +- bbio->is_metadata = 1; +- +- if (btrfs_op(bio) != BTRFS_MAP_WRITE) { +- btrfs_submit_bio(fs_info, bio, mirror_num); +- return; +- } +- +- /* +- * Kthread helpers are used to submit writes so that checksumming can +- * happen in parallel across all CPUs. +- */ +- if (should_async_write(fs_info, inode) && +- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) +- return; +- +- ret = btree_csum_one_bio(bio); +- if (ret) { +- btrfs_bio_end_io(bbio, ret); +- return; +- } +- +- btrfs_submit_bio(fs_info, bio, mirror_num); +-} +- + #ifdef CONFIG_MIGRATION + static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +@@ -1035,22 +868,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, + + } + +-void btrfs_clean_tree_block(struct extent_buffer *buf) +-{ +- struct btrfs_fs_info *fs_info = buf->fs_info; +- if (btrfs_header_generation(buf) == +- fs_info->running_transaction->transid) { +- btrfs_assert_tree_write_locked(buf); +- +- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { +- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, +- -buf->len, +- fs_info->dirty_metadata_batch); +- clear_extent_buffer_dirty(buf); +- } +- } +-} +- + static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, + u64 objectid) + { +@@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) + goto sleep; + } + ++ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) ++ btrfs_sysfs_feature_update(fs_info); ++ + btrfs_run_delayed_iputs(fs_info); + + again = btrfs_clean_one_deleted_snapshot(fs_info); +@@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + start += fs_info->nodesize; + if (!eb) + continue; ++ ++ btrfs_tree_lock(eb); + wait_on_extent_buffer_writeback(eb); ++ btrfs_clear_buffer_dirty(NULL, eb); ++ btrfs_tree_unlock(eb); + +- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, +- &eb->bflags)) +- clear_extent_buffer_dirty(eb); + free_extent_buffer_stale(eb); + } + } +diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h +index f2f295eb6103..4d5772330110 100644 +--- a/fs/btrfs/disk-io.h ++++ b/fs/btrfs/disk-io.h +@@ -39,7 +39,8 @@ struct extent_buffer *btrfs_find_create_tree_block( + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, + int level); +-void btrfs_clean_tree_block(struct extent_buffer *buf); ++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, ++ struct extent_buffer *buf); + void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); + int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); + int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, +@@ -86,7 +87,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, + struct page *page, u64 start, u64 end, + int mirror); +-void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); + #endif +@@ -114,15 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); + +-enum btrfs_wq_submit_cmd { +- WQ_SUBMIT_METADATA, +- WQ_SUBMIT_DATA, +- WQ_SUBMIT_DATA_DIO, +-}; +- +-bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, +- u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +-blk_status_t btree_submit_bio_start(struct bio *bio); ++blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); + int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, +diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c +index 3c7766dfaa69..29a225836e28 100644 +--- a/fs/btrfs/extent-io-tree.c ++++ b/fs/btrfs/extent-io-tree.c +@@ -972,8 +972,8 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + { + struct extent_state *state; + struct extent_state *prealloc = NULL; +- struct rb_node **p; +- struct rb_node *parent; ++ struct rb_node **p = NULL; ++ struct rb_node *parent = NULL; + int err = 0; + u64 last_start; + u64 last_end; +@@ -1218,8 +1218,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + { + struct extent_state *state; + struct extent_state *prealloc = NULL; +- struct rb_node **p; +- struct rb_node *parent; ++ struct rb_node **p = NULL; ++ struct rb_node *parent = NULL; + int err = 0; + u64 last_start; + u64 last_end; +@@ -1625,7 +1625,7 @@ u64 count_range_bits(struct extent_io_tree *tree, + } + + /* +- * Searche a range in the state tree for a given mask. If 'filled' == 1, this ++ * Search a range in the state tree for a given mask. If 'filled' == 1, this + * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 + * is returned if any bit in the range is found set. + */ +diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h +index e3eeec380844..21766e49ec02 100644 +--- a/fs/btrfs/extent-io-tree.h ++++ b/fs/btrfs/extent-io-tree.h +@@ -6,7 +6,6 @@ + #include "misc.h" + + struct extent_changeset; +-struct io_failure_record; + + /* Bits for the extent state */ + enum { +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 72ba13b027a9..824c657f59e8 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -16,7 +16,8 @@ + #include + #include + #include +-#include "misc.h" ++#include "ctree.h" ++#include "extent-tree.h" + #include "tree-log.h" + #include "disk-io.h" + #include "print-tree.h" +@@ -31,14 +32,12 @@ + #include "space-info.h" + #include "block-rsv.h" + #include "delalloc-space.h" +-#include "block-group.h" + #include "discard.h" + #include "rcu-string.h" + #include "zoned.h" + #include "dev-replace.h" + #include "fs.h" + #include "accessors.h" +-#include "extent-tree.h" + #include "root-tree.h" + #include "file-item.h" + #include "orphan.h" +@@ -1966,7 +1965,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + cond_resched(); + + spin_lock(&locked_ref->lock); +- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); ++ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); + } + + return 0; +@@ -2013,7 +2012,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + * insert_inline_extent_backref()). + */ + spin_lock(&locked_ref->lock); +- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); ++ btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); + + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, + &actual_count); +@@ -3385,7 +3384,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) + enum btrfs_loop_type { + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, ++ LOOP_UNSET_SIZE_CLASS, + LOOP_ALLOC_CHUNK, ++ LOOP_WRONG_SIZE_CLASS, + LOOP_NO_EMPTY_SIZE, + }; + +@@ -3453,81 +3454,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, + btrfs_put_block_group(cache); + } + +-enum btrfs_extent_allocation_policy { +- BTRFS_EXTENT_ALLOC_CLUSTERED, +- BTRFS_EXTENT_ALLOC_ZONED, +-}; +- +-/* +- * Structure used internally for find_free_extent() function. Wraps needed +- * parameters. +- */ +-struct find_free_extent_ctl { +- /* Basic allocation info */ +- u64 ram_bytes; +- u64 num_bytes; +- u64 min_alloc_size; +- u64 empty_size; +- u64 flags; +- int delalloc; +- +- /* Where to start the search inside the bg */ +- u64 search_start; +- +- /* For clustered allocation */ +- u64 empty_cluster; +- struct btrfs_free_cluster *last_ptr; +- bool use_cluster; +- +- bool have_caching_bg; +- bool orig_have_caching_bg; +- +- /* Allocation is called for tree-log */ +- bool for_treelog; +- +- /* Allocation is called for data relocation */ +- bool for_data_reloc; +- +- /* RAID index, converted from flags */ +- int index; +- +- /* +- * Current loop number, check find_free_extent_update_loop() for details +- */ +- int loop; +- +- /* +- * Whether we're refilling a cluster, if true we need to re-search +- * current block group but don't try to refill the cluster again. +- */ +- bool retry_clustered; +- +- /* +- * Whether we're updating free space cache, if true we need to re-search +- * current block group but don't try updating free space cache again. +- */ +- bool retry_unclustered; +- +- /* If current block group is cached */ +- int cached; +- +- /* Max contiguous hole found */ +- u64 max_extent_size; +- +- /* Total free space from free space cache, not always contiguous */ +- u64 total_free_space; +- +- /* Found result */ +- u64 found_offset; +- +- /* Hint where to start looking for an empty space */ +- u64 hint_byte; +- +- /* Allocation policy */ +- enum btrfs_extent_allocation_policy policy; +-}; +- +- + /* + * Helper function for find_free_extent(). + * +@@ -3559,8 +3485,7 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); +- trace_btrfs_reserve_extent_cluster(cluster_bg, +- ffe_ctl->search_start, ffe_ctl->num_bytes); ++ trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; +@@ -3610,10 +3535,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); +- trace_btrfs_reserve_extent_cluster(bg, +- ffe_ctl->search_start, +- ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; ++ trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); + return 0; + } + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && +@@ -4028,24 +3951,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + } + } + +-static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +-{ +- switch (ffe_ctl->policy) { +- case BTRFS_EXTENT_ALLOC_CLUSTERED: +- /* +- * If we can't allocate a new chunk we've already looped through +- * at least once, move on to the NO_EMPTY_SIZE case. +- */ +- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; +- return 0; +- case BTRFS_EXTENT_ALLOC_ZONED: +- /* Give up here */ +- return -ENOSPC; +- default: +- BUG(); +- } +-} +- + /* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. +@@ -4079,31 +3984,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching ++ * LOOP_UNSET_SIZE_CLASS, allow unset size class + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; +- if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { +- /* +- * We want to skip the LOOP_CACHING_WAIT step if we +- * don't have any uncached bgs and we've already done a +- * full search through. +- */ +- if (ffe_ctl->orig_have_caching_bg || !full_search) +- ffe_ctl->loop = LOOP_CACHING_WAIT; +- else +- ffe_ctl->loop = LOOP_ALLOC_CHUNK; +- } else { ++ /* ++ * We want to skip the LOOP_CACHING_WAIT step if we don't have ++ * any uncached bgs and we've already done a full search ++ * through. ++ */ ++ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && ++ (!ffe_ctl->orig_have_caching_bg && full_search)) + ffe_ctl->loop++; +- } ++ ffe_ctl->loop++; + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + +- /*Check if allocation policy allows to create a new chunk */ ++ /* Check if allocation policy allows to create a new chunk */ + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; +@@ -4123,8 +4025,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + CHUNK_ALLOC_FORCE_FOR_EXTENT); + + /* Do not bail out on ENOSPC since we can do more. */ +- if (ret == -ENOSPC) +- ret = chunk_allocation_failed(ffe_ctl); ++ if (ret == -ENOSPC) { ++ ret = 0; ++ ffe_ctl->loop++; ++ } + else if (ret < 0) + btrfs_abort_transaction(trans, ret); + else +@@ -4154,6 +4058,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + return -ENOSPC; + } + ++static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_block_group *bg) ++{ ++ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) ++ return true; ++ if (!btrfs_block_group_should_use_size_class(bg)) ++ return true; ++ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) ++ return true; ++ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && ++ bg->size_class == BTRFS_BG_SZ_NONE) ++ return true; ++ return ffe_ctl->size_class == bg->size_class; ++} ++ + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, +@@ -4288,6 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; ++ ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); + + if (btrfs_is_zoned(fs_info)) + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; +@@ -4296,8 +4216,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + ins->objectid = 0; + ins->offset = 0; + +- trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, +- ffe_ctl->flags); ++ trace_find_free_extent(root, ffe_ctl); + + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (!space_info) { +@@ -4340,6 +4259,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); ++ ffe_ctl->hinted = true; + goto have_block_group; + } + } else if (block_group) { +@@ -4347,6 +4267,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + } + } + search: ++ trace_find_free_extent_search_loop(root, ffe_ctl); + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) +@@ -4356,6 +4277,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + &space_info->block_groups[ffe_ctl->index], list) { + struct btrfs_block_group *bg_ret; + ++ ffe_ctl->hinted = false; + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) { + if (ffe_ctl->for_treelog) +@@ -4397,6 +4319,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + } + + have_block_group: ++ trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; +@@ -4421,6 +4344,9 @@ static noinline int find_free_extent(struct btrfs_root *root, + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; + ++ if (!find_free_extent_check_size_class(ffe_ctl, block_group)) ++ goto loop; ++ + bg_ret = NULL; + ret = do_allocation(block_group, ffe_ctl, &bg_ret); + if (ret == 0) { +@@ -4455,7 +4381,8 @@ static noinline int find_free_extent(struct btrfs_root *root, + + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, +- ffe_ctl->delalloc); ++ ffe_ctl->delalloc, ++ ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); + if (ret == -EAGAIN) { + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, +@@ -4468,8 +4395,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; + +- trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, +- ffe_ctl->num_bytes); ++ trace_btrfs_reserve_extent(block_group, ffe_ctl); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + break; + loop: +@@ -4912,7 +4838,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); + + __btrfs_tree_lock(buf, nest); +- btrfs_clean_tree_block(buf); ++ btrfs_clear_buffer_dirty(trans, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + +@@ -5542,13 +5468,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + } + } + } +- /* make block locked assertion in btrfs_clean_tree_block happy */ +- if (!path->locks[level] && +- btrfs_header_generation(eb) == trans->transid) { ++ /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ ++ if (!path->locks[level]) { + btrfs_tree_lock(eb); + path->locks[level] = BTRFS_WRITE_LOCK; + } +- btrfs_clean_tree_block(eb); ++ btrfs_clear_buffer_dirty(trans, eb); + } + + if (eb == root->node) { +diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h +index ae5425253603..0c958fc1b3b8 100644 +--- a/fs/btrfs/extent-tree.h ++++ b/fs/btrfs/extent-tree.h +@@ -3,6 +3,87 @@ + #ifndef BTRFS_EXTENT_TREE_H + #define BTRFS_EXTENT_TREE_H + ++#include "misc.h" ++#include "block-group.h" ++ ++struct btrfs_free_cluster; ++ ++enum btrfs_extent_allocation_policy { ++ BTRFS_EXTENT_ALLOC_CLUSTERED, ++ BTRFS_EXTENT_ALLOC_ZONED, ++}; ++ ++struct find_free_extent_ctl { ++ /* Basic allocation info */ ++ u64 ram_bytes; ++ u64 num_bytes; ++ u64 min_alloc_size; ++ u64 empty_size; ++ u64 flags; ++ int delalloc; ++ ++ /* Where to start the search inside the bg */ ++ u64 search_start; ++ ++ /* For clustered allocation */ ++ u64 empty_cluster; ++ struct btrfs_free_cluster *last_ptr; ++ bool use_cluster; ++ ++ bool have_caching_bg; ++ bool orig_have_caching_bg; ++ ++ /* Allocation is called for tree-log */ ++ bool for_treelog; ++ ++ /* Allocation is called for data relocation */ ++ bool for_data_reloc; ++ ++ /* RAID index, converted from flags */ ++ int index; ++ ++ /* ++ * Current loop number, check find_free_extent_update_loop() for details ++ */ ++ int loop; ++ ++ /* ++ * Whether we're refilling a cluster, if true we need to re-search ++ * current block group but don't try to refill the cluster again. ++ */ ++ bool retry_clustered; ++ ++ /* ++ * Whether we're updating free space cache, if true we need to re-search ++ * current block group but don't try updating free space cache again. ++ */ ++ bool retry_unclustered; ++ ++ /* If current block group is cached */ ++ int cached; ++ ++ /* Max contiguous hole found */ ++ u64 max_extent_size; ++ ++ /* Total free space from free space cache, not always contiguous */ ++ u64 total_free_space; ++ ++ /* Found result */ ++ u64 found_offset; ++ ++ /* Hint where to start looking for an empty space */ ++ u64 hint_byte; ++ ++ /* Allocation policy */ ++ enum btrfs_extent_allocation_policy policy; ++ ++ /* Whether or not the allocator is currently following a hint */ ++ bool hinted; ++ ++ /* Size class of block groups to prefer in early loops */ ++ enum btrfs_block_group_size_class size_class; ++}; ++ + enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 3bbf8703db2a..c25fa74d7615 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -36,6 +36,7 @@ + #include "file.h" + #include "dev-replace.h" + #include "super.h" ++#include "transaction.h" + + static struct kmem_cache *extent_buffer_cache; + +@@ -99,7 +100,6 @@ struct btrfs_bio_ctrl { + struct bio *bio; + int mirror_num; + enum btrfs_compression_type compress_type; +- u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; + btrfs_bio_end_io_t end_io_func; + +@@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) + { + struct bio *bio; + struct bio_vec *bv; +- struct btrfs_inode *inode; ++ struct inode *inode; + int mirror_num; + + if (!bio_ctrl->bio) +@@ -134,15 +134,13 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) + + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); +- inode = BTRFS_I(bv->bv_page->mapping->host); ++ inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; + + /* Caller should ensure the bio has at least some range added */ + ASSERT(bio->bi_iter.bi_size); + +- btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; +- +- if (!is_data_inode(&inode->vfs_inode)) { ++ if (!is_data_inode(inode)) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { + /* + * For metadata read, we should have the parent_check, +@@ -153,14 +151,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) + bio_ctrl->parent_check, + sizeof(struct btrfs_tree_parent_check)); + } +- btrfs_submit_metadata_bio(inode, bio, mirror_num); +- } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { +- btrfs_submit_data_write_bio(inode, bio, mirror_num); +- } else { +- btrfs_submit_data_read_bio(inode, bio, mirror_num, +- bio_ctrl->compress_type); ++ bio->bi_opf |= REQ_META; + } + ++ if (btrfs_op(bio) == BTRFS_MAP_READ && ++ bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) ++ btrfs_submit_compressed_read(inode, bio, mirror_num); ++ else ++ btrfs_submit_bio(bio, mirror_num); ++ + /* The bio is owned by the end_io handler now */ + bio_ctrl->bio = NULL; + } +@@ -515,266 +514,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + start, end, page_ops, NULL); + } + +-static int insert_failrec(struct btrfs_inode *inode, +- struct io_failure_record *failrec) +-{ +- struct rb_node *exist; +- +- spin_lock(&inode->io_failure_lock); +- exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, +- &failrec->rb_node); +- spin_unlock(&inode->io_failure_lock); +- +- return (exist == NULL) ? 0 : -EEXIST; +-} +- +-static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) +-{ +- struct rb_node *node; +- struct io_failure_record *failrec = ERR_PTR(-ENOENT); +- +- spin_lock(&inode->io_failure_lock); +- node = rb_simple_search(&inode->io_failure_tree, start); +- if (node) +- failrec = rb_entry(node, struct io_failure_record, rb_node); +- spin_unlock(&inode->io_failure_lock); +- return failrec; +-} +- +-static void free_io_failure(struct btrfs_inode *inode, +- struct io_failure_record *rec) +-{ +- spin_lock(&inode->io_failure_lock); +- rb_erase(&rec->rb_node, &inode->io_failure_tree); +- spin_unlock(&inode->io_failure_lock); +- +- kfree(rec); +-} +- +-static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +-{ +- if (cur_mirror == failrec->num_copies) +- return cur_mirror + 1 - failrec->num_copies; +- return cur_mirror + 1; +-} +- +-static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +-{ +- if (cur_mirror == 1) +- return failrec->num_copies; +- return cur_mirror - 1; +-} +- +-/* +- * each time an IO finishes, we do a fast check in the IO failure tree +- * to see if we need to process or clean up an io_failure_record +- */ +-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, +- struct page *page, unsigned int pg_offset) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct extent_io_tree *io_tree = &inode->io_tree; +- u64 ino = btrfs_ino(inode); +- u64 locked_start, locked_end; +- struct io_failure_record *failrec; +- int mirror; +- int ret; +- +- failrec = get_failrec(inode, start); +- if (IS_ERR(failrec)) +- return 0; +- +- BUG_ON(!failrec->this_mirror); +- +- if (sb_rdonly(fs_info->sb)) +- goto out; +- +- ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, +- &locked_end, EXTENT_LOCKED, NULL); +- if (ret || locked_start > failrec->bytenr || +- locked_end < failrec->bytenr + failrec->len - 1) +- goto out; +- +- mirror = failrec->this_mirror; +- do { +- mirror = prev_mirror(failrec, mirror); +- btrfs_repair_io_failure(fs_info, ino, start, failrec->len, +- failrec->logical, page, pg_offset, mirror); +- } while (mirror != failrec->failed_mirror); +- +-out: +- free_io_failure(inode, failrec); +- return 0; +-} +- +-/* +- * Can be called when +- * - hold extent lock +- * - under ordered extent +- * - the inode is freeing +- */ +-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) +-{ +- struct io_failure_record *failrec; +- struct rb_node *node, *next; +- +- if (RB_EMPTY_ROOT(&inode->io_failure_tree)) +- return; +- +- spin_lock(&inode->io_failure_lock); +- node = rb_simple_search_first(&inode->io_failure_tree, start); +- while (node) { +- failrec = rb_entry(node, struct io_failure_record, rb_node); +- if (failrec->bytenr > end) +- break; +- +- next = rb_next(node); +- rb_erase(&failrec->rb_node, &inode->io_failure_tree); +- kfree(failrec); +- +- node = next; +- } +- spin_unlock(&inode->io_failure_lock); +-} +- +-static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, +- struct btrfs_bio *bbio, +- unsigned int bio_offset) +-{ +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- u64 start = bbio->file_offset + bio_offset; +- struct io_failure_record *failrec; +- const u32 sectorsize = fs_info->sectorsize; +- int ret; +- +- failrec = get_failrec(BTRFS_I(inode), start); +- if (!IS_ERR(failrec)) { +- btrfs_debug(fs_info, +- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", +- failrec->logical, failrec->bytenr, failrec->len); +- /* +- * when data can be on disk more than twice, add to failrec here +- * (e.g. with a list for failed_mirror) to make +- * clean_io_failure() clean all those errors at once. +- */ +- ASSERT(failrec->this_mirror == bbio->mirror_num); +- ASSERT(failrec->len == fs_info->sectorsize); +- return failrec; +- } +- +- failrec = kzalloc(sizeof(*failrec), GFP_NOFS); +- if (!failrec) +- return ERR_PTR(-ENOMEM); +- +- RB_CLEAR_NODE(&failrec->rb_node); +- failrec->bytenr = start; +- failrec->len = sectorsize; +- failrec->failed_mirror = bbio->mirror_num; +- failrec->this_mirror = bbio->mirror_num; +- failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; +- +- btrfs_debug(fs_info, +- "new io failure record logical %llu start %llu", +- failrec->logical, start); +- +- failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); +- if (failrec->num_copies == 1) { +- /* +- * We only have a single copy of the data, so don't bother with +- * all the retry and error correction code that follows. No +- * matter what the error is, it is very likely to persist. +- */ +- btrfs_debug(fs_info, +- "cannot repair logical %llu num_copies %d", +- failrec->logical, failrec->num_copies); +- kfree(failrec); +- return ERR_PTR(-EIO); +- } +- +- /* Set the bits in the private failure tree */ +- ret = insert_failrec(BTRFS_I(inode), failrec); +- if (ret) { +- kfree(failrec); +- return ERR_PTR(ret); +- } +- +- return failrec; +-} +- +-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, +- u32 bio_offset, struct page *page, unsigned int pgoff, +- bool submit_buffered) +-{ +- u64 start = failed_bbio->file_offset + bio_offset; +- struct io_failure_record *failrec; +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct bio *failed_bio = &failed_bbio->bio; +- const int icsum = bio_offset >> fs_info->sectorsize_bits; +- struct bio *repair_bio; +- struct btrfs_bio *repair_bbio; +- +- btrfs_debug(fs_info, +- "repair read error: read error at %llu", start); +- +- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); +- +- failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); +- if (IS_ERR(failrec)) +- return PTR_ERR(failrec); +- +- /* +- * There are two premises: +- * a) deliver good data to the caller +- * b) correct the bad sectors on disk +- * +- * Since we're only doing repair for one sector, we only need to get +- * a good copy of the failed sector and if we succeed, we have setup +- * everything for btrfs_repair_io_failure to do the rest for us. +- */ +- failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); +- if (failrec->this_mirror == failrec->failed_mirror) { +- btrfs_debug(fs_info, +- "failed to repair num_copies %d this_mirror %d failed_mirror %d", +- failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); +- free_io_failure(inode, failrec); +- return -EIO; +- } +- +- repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, +- failed_bbio->private); +- repair_bbio = btrfs_bio(repair_bio); +- repair_bbio->file_offset = start; +- repair_bio->bi_iter.bi_sector = failrec->logical >> 9; +- +- if (failed_bbio->csum) { +- const u32 csum_size = fs_info->csum_size; +- +- repair_bbio->csum = repair_bbio->csum_inline; +- memcpy(repair_bbio->csum, +- failed_bbio->csum + csum_size * icsum, csum_size); +- } +- +- bio_add_page(repair_bio, page, failrec->len, pgoff); +- repair_bbio->iter = repair_bio->bi_iter; +- +- btrfs_debug(fs_info, +- "repair read error: submitting new read to mirror %d", +- failrec->this_mirror); +- +- /* +- * At this point we have a bio, so any errors from bio submission will +- * be handled by the endio on the repair_bio, so we can't return an +- * error here. +- */ +- if (submit_buffered) +- btrfs_submit_data_read_bio(inode, repair_bio, +- failrec->this_mirror, 0); +- else +- btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); +- +- return BLK_STS_OK; +-} +- + static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) + { + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +@@ -803,79 +542,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) + btrfs_subpage_end_reader(fs_info, page, start, len); + } + +-static void end_sector_io(struct page *page, u64 offset, bool uptodate) +-{ +- struct btrfs_inode *inode = BTRFS_I(page->mapping->host); +- const u32 sectorsize = inode->root->fs_info->sectorsize; +- +- end_page_read(page, uptodate, offset, sectorsize); +- unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); +-} +- +-static void submit_data_read_repair(struct inode *inode, +- struct btrfs_bio *failed_bbio, +- u32 bio_offset, const struct bio_vec *bvec, +- unsigned int error_bitmap) +-{ +- const unsigned int pgoff = bvec->bv_offset; +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- struct page *page = bvec->bv_page; +- const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; +- const u64 end = start + bvec->bv_len - 1; +- const u32 sectorsize = fs_info->sectorsize; +- const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; +- int i; +- +- BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); +- +- /* This repair is only for data */ +- ASSERT(is_data_inode(inode)); +- +- /* We're here because we had some read errors or csum mismatch */ +- ASSERT(error_bitmap); +- +- /* +- * We only get called on buffered IO, thus page must be mapped and bio +- * must not be cloned. +- */ +- ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); +- +- /* Iterate through all the sectors in the range */ +- for (i = 0; i < nr_bits; i++) { +- const unsigned int offset = i * sectorsize; +- bool uptodate = false; +- int ret; +- +- if (!(error_bitmap & (1U << i))) { +- /* +- * This sector has no error, just end the page read +- * and unlock the range. +- */ +- uptodate = true; +- goto next; +- } +- +- ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, +- bio_offset + offset, page, pgoff + offset, +- true); +- if (!ret) { +- /* +- * We have submitted the read repair, the page release +- * will be handled by the endio function of the +- * submitted repair bio. +- * Thus we don't need to do any thing here. +- */ +- continue; +- } +- /* +- * Continue on failed repair, otherwise the remaining sectors +- * will not be properly unlocked. +- */ +-next: +- end_sector_io(page, start + offset, uptodate); +- } +-} +- + /* lots and lots of room for performance fixes in the end_bio funcs */ + + void end_extent_writepage(struct page *page, int err, u64 start, u64 end) +@@ -919,7 +585,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) + u64 start; + u64 end; + struct bvec_iter_all iter_all; +- bool first_bvec = true; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { +@@ -941,11 +606,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + +- if (first_bvec) { +- btrfs_record_physical_zoned(inode, start, bio); +- first_bvec = false; +- } +- + end_extent_writepage(page, error, start, end); + + btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); +@@ -1093,8 +753,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; +- unsigned int error_bitmap = (unsigned int)-1; +- bool repair = false; + u64 start; + u64 end; + u32 len; +@@ -1126,25 +784,14 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) + len = bvec->bv_len; + + mirror = bbio->mirror_num; +- if (likely(uptodate)) { +- if (is_data_inode(inode)) { +- error_bitmap = btrfs_verify_data_csum(bbio, +- bio_offset, page, start, end); +- if (error_bitmap) +- uptodate = false; +- } else { +- if (btrfs_validate_metadata_buffer(bbio, +- page, start, end, mirror)) +- uptodate = false; +- } +- } ++ if (uptodate && !is_data_inode(inode) && ++ btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) ++ uptodate = false; + + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_SHIFT; + +- btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); +- + /* + * Zero out the remaining part if this range straddles + * i_size. +@@ -1161,19 +808,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) + zero_user_segment(page, zero_start, + offset_in_page(end) + 1); + } +- } else if (is_data_inode(inode)) { +- /* +- * Only try to repair bios that actually made it to a +- * device. If the bio failed to be submitted mirror +- * is 0 and we need to fail it without retrying. +- * +- * This also includes the high level bios for compressed +- * extents - these never make it to a device and repair +- * is already handled on the lower compressed bio. +- */ +- if (mirror > 0) +- repair = true; +- } else { ++ } else if (!is_data_inode(inode)) { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); +@@ -1182,19 +817,10 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) + atomic_dec(&eb->io_pages); + } + +- if (repair) { +- /* +- * submit_data_read_repair() will handle all the good +- * and bad sectors, we just continue to the next bvec. +- */ +- submit_data_read_repair(inode, bbio, bio_offset, bvec, +- error_bitmap); +- } else { +- /* Update page status and unlock */ +- end_page_read(page, uptodate, start, len); +- endio_readpage_release_extent(&processed, BTRFS_I(inode), +- start, end, PageUptodate(page)); +- } ++ /* Update page status and unlock. */ ++ end_page_read(page, uptodate, start, len); ++ endio_readpage_release_extent(&processed, BTRFS_I(inode), ++ start, end, PageUptodate(page)); + + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; +@@ -1202,7 +828,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) + } + /* Release the last extent */ + endio_readpage_release_extent(&processed, NULL, 0, 0, false); +- btrfs_bio_free_csum(bbio); + bio_put(bio); + } + +@@ -1270,11 +895,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + u32 real_size; + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + bool contig = false; +- int ret; + + ASSERT(bio); + /* The limit should be calculated when bio_ctrl->bio is allocated */ +- ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); ++ ASSERT(bio_ctrl->len_to_oe_boundary); + if (bio_ctrl->compress_type != compress_type) + return 0; + +@@ -1310,9 +934,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + if (!contig) + return 0; + +- real_size = min(bio_ctrl->len_to_oe_boundary, +- bio_ctrl->len_to_stripe_boundary) - bio_size; +- real_size = min(real_size, size); ++ real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); + + /* + * If real_size is 0, never call bio_add_*_page(), as even size is 0, +@@ -1321,82 +943,45 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + if (real_size == 0) + return 0; + +- if (bio_op(bio) == REQ_OP_ZONE_APPEND) +- ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); +- else +- ret = bio_add_page(bio, page, real_size, pg_offset); +- +- return ret; ++ return bio_add_page(bio, page, real_size, pg_offset); + } + +-static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, +- struct btrfs_inode *inode, u64 file_offset) ++static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, ++ struct btrfs_inode *inode, u64 file_offset) + { +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct btrfs_io_geometry geom; + struct btrfs_ordered_extent *ordered; +- struct extent_map *em; +- u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); +- int ret; + + /* +- * Pages for compressed extent are never submitted to disk directly, +- * thus it has no real boundary, just set them to U32_MAX. +- * +- * The split happens for real compressed bio, which happens in +- * btrfs_submit_compressed_read/write(). ++ * Limit the extent to the ordered boundary for Zone Append. ++ * Compressed bios aren't submitted directly, so it doesn't apply to ++ * them. + */ +- if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { +- bio_ctrl->len_to_oe_boundary = U32_MAX; +- bio_ctrl->len_to_stripe_boundary = U32_MAX; +- return 0; +- } +- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); +- if (IS_ERR(em)) +- return PTR_ERR(em); +- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), +- logical, &geom); +- free_extent_map(em); +- if (ret < 0) { +- return ret; +- } +- if (geom.len > U32_MAX) +- bio_ctrl->len_to_stripe_boundary = U32_MAX; +- else +- bio_ctrl->len_to_stripe_boundary = (u32)geom.len; +- +- if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { +- bio_ctrl->len_to_oe_boundary = U32_MAX; +- return 0; +- } +- +- /* Ordered extent not yet created, so we're good */ +- ordered = btrfs_lookup_ordered_extent(inode, file_offset); +- if (!ordered) { +- bio_ctrl->len_to_oe_boundary = U32_MAX; +- return 0; ++ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && ++ btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { ++ ordered = btrfs_lookup_ordered_extent(inode, file_offset); ++ if (ordered) { ++ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ++ ordered->file_offset + ++ ordered->disk_num_bytes - file_offset); ++ btrfs_put_ordered_extent(ordered); ++ return; ++ } + } + +- bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, +- ordered->disk_bytenr + ordered->disk_num_bytes - logical); +- btrfs_put_ordered_extent(ordered); +- return 0; ++ bio_ctrl->len_to_oe_boundary = U32_MAX; + } + +-static int alloc_new_bio(struct btrfs_inode *inode, +- struct btrfs_bio_ctrl *bio_ctrl, +- struct writeback_control *wbc, +- blk_opf_t opf, +- u64 disk_bytenr, u32 offset, u64 file_offset, +- enum btrfs_compression_type compress_type) ++static void alloc_new_bio(struct btrfs_inode *inode, ++ struct btrfs_bio_ctrl *bio_ctrl, ++ struct writeback_control *wbc, blk_opf_t opf, ++ u64 disk_bytenr, u32 offset, u64 file_offset, ++ enum btrfs_compression_type compress_type) + { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio; +- int ret; + +- ASSERT(bio_ctrl->end_io_func); +- +- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); ++ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, ++ NULL); + /* + * For compressed page range, its disk_bytenr is always @disk_bytenr + * passed in, no matter if we have added any range into previous bio. +@@ -1405,48 +990,21 @@ static int alloc_new_bio(struct btrfs_inode *inode, + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + else + bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; ++ btrfs_bio(bio)->file_offset = file_offset; + bio_ctrl->bio = bio; + bio_ctrl->compress_type = compress_type; +- ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); +- if (ret < 0) +- goto error; ++ calc_bio_boundaries(bio_ctrl, inode, file_offset); + + if (wbc) { + /* +- * For Zone append we need the correct block_device that we are +- * going to write to set in the bio to be able to respect the +- * hardware limitation. Look it up here: ++ * Pick the last added device to support cgroup writeback. For ++ * multi-device file systems this means blk-cgroup policies have ++ * to always be set on the last added/replaced device. ++ * This is a bit odd but has been like that for a long time. + */ +- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { +- struct btrfs_device *dev; +- +- dev = btrfs_zoned_get_device(fs_info, disk_bytenr, +- fs_info->sectorsize); +- if (IS_ERR(dev)) { +- ret = PTR_ERR(dev); +- goto error; +- } +- +- bio_set_dev(bio, dev->bdev); +- } else { +- /* +- * Otherwise pick the last added device to support +- * cgroup writeback. For multi-device file systems this +- * means blk-cgroup policies have to always be set on the +- * last added/replaced device. This is a bit odd but has +- * been like that for a long time. +- */ +- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); +- } ++ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(wbc, bio); +- } else { +- ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); + } +- return 0; +-error: +- bio_ctrl->bio = NULL; +- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); +- return ret; + } + + /* +@@ -1472,7 +1030,6 @@ static int submit_extent_page(blk_opf_t opf, + enum btrfs_compression_type compress_type, + bool force_bio_submit) + { +- int ret = 0; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + unsigned int cur = pg_offset; + +@@ -1492,12 +1049,9 @@ static int submit_extent_page(blk_opf_t opf, + + /* Allocate new bio if needed */ + if (!bio_ctrl->bio) { +- ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, +- disk_bytenr, offset, +- page_offset(page) + cur, +- compress_type); +- if (ret < 0) +- return ret; ++ alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, ++ offset, page_offset(page) + cur, ++ compress_type); + } + /* + * We must go through btrfs_bio_add_page() to ensure each +@@ -2054,10 +1608,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + * find_next_dirty_byte() are all exclusive + */ + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; +- +- if (btrfs_use_zone_append(inode, em->block_start)) +- op = REQ_OP_ZONE_APPEND; +- + free_extent_map(em); + em = NULL; + +@@ -2360,13 +1910,6 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) + */ + mapping_set_error(page->mapping, -EIO); + +- /* +- * If we error out, we should add back the dirty_metadata_bytes +- * to make it consistent. +- */ +- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, +- eb->len, fs_info->dirty_metadata_batch); +- + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. +@@ -4724,12 +4267,25 @@ static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) + WARN_ON(atomic_read(&eb->refs) == 0); + } + +-void clear_extent_buffer_dirty(const struct extent_buffer *eb) ++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, ++ struct extent_buffer *eb) + { ++ struct btrfs_fs_info *fs_info = eb->fs_info; + int i; + int num_pages; + struct page *page; + ++ btrfs_assert_tree_write_locked(eb); ++ ++ if (trans && btrfs_header_generation(eb) != trans->transid) ++ return; ++ ++ if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) ++ return; ++ ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, ++ fs_info->dirty_metadata_batch); ++ + if (eb->fs_info->nodesize < PAGE_SIZE) + return clear_subpage_extent_buffer_dirty(eb); + +diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h +index a2c82448b2e0..4341ad978fb8 100644 +--- a/fs/btrfs/extent_io.h ++++ b/fs/btrfs/extent_io.h +@@ -11,6 +11,8 @@ + #include "ulist.h" + #include "misc.h" + ++struct btrfs_trans_handle; ++ + enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, +@@ -60,11 +62,9 @@ enum { + #define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +-struct btrfs_bio; + struct btrfs_root; + struct btrfs_inode; + struct btrfs_fs_info; +-struct io_failure_record; + struct extent_io_tree; + struct btrfs_tree_parent_check; + +@@ -262,7 +262,6 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star + void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len); +-void clear_extent_buffer_dirty(const struct extent_buffer *eb); + bool set_extent_buffer_dirty(struct extent_buffer *eb); + void set_extent_buffer_uptodate(struct extent_buffer *eb); + void clear_extent_buffer_uptodate(struct extent_buffer *eb); +@@ -274,40 +273,13 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + u32 bits_to_clear, unsigned long page_ops); + int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); ++void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, ++ struct extent_buffer *buf); + + int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); + + void end_extent_writepage(struct page *page, int err, u64 start, u64 end); + +-/* +- * When IO fails, either with EIO or csum verification fails, we +- * try other mirrors that might have a good copy of the data. This +- * io_failure_record is used to record state as we go through all the +- * mirrors. If another mirror has good data, the sector is set up to date +- * and things continue. If a good mirror can't be found, the original +- * bio end_io callback is called to indicate things have failed. +- */ +-struct io_failure_record { +- /* Use rb_simple_node for search/insert */ +- struct { +- struct rb_node rb_node; +- u64 bytenr; +- }; +- struct page *page; +- u64 len; +- u64 logical; +- int this_mirror; +- int failed_mirror; +- int num_copies; +-}; +- +-int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, +- u32 bio_offset, struct page *page, unsigned int pgoff, +- bool submit_buffered); +-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); +-int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, +- struct page *page, unsigned int pg_offset); +- + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + bool find_lock_delalloc_range(struct inode *inode, + struct page *locked_page, u64 *start, +diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c +index 5de73466b2ca..41c77a100853 100644 +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -380,32 +380,25 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, + /* + * Lookup the checksum for the read bio in csum tree. + * +- * @inode: inode that the bio is for. +- * @bio: bio to look up. +- * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return +- * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If +- * NULL, the checksum buffer is allocated and returned in +- * btrfs_bio(bio)->csum instead. +- * + * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. + */ +-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) ++blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) + { +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +- struct btrfs_bio *bbio = NULL; ++ struct btrfs_inode *inode = bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct extent_io_tree *io_tree = &inode->io_tree; ++ struct bio *bio = &bbio->bio; + struct btrfs_path *path; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 orig_len = bio->bi_iter.bi_size; + u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_bytenr; +- u8 *csum; + const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; + int count = 0; + blk_status_t ret = BLK_STS_OK; + +- if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || ++ if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) + return BLK_STS_OK; + +@@ -426,21 +419,14 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst + if (!path) + return BLK_STS_RESOURCE; + +- if (!dst) { +- bbio = btrfs_bio(bio); +- +- if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { +- bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); +- if (!bbio->csum) { +- btrfs_free_path(path); +- return BLK_STS_RESOURCE; +- } +- } else { +- bbio->csum = bbio->csum_inline; ++ if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { ++ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); ++ if (!bbio->csum) { ++ btrfs_free_path(path); ++ return BLK_STS_RESOURCE; + } +- csum = bbio->csum; + } else { +- csum = dst; ++ bbio->csum = bbio->csum_inline; + } + + /* +@@ -456,7 +442,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ +- if (btrfs_is_free_space_inode(BTRFS_I(inode))) { ++ if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } +@@ -479,14 +465,15 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst + ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); + sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> + fs_info->sectorsize_bits; +- csum_dst = csum + sector_offset * csum_size; ++ csum_dst = bbio->csum + sector_offset * csum_size; + + count = search_csum_tree(fs_info, path, cur_disk_bytenr, + search_len, csum_dst); + if (count < 0) { + ret = errno_to_blk_status(count); +- if (bbio) +- btrfs_bio_free_csum(bbio); ++ if (bbio->csum != bbio->csum_inline) ++ kfree(bbio->csum); ++ bbio->csum = NULL; + break; + } + +@@ -504,12 +491,13 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst + memset(csum_dst, 0, csum_size); + count = 1; + +- if (BTRFS_I(inode)->root->root_key.objectid == ++ if (inode->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + u64 file_offset; + int ret; + +- ret = search_file_offset_in_bio(bio, inode, ++ ret = search_file_offset_in_bio(bio, ++ &inode->vfs_inode, + cur_disk_bytenr, &file_offset); + if (ret) + set_extent_bits(io_tree, file_offset, +@@ -784,23 +772,16 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + + /* + * Calculate checksums of the data contained inside a bio. +- * +- * @inode: Owner of the data inside the bio +- * @bio: Contains the data to be checksummed +- * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the +- * file offsets are determined from the page offsets in the bio. +- * Otherwise, this is the starting file offset of the bio vecs in +- * @bio, which must be contiguous. +- * @one_ordered: If true, @bio only refers to one ordered extent. + */ +-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, +- u64 offset, bool one_ordered) ++blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) + { ++ struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); ++ struct bio *bio = &bbio->bio; ++ u64 offset = bbio->file_offset; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_extent *ordered = NULL; +- const bool use_page_offsets = (offset == (u64)-1); + char *data; + struct bvec_iter iter; + struct bio_vec bvec; +@@ -828,9 +809,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + shash->tfm = fs_info->csum_shash; + + bio_for_each_segment(bvec, bio, iter) { +- if (use_page_offsets) +- offset = page_offset(bvec.bv_page) + bvec.bv_offset; +- + if (!ordered) { + ordered = btrfs_lookup_ordered_extent(inode, offset); + /* +@@ -852,7 +830,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + - 1); + + for (i = 0; i < blockcount; i++) { +- if (!one_ordered && ++ if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && + !in_range(offset, ordered->file_offset, + ordered->num_bytes)) { + unsigned long bytes_left; +diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h +index 031225668434..cd7f2ae515c0 100644 +--- a/fs/btrfs/file-item.h ++++ b/fs/btrfs/file-item.h +@@ -38,7 +38,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) + + int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); ++blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); + int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +@@ -49,8 +49,10 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +-blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, +- u64 offset, bool one_ordered); ++blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); ++int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, ++ struct list_head *list, int search_commit, ++ bool nowait); + int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index af046d22300e..5cc5a1faaef5 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + unlock_page(pages[i]); + put_page(pages[i]); + } +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; + } +diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c +index c667e878ef1a..4d155a48ec59 100644 +--- a/fs/btrfs/free-space-tree.c ++++ b/fs/btrfs/free-space-tree.c +@@ -1283,7 +1283,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) + list_del(&free_space_root->dirty_list); + + btrfs_tree_lock(free_space_root->node); +- btrfs_clean_tree_block(free_space_root->node); ++ btrfs_clear_buffer_dirty(trans, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), + free_space_root->node, 0, 1); +diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c +index 5553e1f8afe8..31c1648bc0b4 100644 +--- a/fs/btrfs/fs.c ++++ b/fs/btrfs/fs.c +@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + name, flag); + } + spin_unlock(&fs_info->super_lock); ++ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } + } + +@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + name, flag); + } + spin_unlock(&fs_info->super_lock); ++ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } + } + +@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + name, flag); + } + spin_unlock(&fs_info->super_lock); ++ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } + } + +@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + name, flag); + } + spin_unlock(&fs_info->super_lock); ++ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); + } + } +diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h +index 37b86acfcbcf..4c477eae6891 100644 +--- a/fs/btrfs/fs.h ++++ b/fs/btrfs/fs.h +@@ -3,6 +3,7 @@ + #ifndef BTRFS_FS_H + #define BTRFS_FS_H + ++#include + #include + #include + #include +@@ -125,6 +126,12 @@ enum { + */ + BTRFS_FS_NO_OVERCOMMIT, + ++ /* ++ * Indicate if we have some features changed, this is mostly for ++ * cleaner thread to update the sysfs interface. ++ */ ++ BTRFS_FS_FEATURE_CHANGED, ++ + #if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, +@@ -742,8 +749,10 @@ struct btrfs_fs_info { + */ + u64 zone_size; + +- /* Max size to emit ZONE_APPEND write command */ ++ /* Constraints for ZONE_APPEND commands: */ ++ struct queue_limits limits; + u64 max_zone_append_size; ++ + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 98a800b8bd43..44e9acc77a74 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -84,27 +84,12 @@ struct btrfs_dio_data { + }; + + struct btrfs_dio_private { +- struct btrfs_inode *inode; +- +- /* +- * Since DIO can use anonymous page, we cannot use page_offset() to +- * grab the file offset, thus need a dedicated member for file offset. +- */ ++ /* Range of I/O */ + u64 file_offset; +- /* Used for bio::bi_size */ + u32 bytes; + +- /* +- * References to this structure. There is one reference per in-flight +- * bio plus one while we're still setting up. +- */ +- refcount_t refs; +- +- /* Array of checksums */ +- u8 *csums; +- + /* This must be last */ +- struct bio bio; ++ struct btrfs_bio bbio; + }; + + static struct bio_set btrfs_dio_bioset; +@@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, + { + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; +- u64 page_start, page_end; ++ u64 page_start = 0, page_end = 0; + struct page *page; + + if (locked_page) { +@@ -2535,19 +2520,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + } + } + +-/* +- * in order to insert checksums into the metadata in large chunks, +- * we wait until bio submission time. All the pages in the bio are +- * checksummed and sums are attached onto the ordered extent record. +- * +- * At IO completion time the cums attached on the ordered extent record +- * are inserted into the btree +- */ +-blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) +-{ +- return btrfs_csum_one_bio(inode, bio, (u64)-1, false); +-} +- + /* + * Split an extent_map at [start, start + len] + * +@@ -2663,19 +2635,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, + return ret; + } + +-static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, +- struct bio *bio, loff_t file_offset) ++blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) + { ++ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; ++ u64 len = bbio->bio.bi_iter.bi_size; ++ struct btrfs_inode *inode = bbio->inode; + struct btrfs_ordered_extent *ordered; +- u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 file_len; +- u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; + u64 pre, post; + int ret = 0; + +- ordered = btrfs_lookup_ordered_extent(inode, file_offset); ++ ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + +@@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; +- ret = split_zoned_em(inode, file_offset, file_len, pre, post); ++ ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); + + out: + btrfs_put_ordered_extent(ordered); +@@ -2723,75 +2695,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + return errno_to_blk_status(ret); + } + +-void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- blk_status_t ret; +- +- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { +- ret = extract_ordered_extent(inode, bio, +- page_offset(bio_first_bvec_all(bio)->bv_page)); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- return; +- } +- } +- +- /* +- * If we need to checksum, and the I/O is not issued by fsync and +- * friends, that is ->sync_writers != 0, defer the submission to a +- * workqueue to parallelize it. +- * +- * Csum items for reloc roots have already been cloned at this point, +- * so they are handled as part of the no-checksum case. +- */ +- if (!(inode->flags & BTRFS_INODE_NODATASUM) && +- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && +- !btrfs_is_data_reloc_root(inode->root)) { +- if (!atomic_read(&inode->sync_writers) && +- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) +- return; +- +- ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- return; +- } +- } +- btrfs_submit_bio(fs_info, bio, mirror_num); +-} +- +-void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, +- int mirror_num, enum btrfs_compression_type compress_type) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- blk_status_t ret; +- +- if (compress_type != BTRFS_COMPRESS_NONE) { +- /* +- * btrfs_submit_compressed_read will handle completing the bio +- * if there were any errors, so just return here. +- */ +- btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); +- return; +- } +- +- /* Save the original iter for read repair */ +- btrfs_bio(bio)->iter = bio->bi_iter; +- +- /* +- * Lookup bio sums does extra checks around whether we need to csum or +- * not, which is why we ignore skip_sum here. +- */ +- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- return; +- } +- +- btrfs_submit_bio(fs_info, bio, mirror_num); +-} +- + /* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. +@@ -2969,7 +2872,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + unlock_page(page); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } +@@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) + goto out; + } + +- /* A valid bdev implies a write on a sequential zone */ +- if (ordered_extent->bdev) { ++ /* A valid ->physical implies a write on a sequential zone. */ ++ if (ordered_extent->physical != (u64)-1) { + btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } + +- btrfs_free_io_failure_record(inode, start, end); +- + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; +@@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of + } + + /* +- * check_data_csum - verify checksum of one sector of uncompressed data +- * @inode: inode +- * @bbio: btrfs_bio which contains the csum ++ * Verify the checksum of a single data sector. ++ * ++ * @bbio: btrfs_io_bio which contains the csum ++ * @dev: device the sector is on + * @bio_offset: offset to the beginning of the bio (in bytes) +- * @page: page where is the data to be verified +- * @pgoff: offset inside the page ++ * @bv: bio_vec to check + * +- * The length of such check is always one sector size. ++ * Check if the checksum on a data block is valid. When a checksum mismatch is ++ * detected, report the error and fill the corrupted range with zero. + * +- * When csum mismatch is detected, we will also report the error and fill the +- * corrupted range with zero. (Thus it needs the extra parameters) ++ * Return %true if the sector is ok or had no checksum to start with, else %false. + */ +-int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, +- u32 bio_offset, struct page *page, u32 pgoff) ++bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, ++ u32 bio_offset, struct bio_vec *bv) + { ++ struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; +- u32 len = fs_info->sectorsize; ++ u64 file_offset = bbio->file_offset + bio_offset; ++ u64 end = file_offset + bv->bv_len - 1; + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; + +- ASSERT(pgoff + len <= PAGE_SIZE); ++ ASSERT(bv->bv_len == fs_info->sectorsize); + +- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); ++ if (!bbio->csum) ++ return true; + +- if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) ++ if (btrfs_is_data_reloc_root(inode->root) && ++ test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, ++ 1, NULL)) { ++ /* Skip the range without csum for data reloc inode */ ++ clear_extent_bits(&inode->io_tree, file_offset, end, ++ EXTENT_NODATASUM); ++ return true; ++ } ++ ++ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); ++ if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, ++ csum_expected)) + goto zeroit; +- return 0; ++ return true; + + zeroit: +- btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, +- csum, csum_expected, bbio->mirror_num); +- if (bbio->device) +- btrfs_dev_stat_inc_and_print(bbio->device, +- BTRFS_DEV_STAT_CORRUPTION_ERRS); +- memzero_page(page, pgoff, len); +- return -EIO; +-} +- +-/* +- * When reads are done, we need to check csums to verify the data is correct. +- * if there's a match, we allow the bio to finish. If not, the code in +- * extent_io.c will try to find good copies for us. +- * +- * @bio_offset: offset to the beginning of the bio (in bytes) +- * @start: file offset of the range start +- * @end: file offset of the range end (inclusive) +- * +- * Return a bitmap where bit set means a csum mismatch, and bit not set means +- * csum match. +- */ +-unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, +- u32 bio_offset, struct page *page, +- u64 start, u64 end) +-{ +- struct btrfs_inode *inode = BTRFS_I(page->mapping->host); +- struct btrfs_root *root = inode->root; +- struct btrfs_fs_info *fs_info = root->fs_info; +- struct extent_io_tree *io_tree = &inode->io_tree; +- const u32 sectorsize = root->fs_info->sectorsize; +- u32 pg_off; +- unsigned int result = 0; +- +- /* +- * This only happens for NODATASUM or compressed read. +- * Normally this should be covered by above check for compressed read +- * or the next check for NODATASUM. Just do a quicker exit here. +- */ +- if (bbio->csum == NULL) +- return 0; +- +- if (inode->flags & BTRFS_INODE_NODATASUM) +- return 0; +- +- if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) +- return 0; +- +- ASSERT(page_offset(page) <= start && +- end <= page_offset(page) + PAGE_SIZE - 1); +- for (pg_off = offset_in_page(start); +- pg_off < offset_in_page(end); +- pg_off += sectorsize, bio_offset += sectorsize) { +- u64 file_offset = pg_off + page_offset(page); +- int ret; +- +- if (btrfs_is_data_reloc_root(root) && +- test_range_bit(io_tree, file_offset, +- file_offset + sectorsize - 1, +- EXTENT_NODATASUM, 1, NULL)) { +- /* Skip the range without csum for data reloc inode */ +- clear_extent_bits(io_tree, file_offset, +- file_offset + sectorsize - 1, +- EXTENT_NODATASUM); +- continue; +- } +- ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); +- if (ret < 0) { +- const int nr_bit = (pg_off - offset_in_page(start)) >> +- root->fs_info->sectorsize_bits; +- +- result |= (1U << nr_bit); +- } +- } +- return result; ++ btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, ++ bbio->mirror_num); ++ if (dev) ++ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); ++ memzero_bvec(bv); ++ return false; + } + + /* +@@ -4987,7 +4834,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + unlock_extent(io_tree, block_start, block_end, &cached_state); + unlock_page(page); + put_page(page); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } +@@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) + if (is_bad_inode(inode)) + goto no_delete; + +- btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); +- + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + goto no_delete; + +@@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + else + ret = nowait ? -EAGAIN : -ENOTBLK; + btrfs_put_ordered_extent(ordered); +@@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; + iomap->length = len; +- +- if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) +- iomap->flags |= IOMAP_F_ZONE_APPEND; +- + free_extent_map(em); + + return 0; +@@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + return ret; + } + +-static void btrfs_dio_private_put(struct btrfs_dio_private *dip) +-{ +- /* +- * This implies a barrier so that stores to dio_bio->bi_status before +- * this and loads of dio_bio->bi_status after this are fully ordered. +- */ +- if (!refcount_dec_and_test(&dip->refs)) +- return; +- +- if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { +- btrfs_mark_ordered_io_finished(dip->inode, NULL, +- dip->file_offset, dip->bytes, +- !dip->bio.bi_status); +- } else { +- unlock_extent(&dip->inode->io_tree, +- dip->file_offset, +- dip->file_offset + dip->bytes - 1, NULL); +- } +- +- kfree(dip->csums); +- bio_endio(&dip->bio); +-} +- +-void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) +-{ +- struct btrfs_dio_private *dip = btrfs_bio(bio)->private; +- +- BUG_ON(bio_op(bio) == REQ_OP_WRITE); +- +- refcount_inc(&dip->refs); +- btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); +-} +- +-static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, +- struct btrfs_bio *bbio, +- const bool uptodate) +-{ +- struct inode *inode = &dip->inode->vfs_inode; +- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); +- blk_status_t err = BLK_STS_OK; +- struct bvec_iter iter; +- struct bio_vec bv; +- u32 offset; +- +- btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { +- u64 start = bbio->file_offset + offset; +- +- if (uptodate && +- (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, +- bv.bv_page, bv.bv_offset))) { +- btrfs_clean_io_failure(BTRFS_I(inode), start, +- bv.bv_page, bv.bv_offset); +- } else { +- int ret; +- +- ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, +- bv.bv_page, bv.bv_offset, false); +- if (ret) +- err = errno_to_blk_status(ret); +- } +- } +- +- return err; +-} +- +-blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, +- struct bio *bio, +- u64 dio_file_offset) ++static void btrfs_dio_end_io(struct btrfs_bio *bbio) + { +- return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); +-} +- +-static void btrfs_end_dio_bio(struct btrfs_bio *bbio) +-{ +- struct btrfs_dio_private *dip = bbio->private; ++ struct btrfs_dio_private *dip = ++ container_of(bbio, struct btrfs_dio_private, bbio); ++ struct btrfs_inode *inode = bbio->inode; + struct bio *bio = &bbio->bio; +- blk_status_t err = bio->bi_status; +- +- if (err) +- btrfs_warn(dip->inode->root->fs_info, +- "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", +- btrfs_ino(dip->inode), bio_op(bio), +- bio->bi_opf, bio->bi_iter.bi_sector, +- bio->bi_iter.bi_size, err); +- +- if (bio_op(bio) == REQ_OP_READ) +- err = btrfs_check_read_dio_bio(dip, bbio, !err); +- +- if (err) +- dip->bio.bi_status = err; +- +- btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); +- +- bio_put(bio); +- btrfs_dio_private_put(dip); +-} + +-static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, +- u64 file_offset, int async_submit) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct btrfs_dio_private *dip = btrfs_bio(bio)->private; +- blk_status_t ret; +- +- /* Save the original iter for read repair */ +- if (btrfs_op(bio) == BTRFS_MAP_READ) +- btrfs_bio(bio)->iter = bio->bi_iter; +- +- if (inode->flags & BTRFS_INODE_NODATASUM) +- goto map; ++ if (bio->bi_status) { ++ btrfs_warn(inode->root->fs_info, ++ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", ++ btrfs_ino(inode), bio->bi_opf, ++ dip->file_offset, dip->bytes, bio->bi_status); ++ } + +- if (btrfs_op(bio) == BTRFS_MAP_WRITE) { +- /* Check btrfs_submit_data_write_bio() for async submit rules */ +- if (async_submit && !atomic_read(&inode->sync_writers) && +- btrfs_wq_submit_bio(inode, bio, 0, file_offset, +- WQ_SUBMIT_DATA_DIO)) +- return; ++ if (btrfs_op(bio) == BTRFS_MAP_WRITE) ++ btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, ++ dip->bytes, !bio->bi_status); ++ else ++ unlock_extent(&inode->io_tree, dip->file_offset, ++ dip->file_offset + dip->bytes - 1, NULL); + +- /* +- * If we aren't doing async submit, calculate the csum of the +- * bio now. +- */ +- ret = btrfs_csum_one_bio(inode, bio, file_offset, false); +- if (ret) { +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- return; +- } +- } else { +- btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, +- file_offset - dip->file_offset); +- } +-map: +- btrfs_submit_bio(fs_info, bio, 0); ++ bbio->bio.bi_private = bbio->private; ++ iomap_dio_bio_end_io(bio); + } + +-static void btrfs_submit_direct(const struct iomap_iter *iter, +- struct bio *dio_bio, loff_t file_offset) ++static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ++ loff_t file_offset) + { ++ struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = +- container_of(dio_bio, struct btrfs_dio_private, bio); +- struct inode *inode = iter->inode; +- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- const bool raid56 = (btrfs_data_alloc_profile(fs_info) & +- BTRFS_BLOCK_GROUP_RAID56_MASK); +- struct bio *bio; +- u64 start_sector; +- int async_submit = 0; +- u64 submit_len; +- u64 clone_offset = 0; +- u64 clone_len; +- u64 logical; +- int ret; +- blk_status_t status; +- struct btrfs_io_geometry geom; ++ container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_dio_data *dio_data = iter->private; +- struct extent_map *em = NULL; +- +- dip->inode = BTRFS_I(inode); +- dip->file_offset = file_offset; +- dip->bytes = dio_bio->bi_iter.bi_size; +- refcount_set(&dip->refs, 1); +- dip->csums = NULL; +- +- if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { +- unsigned int nr_sectors = +- (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); +- +- /* +- * Load the csums up front to reduce csum tree searches and +- * contention when submitting bios. +- */ +- status = BLK_STS_RESOURCE; +- dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); +- if (!dip->csums) +- goto out_err; +- +- status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); +- if (status != BLK_STS_OK) +- goto out_err; +- } +- +- start_sector = dio_bio->bi_iter.bi_sector; +- submit_len = dio_bio->bi_iter.bi_size; +- +- do { +- logical = start_sector << 9; +- em = btrfs_get_chunk_map(fs_info, logical, submit_len); +- if (IS_ERR(em)) { +- status = errno_to_blk_status(PTR_ERR(em)); +- em = NULL; +- goto out_err_em; +- } +- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), +- logical, &geom); +- if (ret) { +- status = errno_to_blk_status(ret); +- goto out_err_em; +- } + +- clone_len = min(submit_len, geom.len); +- ASSERT(clone_len <= UINT_MAX); ++ btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); ++ bbio->file_offset = file_offset; + +- /* +- * This will never fail as it's passing GPF_NOFS and +- * the allocation is backed by btrfs_bioset. +- */ +- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, +- btrfs_end_dio_bio, dip); +- btrfs_bio(bio)->file_offset = file_offset; +- +- if (bio_op(bio) == REQ_OP_ZONE_APPEND) { +- status = extract_ordered_extent(BTRFS_I(inode), bio, +- file_offset); +- if (status) { +- bio_put(bio); +- goto out_err; +- } +- } +- +- ASSERT(submit_len >= clone_len); +- submit_len -= clone_len; +- +- /* +- * Increase the count before we submit the bio so we know +- * the end IO handler won't happen before we increase the +- * count. Otherwise, the dip might get freed before we're +- * done setting it up. +- * +- * We transfer the initial reference to the last bio, so we +- * don't need to increment the reference count for the last one. +- */ +- if (submit_len > 0) { +- refcount_inc(&dip->refs); +- /* +- * If we are submitting more than one bio, submit them +- * all asynchronously. The exception is RAID 5 or 6, as +- * asynchronous checksums make it difficult to collect +- * full stripe writes. +- */ +- if (!raid56) +- async_submit = 1; +- } +- +- btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); +- +- dio_data->submitted += clone_len; +- clone_offset += clone_len; +- start_sector += clone_len >> 9; +- file_offset += clone_len; +- +- free_extent_map(em); +- } while (submit_len > 0); +- return; ++ dip->file_offset = file_offset; ++ dip->bytes = bio->bi_iter.bi_size; + +-out_err_em: +- free_extent_map(em); +-out_err: +- dio_bio->bi_status = status; +- btrfs_dio_private_put(dip); ++ dio_data->submitted += bio->bi_iter.bi_size; ++ btrfs_submit_bio(bio, 0); + } + + static const struct iomap_ops btrfs_dio_iomap_ops = { +@@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { + }; + + static const struct iomap_dio_ops btrfs_dio_ops = { +- .submit_io = btrfs_submit_direct, ++ .submit_io = btrfs_dio_submit_io, + .bio_set = &btrfs_dio_bioset, + }; + +@@ -8552,7 +8173,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) + unlock_extent(io_tree, page_start, page_end, &cached_state); + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } +@@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) + ei->last_log_commit = 0; + + spin_lock_init(&ei->lock); +- spin_lock_init(&ei->io_failure_lock); + ei->outstanding_extents = 0; + if (sb->s_magic != BTRFS_TEST_MAGIC) + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, +@@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) + ei->io_tree.inode = ei; + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); +- ei->io_failure_tree = RB_ROOT; + atomic_set(&ei->sync_writers, 0); + mutex_init(&ei->log_mutex); + btrfs_ordered_inode_tree_init(&ei->ordered_tree); +@@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) + goto fail; + + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, +- offsetof(struct btrfs_dio_private, bio), ++ offsetof(struct btrfs_dio_private, bbio.bio), + BIOSET_NEED_BVECS)) + goto fail; + +@@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; +- bool skip_csum; + }; + +-static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, +- struct bio *bio, int mirror_num) +-{ +- struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- blk_status_t ret; +- +- if (!priv->skip_csum) { +- ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); +- if (ret) +- return ret; +- } +- +- atomic_inc(&priv->pending); +- btrfs_submit_bio(fs_info, bio, mirror_num); +- return BLK_STS_OK; +-} +- +-static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) +-{ +- const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); +- struct btrfs_encoded_read_private *priv = bbio->private; +- struct btrfs_inode *inode = priv->inode; +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- u32 sectorsize = fs_info->sectorsize; +- struct bio_vec *bvec; +- struct bvec_iter_all iter_all; +- u32 bio_offset = 0; +- +- if (priv->skip_csum || !uptodate) +- return bbio->bio.bi_status; +- +- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { +- unsigned int i, nr_sectors, pgoff; +- +- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); +- pgoff = bvec->bv_offset; +- for (i = 0; i < nr_sectors; i++) { +- ASSERT(pgoff < PAGE_SIZE); +- if (btrfs_check_data_csum(inode, bbio, bio_offset, +- bvec->bv_page, pgoff)) +- return BLK_STS_IOERR; +- bio_offset += sectorsize; +- pgoff += sectorsize; +- } +- } +- return BLK_STS_OK; +-} +- + static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) + { + struct btrfs_encoded_read_private *priv = bbio->private; +- blk_status_t status; + +- status = btrfs_encoded_read_verify_csum(bbio); +- if (status) { ++ if (bbio->bio.bi_status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the +@@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ +- WRITE_ONCE(priv->status, status); ++ WRITE_ONCE(priv->status, bbio->bio.bi_status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); +- btrfs_bio_free_csum(bbio); + bio_put(&bbio->bio); + } + +@@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) + { +- struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .inode = inode, + .file_offset = file_offset, + .pending = ATOMIC_INIT(1), +- .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), + }; + unsigned long i = 0; + u64 cur = 0; +- int ret; + + init_waitqueue_head(&priv.wait); +- /* +- * Submit bios for the extent, splitting due to bio or stripe limits as +- * necessary. +- */ ++ /* Submit bios for the extent, splitting due to bio limits as necessary. */ + while (cur < disk_io_size) { +- struct extent_map *em; +- struct btrfs_io_geometry geom; + struct bio *bio = NULL; +- u64 remaining; ++ u64 remaining = disk_io_size - cur; + +- em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, +- disk_io_size - cur); +- if (IS_ERR(em)) { +- ret = PTR_ERR(em); +- } else { +- ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, +- disk_bytenr + cur, &geom); +- free_extent_map(em); +- } +- if (ret) { +- WRITE_ONCE(priv.status, errno_to_blk_status(ret)); +- break; +- } +- remaining = min(geom.len, disk_io_size - cur); + while (bio || remaining) { + size_t bytes = min_t(u64, remaining, PAGE_SIZE); + + if (!bio) { + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, ++ inode, + btrfs_encoded_read_endio, + &priv); + bio->bi_iter.bi_sector = +@@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + + if (!bytes || + bio_add_page(bio, pages[i], bytes, 0) < bytes) { +- blk_status_t status; +- +- status = submit_encoded_read_bio(inode, bio, 0); +- if (status) { +- WRITE_ONCE(priv.status, status); +- bio_put(bio); +- goto out; +- } ++ atomic_inc(&priv.pending); ++ btrfs_submit_bio(bio, 0); + bio = NULL; + continue; + } +@@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + } + } + +-out: + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ +@@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, + return 0; + + max_pages = sis->max - bsi->nr_pages; +- first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; +- next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, +- PAGE_SIZE) >> PAGE_SHIFT; ++ first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; ++ next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index 7e348bd2ccde..8ea557e22252 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -707,7 +707,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, + * exists). + */ + btrfs_tree_lock(leaf); +- btrfs_clean_tree_block(leaf); ++ btrfs_clear_buffer_dirty(trans, leaf); + btrfs_tree_unlock(leaf); + btrfs_free_tree_block(trans, objectid, leaf, 0, 1); + free_extent_buffer(leaf); +diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c +new file mode 100644 +index 000000000000..0fe0ae54ac67 +--- /dev/null ++++ b/fs/btrfs/lru_cache.c +@@ -0,0 +1,166 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include "lru_cache.h" ++#include "messages.h" ++ ++/* ++ * Initialize a cache object. ++ * ++ * @cache: The cache. ++ * @max_size: Maximum size (number of entries) for the cache. ++ * Use 0 for unlimited size, it's the user's responsability to ++ * trim the cache in that case. ++ */ ++void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) ++{ ++ INIT_LIST_HEAD(&cache->lru_list); ++ mt_init(&cache->entries); ++ cache->size = 0; ++ cache->max_size = max_size; ++} ++ ++static struct btrfs_lru_cache_entry *match_entry(struct list_head *head, u64 key, ++ u64 gen) ++{ ++ struct btrfs_lru_cache_entry *entry; ++ ++ list_for_each_entry(entry, head, list) { ++ if (entry->key == key && entry->gen == gen) ++ return entry; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Lookup for an entry in the cache. ++ * ++ * @cache: The cache. ++ * @key: The key of the entry we are looking for. ++ * @gen: Generation associated to the key. ++ * ++ * Returns the entry associated with the key or NULL if none found. ++ */ ++struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, ++ u64 key, u64 gen) ++{ ++ struct list_head *head; ++ struct btrfs_lru_cache_entry *entry; ++ ++ head = mtree_load(&cache->entries, key); ++ if (!head) ++ return NULL; ++ ++ entry = match_entry(head, key, gen); ++ if (entry) ++ list_move_tail(&entry->lru_list, &cache->lru_list); ++ ++ return entry; ++} ++ ++/* ++ * Remove an entry from the cache. ++ * ++ * @cache: The cache to remove from. ++ * @entry: The entry to remove from the cache. ++ * ++ * Note: this also frees the memory used by the entry. ++ */ ++void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, ++ struct btrfs_lru_cache_entry *entry) ++{ ++ struct list_head *prev = entry->list.prev; ++ ++ ASSERT(cache->size > 0); ++ ASSERT(!mtree_empty(&cache->entries)); ++ ++ list_del(&entry->list); ++ list_del(&entry->lru_list); ++ ++ if (list_empty(prev)) { ++ struct list_head *head; ++ ++ /* ++ * If previous element in the list entry->list is now empty, it ++ * means it's a head entry not pointing to any cached entries, ++ * so remove it from the maple tree and free it. ++ */ ++ head = mtree_erase(&cache->entries, entry->key); ++ ASSERT(head == prev); ++ kfree(head); ++ } ++ ++ kfree(entry); ++ cache->size--; ++} ++ ++/* ++ * Store an entry in the cache. ++ * ++ * @cache: The cache. ++ * @entry: The entry to store. ++ * ++ * Returns 0 on success and < 0 on error. ++ */ ++int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, ++ struct btrfs_lru_cache_entry *new_entry, ++ gfp_t gfp) ++{ ++ const u64 key = new_entry->key; ++ struct list_head *head; ++ int ret; ++ ++ head = kmalloc(sizeof(*head), gfp); ++ if (!head) ++ return -ENOMEM; ++ ++ ret = mtree_insert(&cache->entries, key, head, gfp); ++ if (ret == 0) { ++ INIT_LIST_HEAD(head); ++ list_add_tail(&new_entry->list, head); ++ } else if (ret == -EEXIST) { ++ kfree(head); ++ head = mtree_load(&cache->entries, key); ++ ASSERT(head != NULL); ++ if (match_entry(head, key, new_entry->gen) != NULL) ++ return -EEXIST; ++ list_add_tail(&new_entry->list, head); ++ } else if (ret < 0) { ++ kfree(head); ++ return ret; ++ } ++ ++ if (cache->max_size > 0 && cache->size == cache->max_size) { ++ struct btrfs_lru_cache_entry *lru_entry; ++ ++ lru_entry = list_first_entry(&cache->lru_list, ++ struct btrfs_lru_cache_entry, ++ lru_list); ++ btrfs_lru_cache_remove(cache, lru_entry); ++ } ++ ++ list_add_tail(&new_entry->lru_list, &cache->lru_list); ++ cache->size++; ++ ++ return 0; ++} ++ ++/* ++ * Empty a cache. ++ * ++ * @cache: The cache to empty. ++ * ++ * Removes all entries from the cache. ++ */ ++void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache) ++{ ++ struct btrfs_lru_cache_entry *entry; ++ struct btrfs_lru_cache_entry *tmp; ++ ++ list_for_each_entry_safe(entry, tmp, &cache->lru_list, lru_list) ++ btrfs_lru_cache_remove(cache, entry); ++ ++ ASSERT(cache->size == 0); ++ ASSERT(mtree_empty(&cache->entries)); ++} +diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h +new file mode 100644 +index 000000000000..de3e18bce24a +--- /dev/null ++++ b/fs/btrfs/lru_cache.h +@@ -0,0 +1,80 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef BTRFS_LRU_CACHE_H ++#define BTRFS_LRU_CACHE_H ++ ++#include ++#include ++ ++/* ++ * A cache entry. This is meant to be embedded in a structure of a user of ++ * this module. Similar to how struct list_head and struct rb_node are used. ++ * ++ * Note: it should be embedded as the first element in a struct (offset 0), and ++ * this module assumes it was allocated with kmalloc(), so it calls kfree() when ++ * it needs to free an entry. ++ */ ++struct btrfs_lru_cache_entry { ++ struct list_head lru_list; ++ u64 key; ++ /* ++ * Optional generation associated to a key. Use 0 if not needed/used. ++ * Entries with the same key and different generations are stored in a ++ * linked list, so use this only for cases where there's a small number ++ * of different generations. ++ */ ++ u64 gen; ++ /* ++ * The maple tree uses unsigned long type for the keys, which is 32 bits ++ * on 32 bits systems, and 64 bits on 64 bits systems. So if we want to ++ * use something like inode numbers as keys, which are always a u64, we ++ * have to deal with this in a special way - we store the key in the ++ * entry itself, as a u64, and the values inserted into the maple tree ++ * are linked lists of entries - so in case we are on a 64 bits system, ++ * that list always has a single entry, while on 32 bits systems it ++ * may have more than one, with each entry having the same value for ++ * their lower 32 bits of the u64 key. ++ */ ++ struct list_head list; ++}; ++ ++struct btrfs_lru_cache { ++ struct list_head lru_list; ++ struct maple_tree entries; ++ /* Number of entries stored in the cache. */ ++ unsigned int size; ++ /* Maximum number of entries the cache can have. */ ++ unsigned int max_size; ++}; ++ ++#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ ++ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) ++ ++static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) ++{ ++ return cache->size; ++} ++ ++static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) ++{ ++ return cache->size >= cache->max_size; ++} ++ ++static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( ++ struct btrfs_lru_cache *cache) ++{ ++ return list_first_entry_or_null(&cache->lru_list, ++ struct btrfs_lru_cache_entry, lru_list); ++} ++ ++void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size); ++struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache, ++ u64 key, u64 gen); ++int btrfs_lru_cache_store(struct btrfs_lru_cache *cache, ++ struct btrfs_lru_cache_entry *new_entry, ++ gfp_t gfp); ++void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache, ++ struct btrfs_lru_cache_entry *entry); ++void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache); ++ ++#endif +diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c +index d5e78cbc8fbc..71f6d8302d50 100644 +--- a/fs/btrfs/lzo.c ++++ b/fs/btrfs/lzo.c +@@ -280,7 +280,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + } + + /* Check if we have reached page boundary */ +- if (IS_ALIGNED(cur_in, PAGE_SIZE)) { ++ if (PAGE_ALIGNED(cur_in)) { + put_page(page_in); + page_in = NULL; + } +diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c +index 625bbbbb2608..fde5aaa6e7c9 100644 +--- a/fs/btrfs/messages.c ++++ b/fs/btrfs/messages.c +@@ -292,36 +292,6 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) + } + #endif + +-/* +- * We only mark the transaction aborted and then set the file system read-only. +- * This will prevent new transactions from starting or trying to join this +- * one. +- * +- * This means that error recovery at the call site is limited to freeing +- * any local memory allocations and passing the error code up without +- * further cleanup. The transaction should complete as it normally would +- * in the call path but will return -EIO. +- * +- * We'll complete the cleanup in btrfs_end_transaction and +- * btrfs_commit_transaction. +- */ +-__cold +-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, +- const char *function, +- unsigned int line, int errno, bool first_hit) +-{ +- struct btrfs_fs_info *fs_info = trans->fs_info; +- +- WRITE_ONCE(trans->aborted, errno); +- WRITE_ONCE(trans->transaction->aborted, errno); +- if (first_hit && errno == -ENOSPC) +- btrfs_dump_space_info_for_trans_abort(fs_info); +- /* Wake up anybody who may be waiting on this transaction */ +- wake_up(&fs_info->transaction_wait); +- wake_up(&fs_info->transaction_blocked_wait); +- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +-} +- + /* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. +diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h +index 190af1f698d9..8c516ee58ff9 100644 +--- a/fs/btrfs/messages.h ++++ b/fs/btrfs/messages.h +@@ -6,7 +6,6 @@ + #include + + struct btrfs_fs_info; +-struct btrfs_trans_handle; + + static inline __printf(2, 3) __cold + void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +@@ -178,39 +177,6 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function + + const char * __attribute_const__ btrfs_decode_error(int errno); + +-__cold +-void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, +- const char *function, +- unsigned int line, int errno, bool first_hit); +- +-bool __cold abort_should_print_stack(int errno); +- +-/* +- * Call btrfs_abort_transaction as early as possible when an error condition is +- * detected, that way the exact stack trace is reported for some errors. +- */ +-#define btrfs_abort_transaction(trans, errno) \ +-do { \ +- bool first = false; \ +- /* Report first abort since mount */ \ +- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ +- &((trans)->fs_info->fs_state))) { \ +- first = true; \ +- if (WARN(abort_should_print_stack(errno), \ +- KERN_ERR \ +- "BTRFS: Transaction aborted (error %d)\n", \ +- (errno))) { \ +- /* Stack trace printed. */ \ +- } else { \ +- btrfs_err((trans)->fs_info, \ +- "Transaction aborted (error %d)", \ +- (errno)); \ +- } \ +- } \ +- __btrfs_abort_transaction((trans), __func__, \ +- __LINE__, (errno), first); \ +-} while (0) +- + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c +index 57d8c72737e1..6c24b69e2d0a 100644 +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + complete(&ordered->completion); + } + +@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + } + + /* +- * Used to start IO or wait for a given ordered extent to finish. ++ * Start IO and wait for a given ordered extent to finish. + * +- * If wait is one, this effectively waits on page writeback for all the pages +- * in the extent, and it waits on the io completion code to insert +- * metadata into the btree corresponding to the extent ++ * Wait on page writeback for all the pages in the extent and the IO completion ++ * code to insert metadata into the btree corresponding to the extent. + */ +-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) ++void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) + { + u64 start = entry->file_offset; + u64 end = start + entry->num_bytes - 1; +@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) + */ + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); +- if (wait) { +- if (!freespace_inode) +- btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); +- wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, +- &entry->flags)); +- } ++ ++ if (!freespace_inode) ++ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); ++ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); + } + + /* +@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) + btrfs_put_ordered_extent(ordered); + break; + } +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't +@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, + break; + } + unlock_extent(&inode->io_tree, start, end, cachedp); +- btrfs_start_ordered_extent(ordered, 1); ++ btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } +diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h +index 89f82b78f590..eb40cb39f842 100644 +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -157,7 +157,6 @@ struct btrfs_ordered_extent { + * command in a workqueue context + */ + u64 physical; +- struct block_device *bdev; + }; + + static inline void +@@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); + struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, + u64 file_offset); +-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); ++void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); + int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); + struct btrfs_ordered_extent * + btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); +diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c +index af97413abcf4..52a7d2fa2284 100644 +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1304,7 +1304,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) + list_del("a_root->dirty_list); + + btrfs_tree_lock(quota_root->node); +- btrfs_clean_tree_block(quota_root->node); ++ btrfs_clear_buffer_dirty(trans, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, btrfs_root_id(quota_root), + quota_root->node, 0, 1); +diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c +index ff4b1d583788..642828c1b299 100644 +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -998,7 +998,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) + } + + /* +- * Return the total numer of errors found in the vertical stripe of @sector_nr. ++ * Return the total number of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. +@@ -1183,7 +1183,15 @@ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + trace_info->stripe_nr = -1; + } + +-/* Generate PQ for one veritical stripe. */ ++static inline void bio_list_put(struct bio_list *bio_list) ++{ ++ struct bio *bio; ++ ++ while ((bio = bio_list_pop(bio_list))) ++ bio_put(bio); ++} ++ ++/* Generate PQ for one vertical stripe. */ + static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) + { + void **pointers = rbio->finish_pointers; +@@ -1228,7 +1236,6 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) + static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) + { +- struct bio *bio; + /* The total sector number inside the full stripe. */ + int total_sector_nr; + int sectornr; +@@ -1317,8 +1324,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + + return 0; + error: +- while ((bio = bio_list_pop(bio_list))) +- bio_put(bio); ++ bio_list_put(bio_list); + return -EIO; + } + +@@ -1357,7 +1363,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) + } + + /* +- * For subpage case, we can no longer set page Uptodate directly for ++ * For subpage case, we can no longer set page Up-to-date directly for + * stripe_pages[], thus we need to locate the sector. + */ + static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, +@@ -1425,10 +1431,9 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; +- struct bvec_iter_all iter_all; + int i; + +- bio_for_each_segment_all(bvec, bio, iter_all) ++ bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + /* +@@ -1498,7 +1503,7 @@ static void raid_wait_read_end_io(struct bio *bio) + wake_up(&rbio->io_wait); + } + +-static void submit_read_bios(struct btrfs_raid_bio *rbio, ++static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) + { + struct bio *bio; +@@ -1515,41 +1520,8 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, + } + submit_bio(bio); + } +-} +- +-static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, +- struct bio_list *bio_list) +-{ +- struct bio *bio; +- int total_sector_nr; +- int ret = 0; +- +- ASSERT(bio_list_size(bio_list) == 0); +- +- /* +- * Build a list of bios to read all sectors (including data and P/Q). +- * +- * This behaviro is to compensate the later csum verification and +- * recovery. +- */ +- for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; +- total_sector_nr++) { +- struct sector_ptr *sector; +- int stripe = total_sector_nr / rbio->stripe_nsectors; +- int sectornr = total_sector_nr % rbio->stripe_nsectors; +- +- sector = rbio_stripe_sector(rbio, stripe, sectornr); +- ret = rbio_add_io_sector(rbio, bio_list, sector, +- stripe, sectornr, REQ_OP_READ); +- if (ret) +- goto cleanup; +- } +- return 0; + +-cleanup: +- while ((bio = bio_list_pop(bio_list))) +- bio_put(bio); +- return ret; ++ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + } + + static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) +@@ -1668,12 +1640,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; +- int ret = 0; + + rbio = alloc_rbio(fs_info, bioc); + if (IS_ERR(rbio)) { +- ret = PTR_ERR(rbio); +- goto fail; ++ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); ++ bio_endio(bio); ++ return; + } + rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); +@@ -1682,31 +1654,24 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) + * Don't plug on full rbios, just get them out the door + * as quickly as we can + */ +- if (rbio_is_full(rbio)) +- goto queue_rbio; +- +- cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); +- if (cb) { +- plug = container_of(cb, struct btrfs_plug_cb, cb); +- if (!plug->info) { +- plug->info = fs_info; +- INIT_LIST_HEAD(&plug->rbio_list); ++ if (!rbio_is_full(rbio)) { ++ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); ++ if (cb) { ++ plug = container_of(cb, struct btrfs_plug_cb, cb); ++ if (!plug->info) { ++ plug->info = fs_info; ++ INIT_LIST_HEAD(&plug->rbio_list); ++ } ++ list_add_tail(&rbio->plug_list, &plug->rbio_list); ++ return; + } +- list_add_tail(&rbio->plug_list, &plug->rbio_list); +- return; + } +-queue_rbio: ++ + /* + * Either we don't have any existing plug, or we're doing a full stripe, +- * can queue the rmw work now. ++ * queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); +- +- return; +- +-fail: +- bio->bi_status = errno_to_blk_status(ret); +- bio_endio(bio); + } + + static int verify_one_sector(struct btrfs_raid_bio *rbio, +@@ -1773,7 +1738,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* +- * No errors in the veritical stripe, skip it. Can happen for recovery ++ * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) +@@ -1949,14 +1914,25 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) + return ret; + } + +-static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, +- struct bio_list *bio_list) ++static void recover_rbio(struct btrfs_raid_bio *rbio) + { +- struct bio *bio; ++ struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + +- ASSERT(bio_list_size(bio_list) == 0); ++ /* ++ * Either we're doing recover for a read failure or degraded write, ++ * caller should have set error bitmap correctly. ++ */ ++ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); ++ ++ /* For recovery, we need to read all sectors including P/Q. */ ++ ret = alloc_rbio_pages(rbio); ++ if (ret < 0) ++ goto out; ++ ++ index_rbio_pages(rbio); ++ + /* + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. +@@ -1987,78 +1963,32 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + } + + sector = rbio_stripe_sector(rbio, stripe, sectornr); +- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, ++ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); +- if (ret < 0) +- goto error; ++ if (ret < 0) { ++ bio_list_put(&bio_list); ++ goto out; ++ } + } +- return 0; +-error: +- while ((bio = bio_list_pop(bio_list))) +- bio_put(bio); +- +- return -EIO; +-} +- +-static int recover_rbio(struct btrfs_raid_bio *rbio) +-{ +- struct bio_list bio_list; +- struct bio *bio; +- int ret; +- +- /* +- * Either we're doing recover for a read failure or degraded write, +- * caller should have set error bitmap correctly. +- */ +- ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); +- bio_list_init(&bio_list); +- +- /* For recovery, we need to read all sectors including P/Q. */ +- ret = alloc_rbio_pages(rbio); +- if (ret < 0) +- goto out; +- +- index_rbio_pages(rbio); +- +- ret = recover_assemble_read_bios(rbio, &bio_list); +- if (ret < 0) +- goto out; +- +- submit_read_bios(rbio, &bio_list); +- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + ++ submit_read_wait_bio_list(rbio, &bio_list); + ret = recover_sectors(rbio); +- + out: +- while ((bio = bio_list_pop(&bio_list))) +- bio_put(bio); +- +- return ret; ++ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } + + static void recover_rbio_work(struct work_struct *work) + { + struct btrfs_raid_bio *rbio; +- int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); +- +- ret = lock_stripe_add(rbio); +- if (ret == 0) { +- ret = recover_rbio(rbio); +- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +- } ++ if (!lock_stripe_add(rbio)) ++ recover_rbio(rbio); + } + + static void recover_rbio_work_locked(struct work_struct *work) + { +- struct btrfs_raid_bio *rbio; +- int ret; +- +- rbio = container_of(work, struct btrfs_raid_bio, work); +- +- ret = recover_rbio(rbio); +- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); ++ recover_rbio(container_of(work, struct btrfs_raid_bio, work)); + } + + static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +@@ -2204,11 +2134,9 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) + + static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) + { +- struct bio_list bio_list; +- struct bio *bio; +- int ret; +- +- bio_list_init(&bio_list); ++ struct bio_list bio_list = BIO_EMPTY_LIST; ++ int total_sector_nr; ++ int ret = 0; + + /* + * Fill the data csums we need for data verification. We need to fill +@@ -2217,24 +2145,32 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) + */ + fill_data_csums(rbio); + +- ret = rmw_assemble_read_bios(rbio, &bio_list); +- if (ret < 0) +- goto out; ++ /* ++ * Build a list of bios to read all sectors (including data and P/Q). ++ * ++ * This behavior is to compensate the later csum verification and recovery. ++ */ ++ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; ++ total_sector_nr++) { ++ struct sector_ptr *sector; ++ int stripe = total_sector_nr / rbio->stripe_nsectors; ++ int sectornr = total_sector_nr % rbio->stripe_nsectors; + +- submit_read_bios(rbio, &bio_list); +- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); ++ sector = rbio_stripe_sector(rbio, stripe, sectornr); ++ ret = rbio_add_io_sector(rbio, &bio_list, sector, ++ stripe, sectornr, REQ_OP_READ); ++ if (ret) { ++ bio_list_put(&bio_list); ++ return ret; ++ } ++ } + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ +- ret = recover_sectors(rbio); +- return ret; +-out: +- while ((bio = bio_list_pop(&bio_list))) +- bio_put(bio); +- +- return ret; ++ submit_read_wait_bio_list(rbio, &bio_list); ++ return recover_sectors(rbio); + } + + static void raid_wait_write_end_io(struct bio *bio) +@@ -2290,7 +2226,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) + return false; + } + +-static int rmw_rbio(struct btrfs_raid_bio *rbio) ++static void rmw_rbio(struct btrfs_raid_bio *rbio) + { + struct bio_list bio_list; + int sectornr; +@@ -2302,30 +2238,28 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) + */ + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) +- return ret; ++ goto out; + + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ +- if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) +- goto write; +- +- /* +- * Now we're doing sub-stripe write, also need all data stripes to do +- * the full RMW. +- */ +- ret = alloc_rbio_data_pages(rbio); +- if (ret < 0) +- return ret; ++ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { ++ /* ++ * Now we're doing sub-stripe write, also need all data stripes ++ * to do the full RMW. ++ */ ++ ret = alloc_rbio_data_pages(rbio); ++ if (ret < 0) ++ goto out; + +- index_rbio_pages(rbio); ++ index_rbio_pages(rbio); + +- ret = rmw_read_wait_recover(rbio); +- if (ret < 0) +- return ret; ++ ret = rmw_read_wait_recover(rbio); ++ if (ret < 0) ++ goto out; ++ } + +-write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe +@@ -2356,7 +2290,7 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) +- return ret; ++ goto out; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); +@@ -2373,32 +2307,22 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) + break; + } + } +- return ret; ++out: ++ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } + + static void rmw_rbio_work(struct work_struct *work) + { + struct btrfs_raid_bio *rbio; +- int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); +- +- ret = lock_stripe_add(rbio); +- if (ret == 0) { +- ret = rmw_rbio(rbio); +- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +- } ++ if (lock_stripe_add(rbio) == 0) ++ rmw_rbio(rbio); + } + + static void rmw_rbio_work_locked(struct work_struct *work) + { +- struct btrfs_raid_bio *rbio; +- int ret; +- +- rbio = container_of(work, struct btrfs_raid_bio, work); +- +- ret = rmw_rbio(rbio); +- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); ++ rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); + } + + /* +@@ -2506,7 +2430,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; + struct bio_list bio_list; +- struct bio *bio; + int is_replace = 0; + int ret; + +@@ -2637,8 +2560,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) + return 0; + + cleanup: +- while ((bio = bio_list_pop(&bio_list))) +- bio_put(bio); ++ bio_list_put(&bio_list); + return ret; + } + +@@ -2733,15 +2655,12 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) + return ret; + } + +-static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, +- struct bio_list *bio_list) ++static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) + { +- struct bio *bio; ++ struct bio_list bio_list = BIO_EMPTY_LIST; + int total_sector_nr; + int ret = 0; + +- ASSERT(bio_list_size(bio_list) == 0); +- + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { +@@ -2770,45 +2689,38 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + if (sector->uptodate) + continue; + +- ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, ++ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); +- if (ret) +- goto error; ++ if (ret) { ++ bio_list_put(&bio_list); ++ return ret; ++ } + } ++ ++ submit_read_wait_bio_list(rbio, &bio_list); + return 0; +-error: +- while ((bio = bio_list_pop(bio_list))) +- bio_put(bio); +- return ret; + } + +-static int scrub_rbio(struct btrfs_raid_bio *rbio) ++static void scrub_rbio(struct btrfs_raid_bio *rbio) + { + bool need_check = false; +- struct bio_list bio_list; + int sector_nr; + int ret; +- struct bio *bio; +- +- bio_list_init(&bio_list); + + ret = alloc_rbio_essential_pages(rbio); + if (ret) +- goto cleanup; ++ goto out; + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + +- ret = scrub_assemble_read_bios(rbio, &bio_list); ++ ret = scrub_assemble_read_bios(rbio); + if (ret < 0) +- goto cleanup; +- +- submit_read_bios(rbio, &bio_list); +- wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); ++ goto out; + + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) +- goto cleanup; ++ goto out; + + /* + * We have every sector properly prepared. Can finish the scrub +@@ -2825,23 +2737,13 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) + break; + } + } +- return ret; +- +-cleanup: +- while ((bio = bio_list_pop(&bio_list))) +- bio_put(bio); +- +- return ret; ++out: ++ rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } + + static void scrub_rbio_work_locked(struct work_struct *work) + { +- struct btrfs_raid_bio *rbio; +- int ret; +- +- rbio = container_of(work, struct btrfs_raid_bio, work); +- ret = scrub_rbio(rbio); +- rbio_orig_end_io(rbio, errno_to_blk_status(ret)); ++ scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); + } + + void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h +index 7c73a443939e..df0e0abdeb1f 100644 +--- a/fs/btrfs/raid56.h ++++ b/fs/btrfs/raid56.h +@@ -65,7 +65,7 @@ struct btrfs_raid_bio { + /* Number of data stripes (no p/q) */ + u8 nr_data; + +- /* Numer of all stripes (including P/Q) */ ++ /* Number of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ +@@ -132,7 +132,7 @@ struct btrfs_raid_bio { + + /* + * Checksum buffer if the rbio is for data. The buffer should cover +- * all data sectors (exlcuding P/Q sectors). ++ * all data sectors (excluding P/Q sectors). + */ + u8 *csum_buf; + +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index 31ec4a7658ce..ef13a9d4e370 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -2825,7 +2825,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( + * + * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + */ +- if (!IS_ALIGNED(i_size, PAGE_SIZE)) { ++ if (!PAGE_ALIGNED(i_size)) { + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 52b346795f66..69c93ae333f6 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -229,7 +229,7 @@ struct full_stripe_lock { + }; + + #ifndef CONFIG_64BIT +-/* This structure is for archtectures whose (void *) is smaller than u64 */ ++/* This structure is for architectures whose (void *) is smaller than u64 */ + struct scrub_page_private { + u64 logical; + }; +@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) + * a) don't have an extent buffer and + * b) the page is already kmapped + */ +- if (sblock->logical != btrfs_stack_header_bytenr(h)) ++ if (sblock->logical != btrfs_stack_header_bytenr(h)) { + sblock->header_error = 1; +- +- if (sector->generation != btrfs_stack_header_generation(h)) { +- sblock->header_error = 1; +- sblock->generation_error = 1; ++ btrfs_warn_rl(fs_info, ++ "tree block %llu mirror %u has bad bytenr, has %llu want %llu", ++ sblock->logical, sblock->mirror_num, ++ btrfs_stack_header_bytenr(h), ++ sblock->logical); ++ goto out; + } + +- if (!scrub_check_fsid(h->fsid, sector)) ++ if (!scrub_check_fsid(h->fsid, sector)) { + sblock->header_error = 1; ++ btrfs_warn_rl(fs_info, ++ "tree block %llu mirror %u has bad fsid, has %pU want %pU", ++ sblock->logical, sblock->mirror_num, ++ h->fsid, sblock->dev->fs_devices->fsid); ++ goto out; ++ } + +- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, +- BTRFS_UUID_SIZE)) ++ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { + sblock->header_error = 1; ++ btrfs_warn_rl(fs_info, ++ "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", ++ sblock->logical, sblock->mirror_num, ++ h->chunk_tree_uuid, fs_info->chunk_tree_uuid); ++ goto out; ++ } + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); +@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) + } + + crypto_shash_final(shash, calculated_csum); +- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) ++ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { + sblock->checksum_error = 1; ++ btrfs_warn_rl(fs_info, ++ "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, ++ sblock->logical, sblock->mirror_num, ++ CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), ++ CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); ++ goto out; ++ } ++ ++ if (sector->generation != btrfs_stack_header_generation(h)) { ++ sblock->header_error = 1; ++ sblock->generation_error = 1; ++ btrfs_warn_rl(fs_info, ++ "tree block %llu mirror %u has bad generation, has %llu want %llu", ++ sblock->logical, sblock->mirror_num, ++ btrfs_stack_header_generation(h), ++ sector->generation); ++ } + ++out: + return sblock->header_error || sblock->checksum_error; + } + +diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c +index d50182b6deec..e5c963bb873d 100644 +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -32,6 +32,7 @@ + #include "file-item.h" + #include "ioctl.h" + #include "verity.h" ++#include "lru_cache.h" + + /* + * Maximum number of references an extent can have in order for us to attempt to +@@ -80,23 +81,23 @@ struct clone_root { + bool found_ref; + }; + +-#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 +-#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) ++#define SEND_MAX_NAME_CACHE_SIZE 256 + + /* +- * Limit the root_ids array of struct backref_cache_entry to 12 elements. +- * This makes the size of a cache entry to be exactly 128 bytes on x86_64. ++ * Limit the root_ids array of struct backref_cache_entry to 17 elements. ++ * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which ++ * can be satisfied from the kmalloc-192 slab, without wasting any space. + * The most common case is to have a single root for cloning, which corresponds +- * to the send root. Having the user specify more than 11 clone roots is not ++ * to the send root. Having the user specify more than 16 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of +- * cloning roots that lead down to a leaf is more than 12. ++ * cloning roots that lead down to a leaf is more than 17. + */ +-#define SEND_MAX_BACKREF_CACHE_ROOTS 12 ++#define SEND_MAX_BACKREF_CACHE_ROOTS 17 + + /* + * Max number of entries in the cache. +- * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding +- * maple tree's internal nodes, is 16K. ++ * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding ++ * maple tree's internal nodes, is 24K. + */ + #define SEND_MAX_BACKREF_CACHE_SIZE 128 + +@@ -107,15 +108,31 @@ struct clone_root { + * x86_64). + */ + struct backref_cache_entry { +- /* List to link to the cache's lru list. */ +- struct list_head list; +- /* The key for this entry in the cache. */ +- u64 key; ++ struct btrfs_lru_cache_entry entry; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; + }; + ++/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ ++static_assert(offsetof(struct backref_cache_entry, entry) == 0); ++ ++/* ++ * Max number of entries in the cache that stores directories that were already ++ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses ++ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but ++ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). ++ */ ++#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 ++ ++/* ++ * Max number of entries in the cache that stores directories that were already ++ * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses ++ * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but ++ * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). ++ */ ++#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 ++ + struct send_ctx { + struct file *send_filp; + loff_t send_off; +@@ -174,9 +191,7 @@ struct send_ctx { + struct list_head new_refs; + struct list_head deleted_refs; + +- struct radix_tree_root name_cache; +- struct list_head name_cache_list; +- int name_cache_size; ++ struct btrfs_lru_cache name_cache; + + /* + * The inode we are currently processing. It's not NULL only when we +@@ -285,13 +300,11 @@ struct send_ctx { + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; + +- struct { +- u64 last_reloc_trans; +- struct list_head lru_list; +- struct maple_tree entries; +- /* Number of entries stored in the cache. */ +- int size; +- } backref_cache; ++ struct btrfs_lru_cache backref_cache; ++ u64 backref_cache_last_reloc_trans; ++ ++ struct btrfs_lru_cache dir_created_cache; ++ struct btrfs_lru_cache dir_utimes_cache; + }; + + struct pending_dir_move { +@@ -321,21 +334,15 @@ struct orphan_dir_info { + u64 ino; + u64 gen; + u64 last_dir_index_offset; ++ u64 dir_high_seq_ino; + }; + + struct name_cache_entry { +- struct list_head list; + /* +- * radix_tree has only 32bit entries but we need to handle 64bit inums. +- * We use the lower 32bit of the 64bit inum to store it in the tree. If +- * more then one inum would fall into the same entry, we use radix_list +- * to store the additional entries. radix_list is also used to store +- * entries where two entries have the same inum but different +- * generations. ++ * The key in the entry is an inode number, and the generation matches ++ * the inode's generation. + */ +- struct list_head radix_list; +- u64 ino; +- u64 gen; ++ struct btrfs_lru_cache_entry entry; + u64 parent_ino; + u64 parent_gen; + int ret; +@@ -344,6 +351,9 @@ struct name_cache_entry { + char name[]; + }; + ++/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ ++static_assert(offsetof(struct name_cache_entry, entry) == 0); ++ + #define ADVANCE 1 + #define ADVANCE_ONLY_NEXT -1 + +@@ -956,14 +966,12 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, + static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) + { + int ret; +- struct btrfs_inode_info info; ++ struct btrfs_inode_info info = { 0 }; + +- if (!gen) +- return -EPERM; ++ ASSERT(gen); + + ret = get_inode_info(root, ino, &info); +- if (!ret) +- *gen = info.gen; ++ *gen = info.gen; + return ret; + } + +@@ -1388,19 +1396,6 @@ static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + return 0; + } + +-static void empty_backref_cache(struct send_ctx *sctx) +-{ +- struct backref_cache_entry *entry; +- struct backref_cache_entry *tmp; +- +- list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) +- kfree(entry); +- +- INIT_LIST_HEAD(&sctx->backref_cache.lru_list); +- mtree_destroy(&sctx->backref_cache.entries); +- sctx->backref_cache.size = 0; +-} +- + static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) + { +@@ -1408,9 +1403,10 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + struct send_ctx *sctx = bctx->sctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; ++ struct btrfs_lru_cache_entry *raw_entry; + struct backref_cache_entry *entry; + +- if (sctx->backref_cache.size == 0) ++ if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + return false; + + /* +@@ -1424,18 +1420,18 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ +- if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { +- empty_backref_cache(sctx); ++ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { ++ btrfs_lru_cache_clear(&sctx->backref_cache); + return false; + } + +- entry = mtree_load(&sctx->backref_cache.entries, key); +- if (!entry) ++ raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); ++ if (!raw_entry) + return false; + ++ entry = container_of(raw_entry, struct backref_cache_entry, entry); + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; +- list_move_tail(&entry->list, &sctx->backref_cache.lru_list); + + return true; + } +@@ -1461,7 +1457,8 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + if (!new_entry) + return; + +- new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; ++ new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; ++ new_entry->entry.gen = 0; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { +@@ -1489,23 +1486,12 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. ++ * ++ * Also use GFP_NOFS because we're called while holding a transaction ++ * handle or while holding fs_info->commit_root_sem. + */ +- +- if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { +- struct backref_cache_entry *lru_entry; +- struct backref_cache_entry *mt_entry; +- +- lru_entry = list_first_entry(&sctx->backref_cache.lru_list, +- struct backref_cache_entry, list); +- mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); +- ASSERT(mt_entry == lru_entry); +- list_del(&mt_entry->list); +- kfree(mt_entry); +- sctx->backref_cache.size--; +- } +- +- ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, +- new_entry, GFP_NOFS); ++ ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, ++ GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ +@@ -1513,17 +1499,13 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + return; + } + +- list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); +- + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ +- if (sctx->backref_cache.size == 0) +- sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; +- +- sctx->backref_cache.size++; ++ if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) ++ sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; + } + + static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, +@@ -1886,7 +1868,8 @@ enum inode_state { + inode_state_did_delete, + }; + +-static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) ++static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ++ u64 *send_gen, u64 *parent_gen) + { + int ret; + int left_ret; +@@ -1900,6 +1883,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) + goto out; + left_ret = (info.nlink == 0) ? -ENOENT : ret; + left_gen = info.gen; ++ if (send_gen) ++ *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); + + if (!sctx->parent_root) { + right_ret = -ENOENT; +@@ -1909,6 +1894,8 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) + goto out; + right_ret = (info.nlink == 0) ? -ENOENT : ret; + right_gen = info.gen; ++ if (parent_gen) ++ *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); + } + + if (!left_ret && !right_ret) { +@@ -1953,14 +1940,15 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) + return ret; + } + +-static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) ++static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ++ u64 *send_gen, u64 *parent_gen) + { + int ret; + + if (ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + +- ret = get_cur_inode_state(sctx, ino, gen); ++ ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); + if (ret < 0) + goto out; + +@@ -2121,43 +2109,36 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + const char *name, int name_len, + u64 *who_ino, u64 *who_gen, u64 *who_mode) + { +- int ret = 0; +- u64 gen; ++ int ret; ++ u64 parent_root_dir_gen; + u64 other_inode = 0; + struct btrfs_inode_info info; + + if (!sctx->parent_root) +- goto out; ++ return 0; + +- ret = is_inode_existent(sctx, dir, dir_gen); ++ ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); + if (ret <= 0) +- goto out; ++ return 0; + + /* + * If we have a parent root we need to verify that the parent dir was + * not deleted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. ++ * ++ * @parent_root_dir_gen was set to 0 if the inode does not exist in the ++ * parent root. + */ +- if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { +- ret = get_inode_gen(sctx->parent_root, dir, &gen); +- if (ret < 0 && ret != -ENOENT) +- goto out; +- if (ret) { +- ret = 0; +- goto out; +- } +- if (gen != dir_gen) +- goto out; +- } ++ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && ++ parent_root_dir_gen != dir_gen) ++ return 0; + + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, + &other_inode); +- if (ret < 0 && ret != -ENOENT) +- goto out; +- if (ret) { +- ret = 0; +- goto out; +- } ++ if (ret == -ENOENT) ++ return 0; ++ else if (ret < 0) ++ return ret; + + /* + * Check if the overwritten ref was already processed. If yes, the ref +@@ -2168,18 +2149,15 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + is_waiting_for_move(sctx, other_inode)) { + ret = get_inode_info(sctx->parent_root, other_inode, &info); + if (ret < 0) +- goto out; ++ return ret; + +- ret = 1; + *who_ino = other_inode; + *who_gen = info.gen; + *who_mode = info.mode; +- } else { +- ret = 0; ++ return 1; + } + +-out: +- return ret; ++ return 0; + } + + /* +@@ -2194,47 +2172,43 @@ static int did_overwrite_ref(struct send_ctx *sctx, + u64 ino, u64 ino_gen, + const char *name, int name_len) + { +- int ret = 0; +- u64 gen; ++ int ret; + u64 ow_inode; ++ u64 ow_gen = 0; ++ u64 send_root_dir_gen; + + if (!sctx->parent_root) +- goto out; ++ return 0; + +- ret = is_inode_existent(sctx, dir, dir_gen); ++ ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); + if (ret <= 0) +- goto out; ++ return ret; + +- if (dir != BTRFS_FIRST_FREE_OBJECTID) { +- ret = get_inode_gen(sctx->send_root, dir, &gen); +- if (ret < 0 && ret != -ENOENT) +- goto out; +- if (ret) { +- ret = 0; +- goto out; +- } +- if (gen != dir_gen) +- goto out; +- } ++ /* ++ * @send_root_dir_gen was set to 0 if the inode does not exist in the ++ * send root. ++ */ ++ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) ++ return 0; + + /* check if the ref was overwritten by another ref */ + ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, + &ow_inode); +- if (ret < 0 && ret != -ENOENT) +- goto out; +- if (ret) { ++ if (ret == -ENOENT) { + /* was never and will never be overwritten */ +- ret = 0; +- goto out; ++ return 0; ++ } else if (ret < 0) { ++ return ret; + } + +- ret = get_inode_gen(sctx->send_root, ow_inode, &gen); +- if (ret < 0) +- goto out; ++ if (ow_inode == ino) { ++ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); ++ if (ret < 0) ++ return ret; + +- if (ow_inode == ino && gen == ino_gen) { +- ret = 0; +- goto out; ++ /* It's the same inode, so no overwrite happened. */ ++ if (ow_gen == ino_gen) ++ return 0; + } + + /* +@@ -2243,15 +2217,20 @@ static int did_overwrite_ref(struct send_ctx *sctx, + * inode 'ino' to be orphanized, therefore check if ow_inode matches + * the current inode being processed. + */ +- if ((ow_inode < sctx->send_progress) || +- (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && +- gen == sctx->cur_inode_gen)) +- ret = 1; +- else +- ret = 0; ++ if (ow_inode < sctx->send_progress) ++ return 1; + +-out: +- return ret; ++ if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { ++ if (ow_gen == 0) { ++ ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); ++ if (ret < 0) ++ return ret; ++ } ++ if (ow_gen == sctx->cur_inode_gen) ++ return 1; ++ } ++ ++ return 0; + } + + /* +@@ -2285,113 +2264,16 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) + return ret; + } + +-/* +- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, +- * so we need to do some special handling in case we have clashes. This function +- * takes care of this with the help of name_cache_entry::radix_list. +- * In case of error, nce is kfreed. +- */ +-static int name_cache_insert(struct send_ctx *sctx, +- struct name_cache_entry *nce) ++static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, ++ u64 ino, u64 gen) + { +- int ret = 0; +- struct list_head *nce_head; +- +- nce_head = radix_tree_lookup(&sctx->name_cache, +- (unsigned long)nce->ino); +- if (!nce_head) { +- nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); +- if (!nce_head) { +- kfree(nce); +- return -ENOMEM; +- } +- INIT_LIST_HEAD(nce_head); +- +- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); +- if (ret < 0) { +- kfree(nce_head); +- kfree(nce); +- return ret; +- } +- } +- list_add_tail(&nce->radix_list, nce_head); +- list_add_tail(&nce->list, &sctx->name_cache_list); +- sctx->name_cache_size++; +- +- return ret; +-} ++ struct btrfs_lru_cache_entry *entry; + +-static void name_cache_delete(struct send_ctx *sctx, +- struct name_cache_entry *nce) +-{ +- struct list_head *nce_head; +- +- nce_head = radix_tree_lookup(&sctx->name_cache, +- (unsigned long)nce->ino); +- if (!nce_head) { +- btrfs_err(sctx->send_root->fs_info, +- "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", +- nce->ino, sctx->name_cache_size); +- } +- +- list_del(&nce->radix_list); +- list_del(&nce->list); +- sctx->name_cache_size--; +- +- /* +- * We may not get to the final release of nce_head if the lookup fails +- */ +- if (nce_head && list_empty(nce_head)) { +- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); +- kfree(nce_head); +- } +-} +- +-static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, +- u64 ino, u64 gen) +-{ +- struct list_head *nce_head; +- struct name_cache_entry *cur; +- +- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); +- if (!nce_head) ++ entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); ++ if (!entry) + return NULL; + +- list_for_each_entry(cur, nce_head, radix_list) { +- if (cur->ino == ino && cur->gen == gen) +- return cur; +- } +- return NULL; +-} +- +-/* +- * Remove some entries from the beginning of name_cache_list. +- */ +-static void name_cache_clean_unused(struct send_ctx *sctx) +-{ +- struct name_cache_entry *nce; +- +- if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) +- return; +- +- while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { +- nce = list_entry(sctx->name_cache_list.next, +- struct name_cache_entry, list); +- name_cache_delete(sctx, nce); +- kfree(nce); +- } +-} +- +-static void name_cache_free(struct send_ctx *sctx) +-{ +- struct name_cache_entry *nce; +- +- while (!list_empty(&sctx->name_cache_list)) { +- nce = list_entry(sctx->name_cache_list.next, +- struct name_cache_entry, list); +- name_cache_delete(sctx, nce); +- kfree(nce); +- } ++ return container_of(entry, struct name_cache_entry, entry); + } + + /* +@@ -2410,7 +2292,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, + { + int ret; + int nce_ret; +- struct name_cache_entry *nce = NULL; ++ struct name_cache_entry *nce; + + /* + * First check if we already did a call to this function with the same +@@ -2420,17 +2302,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, + nce = name_cache_search(sctx, ino, gen); + if (nce) { + if (ino < sctx->send_progress && nce->need_later_update) { +- name_cache_delete(sctx, nce); +- kfree(nce); ++ btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); + nce = NULL; + } else { +- /* +- * Removes the entry from the list and adds it back to +- * the end. This marks the entry as recently used so +- * that name_cache_clean_unused does not remove it. +- */ +- list_move_tail(&nce->list, &sctx->name_cache_list); +- + *parent_ino = nce->parent_ino; + *parent_gen = nce->parent_gen; + ret = fs_path_add(dest, nce->name, nce->name_len); +@@ -2446,7 +2320,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, + * This should only happen for the parent dir that we determine in + * record_new_ref_if_needed(). + */ +- ret = is_inode_existent(sctx, ino, gen); ++ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); + if (ret < 0) + goto out; + +@@ -2497,8 +2371,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, + goto out; + } + +- nce->ino = ino; +- nce->gen = gen; ++ nce->entry.key = ino; ++ nce->entry.gen = gen; + nce->parent_ino = *parent_ino; + nce->parent_gen = *parent_gen; + nce->name_len = fs_path_len(dest); +@@ -2510,10 +2384,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, + else + nce->need_later_update = 1; + +- nce_ret = name_cache_insert(sctx, nce); +- if (nce_ret < 0) ++ nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); ++ if (nce_ret < 0) { ++ kfree(nce); + ret = nce_ret; +- name_cache_clean_unused(sctx); ++ } + + out: + return ret; +@@ -2883,6 +2758,63 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) + return ret; + } + ++/* ++ * If the cache is full, we can't remove entries from it and do a call to ++ * send_utimes() for each respective inode, because we might be finishing ++ * processing an inode that is a directory and it just got renamed, and existing ++ * entries in the cache may refer to inodes that have the directory in their ++ * full path - in which case we would generate outdated paths (pre-rename) ++ * for the inodes that the cache entries point to. Instead of prunning the ++ * cache when inserting, do it after we finish processing each inode at ++ * finish_inode_if_needed(). ++ */ ++static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) ++{ ++ struct btrfs_lru_cache_entry *entry; ++ int ret; ++ ++ entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); ++ if (entry != NULL) ++ return 0; ++ ++ /* Caching is optional, don't fail if we can't allocate memory. */ ++ entry = kmalloc(sizeof(*entry), GFP_KERNEL); ++ if (!entry) ++ return send_utimes(sctx, dir, gen); ++ ++ entry->key = dir; ++ entry->gen = gen; ++ ++ ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); ++ ASSERT(ret != -EEXIST); ++ if (ret) { ++ kfree(entry); ++ return send_utimes(sctx, dir, gen); ++ } ++ ++ return 0; ++} ++ ++static int trim_dir_utimes_cache(struct send_ctx *sctx) ++{ ++ while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > ++ SEND_MAX_DIR_UTIMES_CACHE_SIZE) { ++ struct btrfs_lru_cache_entry *lru; ++ int ret; ++ ++ lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); ++ ASSERT(lru != NULL); ++ ++ ret = send_utimes(sctx, lru->key, lru->gen); ++ if (ret) ++ return ret; ++ ++ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); ++ } ++ ++ return 0; ++} ++ + /* + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have + * a valid path yet because we did not process the refs yet. So, the inode +@@ -2971,6 +2903,23 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) + return ret; + } + ++static void cache_dir_created(struct send_ctx *sctx, u64 dir) ++{ ++ struct btrfs_lru_cache_entry *entry; ++ int ret; ++ ++ /* Caching is optional, ignore any failures. */ ++ entry = kmalloc(sizeof(*entry), GFP_KERNEL); ++ if (!entry) ++ return; ++ ++ entry->key = dir; ++ entry->gen = 0; ++ ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); ++ if (ret < 0) ++ kfree(entry); ++} ++ + /* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. +@@ -2986,6 +2935,9 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) + struct btrfs_key di_key; + struct btrfs_dir_item *di; + ++ if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) ++ return 1; ++ + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; +@@ -3009,6 +2961,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { + ret = 1; ++ cache_dir_created(sctx, dir); + break; + } + } +@@ -3038,7 +2991,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) + return 0; + } + +- return send_create_inode(sctx, sctx->cur_ino); ++ ret = send_create_inode(sctx, sctx->cur_ino); ++ ++ if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) ++ cache_dir_created(sctx, sctx->cur_ino); ++ ++ return ret; + } + + struct recorded_ref { +@@ -3166,6 +3124,7 @@ static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, + odi->ino = dir_ino; + odi->gen = dir_gen; + odi->last_dir_index_offset = 0; ++ odi->dir_high_seq_ino = 0; + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); +@@ -3215,8 +3174,7 @@ static void free_orphan_dir_info(struct send_ctx *sctx, + * We check this by iterating all dir items and checking if the inode behind + * the dir item was already processed. + */ +-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, +- u64 send_progress) ++static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) + { + int ret = 0; + int iter_ret = 0; +@@ -3227,6 +3185,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + struct btrfs_key loc; + struct btrfs_dir_item *di; + struct orphan_dir_info *odi = NULL; ++ u64 dir_high_seq_ino = 0; ++ u64 last_dir_index_offset = 0; + + /* + * Don't try to rmdir the top/root subvolume dir. +@@ -3234,17 +3194,62 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + ++ odi = get_orphan_dir_info(sctx, dir, dir_gen); ++ if (odi && sctx->cur_ino < odi->dir_high_seq_ino) ++ return 0; ++ + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ++ if (!odi) { ++ /* ++ * Find the inode number associated with the last dir index ++ * entry. This is very likely the inode with the highest number ++ * of all inodes that have an entry in the directory. We can ++ * then use it to avoid future calls to can_rmdir(), when ++ * processing inodes with a lower number, from having to search ++ * the parent root b+tree for dir index keys. ++ */ ++ key.objectid = dir; ++ key.type = BTRFS_DIR_INDEX_KEY; ++ key.offset = (u64)-1; ++ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) { ++ goto out; ++ } else if (ret > 0) { ++ /* Can't happen, the root is never empty. */ ++ ASSERT(path->slots[0] > 0); ++ if (WARN_ON(path->slots[0] == 0)) { ++ ret = -EUCLEAN; ++ goto out; ++ } ++ path->slots[0]--; ++ } ++ ++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ++ if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { ++ /* No index keys, dir can be removed. */ ++ ret = 1; ++ goto out; ++ } ++ ++ di = btrfs_item_ptr(path->nodes[0], path->slots[0], ++ struct btrfs_dir_item); ++ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); ++ dir_high_seq_ino = loc.objectid; ++ if (sctx->cur_ino < dir_high_seq_ino) { ++ ret = 0; ++ goto out; ++ } ++ ++ btrfs_release_path(path); ++ } ++ + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; +- key.offset = 0; +- +- odi = get_orphan_dir_info(sctx, dir, dir_gen); +- if (odi) +- key.offset = odi->last_dir_index_offset; ++ key.offset = (odi ? odi->last_dir_index_offset : 0); + + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct waiting_dir_move *dm; +@@ -3257,29 +3262,18 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + ++ dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); ++ last_dir_index_offset = found_key.offset; ++ + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { +- odi = add_orphan_dir_info(sctx, dir, dir_gen); +- if (IS_ERR(odi)) { +- ret = PTR_ERR(odi); +- goto out; +- } +- odi->gen = dir_gen; +- odi->last_dir_index_offset = found_key.offset; + dm->rmdir_ino = dir; + dm->rmdir_gen = dir_gen; + ret = 0; + goto out; + } + +- if (loc.objectid > send_progress) { +- odi = add_orphan_dir_info(sctx, dir, dir_gen); +- if (IS_ERR(odi)) { +- ret = PTR_ERR(odi); +- goto out; +- } +- odi->gen = dir_gen; +- odi->last_dir_index_offset = found_key.offset; ++ if (loc.objectid > sctx->cur_ino) { + ret = 0; + goto out; + } +@@ -3294,7 +3288,22 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + + out: + btrfs_free_path(path); +- return ret; ++ ++ if (ret) ++ return ret; ++ ++ if (!odi) { ++ odi = add_orphan_dir_info(sctx, dir, dir_gen); ++ if (IS_ERR(odi)) ++ return PTR_ERR(odi); ++ ++ odi->gen = dir_gen; ++ } ++ ++ odi->last_dir_index_offset = last_dir_index_offset; ++ odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); ++ ++ return 0; + } + + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +@@ -3579,7 +3588,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) + } + gen = odi->gen; + +- ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); ++ ret = can_rmdir(sctx, rmdir_ino, gen); + if (ret < 0) + goto out; + if (!ret) +@@ -3599,7 +3608,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) + } + + finish: +- ret = send_utimes(sctx, pm->ino, pm->gen); ++ ret = cache_dir_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + +@@ -3619,7 +3628,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) + if (ret < 0) + goto out; + +- ret = send_utimes(sctx, cur->dir, cur->dir_gen); ++ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } +@@ -4242,7 +4251,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + * "testdir_2". + */ + list_for_each_entry(cur, &sctx->new_refs, list) { +- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); ++ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) +@@ -4288,12 +4297,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + * the source path when performing its rename + * operation. + */ +- if (is_waiting_for_move(sctx, ow_inode)) { +- wdm = get_waiting_dir_move(sctx, +- ow_inode); +- ASSERT(wdm); ++ wdm = get_waiting_dir_move(sctx, ow_inode); ++ if (wdm) + wdm->orphanized = true; +- } + + /* + * Make sure we clear our orphanized inode's +@@ -4306,10 +4312,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); +- if (nce) { +- name_cache_delete(sctx, nce); +- kfree(nce); +- } ++ if (nce) ++ btrfs_lru_cache_remove(&sctx->name_cache, ++ &nce->entry); + + /* + * ow_inode might currently be an ancestor of +@@ -4358,7 +4363,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ +- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); ++ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { +@@ -4388,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; ++ cache_dir_created(sctx, cur->dir); + } + } + +@@ -4470,8 +4476,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + * later, we do this check again and rmdir it then if possible. + * See the use of check_dirs for more details. + */ +- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, +- sctx->cur_ino); ++ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) { +@@ -4564,20 +4569,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) + if (cur->dir > sctx->cur_ino) + continue; + +- ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); ++ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); + if (ret < 0) + goto out; + + if (ret == inode_state_did_create || + ret == inode_state_no_change) { +- /* TODO delayed utimes */ +- ret = send_utimes(sctx, cur->dir, cur->dir_gen); ++ ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { +- ret = can_rmdir(sctx, cur->dir, cur->dir_gen, +- sctx->cur_ino); ++ ret = can_rmdir(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret) { +@@ -5635,7 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ +- data_offset = ALIGN(sctx->send_size, PAGE_SIZE); ++ data_offset = PAGE_ALIGN(sctx->send_size); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; +@@ -5759,7 +5762,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + sent += size; + } + +- if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { ++ if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in +@@ -6754,12 +6757,26 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; +- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); ++ ++ /* ++ * If the current inode is a non-empty directory, delay issuing ++ * the utimes command for it, as it's very likely we have inodes ++ * with an higher number inside it. We want to issue the utimes ++ * command only after adding all dentries to it. ++ */ ++ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) ++ ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); ++ else ++ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); ++ + if (ret < 0) + goto out; + } + + out: ++ if (!ret) ++ ret = trim_dir_utimes_cache(sctx); ++ + return ret; + } + +@@ -8044,6 +8061,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) + int clone_sources_to_rollback = 0; + size_t alloc_size; + int sort_clone_roots = 0; ++ struct btrfs_lru_cache_entry *entry; ++ struct btrfs_lru_cache_entry *tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; +@@ -8094,11 +8113,22 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) + + INIT_LIST_HEAD(&sctx->new_refs); + INIT_LIST_HEAD(&sctx->deleted_refs); +- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); +- INIT_LIST_HEAD(&sctx->name_cache_list); + +- INIT_LIST_HEAD(&sctx->backref_cache.lru_list); +- mt_init(&sctx->backref_cache.entries); ++ btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); ++ btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); ++ btrfs_lru_cache_init(&sctx->dir_created_cache, ++ SEND_MAX_DIR_CREATED_CACHE_SIZE); ++ /* ++ * This cache is periodically trimmed to a fixed size elsewhere, see ++ * cache_dir_utimes() and trim_dir_utimes_cache(). ++ */ ++ btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); ++ ++ sctx->pending_dir_moves = RB_ROOT; ++ sctx->waiting_dir_moves = RB_ROOT; ++ sctx->orphan_dirs = RB_ROOT; ++ sctx->rbtree_new_refs = RB_ROOT; ++ sctx->rbtree_deleted_refs = RB_ROOT; + + sctx->flags = arg->flags; + +@@ -8165,12 +8195,6 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) + goto out; + } + +- sctx->pending_dir_moves = RB_ROOT; +- sctx->waiting_dir_moves = RB_ROOT; +- sctx->orphan_dirs = RB_ROOT; +- sctx->rbtree_new_refs = RB_ROOT; +- sctx->rbtree_deleted_refs = RB_ROOT; +- + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); +@@ -8279,6 +8303,13 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) + if (ret < 0) + goto out; + ++ btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { ++ ret = send_utimes(sctx, entry->key, entry->gen); ++ if (ret < 0) ++ goto out; ++ btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); ++ } ++ + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) +@@ -8358,11 +8389,12 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) + kvfree(sctx->send_buf); + kvfree(sctx->verity_descriptor); + +- name_cache_free(sctx); +- + close_current_inode(sctx); + +- empty_backref_cache(sctx); ++ btrfs_lru_cache_clear(&sctx->name_cache); ++ btrfs_lru_cache_clear(&sctx->backref_cache); ++ btrfs_lru_cache_clear(&sctx->dir_created_cache); ++ btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + + kfree(sctx); + } +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 433ce221dc5c..581845bc206a 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -58,6 +58,7 @@ + #include "scrub.h" + #include "verity.h" + #include "super.h" ++#include "extent-tree.h" + #define CREATE_TRACE_POINTS + #include + +@@ -2049,7 +2050,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) + } + + /* +- * Metadata in mixed block goup profiles are accounted in data ++ * Metadata in mixed block group profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 45615ce36498..8c5efa5813b3 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) + kfree(to_raid_kobj(kobj)); + } + +-static struct kobj_type btrfs_raid_ktype = { ++static const struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_groups = raid_groups, +@@ -900,7 +900,7 @@ static void space_info_release(struct kobject *kobj) + kfree(sinfo); + } + +-static struct kobj_type space_info_ktype = { ++static const struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_groups = space_info_groups, +@@ -1259,7 +1259,7 @@ static void btrfs_release_fsid_kobj(struct kobject *kobj) + complete(&fs_devs->kobj_unregister); + } + +-static struct kobj_type btrfs_ktype = { ++static const struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_fsid_kobj, + }; +@@ -1789,7 +1789,7 @@ static void btrfs_release_devid_kobj(struct kobject *kobj) + complete(&device->kobj_unregister); + } + +-static struct kobj_type devid_ktype = { ++static const struct kobj_type devid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = devid_groups, + .release = btrfs_release_devid_kobj, +@@ -2103,7 +2103,7 @@ static void qgroups_release(struct kobject *kobj) + kfree(kobj); + } + +-static struct kobj_type qgroups_ktype = { ++static const struct kobj_type qgroups_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = qgroups_groups, + .release = qgroups_release, +@@ -2173,7 +2173,7 @@ static void qgroup_release(struct kobject *kobj) + memset(&qgroup->kobj, 0, sizeof(*kobj)); + } + +-static struct kobj_type qgroup_ktype = { ++static const struct kobj_type qgroup_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = qgroup_release, + .default_groups = qgroup_groups, +@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, +- u64 bit, enum btrfs_feature_set set) ++void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) + { +- struct btrfs_fs_devices *fs_devs; + struct kobject *fsid_kobj; +- u64 __maybe_unused features; +- int __maybe_unused ret; ++ int ret; + + if (!fs_info) + return; + +- /* +- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not +- * safe when called from some contexts (eg. balance) +- */ +- features = get_features(fs_info, set); +- ASSERT(bit & supported_feature_masks[set]); +- +- fs_devs = fs_info->fs_devices; +- fsid_kobj = &fs_devs->fsid_kobj; +- ++ fsid_kobj = &fs_info->fs_devices->fsid_kobj; + if (!fsid_kobj->state_initialized) + return; + +- /* +- * FIXME: this is too heavy to update just one value, ideally we'd like +- * to use sysfs_update_group but some refactoring is needed first. +- */ +- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); +- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); ++ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); ++ if (ret < 0) ++ btrfs_warn(fs_info, ++ "failed to update /sys/fs/btrfs/%pU/features: %d", ++ fs_info->fs_devices->fsid, ret); + } + + int __init btrfs_init_sysfs(void) +diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h +index bacef43f7267..86c7eef12873 100644 +--- a/fs/btrfs/sysfs.h ++++ b/fs/btrfs/sysfs.h +@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); + int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); + void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); + void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); +-void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, +- u64 bit, enum btrfs_feature_set set); ++void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); + void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); + + int __init btrfs_init_sysfs(void); +diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c +index c5b3a631bf4f..f2f2e11dac4c 100644 +--- a/fs/btrfs/tests/extent-map-tests.c ++++ b/fs/btrfs/tests/extent-map-tests.c +@@ -509,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, + goto out_free; + } + +- ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), ++ ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + &logical, &out_ndaddrs, &out_stripe_len); + if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { + test_err("didn't rmap anything but expected %d", +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index b8c52e89688c..18329ebcb1cb 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) + wake_up(&fs_info->transaction_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + ++ /* If we have features changed, wake up the cleaner to update sysfs. */ ++ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && ++ fs_info->cleaner_kthread) ++ wake_up_process(fs_info->cleaner_kthread); ++ + ret = btrfs_write_and_wait_transaction(trans); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, +@@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) + return (ret < 0) ? 0 : 1; + } + ++/* ++ * We only mark the transaction aborted and then set the file system read-only. ++ * This will prevent new transactions from starting or trying to join this ++ * one. ++ * ++ * This means that error recovery at the call site is limited to freeing ++ * any local memory allocations and passing the error code up without ++ * further cleanup. The transaction should complete as it normally would ++ * in the call path but will return -EIO. ++ * ++ * We'll complete the cleanup in btrfs_end_transaction and ++ * btrfs_commit_transaction. ++ */ ++void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, ++ const char *function, ++ unsigned int line, int errno, bool first_hit) ++{ ++ struct btrfs_fs_info *fs_info = trans->fs_info; ++ ++ WRITE_ONCE(trans->aborted, errno); ++ WRITE_ONCE(trans->transaction->aborted, errno); ++ if (first_hit && errno == -ENOSPC) ++ btrfs_dump_space_info_for_trans_abort(fs_info); ++ /* Wake up anybody who may be waiting on this transaction */ ++ wake_up(&fs_info->transaction_wait); ++ wake_up(&fs_info->transaction_blocked_wait); ++ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); ++} ++ + int __init btrfs_transaction_init(void) + { + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", +diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h +index 97f6c39f59c8..fa728ab80826 100644 +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -202,6 +202,34 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) + delayed_refs->qgroup_to_skip = 0; + } + ++bool __cold abort_should_print_stack(int errno); ++ ++/* ++ * Call btrfs_abort_transaction as early as possible when an error condition is ++ * detected, that way the exact stack trace is reported for some errors. ++ */ ++#define btrfs_abort_transaction(trans, errno) \ ++do { \ ++ bool first = false; \ ++ /* Report first abort since mount */ \ ++ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ ++ &((trans)->fs_info->fs_state))) { \ ++ first = true; \ ++ if (WARN(abort_should_print_stack(errno), \ ++ KERN_ERR \ ++ "BTRFS: Transaction aborted (error %d)\n", \ ++ (errno))) { \ ++ /* Stack trace printed. */ \ ++ } else { \ ++ btrfs_debug((trans)->fs_info, \ ++ "Transaction aborted (error %d)", \ ++ (errno)); \ ++ } \ ++ } \ ++ __btrfs_abort_transaction((trans), __func__, \ ++ __LINE__, (errno), first); \ ++} while (0) ++ + int btrfs_end_transaction(struct btrfs_trans_handle *trans); + struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + unsigned int num_items); +@@ -236,6 +264,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); + void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); ++void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, ++ const char *function, ++ unsigned int line, int errno, bool first_hit); + + int __init btrfs_transaction_init(void); + void __cold btrfs_transaction_exit(void); +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 58599189bd18..200cea6e49e5 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -279,12 +279,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) + } + } + +-static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +-{ +- filemap_fdatawait_range(buf->pages[0]->mapping, +- buf->start, buf->start + buf->len - 1); +-} +- + /* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part +@@ -2623,11 +2617,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + return ret; + } + ++ btrfs_tree_lock(next); ++ btrfs_clear_buffer_dirty(trans, next); ++ wait_on_extent_buffer_writeback(next); ++ btrfs_tree_unlock(next); ++ + if (trans) { +- btrfs_tree_lock(next); +- btrfs_clean_tree_block(next); +- btrfs_wait_tree_block_writeback(next); +- btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + bytenr, blocksize); + if (ret) { +@@ -2637,8 +2632,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + btrfs_redirty_list_add( + trans->transaction, next); + } else { +- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) +- clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, bytenr); + } + } +@@ -2693,11 +2686,12 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + + next = path->nodes[*level]; + ++ btrfs_tree_lock(next); ++ btrfs_clear_buffer_dirty(trans, next); ++ wait_on_extent_buffer_writeback(next); ++ btrfs_tree_unlock(next); ++ + if (trans) { +- btrfs_tree_lock(next); +- btrfs_clean_tree_block(next); +- btrfs_wait_tree_block_writeback(next); +- btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + path->nodes[*level]->start, + path->nodes[*level]->len); +@@ -2706,9 +2700,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + btrfs_redirty_list_add(trans->transaction, + next); + } else { +- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) +- clear_extent_buffer_dirty(next); +- + unaccount_log_buffer(fs_info, + path->nodes[*level]->start); + } +@@ -2776,19 +2767,18 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, + + next = path->nodes[orig_level]; + ++ btrfs_tree_lock(next); ++ btrfs_clear_buffer_dirty(trans, next); ++ wait_on_extent_buffer_writeback(next); ++ btrfs_tree_unlock(next); ++ + if (trans) { +- btrfs_tree_lock(next); +- btrfs_clean_tree_block(next); +- btrfs_wait_tree_block_writeback(next); +- btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + next->start, next->len); + if (ret) + goto out; + btrfs_redirty_list_add(trans->transaction, next); + } else { +- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) +- clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, next->start); + } + } +@@ -3652,11 +3642,10 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + + /* + * If for some unexpected reason the last item's index is not greater +- * than the last index we logged, warn and return an error to fallback +- * to a transaction commit. ++ * than the last index we logged, warn and force a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) +- ret = -EUCLEAN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; + out: +@@ -3794,7 +3783,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_key min_key; + struct btrfs_root *root = inode->root; + struct btrfs_root *log = root->log_root; +- int err = 0; + int ret; + u64 last_old_dentry_offset = min_offset - 1; + u64 last_offset = (u64)-1; +@@ -3835,8 +3823,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + path->slots[0]); + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; +- } else if (ret < 0) { +- err = ret; ++ } else if (ret > 0) { ++ ret = 0; + } + + goto done; +@@ -3859,7 +3847,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { +- err = ret; + goto done; + } + +@@ -3881,12 +3868,15 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + */ + search: + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); +- if (ret > 0) ++ if (ret > 0) { + ret = btrfs_next_item(root, path); ++ if (ret > 0) { ++ /* There are no more keys in the inode's root. */ ++ ret = 0; ++ goto done; ++ } ++ } + if (ret < 0) +- err = ret; +- /* If ret is 1, there are no more keys in the inode's root. */ +- if (ret != 0) + goto done; + + /* +@@ -3897,8 +3887,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); + if (ret != 0) { +- if (ret < 0) +- err = ret; ++ if (ret > 0) ++ ret = 0; + goto done; + } + path->slots[0] = btrfs_header_nritems(path->nodes[0]); +@@ -3909,10 +3899,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + */ + ret = btrfs_next_leaf(root, path); + if (ret) { +- if (ret == 1) ++ if (ret == 1) { + last_offset = (u64)-1; +- else +- err = ret; ++ ret = 0; ++ } + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); +@@ -3943,7 +3933,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + btrfs_release_path(path); + btrfs_release_path(dst_path); + +- if (err == 0) { ++ if (ret == 0) { + *last_offset_ret = last_offset; + /* + * In case the leaf was changed in the current transaction but +@@ -3954,15 +3944,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, + * a range, last_old_dentry_offset is == to last_offset. + */ + ASSERT(last_old_dentry_offset <= last_offset); +- if (last_old_dentry_offset < last_offset) { ++ if (last_old_dentry_offset < last_offset) + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); +- if (ret) +- err = ret; +- } + } +- return err; ++ ++ return ret; + } + + /* +@@ -5604,10 +5592,8 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ +- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { +- btrfs_set_log_full_commit(trans); ++ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + return BTRFS_LOG_FORCE_COMMIT; +- } + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* +@@ -6466,7 +6452,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, + * result in losing the file after a log replay. + */ + if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { +- btrfs_set_log_full_commit(trans); + ret = BTRFS_LOG_FORCE_COMMIT; + goto out_unlock; + } +diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h +index 85cd24cb0540..bdeb5216718f 100644 +--- a/fs/btrfs/tree-log.h ++++ b/fs/btrfs/tree-log.h +@@ -13,8 +13,13 @@ + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ + #define BTRFS_NO_LOG_SYNC 256 + +-/* We can't use the tree log for whatever reason, force a transaction commit */ +-#define BTRFS_LOG_FORCE_COMMIT (1) ++/* ++ * We can't use the tree log for whatever reason, force a transaction commit. ++ * We use a negative value because there are functions through the logging code ++ * that need to return an error (< 0 value), false (0) or true (1). Any negative ++ * value will do, as it will cause the log to be marked for a full sync. ++ */ ++#define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) + + struct btrfs_log_ctx { + int log_ret; +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index df43093b7a46..7823168c08a6 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -728,7 +728,7 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( + /* + * Handle the case where the scanned device is part of an fs whose last + * metadata UUID change reverted it to the original FSID. At the same +- * time * fs_devices was first created by another constitutent device ++ * time fs_devices was first created by another constituent device + * which didn't fully observe the operation. This results in an + * btrfs_fs_devices created with metadata/fsid different AND + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the +@@ -6284,91 +6284,42 @@ static bool need_full_stripe(enum btrfs_map_op op) + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); + } + +-/* +- * Calculate the geometry of a particular (address, len) tuple. This +- * information is used to calculate how big a particular bio can get before it +- * straddles a stripe. +- * +- * @fs_info: the filesystem +- * @em: mapping containing the logical extent +- * @op: type of operation - write or read +- * @logical: address that we want to figure out the geometry of +- * @io_geom: pointer used to return values +- * +- * Returns < 0 in case a chunk for the given logical address cannot be found, +- * usually shouldn't happen unless @logical is corrupted, 0 otherwise. +- */ +-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, +- enum btrfs_map_op op, u64 logical, +- struct btrfs_io_geometry *io_geom) ++static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, ++ u64 offset, u64 *stripe_nr, u64 *stripe_offset, ++ u64 *full_stripe_start) + { +- struct map_lookup *map; +- u64 len; +- u64 offset; +- u64 stripe_offset; +- u64 stripe_nr; +- u32 stripe_len; +- u64 raid56_full_stripe_start = (u64)-1; +- int data_stripes; ++ u32 stripe_len = map->stripe_len; + + ASSERT(op != BTRFS_MAP_DISCARD); + +- map = em->map_lookup; +- /* Offset of this logical address in the chunk */ +- offset = logical - em->start; +- /* Len of a stripe in a chunk */ +- stripe_len = map->stripe_len; + /* +- * Stripe_nr is where this block falls in +- * stripe_offset is the offset of this block in its stripe. ++ * Stripe_nr is the stripe where this block falls. stripe_offset is ++ * the offset of this block in its stripe. + */ +- stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); +- ASSERT(stripe_offset < U32_MAX); ++ *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); ++ ASSERT(*stripe_offset < U32_MAX); + +- data_stripes = nr_data_stripes(map); ++ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ++ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + +- /* Only stripe based profiles needs to check against stripe length. */ +- if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { +- u64 max_len = stripe_len - stripe_offset; ++ *full_stripe_start = ++ div64_u64(offset, full_stripe_len) * full_stripe_len; + + /* +- * In case of raid56, we need to know the stripe aligned start ++ * For writes to RAID56, allow to write a full stripe set, but ++ * no straddling of stripe sets. + */ +- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { +- unsigned long full_stripe_len = stripe_len * data_stripes; +- raid56_full_stripe_start = offset; +- +- /* +- * Allow a write of a full stripe, but make sure we +- * don't allow straddling of stripes +- */ +- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, +- full_stripe_len); +- raid56_full_stripe_start *= full_stripe_len; +- +- /* +- * For writes to RAID[56], allow a full stripeset across +- * all disks. For other RAID types and for RAID[56] +- * reads, just allow a single stripe (on a single disk). +- */ +- if (op == BTRFS_MAP_WRITE) { +- max_len = stripe_len * data_stripes - +- (offset - raid56_full_stripe_start); +- } +- } +- len = min_t(u64, em->len - offset, max_len); +- } else { +- len = em->len - offset; ++ if (op == BTRFS_MAP_WRITE) ++ return full_stripe_len - (offset - *full_stripe_start); + } + +- io_geom->len = len; +- io_geom->offset = offset; +- io_geom->stripe_len = stripe_len; +- io_geom->stripe_nr = stripe_nr; +- io_geom->stripe_offset = stripe_offset; +- io_geom->raid56_stripe_offset = raid56_full_stripe_start; +- +- return 0; ++ /* ++ * For other RAID types and for RAID56 reads, allow a single stripe (on ++ * a single disk). ++ */ ++ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) ++ return stripe_len - *stripe_offset; ++ return U64_MAX; + } + + static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, +@@ -6387,6 +6338,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + { + struct extent_map *em; + struct map_lookup *map; ++ u64 map_offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; +@@ -6405,7 +6357,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; +- struct btrfs_io_geometry geom; ++ u64 max_len; + + ASSERT(bioc_ret); + ASSERT(op != BTRFS_MAP_DISCARD); +@@ -6413,18 +6365,14 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + +- ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); +- if (ret < 0) +- return ret; +- + map = em->map_lookup; +- +- *length = geom.len; +- stripe_len = geom.stripe_len; +- stripe_nr = geom.stripe_nr; +- stripe_offset = geom.stripe_offset; +- raid56_full_stripe_start = geom.raid56_stripe_offset; + data_stripes = nr_data_stripes(map); ++ stripe_len = map->stripe_len; ++ ++ map_offset = logical - em->start; ++ max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, ++ &stripe_offset, &raid56_full_stripe_start); ++ *length = min_t(u64, em->len - map_offset, max_len); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 6b7a05f6cf82..7e51f2238f72 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -53,21 +53,6 @@ enum btrfs_raid_types { + BTRFS_NR_RAID_TYPES + }; + +-struct btrfs_io_geometry { +- /* remaining bytes before crossing a stripe */ +- u64 len; +- /* offset of logical address in chunk */ +- u64 offset; +- /* length of single IO stripe */ +- u32 stripe_len; +- /* offset of address in stripe */ +- u32 stripe_offset; +- /* number of stripe where address falls */ +- u64 stripe_nr; +- /* offset of raid56 stripe into the chunk */ +- u64 raid56_stripe_offset; +-}; +- + /* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. +@@ -545,9 +530,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); +-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, +- enum btrfs_map_op op, u64 logical, +- struct btrfs_io_geometry *io_geom); + int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); + int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); + struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 1f503e8e42d4..f95b2c94d619 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -17,6 +17,7 @@ + #include "space-info.h" + #include "fs.h" + #include "accessors.h" ++#include "bio.h" + + /* Maximum number of zones to report per blkdev_report_zones() call */ + #define BTRFS_REPORT_NR_ZONES 4096 +@@ -160,7 +161,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, + */ + static inline u32 sb_zone_number(int shift, int mirror) + { +- u64 zone; ++ u64 zone = U64_MAX; + + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + switch (mirror) { +@@ -220,7 +221,6 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) + { + struct btrfs_zoned_device_info *zinfo = device->zone_info; +- u32 zno; + int ret; + + if (!*nr_zones) +@@ -235,6 +235,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + /* Check cache */ + if (zinfo->zone_cache) { + unsigned int i; ++ u32 zno; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + zno = pos >> zinfo->zone_size_shift; +@@ -274,9 +275,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + return -EIO; + + /* Populate cache */ +- if (zinfo->zone_cache) ++ if (zinfo->zone_cache) { ++ u32 zno = pos >> zinfo->zone_size_shift; ++ + memcpy(zinfo->zone_cache + zno, zones, + sizeof(*zinfo->zone_cache) * *nr_zones); ++ } + + return 0; + } +@@ -417,25 +421,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + nr_sectors = bdev_nr_sectors(bdev); + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); +- /* +- * We limit max_zone_append_size also by max_segments * +- * PAGE_SIZE. Technically, we can have multiple pages per segment. But, +- * since btrfs adds the pages one by one to a bio, and btrfs cannot +- * increase the metadata reservation even if it increases the number of +- * extents, it is safe to stick with the limit. +- * +- * With the zoned emulation, we can have non-zoned device on the zoned +- * mode. In this case, we don't have a valid max zone append size. So, +- * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. +- */ +- if (bdev_is_zoned(bdev)) { +- zone_info->max_zone_append_size = min_t(u64, +- (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, +- (u64)bdev_max_segments(bdev) << PAGE_SHIFT); +- } else { +- zone_info->max_zone_append_size = +- (u64)bdev_max_segments(bdev) << PAGE_SHIFT; +- } + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + +@@ -715,9 +700,9 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) + + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + { ++ struct queue_limits *lim = &fs_info->limits; + struct btrfs_device *device; + u64 zone_size = 0; +- u64 max_zone_append_size = 0; + int ret; + + /* +@@ -727,6 +712,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + ++ blk_set_stacking_limits(lim); ++ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; + +@@ -741,10 +728,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + zone_info->zone_size, zone_size); + return -EINVAL; + } +- if (!max_zone_append_size || +- (zone_info->max_zone_append_size && +- zone_info->max_zone_append_size < max_zone_append_size)) +- max_zone_append_size = zone_info->max_zone_append_size; ++ ++ /* ++ * With the zoned emulation, we can have non-zoned device on the ++ * zoned mode. In this case, we don't have a valid max zone ++ * append size. ++ */ ++ if (bdev_is_zoned(device->bdev)) { ++ blk_stack_limits(lim, ++ &bdev_get_queue(device->bdev)->limits, ++ 0); ++ } + } + + /* +@@ -765,8 +759,18 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + } + + fs_info->zone_size = zone_size; +- fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, +- fs_info->sectorsize); ++ /* ++ * Also limit max_zone_append_size by max_segments * PAGE_SIZE. ++ * Technically, we can have multiple pages per segment. But, since ++ * we add the pages one by one to a bio, and cannot increase the ++ * metadata reservation even if it increases the number of extents, it ++ * is safe to stick with the limit. ++ */ ++ fs_info->max_zone_append_size = ALIGN_DOWN( ++ min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, ++ (u64)lim->max_sectors << SECTOR_SHIFT, ++ (u64)lim->max_segments << PAGE_SHIFT), ++ fs_info->sectorsize); + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; +@@ -1623,8 +1627,10 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) + spin_unlock(&trans->releasing_ebs_lock); + } + +-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) ++bool btrfs_use_zone_append(struct btrfs_bio *bbio) + { ++ u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); ++ struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_group *cache; + bool ret = false; +@@ -1635,6 +1641,9 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) + if (!is_data_inode(&inode->vfs_inode)) + return false; + ++ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) ++ return false; ++ + /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. +@@ -1657,22 +1666,16 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) + return ret; + } + +-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, +- struct bio *bio) ++void btrfs_record_physical_zoned(struct btrfs_bio *bbio) + { ++ const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + struct btrfs_ordered_extent *ordered; +- const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + +- if (bio_op(bio) != REQ_OP_ZONE_APPEND) +- return; +- +- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); ++ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); + if (WARN_ON(!ordered)) + return; + + ordered->physical = physical; +- ordered->bdev = bio->bi_bdev; +- + btrfs_put_ordered_extent(ordered); + } + +@@ -1684,43 +1687,46 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) + struct extent_map *em; + struct btrfs_ordered_sum *sum; + u64 orig_logical = ordered->disk_bytenr; +- u64 *logical = NULL; +- int nr, stripe_len; ++ struct map_lookup *map; ++ u64 physical = ordered->physical; ++ u64 chunk_start_phys; ++ u64 logical; + +- /* Zoned devices should not have partitions. So, we can assume it is 0 */ +- ASSERT(!bdev_is_partition(ordered->bdev)); +- if (WARN_ON(!ordered->bdev)) ++ em = btrfs_get_chunk_map(fs_info, orig_logical, 1); ++ if (IS_ERR(em)) + return; ++ map = em->map_lookup; ++ chunk_start_phys = map->stripes[0].physical; + +- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, +- ordered->physical, &logical, &nr, +- &stripe_len))) +- goto out; +- +- WARN_ON(nr != 1); ++ if (WARN_ON_ONCE(map->num_stripes > 1) || ++ WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || ++ WARN_ON_ONCE(physical < chunk_start_phys) || ++ WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { ++ free_extent_map(em); ++ return; ++ } ++ logical = em->start + (physical - map->stripes[0].physical); ++ free_extent_map(em); + +- if (orig_logical == *logical) +- goto out; ++ if (orig_logical == logical) ++ return; + +- ordered->disk_bytenr = *logical; ++ ordered->disk_bytenr = logical; + + em_tree = &inode->extent_tree; + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); +- em->block_start = *logical; ++ em->block_start = logical; + free_extent_map(em); + write_unlock(&em_tree->lock); + + list_for_each_entry(sum, &ordered->list, list) { +- if (*logical < orig_logical) +- sum->bytenr -= orig_logical - *logical; ++ if (logical < orig_logical) ++ sum->bytenr -= orig_logical - logical; + else +- sum->bytenr += *logical - orig_logical; ++ sum->bytenr += logical - orig_logical; + } +- +-out: +- kfree(logical); + } + + bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, +@@ -1845,26 +1851,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); + } + +-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) +-{ +- struct btrfs_device *device; +- struct extent_map *em; +- struct map_lookup *map; +- +- em = btrfs_get_chunk_map(fs_info, logical, length); +- if (IS_ERR(em)) +- return ERR_CAST(em); +- +- map = em->map_lookup; +- /* We only support single profile for now */ +- device = map->stripes[0].dev; +- +- free_extent_map(em); +- +- return device; +-} +- + /* + * Activate block group and underlying device zones + * +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index f43990985d80..c0570d35fea2 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -20,7 +20,6 @@ struct btrfs_zoned_device_info { + */ + u64 zone_size; + u8 zone_size_shift; +- u64 max_zone_append_size; + u32 nr_zones; + unsigned int max_active_zones; + atomic_t active_zones_left; +@@ -56,9 +55,8 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); + void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); + void btrfs_free_redirty_list(struct btrfs_transaction *trans); +-bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); +-void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, +- struct bio *bio); ++bool btrfs_use_zone_append(struct btrfs_bio *bbio); ++void btrfs_record_physical_zoned(struct btrfs_bio *bbio); + void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); + bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, +@@ -68,8 +66,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); + int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); +-struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length); + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +@@ -185,13 +181,12 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } + static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } + +-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) ++static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) + { + return false; + } + +-static inline void btrfs_record_physical_zoned(struct inode *inode, +- u64 file_offset, struct bio *bio) ++static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) + { + } + +@@ -224,13 +219,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + return -EOPNOTSUPP; + } + +-static inline struct btrfs_device *btrfs_zoned_get_device( +- struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) +-{ +- return ERR_PTR(-EOPNOTSUPP); +-} +- + static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) + { + return true; +diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c +index e7537fd305dd..e191ecfb1fde 100644 +--- a/fs/gfs2/bmap.c ++++ b/fs/gfs2/bmap.c +@@ -956,26 +956,40 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + goto out; + } + +-static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, +- unsigned len) ++static struct folio * ++gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) + { ++ struct inode *inode = iter->inode; + unsigned int blockmask = i_blocksize(inode) - 1; + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocks; ++ struct folio *folio; ++ int status; + + blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; +- return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); ++ status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); ++ if (status) ++ return ERR_PTR(status); ++ ++ folio = iomap_get_folio(iter, pos); ++ if (IS_ERR(folio)) ++ gfs2_trans_end(sdp); ++ return folio; + } + +-static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, +- unsigned copied, struct page *page) ++static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, ++ unsigned copied, struct folio *folio) + { + struct gfs2_trans *tr = current->journal_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + +- if (page && !gfs2_is_stuffed(ip)) +- gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); ++ if (!gfs2_is_stuffed(ip)) ++ gfs2_page_add_databufs(ip, &folio->page, offset_in_page(pos), ++ copied); ++ ++ folio_unlock(folio); ++ folio_put(folio); + + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +@@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, + gfs2_trans_end(sdp); + } + +-static const struct iomap_page_ops gfs2_iomap_page_ops = { +- .page_prepare = gfs2_iomap_page_prepare, +- .page_done = gfs2_iomap_page_done, ++static const struct iomap_folio_ops gfs2_iomap_folio_ops = { ++ .get_folio = gfs2_iomap_get_folio, ++ .put_folio = gfs2_iomap_put_folio, + }; + + static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, +@@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, + } + + if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) +- iomap->page_ops = &gfs2_iomap_page_ops; ++ iomap->folio_ops = &gfs2_iomap_folio_ops; + return 0; + + out_trans_end: +@@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + /* + * NOTE: Never call gfs2_block_zero_range with an open transaction because it + * uses iomap write to perform its actions, which begin their own transactions +- * (iomap_begin, page_prepare, etc.) ++ * (iomap_begin, get_folio, etc.) + */ + static int gfs2_block_zero_range(struct inode *inode, loff_t from, + unsigned int length) +diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c +index 356193e44cf0..d3c300563eb8 100644 +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) + } + EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); + ++/** ++ * iomap_get_folio - get a folio reference for writing ++ * @iter: iteration structure ++ * @pos: start offset of write ++ * ++ * Returns a locked reference to the folio at @pos, or an error pointer if the ++ * folio could not be obtained. ++ */ ++struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) ++{ ++ unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; ++ struct folio *folio; ++ ++ if (iter->flags & IOMAP_NOWAIT) ++ fgp |= FGP_NOWAIT; ++ ++ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, ++ fgp, mapping_gfp_mask(iter->inode->i_mapping)); ++ if (folio) ++ return folio; ++ ++ if (iter->flags & IOMAP_NOWAIT) ++ return ERR_PTR(-EAGAIN); ++ return ERR_PTR(-ENOMEM); ++} ++EXPORT_SYMBOL_GPL(iomap_get_folio); ++ + bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) + { + trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), +@@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + return 0; + } + ++static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, ++ size_t len) ++{ ++ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; ++ ++ if (folio_ops && folio_ops->get_folio) ++ return folio_ops->get_folio(iter, pos, len); ++ else ++ return iomap_get_folio(iter, pos); ++} ++ ++static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, ++ struct folio *folio) ++{ ++ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; ++ ++ if (folio_ops && folio_ops->put_folio) { ++ folio_ops->put_folio(iter->inode, pos, ret, folio); ++ } else { ++ folio_unlock(folio); ++ folio_put(folio); ++ } ++} ++ + static int iomap_write_begin_inline(const struct iomap_iter *iter, + struct folio *folio) + { +@@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, + static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, + size_t len, struct folio **foliop) + { +- const struct iomap_page_ops *page_ops = iter->iomap.page_ops; ++ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct folio *folio; +- unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; + int status = 0; + +- if (iter->flags & IOMAP_NOWAIT) +- fgp |= FGP_NOWAIT; +- + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); + if (srcmap != &iter->iomap) + BUG_ON(pos + len > srcmap->offset + srcmap->length); +@@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, + if (!mapping_large_folio_support(iter->inode->i_mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + +- if (page_ops && page_ops->page_prepare) { +- status = page_ops->page_prepare(iter->inode, pos, len); +- if (status) +- return status; +- } +- +- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, +- fgp, mapping_gfp_mask(iter->inode->i_mapping)); +- if (!folio) { +- status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; +- goto out_no_page; +- } ++ folio = __iomap_get_folio(iter, pos, len); ++ if (IS_ERR(folio)) ++ return PTR_ERR(folio); + + /* + * Now we have a locked folio, before we do anything with it we need to +@@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ +- if (page_ops && page_ops->iomap_valid) { +- bool iomap_valid = page_ops->iomap_valid(iter->inode, +- &iter->iomap); ++ if (folio_ops && folio_ops->iomap_valid) { ++ bool iomap_valid = folio_ops->iomap_valid(iter->inode, ++ &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; +@@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, + return 0; + + out_unlock: +- folio_unlock(folio); +- folio_put(folio); ++ __iomap_put_folio(iter, pos, 0, folio); + iomap_write_failed(iter->inode, pos, len); + +-out_no_page: +- if (page_ops && page_ops->page_done) +- page_ops->page_done(iter->inode, pos, 0, NULL); + return status; + } + +@@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, + static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, + size_t copied, struct folio *folio) + { +- const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; + size_t ret; +@@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, + i_size_write(iter->inode, pos + ret); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + } +- folio_unlock(folio); ++ __iomap_put_folio(iter, pos, ret, folio); + + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); +- if (page_ops && page_ops->page_done) +- page_ops->page_done(iter->inode, pos, ret, &folio->page); +- folio_put(folio); +- + if (ret < len) + iomap_write_failed(iter->inode, pos + ret, len - ret); + return ret; +diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c +index 9804714b1751..f771001574d0 100644 +--- a/fs/iomap/direct-io.c ++++ b/fs/iomap/direct-io.c +@@ -217,16 +217,10 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, + { + blk_opf_t opflags = REQ_SYNC | REQ_IDLE; + +- if (!(dio->flags & IOMAP_DIO_WRITE)) { +- WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); ++ if (!(dio->flags & IOMAP_DIO_WRITE)) + return REQ_OP_READ; +- } +- +- if (iomap->flags & IOMAP_F_ZONE_APPEND) +- opflags |= REQ_OP_ZONE_APPEND; +- else +- opflags |= REQ_OP_WRITE; + ++ opflags |= REQ_OP_WRITE; + if (use_fua) + opflags |= REQ_FUA; + else +diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c +index 989cf341779b..f8ff81c3de76 100644 +--- a/fs/xfs/libxfs/xfs_alloc.c ++++ b/fs/xfs/libxfs/xfs_alloc.c +@@ -2472,20 +2472,20 @@ xfs_defer_agfl_block( + struct xfs_owner_info *oinfo) + { + struct xfs_mount *mp = tp->t_mountp; +- struct xfs_extent_free_item *new; /* new element */ ++ struct xfs_extent_free_item *xefi; + + ASSERT(xfs_extfree_item_cache != NULL); + ASSERT(oinfo != NULL); + +- new = kmem_cache_zalloc(xfs_extfree_item_cache, ++ xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); +- new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); +- new->xefi_blockcount = 1; +- new->xefi_owner = oinfo->oi_owner; ++ xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); ++ xefi->xefi_blockcount = 1; ++ xefi->xefi_owner = oinfo->oi_owner; + + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + +- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); ++ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + } + + /* +@@ -2500,7 +2500,7 @@ __xfs_free_extent_later( + const struct xfs_owner_info *oinfo, + bool skip_discard) + { +- struct xfs_extent_free_item *new; /* new element */ ++ struct xfs_extent_free_item *xefi; + #ifdef DEBUG + struct xfs_mount *mp = tp->t_mountp; + xfs_agnumber_t agno; +@@ -2519,27 +2519,27 @@ __xfs_free_extent_later( + #endif + ASSERT(xfs_extfree_item_cache != NULL); + +- new = kmem_cache_zalloc(xfs_extfree_item_cache, ++ xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); +- new->xefi_startblock = bno; +- new->xefi_blockcount = (xfs_extlen_t)len; ++ xefi->xefi_startblock = bno; ++ xefi->xefi_blockcount = (xfs_extlen_t)len; + if (skip_discard) +- new->xefi_flags |= XFS_EFI_SKIP_DISCARD; ++ xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (oinfo) { + ASSERT(oinfo->oi_offset == 0); + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) +- new->xefi_flags |= XFS_EFI_ATTR_FORK; ++ xefi->xefi_flags |= XFS_EFI_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) +- new->xefi_flags |= XFS_EFI_BMBT_BLOCK; +- new->xefi_owner = oinfo->oi_owner; ++ xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; ++ xefi->xefi_owner = oinfo->oi_owner; + } else { +- new->xefi_owner = XFS_RMAP_OWN_NULL; ++ xefi->xefi_owner = XFS_RMAP_OWN_NULL; + } + trace_xfs_bmap_free_defer(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, + XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); +- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); ++ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + } + + #ifdef DEBUG +diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c +index 0d56a8d862e8..c8c65387136c 100644 +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent( + int + xfs_bmap_finish_one( + struct xfs_trans *tp, +- struct xfs_inode *ip, +- enum xfs_bmap_intent_type type, +- int whichfork, +- xfs_fileoff_t startoff, +- xfs_fsblock_t startblock, +- xfs_filblks_t *blockcount, +- xfs_exntst_t state) ++ struct xfs_bmap_intent *bi) + { ++ struct xfs_bmbt_irec *bmap = &bi->bi_bmap; + int error = 0; + + ASSERT(tp->t_firstblock == NULLFSBLOCK); + + trace_xfs_bmap_deferred(tp->t_mountp, +- XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, +- XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), +- ip->i_ino, whichfork, startoff, *blockcount, state); ++ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), ++ bi->bi_type, ++ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), ++ bi->bi_owner->i_ino, bi->bi_whichfork, ++ bmap->br_startoff, bmap->br_blockcount, ++ bmap->br_state); + +- if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) ++ if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) + return -EFSCORRUPTED; + + if (XFS_TEST_ERROR(false, tp->t_mountp, + XFS_ERRTAG_BMAP_FINISH_ONE)) + return -EIO; + +- switch (type) { ++ switch (bi->bi_type) { + case XFS_BMAP_MAP: +- error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, +- startblock, 0); +- *blockcount = 0; ++ error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, ++ bmap->br_blockcount, bmap->br_startblock, 0); ++ bmap->br_blockcount = 0; + break; + case XFS_BMAP_UNMAP: +- error = __xfs_bunmapi(tp, ip, startoff, blockcount, +- XFS_BMAPI_REMAP, 1); ++ error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, ++ &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); + break; + default: + ASSERT(0); +diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h +index 16db95b11589..01c2df35c3e3 100644 +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -234,10 +234,7 @@ struct xfs_bmap_intent { + struct xfs_bmbt_irec bi_bmap; + }; + +-int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, +- enum xfs_bmap_intent_type type, int whichfork, +- xfs_fileoff_t startoff, xfs_fsblock_t startblock, +- xfs_filblks_t *blockcount, xfs_exntst_t state); ++int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); + void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_bmbt_irec *imap); + void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, +diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c +index 35f574421670..da8c769887fd 100644 +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -2913,9 +2913,22 @@ xfs_btree_split_worker( + } + + /* +- * BMBT split requests often come in with little stack to work on. Push ++ * BMBT split requests often come in with little stack to work on so we push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. ++ * ++ * Care must be taken here - the work queue rescuer thread introduces potential ++ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new ++ * AGFs to allocate blocks. A task being run by the rescuer could attempt to ++ * lock an AGF that is already locked by a task queued to run by the rescuer, ++ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to ++ * release it until the current thread it is running gains the lock. ++ * ++ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF ++ * already locked to allocate from. The only place that doesn't hold an AGF ++ * locked is unwritten extent conversion at IO completion, but that has already ++ * been offloaded to a worker thread and hence has no stack consumption issues ++ * we have to worry about. + */ + STATIC int /* error */ + xfs_btree_split( +@@ -2929,7 +2942,8 @@ xfs_btree_split( + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + +- if (cur->bc_btnum != XFS_BTNUM_BMAP) ++ if (cur->bc_btnum != XFS_BTNUM_BMAP || ++ cur->bc_tp->t_firstblock == NULLFSBLOCK) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; +diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c +index 6f7ed9288fe4..bcf46aa0d08b 100644 +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -1213,37 +1213,33 @@ xfs_refcount_adjust_extents( + STATIC int + xfs_refcount_adjust( + struct xfs_btree_cur *cur, +- xfs_agblock_t agbno, +- xfs_extlen_t aglen, +- xfs_agblock_t *new_agbno, +- xfs_extlen_t *new_aglen, ++ xfs_agblock_t *agbno, ++ xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adj) + { + bool shape_changed; + int shape_changes = 0; + int error; + +- *new_agbno = agbno; +- *new_aglen = aglen; + if (adj == XFS_REFCOUNT_ADJUST_INCREASE) +- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, +- agbno, aglen); ++ trace_xfs_refcount_increase(cur->bc_mp, ++ cur->bc_ag.pag->pag_agno, *agbno, *aglen); + else +- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, +- agbno, aglen); ++ trace_xfs_refcount_decrease(cur->bc_mp, ++ cur->bc_ag.pag->pag_agno, *agbno, *aglen); + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, +- agbno, &shape_changed); ++ *agbno, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, +- agbno + aglen, &shape_changed); ++ *agbno + *aglen, &shape_changed); + if (error) + goto out_error; + if (shape_changed) +@@ -1253,7 +1249,7 @@ xfs_refcount_adjust( + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, +- new_agbno, new_aglen, adj, &shape_changed); ++ agbno, aglen, adj, &shape_changed); + if (error) + goto out_error; + if (shape_changed) +@@ -1262,7 +1258,7 @@ xfs_refcount_adjust( + cur->bc_ag.refc.shape_changes++; + + /* Now that we've taken care of the ends, adjust the middle extents */ +- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); ++ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); + if (error) + goto out_error; + +@@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup( + static inline int + xfs_refcount_continue_op( + struct xfs_btree_cur *cur, +- xfs_fsblock_t startblock, +- xfs_agblock_t new_agbno, +- xfs_extlen_t new_len, +- xfs_fsblock_t *new_fsbno) ++ struct xfs_refcount_intent *ri, ++ xfs_agblock_t new_agbno) + { + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + +- if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) ++ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, ++ ri->ri_blockcount))) + return -EFSCORRUPTED; + +- *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); ++ ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + +- ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); +- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); ++ ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); ++ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + + return 0; + } +@@ -1327,11 +1322,7 @@ xfs_refcount_continue_op( + int + xfs_refcount_finish_one( + struct xfs_trans *tp, +- enum xfs_refcount_intent_type type, +- xfs_fsblock_t startblock, +- xfs_extlen_t blockcount, +- xfs_fsblock_t *new_fsb, +- xfs_extlen_t *new_len, ++ struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) + { + struct xfs_mount *mp = tp->t_mountp; +@@ -1339,17 +1330,16 @@ xfs_refcount_finish_one( + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agblock_t bno; +- xfs_agblock_t new_agbno; + unsigned long nr_ops = 0; + int shape_changes = 0; + struct xfs_perag *pag; + +- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); +- bno = XFS_FSB_TO_AGBNO(mp, startblock); ++ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); ++ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); + +- trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), +- type, XFS_FSB_TO_AGBNO(mp, startblock), +- blockcount); ++ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), ++ ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), ++ ri->ri_blockcount); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { + error = -EIO; +@@ -1380,42 +1370,42 @@ xfs_refcount_finish_one( + } + *pcur = rcur; + +- switch (type) { ++ switch (ri->ri_type) { + case XFS_REFCOUNT_INCREASE: +- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, +- new_len, XFS_REFCOUNT_ADJUST_INCREASE); ++ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, ++ XFS_REFCOUNT_ADJUST_INCREASE); + if (error) + goto out_drop; +- if (*new_len > 0) +- error = xfs_refcount_continue_op(rcur, startblock, +- new_agbno, *new_len, new_fsb); ++ if (ri->ri_blockcount > 0) ++ error = xfs_refcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_DECREASE: +- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, +- new_len, XFS_REFCOUNT_ADJUST_DECREASE); ++ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, ++ XFS_REFCOUNT_ADJUST_DECREASE); + if (error) + goto out_drop; +- if (*new_len > 0) +- error = xfs_refcount_continue_op(rcur, startblock, +- new_agbno, *new_len, new_fsb); ++ if (ri->ri_blockcount > 0) ++ error = xfs_refcount_continue_op(rcur, ri, bno); + break; + case XFS_REFCOUNT_ALLOC_COW: +- *new_fsb = startblock + blockcount; +- *new_len = 0; +- error = __xfs_refcount_cow_alloc(rcur, bno, blockcount); ++ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); ++ if (error) ++ goto out_drop; ++ ri->ri_blockcount = 0; + break; + case XFS_REFCOUNT_FREE_COW: +- *new_fsb = startblock + blockcount; +- *new_len = 0; +- error = __xfs_refcount_cow_free(rcur, bno, blockcount); ++ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); ++ if (error) ++ goto out_drop; ++ ri->ri_blockcount = 0; + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } +- if (!error && *new_len > 0) +- trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, +- bno, blockcount, new_agbno, *new_len); ++ if (!error && ri->ri_blockcount > 0) ++ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, ++ ri->ri_type, bno, ri->ri_blockcount); + out_drop: + xfs_perag_put(pag); + return error; +diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h +index 452f30556f5a..c633477ce3ce 100644 +--- a/fs/xfs/libxfs/xfs_refcount.h ++++ b/fs/xfs/libxfs/xfs_refcount.h +@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp, + extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); + extern int xfs_refcount_finish_one(struct xfs_trans *tp, +- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, +- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, +- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); ++ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); + + extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, +diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c +index b56aca1e7c66..df720041cd3d 100644 +--- a/fs/xfs/libxfs/xfs_rmap.c ++++ b/fs/xfs/libxfs/xfs_rmap.c +@@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup( + int + xfs_rmap_finish_one( + struct xfs_trans *tp, +- enum xfs_rmap_intent_type type, +- uint64_t owner, +- int whichfork, +- xfs_fileoff_t startoff, +- xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, +- xfs_exntst_t state, ++ struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) + { + struct xfs_mount *mp = tp->t_mountp; +@@ -2408,11 +2402,13 @@ xfs_rmap_finish_one( + xfs_agblock_t bno; + bool unwritten; + +- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); +- bno = XFS_FSB_TO_AGBNO(mp, startblock); ++ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); ++ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); + +- trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, +- startoff, blockcount, state); ++ trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, ++ ri->ri_owner, ri->ri_whichfork, ++ ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, ++ ri->ri_bmap.br_state); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { + error = -EIO; +@@ -2448,35 +2444,37 @@ xfs_rmap_finish_one( + } + *pcur = rcur; + +- xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); +- unwritten = state == XFS_EXT_UNWRITTEN; +- bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); ++ xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, ++ ri->ri_bmap.br_startoff); ++ unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; ++ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); + +- switch (type) { ++ switch (ri->ri_type) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: +- error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); ++ error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, ++ unwritten, &oinfo); + break; + case XFS_RMAP_MAP_SHARED: +- error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, +- &oinfo); ++ error = xfs_rmap_map_shared(rcur, bno, ++ ri->ri_bmap.br_blockcount, unwritten, &oinfo); + break; + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: +- error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, +- &oinfo); ++ error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, ++ unwritten, &oinfo); + break; + case XFS_RMAP_UNMAP_SHARED: +- error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, +- &oinfo); ++ error = xfs_rmap_unmap_shared(rcur, bno, ++ ri->ri_bmap.br_blockcount, unwritten, &oinfo); + break; + case XFS_RMAP_CONVERT: +- error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, +- &oinfo); ++ error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, ++ !unwritten, &oinfo); + break; + case XFS_RMAP_CONVERT_SHARED: +- error = xfs_rmap_convert_shared(rcur, bno, blockcount, +- !unwritten, &oinfo); ++ error = xfs_rmap_convert_shared(rcur, bno, ++ ri->ri_bmap.br_blockcount, !unwritten, &oinfo); + break; + default: + ASSERT(0); +diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h +index 54741a591a17..2dac88cea28d 100644 +--- a/fs/xfs/libxfs/xfs_rmap.h ++++ b/fs/xfs/libxfs/xfs_rmap.h +@@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, + + void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +-int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, +- uint64_t owner, int whichfork, xfs_fileoff_t startoff, +- xfs_fsblock_t startblock, xfs_filblks_t blockcount, +- xfs_exntst_t state, struct xfs_btree_cur **pcur); ++int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, ++ struct xfs_btree_cur **pcur); + + int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, +diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c +index 41323da523d1..6e2f0013380a 100644 +--- a/fs/xfs/xfs_bmap_item.c ++++ b/fs/xfs/xfs_bmap_item.c +@@ -246,18 +246,11 @@ static int + xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, +- enum xfs_bmap_intent_type type, +- struct xfs_inode *ip, +- int whichfork, +- xfs_fileoff_t startoff, +- xfs_fsblock_t startblock, +- xfs_filblks_t *blockcount, +- xfs_exntst_t state) ++ struct xfs_bmap_intent *bi) + { + int error; + +- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, +- startblock, blockcount, state); ++ error = xfs_bmap_finish_one(tp, bi); + + /* + * Mark the transaction dirty, even on error. This ensures the +@@ -290,24 +283,24 @@ xfs_bmap_update_diff_items( + /* Set the map extent flags for this mapping. */ + static void + xfs_trans_set_bmap_flags( +- struct xfs_map_extent *bmap, ++ struct xfs_map_extent *map, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) + { +- bmap->me_flags = 0; ++ map->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: +- bmap->me_flags = type; ++ map->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) +- bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; ++ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) +- bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; ++ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + } + + /* Log bmap updates in the intent item. */ +@@ -315,7 +308,7 @@ STATIC void + xfs_bmap_update_log_item( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip, +- struct xfs_bmap_intent *bmap) ++ struct xfs_bmap_intent *bi) + { + uint next_extent; + struct xfs_map_extent *map; +@@ -331,12 +324,12 @@ xfs_bmap_update_log_item( + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; +- map->me_owner = bmap->bi_owner->i_ino; +- map->me_startblock = bmap->bi_bmap.br_startblock; +- map->me_startoff = bmap->bi_bmap.br_startoff; +- map->me_len = bmap->bi_bmap.br_blockcount; +- xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, +- bmap->bi_bmap.br_state); ++ map->me_owner = bi->bi_owner->i_ino; ++ map->me_startblock = bi->bi_bmap.br_startblock; ++ map->me_startoff = bi->bi_bmap.br_startoff; ++ map->me_len = bi->bi_bmap.br_blockcount; ++ xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, ++ bi->bi_bmap.br_state); + } + + static struct xfs_log_item * +@@ -348,15 +341,15 @@ xfs_bmap_update_create_intent( + { + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); +- struct xfs_bmap_intent *bmap; ++ struct xfs_bmap_intent *bi; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); +- list_for_each_entry(bmap, items, bi_list) +- xfs_bmap_update_log_item(tp, buip, bmap); ++ list_for_each_entry(bi, items, bi_list) ++ xfs_bmap_update_log_item(tp, buip, bi); + return &buip->bui_item; + } + +@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item( + struct list_head *item, + struct xfs_btree_cur **state) + { +- struct xfs_bmap_intent *bmap; +- xfs_filblks_t count; ++ struct xfs_bmap_intent *bi; + int error; + +- bmap = container_of(item, struct xfs_bmap_intent, bi_list); +- count = bmap->bi_bmap.br_blockcount; +- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), +- bmap->bi_type, +- bmap->bi_owner, bmap->bi_whichfork, +- bmap->bi_bmap.br_startoff, +- bmap->bi_bmap.br_startblock, +- &count, +- bmap->bi_bmap.br_state); +- if (!error && count > 0) { +- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); +- bmap->bi_bmap.br_blockcount = count; ++ bi = container_of(item, struct xfs_bmap_intent, bi_list); ++ ++ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); ++ if (!error && bi->bi_bmap.br_blockcount > 0) { ++ ASSERT(bi->bi_type == XFS_BMAP_UNMAP); + return -EAGAIN; + } +- kmem_cache_free(xfs_bmap_intent_cache, bmap); ++ kmem_cache_free(xfs_bmap_intent_cache, bi); + return error; + } + +@@ -413,10 +398,10 @@ STATIC void + xfs_bmap_update_cancel_item( + struct list_head *item) + { +- struct xfs_bmap_intent *bmap; ++ struct xfs_bmap_intent *bi; + +- bmap = container_of(item, struct xfs_bmap_intent, bi_list); +- kmem_cache_free(xfs_bmap_intent_cache, bmap); ++ bi = container_of(item, struct xfs_bmap_intent, bi_list); ++ kmem_cache_free(xfs_bmap_intent_cache, bi); + } + + const struct xfs_defer_op_type xfs_bmap_update_defer_type = { +@@ -434,18 +419,18 @@ xfs_bui_validate( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) + { +- struct xfs_map_extent *bmap; ++ struct xfs_map_extent *map; + + /* Only one mapping operation per BUI... */ + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + return false; + +- bmap = &buip->bui_format.bui_extents[0]; ++ map = &buip->bui_format.bui_extents[0]; + +- if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS) ++ if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS) + return false; + +- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { ++ switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + break; +@@ -453,13 +438,13 @@ xfs_bui_validate( + return false; + } + +- if (!xfs_verify_ino(mp, bmap->me_owner)) ++ if (!xfs_verify_ino(mp, map->me_owner)) + return false; + +- if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len)) ++ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) + return false; + +- return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len); ++ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); + } + + /* +@@ -471,17 +456,13 @@ xfs_bui_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) + { +- struct xfs_bmbt_irec irec; ++ struct xfs_bmap_intent fake = { }; + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_mount *mp = lip->li_log->l_mp; +- struct xfs_map_extent *bmap; ++ struct xfs_map_extent *map; + struct xfs_bud_log_item *budp; +- xfs_filblks_t count; +- xfs_exntst_t state; +- unsigned int bui_type; +- int whichfork; + int iext_delta; + int error = 0; + +@@ -491,14 +472,12 @@ xfs_bui_item_recover( + return -EFSCORRUPTED; + } + +- bmap = &buip->bui_format.bui_extents[0]; +- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? +- XFS_EXT_UNWRITTEN : XFS_EXT_NORM; +- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? ++ map = &buip->bui_format.bui_extents[0]; ++ fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; +- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; ++ fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + +- error = xlog_recover_iget(mp, bmap->me_owner, &ip); ++ error = xlog_recover_iget(mp, map->me_owner, &ip); + if (error) + return error; + +@@ -512,34 +491,34 @@ xfs_bui_item_recover( + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + +- if (bui_type == XFS_BMAP_MAP) ++ if (fake.bi_type == XFS_BMAP_MAP) + iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; + else + iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; + +- error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); ++ error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); + if (error) + goto err_cancel; + +- count = bmap->me_len; +- error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, +- whichfork, bmap->me_startoff, bmap->me_startblock, +- &count, state); ++ fake.bi_owner = ip; ++ fake.bi_bmap.br_startblock = map->me_startblock; ++ fake.bi_bmap.br_startoff = map->me_startoff; ++ fake.bi_bmap.br_blockcount = map->me_len; ++ fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? ++ XFS_EXT_UNWRITTEN : XFS_EXT_NORM; ++ ++ error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + if (error == -EFSCORRUPTED) +- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, +- sizeof(*bmap)); ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, ++ sizeof(*map)); + if (error) + goto err_cancel; + +- if (count > 0) { +- ASSERT(bui_type == XFS_BMAP_UNMAP); +- irec.br_startblock = bmap->me_startblock; +- irec.br_blockcount = count; +- irec.br_startoff = bmap->me_startoff; +- irec.br_state = state; +- xfs_bmap_unmap_extent(tp, ip, &irec); ++ if (fake.bi_bmap.br_blockcount > 0) { ++ ASSERT(fake.bi_type == XFS_BMAP_UNMAP); ++ xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); + } + + /* +@@ -579,18 +558,18 @@ xfs_bui_item_relog( + { + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; +- struct xfs_map_extent *extp; ++ struct xfs_map_extent *map; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; +- extp = BUI_ITEM(intent)->bui_format.bui_extents; ++ map = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); +- memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); ++ memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); +diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c +index ae082808cfed..b2cbbba3e15a 100644 +--- a/fs/xfs/xfs_error.c ++++ b/fs/xfs/xfs_error.c +@@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_errortag); + +-static struct kobj_type xfs_errortag_ktype = { ++static const struct kobj_type xfs_errortag_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_errortag_sysfs_ops, + .default_groups = xfs_errortag_groups, +diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h +index dbe6c37dc697..0b9c5ba8a598 100644 +--- a/fs/xfs/xfs_error.h ++++ b/fs/xfs/xfs_error.h +@@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); + + /* + * XFS panic tags -- allow a call to xfs_alert_tag() be turned into +- * a panic by setting xfs_panic_mask in a sysctl. ++ * a panic by setting fs.xfs.panic_mask in a sysctl. + */ + #define XFS_NO_PTAG 0u + #define XFS_PTAG_IFLUSH (1u << 0) +@@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); + #define XFS_PTAG_FSBLOCK_ZERO (1u << 7) + #define XFS_PTAG_VERIFIER_ERROR (1u << 8) + ++#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ ++ XFS_PTAG_LOGRES | \ ++ XFS_PTAG_AILDELETE | \ ++ XFS_PTAG_ERROR_REPORT | \ ++ XFS_PTAG_SHUTDOWN_CORRUPT | \ ++ XFS_PTAG_SHUTDOWN_IOERROR | \ ++ XFS_PTAG_SHUTDOWN_LOGERROR | \ ++ XFS_PTAG_FSBLOCK_ZERO | \ ++ XFS_PTAG_VERIFIER_ERROR) ++ + #define XFS_PTAG_STRINGS \ + { XFS_NO_PTAG, "none" }, \ + { XFS_PTAG_IFLUSH, "iflush" }, \ +diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c +index d5130d1fcfae..011b50469301 100644 +--- a/fs/xfs/xfs_extfree_item.c ++++ b/fs/xfs/xfs_extfree_item.c +@@ -345,23 +345,30 @@ static int + xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, +- xfs_fsblock_t start_block, +- xfs_extlen_t ext_len, +- const struct xfs_owner_info *oinfo, +- bool skip_discard) ++ struct xfs_extent_free_item *xefi) + { ++ struct xfs_owner_info oinfo = { }; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; +- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); ++ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, ++ xefi->xefi_startblock); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, +- start_block); ++ xefi->xefi_startblock); + int error; + +- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); ++ oinfo.oi_owner = xefi->xefi_owner; ++ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) ++ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; ++ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) ++ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; ++ ++ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ++ xefi->xefi_blockcount); + +- error = __xfs_free_extent(tp, start_block, ext_len, +- oinfo, XFS_AG_RESV_NONE, skip_discard); ++ error = __xfs_free_extent(tp, xefi->xefi_startblock, ++ xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, ++ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: +@@ -375,8 +382,8 @@ xfs_trans_free_extent( + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); +- extp->ext_start = start_block; +- extp->ext_len = ext_len; ++ extp->ext_start = xefi->xefi_startblock; ++ extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; + + return error; +@@ -404,7 +411,7 @@ STATIC void + xfs_extent_free_log_item( + struct xfs_trans *tp, + struct xfs_efi_log_item *efip, +- struct xfs_extent_free_item *free) ++ struct xfs_extent_free_item *xefi) + { + uint next_extent; + struct xfs_extent *extp; +@@ -420,8 +427,8 @@ xfs_extent_free_log_item( + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &efip->efi_format.efi_extents[next_extent]; +- extp->ext_start = free->xefi_startblock; +- extp->ext_len = free->xefi_blockcount; ++ extp->ext_start = xefi->xefi_startblock; ++ extp->ext_len = xefi->xefi_blockcount; + } + + static struct xfs_log_item * +@@ -433,15 +440,15 @@ xfs_extent_free_create_intent( + { + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); +- struct xfs_extent_free_item *free; ++ struct xfs_extent_free_item *xefi; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); +- list_for_each_entry(free, items, xefi_list) +- xfs_extent_free_log_item(tp, efip, free); ++ list_for_each_entry(xefi, items, xefi_list) ++ xfs_extent_free_log_item(tp, efip, xefi); + return &efip->efi_item; + } + +@@ -463,21 +470,13 @@ xfs_extent_free_finish_item( + struct list_head *item, + struct xfs_btree_cur **state) + { +- struct xfs_owner_info oinfo = { }; +- struct xfs_extent_free_item *free; ++ struct xfs_extent_free_item *xefi; + int error; + +- free = container_of(item, struct xfs_extent_free_item, xefi_list); +- oinfo.oi_owner = free->xefi_owner; +- if (free->xefi_flags & XFS_EFI_ATTR_FORK) +- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; +- if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) +- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; +- error = xfs_trans_free_extent(tp, EFD_ITEM(done), +- free->xefi_startblock, +- free->xefi_blockcount, +- &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); +- kmem_cache_free(xfs_extfree_item_cache, free); ++ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ++ ++ error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); ++ kmem_cache_free(xfs_extfree_item_cache, xefi); + return error; + } + +@@ -494,10 +493,10 @@ STATIC void + xfs_extent_free_cancel_item( + struct list_head *item) + { +- struct xfs_extent_free_item *free; ++ struct xfs_extent_free_item *xefi; + +- free = container_of(item, struct xfs_extent_free_item, xefi_list); +- kmem_cache_free(xfs_extfree_item_cache, free); ++ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ++ kmem_cache_free(xfs_extfree_item_cache, xefi); + } + + const struct xfs_defer_op_type xfs_extent_free_defer_type = { +@@ -523,7 +522,7 @@ xfs_agfl_free_finish_item( + struct xfs_owner_info oinfo = { }; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); +- struct xfs_extent_free_item *free; ++ struct xfs_extent_free_item *xefi; + struct xfs_extent *extp; + struct xfs_buf *agbp; + int error; +@@ -532,13 +531,13 @@ xfs_agfl_free_finish_item( + uint next_extent; + struct xfs_perag *pag; + +- free = container_of(item, struct xfs_extent_free_item, xefi_list); +- ASSERT(free->xefi_blockcount == 1); +- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); +- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); +- oinfo.oi_owner = free->xefi_owner; ++ xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ++ ASSERT(xefi->xefi_blockcount == 1); ++ agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); ++ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); ++ oinfo.oi_owner = xefi->xefi_owner; + +- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); ++ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); + + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); +@@ -559,11 +558,11 @@ xfs_agfl_free_finish_item( + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); +- extp->ext_start = free->xefi_startblock; +- extp->ext_len = free->xefi_blockcount; ++ extp->ext_start = xefi->xefi_startblock; ++ extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; + +- kmem_cache_free(xfs_extfree_item_cache, free); ++ kmem_cache_free(xfs_extfree_item_cache, xefi); + return error; + } + +@@ -599,7 +598,6 @@ xfs_efi_item_recover( + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; +- struct xfs_extent *extp; + int i; + int error = 0; + +@@ -624,10 +622,17 @@ xfs_efi_item_recover( + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { ++ struct xfs_extent_free_item fake = { ++ .xefi_owner = XFS_RMAP_OWN_UNKNOWN, ++ }; ++ struct xfs_extent *extp; ++ + extp = &efip->efi_format.efi_extents[i]; +- error = xfs_trans_free_extent(tp, efdp, extp->ext_start, +- extp->ext_len, +- &XFS_RMAP_OINFO_ANY_OWNER, false); ++ ++ fake.xefi_startblock = extp->ext_start; ++ fake.xefi_blockcount = extp->ext_len; ++ ++ error = xfs_trans_free_extent(tp, efdp, &fake); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + extp, sizeof(*extp)); +diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c +index 4d0a98f920ca..9edc1f2bc939 100644 +--- a/fs/xfs/xfs_globals.c ++++ b/fs/xfs/xfs_globals.c +@@ -4,6 +4,7 @@ + * All Rights Reserved. + */ + #include "xfs.h" ++#include "xfs_error.h" + + /* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, +@@ -15,7 +16,7 @@ xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, +- .panic_mask = { 0, 0, 256 }, ++ .panic_mask = { 0, 0, XFS_PTAG_MASK}, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, +diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c +index fc1946f80a4a..69dbe7814128 100644 +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -83,7 +83,7 @@ xfs_iomap_valid( + return true; + } + +-static const struct iomap_page_ops xfs_iomap_page_ops = { ++static const struct iomap_folio_ops xfs_iomap_folio_ops = { + .iomap_valid = xfs_iomap_valid, + }; + +@@ -133,7 +133,7 @@ xfs_bmbt_to_iomap( + iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; +- iomap->page_ops = &xfs_iomap_page_ops; ++ iomap->folio_ops = &xfs_iomap_folio_ops; + return 0; + } + +diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c +index 858e3e9eb4a8..48d771a76add 100644 +--- a/fs/xfs/xfs_refcount_item.c ++++ b/fs/xfs/xfs_refcount_item.c +@@ -252,17 +252,12 @@ static int + xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, +- enum xfs_refcount_intent_type type, +- xfs_fsblock_t startblock, +- xfs_extlen_t blockcount, +- xfs_fsblock_t *new_fsb, +- xfs_extlen_t *new_len, ++ struct xfs_refcount_intent *ri, + struct xfs_btree_cur **pcur) + { + int error; + +- error = xfs_refcount_finish_one(tp, type, startblock, +- blockcount, new_fsb, new_len, pcur); ++ error = xfs_refcount_finish_one(tp, ri, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the +@@ -297,16 +292,16 @@ xfs_refcount_update_diff_items( + /* Set the phys extent flags for this reverse mapping. */ + static void + xfs_trans_set_refcount_flags( +- struct xfs_phys_extent *refc, ++ struct xfs_phys_extent *pmap, + enum xfs_refcount_intent_type type) + { +- refc->pe_flags = 0; ++ pmap->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: +- refc->pe_flags |= type; ++ pmap->pe_flags |= type; + break; + default: + ASSERT(0); +@@ -318,10 +313,10 @@ STATIC void + xfs_refcount_update_log_item( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip, +- struct xfs_refcount_intent *refc) ++ struct xfs_refcount_intent *ri) + { + uint next_extent; +- struct xfs_phys_extent *ext; ++ struct xfs_phys_extent *pmap; + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); +@@ -333,10 +328,10 @@ xfs_refcount_update_log_item( + */ + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); +- ext = &cuip->cui_format.cui_extents[next_extent]; +- ext->pe_startblock = refc->ri_startblock; +- ext->pe_len = refc->ri_blockcount; +- xfs_trans_set_refcount_flags(ext, refc->ri_type); ++ pmap = &cuip->cui_format.cui_extents[next_extent]; ++ pmap->pe_startblock = ri->ri_startblock; ++ pmap->pe_len = ri->ri_blockcount; ++ xfs_trans_set_refcount_flags(pmap, ri->ri_type); + } + + static struct xfs_log_item * +@@ -348,15 +343,15 @@ xfs_refcount_update_create_intent( + { + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); +- struct xfs_refcount_intent *refc; ++ struct xfs_refcount_intent *ri; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); +- list_for_each_entry(refc, items, ri_list) +- xfs_refcount_update_log_item(tp, cuip, refc); ++ list_for_each_entry(ri, items, ri_list) ++ xfs_refcount_update_log_item(tp, cuip, ri); + return &cuip->cui_item; + } + +@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item( + struct list_head *item, + struct xfs_btree_cur **state) + { +- struct xfs_refcount_intent *refc; +- xfs_fsblock_t new_fsb; +- xfs_extlen_t new_aglen; ++ struct xfs_refcount_intent *ri; + int error; + +- refc = container_of(item, struct xfs_refcount_intent, ri_list); +- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), +- refc->ri_type, refc->ri_startblock, refc->ri_blockcount, +- &new_fsb, &new_aglen, state); ++ ri = container_of(item, struct xfs_refcount_intent, ri_list); ++ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, ++ state); + + /* Did we run out of reservation? Requeue what we didn't finish. */ +- if (!error && new_aglen > 0) { +- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || +- refc->ri_type == XFS_REFCOUNT_DECREASE); +- refc->ri_startblock = new_fsb; +- refc->ri_blockcount = new_aglen; ++ if (!error && ri->ri_blockcount > 0) { ++ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ++ ri->ri_type == XFS_REFCOUNT_DECREASE); + return -EAGAIN; + } +- kmem_cache_free(xfs_refcount_intent_cache, refc); ++ kmem_cache_free(xfs_refcount_intent_cache, ri); + return error; + } + +@@ -413,10 +403,10 @@ STATIC void + xfs_refcount_update_cancel_item( + struct list_head *item) + { +- struct xfs_refcount_intent *refc; ++ struct xfs_refcount_intent *ri; + +- refc = container_of(item, struct xfs_refcount_intent, ri_list); +- kmem_cache_free(xfs_refcount_intent_cache, refc); ++ ri = container_of(item, struct xfs_refcount_intent, ri_list); ++ kmem_cache_free(xfs_refcount_intent_cache, ri); + } + + const struct xfs_defer_op_type xfs_refcount_update_defer_type = { +@@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + static inline bool + xfs_cui_validate_phys( + struct xfs_mount *mp, +- struct xfs_phys_extent *refc) ++ struct xfs_phys_extent *pmap) + { + if (!xfs_has_reflink(mp)) + return false; + +- if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) ++ if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) + return false; + +- switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { ++ switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: +@@ -451,7 +441,7 @@ xfs_cui_validate_phys( + return false; + } + +- return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len); ++ return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); + } + + /* +@@ -463,18 +453,13 @@ xfs_cui_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) + { +- struct xfs_bmbt_irec irec; + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); +- struct xfs_phys_extent *refc; + struct xfs_cud_log_item *cudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + struct xfs_mount *mp = lip->li_log->l_mp; +- xfs_fsblock_t new_fsb; +- xfs_extlen_t new_len; + unsigned int refc_type; + bool requeue_only = false; +- enum xfs_refcount_intent_type type; + int i; + int error = 0; + +@@ -513,14 +498,17 @@ xfs_cui_item_recover( + cudp = xfs_trans_get_cud(tp, cuip); + + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { +- refc = &cuip->cui_format.cui_extents[i]; +- refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ++ struct xfs_refcount_intent fake = { }; ++ struct xfs_phys_extent *pmap; ++ ++ pmap = &cuip->cui_format.cui_extents[i]; ++ refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + switch (refc_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: +- type = refc_type; ++ fake.ri_type = refc_type; + break; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +@@ -529,13 +517,12 @@ xfs_cui_item_recover( + error = -EFSCORRUPTED; + goto abort_error; + } +- if (requeue_only) { +- new_fsb = refc->pe_startblock; +- new_len = refc->pe_len; +- } else ++ ++ fake.ri_startblock = pmap->pe_startblock; ++ fake.ri_blockcount = pmap->pe_len; ++ if (!requeue_only) + error = xfs_trans_log_finish_refcount_update(tp, cudp, +- type, refc->pe_startblock, refc->pe_len, +- &new_fsb, &new_len, &rcur); ++ &fake, &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, +@@ -544,10 +531,13 @@ xfs_cui_item_recover( + goto abort_error; + + /* Requeue what we didn't finish. */ +- if (new_len > 0) { +- irec.br_startblock = new_fsb; +- irec.br_blockcount = new_len; +- switch (type) { ++ if (fake.ri_blockcount > 0) { ++ struct xfs_bmbt_irec irec = { ++ .br_startblock = fake.ri_startblock, ++ .br_blockcount = fake.ri_blockcount, ++ }; ++ ++ switch (fake.ri_type) { + case XFS_REFCOUNT_INCREASE: + xfs_refcount_increase_extent(tp, &irec); + break; +@@ -596,18 +586,18 @@ xfs_cui_item_relog( + { + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; +- struct xfs_phys_extent *extp; ++ struct xfs_phys_extent *pmap; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; +- extp = CUI_ITEM(intent)->cui_format.cui_extents; ++ pmap = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); +- memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); ++ memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); +diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c +index 534504ede1a3..a1619d67015f 100644 +--- a/fs/xfs/xfs_rmap_item.c ++++ b/fs/xfs/xfs_rmap_item.c +@@ -244,40 +244,40 @@ xfs_trans_get_rud( + /* Set the map extent flags for this reverse mapping. */ + static void + xfs_trans_set_rmap_flags( +- struct xfs_map_extent *rmap, ++ struct xfs_map_extent *map, + enum xfs_rmap_intent_type type, + int whichfork, + xfs_exntst_t state) + { +- rmap->me_flags = 0; ++ map->me_flags = 0; + if (state == XFS_EXT_UNWRITTEN) +- rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; ++ map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) +- rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; ++ map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + switch (type) { + case XFS_RMAP_MAP: +- rmap->me_flags |= XFS_RMAP_EXTENT_MAP; ++ map->me_flags |= XFS_RMAP_EXTENT_MAP; + break; + case XFS_RMAP_MAP_SHARED: +- rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; ++ map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; + case XFS_RMAP_UNMAP: +- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; ++ map->me_flags |= XFS_RMAP_EXTENT_UNMAP; + break; + case XFS_RMAP_UNMAP_SHARED: +- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; ++ map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; + case XFS_RMAP_CONVERT: +- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; ++ map->me_flags |= XFS_RMAP_EXTENT_CONVERT; + break; + case XFS_RMAP_CONVERT_SHARED: +- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; ++ map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; + case XFS_RMAP_ALLOC: +- rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; ++ map->me_flags |= XFS_RMAP_EXTENT_ALLOC; + break; + case XFS_RMAP_FREE: +- rmap->me_flags |= XFS_RMAP_EXTENT_FREE; ++ map->me_flags |= XFS_RMAP_EXTENT_FREE; + break; + default: + ASSERT(0); +@@ -293,19 +293,12 @@ static int + xfs_trans_log_finish_rmap_update( + struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, +- enum xfs_rmap_intent_type type, +- uint64_t owner, +- int whichfork, +- xfs_fileoff_t startoff, +- xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, +- xfs_exntst_t state, ++ struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur) + { + int error; + +- error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, +- startblock, blockcount, state, pcur); ++ error = xfs_rmap_finish_one(tp, ri, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the +@@ -342,7 +335,7 @@ STATIC void + xfs_rmap_update_log_item( + struct xfs_trans *tp, + struct xfs_rui_log_item *ruip, +- struct xfs_rmap_intent *rmap) ++ struct xfs_rmap_intent *ri) + { + uint next_extent; + struct xfs_map_extent *map; +@@ -358,12 +351,12 @@ xfs_rmap_update_log_item( + next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; + ASSERT(next_extent < ruip->rui_format.rui_nextents); + map = &ruip->rui_format.rui_extents[next_extent]; +- map->me_owner = rmap->ri_owner; +- map->me_startblock = rmap->ri_bmap.br_startblock; +- map->me_startoff = rmap->ri_bmap.br_startoff; +- map->me_len = rmap->ri_bmap.br_blockcount; +- xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, +- rmap->ri_bmap.br_state); ++ map->me_owner = ri->ri_owner; ++ map->me_startblock = ri->ri_bmap.br_startblock; ++ map->me_startoff = ri->ri_bmap.br_startoff; ++ map->me_len = ri->ri_bmap.br_blockcount; ++ xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, ++ ri->ri_bmap.br_state); + } + + static struct xfs_log_item * +@@ -375,15 +368,15 @@ xfs_rmap_update_create_intent( + { + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); +- struct xfs_rmap_intent *rmap; ++ struct xfs_rmap_intent *ri; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); +- list_for_each_entry(rmap, items, ri_list) +- xfs_rmap_update_log_item(tp, ruip, rmap); ++ list_for_each_entry(ri, items, ri_list) ++ xfs_rmap_update_log_item(tp, ruip, ri); + return &ruip->rui_item; + } + +@@ -405,16 +398,14 @@ xfs_rmap_update_finish_item( + struct list_head *item, + struct xfs_btree_cur **state) + { +- struct xfs_rmap_intent *rmap; ++ struct xfs_rmap_intent *ri; + int error; + +- rmap = container_of(item, struct xfs_rmap_intent, ri_list); +- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), +- rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, +- rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, +- rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, ++ ri = container_of(item, struct xfs_rmap_intent, ri_list); ++ ++ error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, + state); +- kmem_cache_free(xfs_rmap_intent_cache, rmap); ++ kmem_cache_free(xfs_rmap_intent_cache, ri); + return error; + } + +@@ -431,10 +422,10 @@ STATIC void + xfs_rmap_update_cancel_item( + struct list_head *item) + { +- struct xfs_rmap_intent *rmap; ++ struct xfs_rmap_intent *ri; + +- rmap = container_of(item, struct xfs_rmap_intent, ri_list); +- kmem_cache_free(xfs_rmap_intent_cache, rmap); ++ ri = container_of(item, struct xfs_rmap_intent, ri_list); ++ kmem_cache_free(xfs_rmap_intent_cache, ri); + } + + const struct xfs_defer_op_type xfs_rmap_update_defer_type = { +@@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + static inline bool + xfs_rui_validate_map( + struct xfs_mount *mp, +- struct xfs_map_extent *rmap) ++ struct xfs_map_extent *map) + { + if (!xfs_has_rmapbt(mp)) + return false; + +- if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) ++ if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) + return false; + +- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { ++ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_MAP_SHARED: + case XFS_RMAP_EXTENT_UNMAP: +@@ -473,14 +464,14 @@ xfs_rui_validate_map( + return false; + } + +- if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) && +- !xfs_verify_ino(mp, rmap->me_owner)) ++ if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && ++ !xfs_verify_ino(mp, map->me_owner)) + return false; + +- if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len)) ++ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) + return false; + +- return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len); ++ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); + } + + /* +@@ -493,15 +484,11 @@ xfs_rui_item_recover( + struct list_head *capture_list) + { + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); +- struct xfs_map_extent *rmap; + struct xfs_rud_log_item *rudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + struct xfs_mount *mp = lip->li_log->l_mp; +- enum xfs_rmap_intent_type type; +- xfs_exntst_t state; + int i; +- int whichfork; + int error = 0; + + /* +@@ -526,35 +513,34 @@ xfs_rui_item_recover( + rudp = xfs_trans_get_rud(tp, ruip); + + for (i = 0; i < ruip->rui_format.rui_nextents; i++) { +- rmap = &ruip->rui_format.rui_extents[i]; +- state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? +- XFS_EXT_UNWRITTEN : XFS_EXT_NORM; +- whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? +- XFS_ATTR_FORK : XFS_DATA_FORK; +- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { ++ struct xfs_rmap_intent fake = { }; ++ struct xfs_map_extent *map; ++ ++ map = &ruip->rui_format.rui_extents[i]; ++ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: +- type = XFS_RMAP_MAP; ++ fake.ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: +- type = XFS_RMAP_MAP_SHARED; ++ fake.ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: +- type = XFS_RMAP_UNMAP; ++ fake.ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: +- type = XFS_RMAP_UNMAP_SHARED; ++ fake.ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: +- type = XFS_RMAP_CONVERT; ++ fake.ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: +- type = XFS_RMAP_CONVERT_SHARED; ++ fake.ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: +- type = XFS_RMAP_ALLOC; ++ fake.ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: +- type = XFS_RMAP_FREE; ++ fake.ri_type = XFS_RMAP_FREE; + break; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +@@ -563,13 +549,21 @@ xfs_rui_item_recover( + error = -EFSCORRUPTED; + goto abort_error; + } +- error = xfs_trans_log_finish_rmap_update(tp, rudp, type, +- rmap->me_owner, whichfork, +- rmap->me_startoff, rmap->me_startblock, +- rmap->me_len, state, &rcur); ++ ++ fake.ri_owner = map->me_owner; ++ fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? ++ XFS_ATTR_FORK : XFS_DATA_FORK; ++ fake.ri_bmap.br_startblock = map->me_startblock; ++ fake.ri_bmap.br_startoff = map->me_startoff; ++ fake.ri_bmap.br_blockcount = map->me_len; ++ fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? ++ XFS_EXT_UNWRITTEN : XFS_EXT_NORM; ++ ++ error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, ++ &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +- rmap, sizeof(*rmap)); ++ map, sizeof(*map)); + if (error) + goto abort_error; + +@@ -600,18 +594,18 @@ xfs_rui_item_relog( + { + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; +- struct xfs_map_extent *extp; ++ struct xfs_map_extent *map; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; +- extp = RUI_ITEM(intent)->rui_format.rui_extents; ++ map = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); +- memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); ++ memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); +diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c +index f7faf6e70d7f..a3c6b1548723 100644 +--- a/fs/xfs/xfs_sysfs.c ++++ b/fs/xfs/xfs_sysfs.c +@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_mp); + +-struct kobj_type xfs_mp_ktype = { ++const struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_mp_groups, +@@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_dbg); + +-struct kobj_type xfs_dbg_ktype = { ++const struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_dbg_groups, +@@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_stats); + +-struct kobj_type xfs_stats_ktype = { ++const struct kobj_type xfs_stats_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_stats_groups, +@@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_log); + +-struct kobj_type xfs_log_ktype = { ++const struct kobj_type xfs_log_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_log_groups, +@@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = { + }; + ATTRIBUTE_GROUPS(xfs_error); + +-static struct kobj_type xfs_error_cfg_ktype = { ++static const struct kobj_type xfs_error_cfg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_error_groups, + }; + +-static struct kobj_type xfs_error_ktype = { ++static const struct kobj_type xfs_error_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + }; +diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h +index 513095e353a5..148893ebfdef 100644 +--- a/fs/xfs/xfs_sysfs.h ++++ b/fs/xfs/xfs_sysfs.h +@@ -7,10 +7,10 @@ + #ifndef __XFS_SYSFS_H__ + #define __XFS_SYSFS_H__ + +-extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +-extern struct kobj_type xfs_dbg_ktype; /* debug */ +-extern struct kobj_type xfs_log_ktype; /* xlog */ +-extern struct kobj_type xfs_stats_ktype; /* stats */ ++extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ ++extern const struct kobj_type xfs_dbg_ktype; /* debug */ ++extern const struct kobj_type xfs_log_ktype; /* xlog */ ++extern const struct kobj_type xfs_stats_ktype; /* stats */ + + static inline struct xfs_kobj * + to_kobj(struct kobject *kobject) +@@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject) + static inline int + xfs_sysfs_init( + struct xfs_kobj *kobj, +- struct kobj_type *ktype, ++ const struct kobj_type *ktype, + struct xfs_kobj *parent_kobj, + const char *name) + { +diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h +index 421d1e504ac4..6b0e9ae7c513 100644 +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + + TRACE_EVENT(xfs_refcount_finish_one_leftover, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, +- int type, xfs_agblock_t agbno, xfs_extlen_t len, +- xfs_agblock_t new_agbno, xfs_extlen_t new_len), +- TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), ++ int type, xfs_agblock_t agbno, xfs_extlen_t len), ++ TP_ARGS(mp, agno, type, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) +- __field(xfs_agblock_t, new_agbno) +- __field(xfs_extlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; +@@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; +- __entry->new_agbno = new_agbno; +- __entry->new_len = new_len; + ), +- TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", ++ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, +- __entry->len, +- __entry->new_agbno, +- __entry->new_len) ++ __entry->len) + ); + + /* simple inode-based error/%ip tracepoint class */ +diff --git a/include/linux/bio.h b/include/linux/bio.h +index c1da63f6c808..d766be7152e1 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -12,6 +12,8 @@ + + #define BIO_MAX_VECS 256U + ++struct queue_limits; ++ + static inline unsigned int bio_max_segs(unsigned int nr_segs) + { + return min(nr_segs, BIO_MAX_VECS); +@@ -375,6 +377,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, + void bio_trim(struct bio *bio, sector_t offset, sector_t size); + extern struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs); ++struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, ++ unsigned *segs, struct bio_set *bs, unsigned max_bytes); + + /** + * bio_next_split - get next @sectors from a bio, splitting if necessary +diff --git a/include/linux/iomap.h b/include/linux/iomap.h +index 0983dfc9a203..0f8123504e5e 100644 +--- a/include/linux/iomap.h ++++ b/include/linux/iomap.h +@@ -13,6 +13,7 @@ + struct address_space; + struct fiemap_extent_info; + struct inode; ++struct iomap_iter; + struct iomap_dio; + struct iomap_writepage_ctx; + struct iov_iter; +@@ -58,8 +59,7 @@ struct vm_fault; + #define IOMAP_F_SHARED (1U << 2) + #define IOMAP_F_MERGED (1U << 3) + #define IOMAP_F_BUFFER_HEAD (1U << 4) +-#define IOMAP_F_ZONE_APPEND (1U << 5) +-#define IOMAP_F_XATTR (1U << 6) ++#define IOMAP_F_XATTR (1U << 5) + + /* + * Flags set by the core iomap code during operations: +@@ -85,7 +85,7 @@ struct vm_fault; + */ + #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ + +-struct iomap_page_ops; ++struct iomap_folio_ops; + + struct iomap { + u64 addr; /* disk offset of mapping, bytes */ +@@ -97,7 +97,7 @@ struct iomap { + struct dax_device *dax_dev; /* dax_dev for dax operations */ + void *inline_data; + void *private; /* filesystem private */ +- const struct iomap_page_ops *page_ops; ++ const struct iomap_folio_ops *folio_ops; + u64 validity_cookie; /* used with .iomap_valid() */ + }; + +@@ -125,19 +125,20 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap) + } + + /* +- * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare +- * and page_done will be called for each page written to. This only applies to +- * buffered writes as unbuffered writes will not typically have pages ++ * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio ++ * and put_folio will be called for each folio written to. This only applies ++ * to buffered writes as unbuffered writes will not typically have folios + * associated with them. + * +- * When page_prepare succeeds, page_done will always be called to do any +- * cleanup work necessary. In that page_done call, @page will be NULL if the +- * associated page could not be obtained. ++ * When get_folio succeeds, put_folio will always be called to do any ++ * cleanup work necessary. put_folio is responsible for unlocking and putting ++ * @folio. + */ +-struct iomap_page_ops { +- int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); +- void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, +- struct page *page); ++struct iomap_folio_ops { ++ struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, ++ unsigned len); ++ void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, ++ struct folio *folio); + + /* + * Check that the cached iomap still maps correctly to the filesystem's +@@ -260,6 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); + void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); + bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); ++struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); + bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); + void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); + int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, +diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h +index 6548b5b5aa60..75d7d22c3a27 100644 +--- a/include/trace/events/btrfs.h ++++ b/include/trace/events/btrfs.h +@@ -32,6 +32,7 @@ struct prelim_ref; + struct btrfs_space_info; + struct btrfs_raid_bio; + struct raid56_bio_trace_info; ++struct find_free_extent_ctl; + + #define show_ref_type(type) \ + __print_symbolic(type, \ +@@ -1241,76 +1242,156 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, + + TRACE_EVENT(find_free_extent, + +- TP_PROTO(const struct btrfs_root *root, u64 num_bytes, +- u64 empty_size, u64 data), ++ TP_PROTO(const struct btrfs_root *root, ++ const struct find_free_extent_ctl *ffe_ctl), + +- TP_ARGS(root, num_bytes, empty_size, data), ++ TP_ARGS(root, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) +- __field( u64, data ) ++ __field( u64, flags ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; +- __entry->num_bytes = num_bytes; +- __entry->empty_size = empty_size; +- __entry->data = data; ++ __entry->num_bytes = ffe_ctl->num_bytes; ++ __entry->empty_size = ffe_ctl->empty_size; ++ __entry->flags = ffe_ctl->flags; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", + show_root_type(__entry->root_objectid), +- __entry->num_bytes, __entry->empty_size, __entry->data, +- __print_flags((unsigned long)__entry->data, "|", ++ __entry->num_bytes, __entry->empty_size, __entry->flags, ++ __print_flags((unsigned long)__entry->flags, "|", ++ BTRFS_GROUP_FLAGS)) ++); ++ ++TRACE_EVENT(find_free_extent_search_loop, ++ ++ TP_PROTO(const struct btrfs_root *root, ++ const struct find_free_extent_ctl *ffe_ctl), ++ ++ TP_ARGS(root, ffe_ctl), ++ ++ TP_STRUCT__entry_btrfs( ++ __field( u64, root_objectid ) ++ __field( u64, num_bytes ) ++ __field( u64, empty_size ) ++ __field( u64, flags ) ++ __field( u64, loop ) ++ ), ++ ++ TP_fast_assign_btrfs(root->fs_info, ++ __entry->root_objectid = root->root_key.objectid; ++ __entry->num_bytes = ffe_ctl->num_bytes; ++ __entry->empty_size = ffe_ctl->empty_size; ++ __entry->flags = ffe_ctl->flags; ++ __entry->loop = ffe_ctl->loop; ++ ), ++ ++ TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu", ++ show_root_type(__entry->root_objectid), ++ __entry->num_bytes, __entry->empty_size, __entry->flags, ++ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), ++ __entry->loop) ++); ++ ++TRACE_EVENT(find_free_extent_have_block_group, ++ ++ TP_PROTO(const struct btrfs_root *root, ++ const struct find_free_extent_ctl *ffe_ctl, ++ const struct btrfs_block_group *block_group), ++ ++ TP_ARGS(root, ffe_ctl, block_group), ++ ++ TP_STRUCT__entry_btrfs( ++ __field( u64, root_objectid ) ++ __field( u64, num_bytes ) ++ __field( u64, empty_size ) ++ __field( u64, flags ) ++ __field( u64, loop ) ++ __field( bool, hinted ) ++ __field( u64, bg_start ) ++ __field( u64, bg_flags ) ++ ), ++ ++ TP_fast_assign_btrfs(root->fs_info, ++ __entry->root_objectid = root->root_key.objectid; ++ __entry->num_bytes = ffe_ctl->num_bytes; ++ __entry->empty_size = ffe_ctl->empty_size; ++ __entry->flags = ffe_ctl->flags; ++ __entry->loop = ffe_ctl->loop; ++ __entry->hinted = ffe_ctl->hinted; ++ __entry->bg_start = block_group->start; ++ __entry->bg_flags = block_group->flags; ++ ), ++ ++ TP_printk_btrfs( ++"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)", ++ show_root_type(__entry->root_objectid), ++ __entry->num_bytes, __entry->empty_size, __entry->flags, ++ __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), ++ __entry->loop, __entry->hinted, ++ __entry->bg_start, __entry->bg_flags, ++ __print_flags((unsigned long)__entry->bg_flags, "|", + BTRFS_GROUP_FLAGS)) + ); + + DECLARE_EVENT_CLASS(btrfs__reserve_extent, + +- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, +- u64 len), ++ TP_PROTO(const struct btrfs_block_group *block_group, ++ const struct find_free_extent_ctl *ffe_ctl), + +- TP_ARGS(block_group, start, len), ++ TP_ARGS(block_group, ffe_ctl), + + TP_STRUCT__entry_btrfs( + __field( u64, bg_objectid ) + __field( u64, flags ) ++ __field( int, bg_size_class ) + __field( u64, start ) + __field( u64, len ) ++ __field( u64, loop ) ++ __field( bool, hinted ) ++ __field( int, size_class ) + ), + + TP_fast_assign_btrfs(block_group->fs_info, + __entry->bg_objectid = block_group->start; + __entry->flags = block_group->flags; +- __entry->start = start; +- __entry->len = len; ++ __entry->bg_size_class = block_group->size_class; ++ __entry->start = ffe_ctl->search_start; ++ __entry->len = ffe_ctl->num_bytes; ++ __entry->loop = ffe_ctl->loop; ++ __entry->hinted = ffe_ctl->hinted; ++ __entry->size_class = ffe_ctl->size_class; + ), + +- TP_printk_btrfs("root=%llu(%s) block_group=%llu flags=%llu(%s) " +- "start=%llu len=%llu", ++ TP_printk_btrfs( ++"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", + show_root_type(BTRFS_EXTENT_TREE_OBJECTID), + __entry->bg_objectid, + __entry->flags, __print_flags((unsigned long)__entry->flags, + "|", BTRFS_GROUP_FLAGS), +- __entry->start, __entry->len) ++ __entry->bg_size_class, __entry->start, __entry->len, ++ __entry->loop, __entry->hinted, __entry->size_class) + ); + + DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, + +- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, +- u64 len), ++ TP_PROTO(const struct btrfs_block_group *block_group, ++ const struct find_free_extent_ctl *ffe_ctl), + +- TP_ARGS(block_group, start, len) ++ TP_ARGS(block_group, ffe_ctl) + ); + + DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, + +- TP_PROTO(const struct btrfs_block_group *block_group, u64 start, +- u64 len), ++ TP_PROTO(const struct btrfs_block_group *block_group, ++ const struct find_free_extent_ctl *ffe_ctl), + +- TP_ARGS(block_group, start, len) ++ TP_ARGS(block_group, ffe_ctl) + ); + + TRACE_EVENT(btrfs_find_cluster, +-- +2.39.2 + +From 8ef75a5bf012b92f0642e7e288ce34cd247bc41e Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 17 Feb 2023 15:35:46 +0100 +Subject: [PATCH 08/15] Implement amd-pstate-epp and amd-pstate-guided driver + +Signed-off-by: Peter Jung +--- + .../admin-guide/kernel-parameters.txt | 33 +- + Documentation/admin-guide/pm/amd-pstate.rst | 95 ++- + drivers/acpi/cppc_acpi.c | 188 ++++- + drivers/cpufreq/amd-pstate.c | 794 +++++++++++++++++- + drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 +- + drivers/cpufreq/cpufreq.c | 8 +- + drivers/cpufreq/davinci-cpufreq.c | 4 +- + drivers/cpufreq/mediatek-cpufreq-hw.c | 4 +- + drivers/cpufreq/omap-cpufreq.c | 4 +- + drivers/cpufreq/qcom-cpufreq-hw.c | 4 +- + include/acpi/cppc_acpi.h | 23 + + include/linux/amd-pstate.h | 34 + + include/linux/cpufreq.h | 2 +- + 13 files changed, 1139 insertions(+), 59 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 9595abf34974..f39b8f05392c 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -339,6 +339,29 @@ + This mode requires kvm-amd.avic=1. + (Default when IOMMU HW support is present.) + ++ amd_pstate= [X86] ++ disable ++ Do not enable amd_pstate as the default ++ scaling driver for the supported processors ++ passive ++ Use amd_pstate with passive mode as a scaling driver. ++ In this mode autonomous selection is disabled. ++ Driver requests a desired performance level and platform ++ tries to match the same performance level if it is ++ satisfied by guaranteed performance level. ++ active ++ Use amd_pstate_epp driver instance as the scaling driver, ++ driver provides a hint to the hardware if software wants ++ to bias toward performance (0x0) or energy efficiency (0xff) ++ to the CPPC firmware. then CPPC power algorithm will ++ calculate the runtime workload and adjust the realtime cores ++ frequency. ++ guided ++ Activate guided autonomous mode. Driver requests minimum and ++ maximum performance level and the platform autonomously ++ selects a performance level in this range and appropriate ++ to the current workload. ++ + amijoy.map= [HW,JOY] Amiga joystick support + Map of devices attached to JOY0DAT and JOY1DAT + Format: , +@@ -7019,13 +7042,3 @@ + xmon commands. + off xmon is disabled. + +- amd_pstate= [X86] +- disable +- Do not enable amd_pstate as the default +- scaling driver for the supported processors +- passive +- Use amd_pstate as a scaling driver, driver requests a +- desired performance on this abstract scale and the power +- management firmware translates the requests into actual +- hardware states (core frequency, data fabric and memory +- clocks etc.) +diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst +index 5376d53faaa8..f24a90007e98 100644 +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -230,8 +230,8 @@ with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond + to the request from AMD P-States. + + +-User Space Interface in ``sysfs`` +-================================== ++User Space Interface in ``sysfs`` - Per-policy control ++====================================================== + + ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to + control its functionality at the system level. They are located in the +@@ -262,6 +262,25 @@ lowest non-linear performance in `AMD CPPC Performance Capability + `_.) + This attribute is read-only. + ++``energy_performance_available_preferences`` ++ ++A list of all the supported EPP preferences that could be used for ++``energy_performance_preference`` on this system. ++These profiles represent different hints that are provided ++to the low-level firmware about the user's desired energy vs efficiency ++tradeoff. ``default`` represents the epp value is set by platform ++firmware. This attribute is read-only. ++ ++``energy_performance_preference`` ++ ++The current energy performance preference can be read from this attribute. ++and user can change current preference according to energy or performance needs ++Please get all support profiles list from ++``energy_performance_available_preferences`` attribute, all the profiles are ++integer values defined between 0 to 255 when EPP feature is enabled by platform ++firmware, if EPP feature is disabled, driver will ignore the written value ++This attribute is read-write. ++ + Other performance and frequency values can be read back from + ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. + +@@ -280,8 +299,35 @@ module which supports the new AMD P-States mechanism on most of the future AMD + platforms. The AMD P-States mechanism is the more performance and energy + efficiency frequency management method on AMD processors. + +-Kernel Module Options for ``amd-pstate`` +-========================================= ++ ++AMD Pstate Driver Operation Modes ++================================= ++ ++``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, ++non-autonomous (passive) mode and guided autonomous (guided) mode. ++Active/passive/guided mode can be chosen by different kernel parameters. ++ ++- In autonomous mode, platform ignores the desired performance level request ++ and takes into account only the values set to the minimum, maximum and energy ++ performance preference registers. ++- In non-autonomous mode, platform gets desired performance level ++ from OS directly through Desired Performance Register. ++- In guided-autonomous mode, platform sets operating performance level ++ autonomously according to the current workload and within the limits set by ++ OS through min and max performance registers. ++ ++Active Mode ++------------ ++ ++``amd_pstate=active`` ++ ++This is the low-level firmware control mode which is implemented by ``amd_pstate_epp`` ++driver with ``amd_pstate=active`` passed to the kernel in the command line. ++In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software ++wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware. ++then CPPC power algorithm will calculate the runtime workload and adjust the realtime ++cores frequency according to the power supply and thermal, core voltage and some other ++hardware conditions. + + Passive Mode + ------------ +@@ -297,6 +343,47 @@ to the Performance Reduction Tolerance register. Above the nominal performance l + processor must provide at least nominal performance requested and go higher if current + operating conditions allow. + ++Guided Mode ++----------- ++ ++``amd_pstate=guided`` ++ ++If ``amd_pstate=guided`` is passed to kernel command line option then this mode ++is activated. In this mode, driver requests minimum and maximum performance ++level and the platform autonomously selects a performance level in this range ++and appropriate to the current workload. ++ ++User Space Interface in ``sysfs`` - General ++=========================================== ++ ++Global Attributes ++----------------- ++ ++``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to ++control its functionality at the system level. They are located in the ++``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs. ++ ++``status`` ++ Operation mode of the driver: "active", "passive" or "disable". ++ ++ "active" ++ The driver is functional and in the ``active mode`` ++ ++ "passive" ++ The driver is functional and in the ``passive mode`` ++ ++ "guided" ++ The driver is functional and in the ``guided mode`` ++ ++ "disable" ++ The driver is unregistered and not functional now. ++ ++ This attribute can be written to in order to change the driver's ++ operation mode or to unregister it. The string written to it must be ++ one of the possible values of it and, if successful, writing one of ++ these values to the sysfs file will cause the driver to switch over ++ to the operation mode represented by that string - or to be ++ unregistered in the "disable" case. + + ``cpupower`` tool support for ``amd-pstate`` + =============================================== +diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c +index 0f17b1c32718..0efdbeed6ada 100644 +--- a/drivers/acpi/cppc_acpi.c ++++ b/drivers/acpi/cppc_acpi.c +@@ -1153,6 +1153,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) + return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); + } + ++/** ++ * cppc_get_epp_perf - Get the epp register value. ++ * @cpunum: CPU from which to get epp preference value. ++ * @epp_perf: Return address. ++ * ++ * Return: 0 for success, -EIO otherwise. ++ */ ++int cppc_get_epp_perf(int cpunum, u64 *epp_perf) ++{ ++ return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); ++} ++EXPORT_SYMBOL_GPL(cppc_get_epp_perf); ++ + /** + * cppc_get_perf_caps - Get a CPU's performance capabilities. + * @cpunum: CPU from which to get capabilities info. +@@ -1365,6 +1378,157 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs) + } + EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); + ++/* ++ * Set Energy Performance Preference Register value through ++ * Performance Controls Interface ++ */ ++int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) ++{ ++ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); ++ struct cpc_register_resource *epp_set_reg; ++ struct cpc_register_resource *auto_sel_reg; ++ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); ++ struct cppc_pcc_data *pcc_ss_data = NULL; ++ int ret; ++ ++ if (!cpc_desc) { ++ pr_debug("No CPC descriptor for CPU:%d\n", cpu); ++ return -ENODEV; ++ } ++ ++ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; ++ epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; ++ ++ if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { ++ if (pcc_ss_id < 0) { ++ pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); ++ return -ENODEV; ++ } ++ ++ if (CPC_SUPPORTED(auto_sel_reg)) { ++ ret = cpc_write(cpu, auto_sel_reg, enable); ++ if (ret) ++ return ret; ++ } ++ ++ if (CPC_SUPPORTED(epp_set_reg)) { ++ ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); ++ if (ret) ++ return ret; ++ } ++ ++ pcc_ss_data = pcc_data[pcc_ss_id]; ++ ++ down_write(&pcc_ss_data->pcc_lock); ++ /* after writing CPC, transfer the ownership of PCC to platform */ ++ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); ++ up_write(&pcc_ss_data->pcc_lock); ++ } else { ++ ret = -ENOTSUPP; ++ pr_debug("_CPC in PCC is not supported\n"); ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(cppc_set_epp_perf); ++ ++/* ++ * cppc_get_auto_sel_caps - Read autonomous selection register. ++ * @cpunum : CPU from which to read register. ++ * @perf_caps : struct where autonomous selection register value is updated. ++ */ ++int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) ++{ ++ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); ++ struct cpc_register_resource *auto_sel_reg; ++ u64 auto_sel; ++ ++ if (!cpc_desc) { ++ pr_debug("No CPC descriptor for CPU:%d\n", cpunum); ++ return -ENODEV; ++ } ++ ++ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; ++ ++ if (!CPC_SUPPORTED(auto_sel_reg)) ++ pr_warn_once("Autonomous mode is not unsupported!\n"); ++ ++ if (CPC_IN_PCC(auto_sel_reg)) { ++ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); ++ struct cppc_pcc_data *pcc_ss_data = NULL; ++ int ret = 0; ++ ++ if (pcc_ss_id < 0) ++ return -ENODEV; ++ ++ pcc_ss_data = pcc_data[pcc_ss_id]; ++ ++ down_write(&pcc_ss_data->pcc_lock); ++ ++ if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) { ++ cpc_read(cpunum, auto_sel_reg, &auto_sel); ++ perf_caps->auto_sel = (bool)auto_sel; ++ } else { ++ ret = -EIO; ++ } ++ ++ up_write(&pcc_ss_data->pcc_lock); ++ ++ return ret; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps); ++ ++/* ++ * cppc_set_auto_sel - Write autonomous selection register. ++ * @cpu : CPU to which to write register. ++ * @enable : the desired value of autonomous selection resiter to be updated. ++ */ ++int cppc_set_auto_sel(int cpu, bool enable) ++{ ++ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); ++ struct cpc_register_resource *auto_sel_reg; ++ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); ++ struct cppc_pcc_data *pcc_ss_data = NULL; ++ int ret = -EINVAL; ++ ++ if (!cpc_desc) { ++ pr_debug("No CPC descriptor for CPU:%d\n", cpu); ++ return -ENODEV; ++ } ++ ++ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; ++ ++ if (CPC_IN_PCC(auto_sel_reg)) { ++ if (pcc_ss_id < 0) { ++ pr_debug("Invalid pcc_ss_id\n"); ++ return -ENODEV; ++ } ++ ++ if (CPC_SUPPORTED(auto_sel_reg)) { ++ ret = cpc_write(cpu, auto_sel_reg, enable); ++ if (ret) ++ return ret; ++ } ++ ++ pcc_ss_data = pcc_data[pcc_ss_id]; ++ ++ down_write(&pcc_ss_data->pcc_lock); ++ /* after writing CPC, transfer the ownership of PCC to platform */ ++ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); ++ up_write(&pcc_ss_data->pcc_lock); ++ } else { ++ ret = -ENOTSUPP; ++ pr_debug("_CPC in PCC is not supported\n"); ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(cppc_set_auto_sel); ++ ++ + /** + * cppc_set_enable - Set to enable CPPC on the processor by writing the + * Continuous Performance Control package EnableRegister field. +@@ -1420,7 +1584,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable); + int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + { + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); +- struct cpc_register_resource *desired_reg; ++ struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg; + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = 0; +@@ -1431,6 +1595,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + } + + desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; ++ min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF]; ++ max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF]; + + /* + * This is Phase-I where we want to write to CPC registers +@@ -1439,7 +1605,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + * Since read_lock can be acquired by multiple CPUs simultaneously we + * achieve that goal here + */ +- if (CPC_IN_PCC(desired_reg)) { ++ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id\n"); + return -ENODEV; +@@ -1462,13 +1628,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + cpc_desc->write_cmd_status = 0; + } + +- /* +- * Skip writing MIN/MAX until Linux knows how to come up with +- * useful values. +- */ + cpc_write(cpu, desired_reg, perf_ctrls->desired_perf); + +- if (CPC_IN_PCC(desired_reg)) ++ /** ++ * Only write if min_perf and max_perf not zero. Some drivers pass zero ++ * value to min and max perf, but they don't mean to set the zero value, ++ * they just don't want to write to those registers. ++ */ ++ if (perf_ctrls->min_perf) ++ cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf); ++ if (perf_ctrls->max_perf) ++ cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf); ++ ++ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) + up_read(&pcc_ss_data->pcc_lock); /* END Phase-I */ + /* + * This is Phase-II where we transfer the ownership of PCC to Platform +@@ -1516,7 +1688,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) + * case during a CMD_READ and if there are pending writes it delivers + * the write command before servicing the read command + */ +- if (CPC_IN_PCC(desired_reg)) { ++ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) { + if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */ + /* Update only if there are pending write commands */ + if (pcc_ss_data->pending_pcc_write_cmd) +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index c17bd845f5fc..d4e60da7a544 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -59,8 +59,173 @@ + * we disable it by default to go acpi-cpufreq on these processors and add a + * module parameter to be able to enable it manually for debugging. + */ ++static struct cpufreq_driver *current_pstate_driver; + static struct cpufreq_driver amd_pstate_driver; +-static int cppc_load __initdata; ++static struct cpufreq_driver amd_pstate_epp_driver; ++static int cppc_state = AMD_PSTATE_DISABLE; ++struct kobject *amd_pstate_kobj; ++ ++/* ++ * AMD Energy Preference Performance (EPP) ++ * The EPP is used in the CCLK DPM controller to drive ++ * the frequency that a core is going to operate during ++ * short periods of activity. EPP values will be utilized for ++ * different OS profiles (balanced, performance, power savings) ++ * display strings corresponding to EPP index in the ++ * energy_perf_strings[] ++ * index String ++ *------------------------------------- ++ * 0 default ++ * 1 performance ++ * 2 balance_performance ++ * 3 balance_power ++ * 4 power ++ */ ++enum energy_perf_value_index { ++ EPP_INDEX_DEFAULT = 0, ++ EPP_INDEX_PERFORMANCE, ++ EPP_INDEX_BALANCE_PERFORMANCE, ++ EPP_INDEX_BALANCE_POWERSAVE, ++ EPP_INDEX_POWERSAVE, ++}; ++ ++static const char * const energy_perf_strings[] = { ++ [EPP_INDEX_DEFAULT] = "default", ++ [EPP_INDEX_PERFORMANCE] = "performance", ++ [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", ++ [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", ++ [EPP_INDEX_POWERSAVE] = "power", ++ NULL ++}; ++ ++static unsigned int epp_values[] = { ++ [EPP_INDEX_DEFAULT] = 0, ++ [EPP_INDEX_PERFORMANCE] = AMD_CPPC_EPP_PERFORMANCE, ++ [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, ++ [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, ++ [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, ++ }; ++ ++typedef int (*cppc_mode_transition_fn)(int); ++ ++static inline int get_mode_idx_from_str(const char *str, size_t size) ++{ ++ int i; ++ ++ for (i=0; i < AMD_PSTATE_MAX; i++) { ++ if (!strncmp(str, amd_pstate_mode_string[i], size)) ++ return i; ++ } ++ return -EINVAL; ++} ++ ++static DEFINE_MUTEX(amd_pstate_limits_lock); ++static DEFINE_MUTEX(amd_pstate_driver_lock); ++ ++static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) ++{ ++ u64 epp; ++ int ret; ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (!cppc_req_cached) { ++ epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, ++ &cppc_req_cached); ++ if (epp) ++ return epp; ++ } ++ epp = (cppc_req_cached >> 24) & 0xFF; ++ } else { ++ ret = cppc_get_epp_perf(cpudata->cpu, &epp); ++ if (ret < 0) { ++ pr_debug("Could not retrieve energy perf value (%d)\n", ret); ++ return -EIO; ++ } ++ } ++ ++ return (s16)(epp & 0xff); ++} ++ ++static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) ++{ ++ s16 epp; ++ int index = -EINVAL; ++ ++ epp = amd_pstate_get_epp(cpudata, 0); ++ if (epp < 0) ++ return epp; ++ ++ switch (epp) { ++ case AMD_CPPC_EPP_PERFORMANCE: ++ index = EPP_INDEX_PERFORMANCE; ++ break; ++ case AMD_CPPC_EPP_BALANCE_PERFORMANCE: ++ index = EPP_INDEX_BALANCE_PERFORMANCE; ++ break; ++ case AMD_CPPC_EPP_BALANCE_POWERSAVE: ++ index = EPP_INDEX_BALANCE_POWERSAVE; ++ break; ++ case AMD_CPPC_EPP_POWERSAVE: ++ index = EPP_INDEX_POWERSAVE; ++ break; ++ default: ++ break; ++ } ++ ++ return index; ++} ++ ++static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) ++{ ++ int ret; ++ struct cppc_perf_ctrls perf_ctrls; ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ u64 value = READ_ONCE(cpudata->cppc_req_cached); ++ ++ value &= ~GENMASK_ULL(31, 24); ++ value |= (u64)epp << 24; ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ ++ ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); ++ if (!ret) ++ cpudata->epp_cached = epp; ++ } else { ++ perf_ctrls.energy_perf = epp; ++ ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); ++ if (ret) { ++ pr_debug("failed to set energy perf value (%d)\n", ret); ++ return ret; ++ } ++ cpudata->epp_cached = epp; ++ } ++ ++ return ret; ++} ++ ++static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, ++ int pref_index) ++{ ++ int epp = -EINVAL; ++ int ret; ++ ++ if (!pref_index) { ++ pr_debug("EPP pref_index is invalid\n"); ++ return -EINVAL; ++ } ++ ++ if (epp == -EINVAL) ++ epp = epp_values[pref_index]; ++ ++ if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { ++ pr_debug("EPP cannot be set under performance policy\n"); ++ return -EBUSY; ++ } ++ ++ ret = amd_pstate_set_epp(cpudata, epp); ++ ++ return ret; ++} + + static inline int pstate_enable(bool enable) + { +@@ -70,11 +235,21 @@ static inline int pstate_enable(bool enable) + static int cppc_enable(bool enable) + { + int cpu, ret = 0; ++ struct cppc_perf_ctrls perf_ctrls; + + for_each_present_cpu(cpu) { + ret = cppc_set_enable(cpu, enable); + if (ret) + return ret; ++ ++ /* Enable autonomous mode for EPP */ ++ if (cppc_state == AMD_PSTATE_ACTIVE) { ++ /* Set desired perf as zero to allow EPP firmware control */ ++ perf_ctrls.desired_perf = 0; ++ ret = cppc_set_perf(cpu, &perf_ctrls); ++ if (ret) ++ return ret; ++ } + } + + return ret; +@@ -135,7 +310,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + cppc_perf.lowest_nonlinear_perf); + WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); + +- return 0; ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ return 0; ++ ++ ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf); ++ if (ret) { ++ pr_warn("failed to get auto_sel, ret: %d\n", ret); ++ return 0; ++ } ++ ++ ret = cppc_set_auto_sel(cpudata->cpu, ++ (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); ++ ++ if (ret) ++ pr_warn("failed to set auto_sel, ret: %d\n", ret); ++ ++ return ret; + } + + DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); +@@ -212,12 +402,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) + } + + static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, bool fast_switch) ++ u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) + { + u64 prev = READ_ONCE(cpudata->cppc_req_cached); + u64 value = prev; + + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); ++ ++ if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { ++ min_perf = des_perf; ++ des_perf = 0; ++ } ++ + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + +@@ -272,7 +468,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy, + + cpufreq_freq_transition_begin(policy, &freqs); + amd_pstate_update(cpudata, min_perf, des_perf, +- max_perf, false); ++ max_perf, false, policy->governor->flags); + cpufreq_freq_transition_end(policy, &freqs, false); + + return 0; +@@ -306,7 +502,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + if (max_perf < min_perf) + max_perf = min_perf; + +- amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true); ++ amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, ++ policy->governor->flags); + cpufreq_cpu_put(policy); + } + +@@ -418,7 +615,7 @@ static void amd_pstate_boost_init(struct amd_cpudata *cpudata) + return; + + cpudata->boost_supported = true; +- amd_pstate_driver.boost_enabled = true; ++ current_pstate_driver->boost_enabled = true; + } + + static void amd_perf_ctl_reset(unsigned int cpu) +@@ -501,6 +698,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + policy->driver_data = cpudata; + + amd_pstate_boost_init(cpudata); ++ if (!current_pstate_driver->adjust_perf) ++ current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + + return 0; + +@@ -561,7 +760,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, + if (max_freq < 0) + return max_freq; + +- return sprintf(&buf[0], "%u\n", max_freq); ++ return sysfs_emit(buf, "%u\n", max_freq); + } + + static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, +@@ -574,7 +773,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli + if (freq < 0) + return freq; + +- return sprintf(&buf[0], "%u\n", freq); ++ return sysfs_emit(buf, "%u\n", freq); + } + + /* +@@ -589,13 +788,208 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, + + perf = READ_ONCE(cpudata->highest_perf); + +- return sprintf(&buf[0], "%u\n", perf); ++ return sysfs_emit(buf, "%u\n", perf); ++} ++ ++static ssize_t show_energy_performance_available_preferences( ++ struct cpufreq_policy *policy, char *buf) ++{ ++ int i = 0; ++ int offset = 0; ++ ++ while (energy_perf_strings[i] != NULL) ++ offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); ++ ++ sysfs_emit_at(buf, offset, "\n"); ++ ++ return offset; ++} ++ ++static ssize_t store_energy_performance_preference( ++ struct cpufreq_policy *policy, const char *buf, size_t count) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ char str_preference[21]; ++ ssize_t ret; ++ ++ ret = sscanf(buf, "%20s", str_preference); ++ if (ret != 1) ++ return -EINVAL; ++ ++ ret = match_string(energy_perf_strings, -1, str_preference); ++ if (ret < 0) ++ return -EINVAL; ++ ++ mutex_lock(&amd_pstate_limits_lock); ++ ret = amd_pstate_set_energy_pref_index(cpudata, ret); ++ mutex_unlock(&amd_pstate_limits_lock); ++ ++ return ret ?: count; ++} ++ ++static ssize_t show_energy_performance_preference( ++ struct cpufreq_policy *policy, char *buf) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ int preference; ++ ++ preference = amd_pstate_get_energy_pref_index(cpudata); ++ if (preference < 0) ++ return preference; ++ ++ return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); ++} ++ ++static void amd_pstate_driver_cleanup(void) ++{ ++ amd_pstate_enable(false); ++ cppc_state = AMD_PSTATE_DISABLE; ++ current_pstate_driver = NULL; ++} ++ ++static int amd_pstate_register_driver(int mode) ++{ ++ int ret; ++ ++ if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) ++ current_pstate_driver = &amd_pstate_driver; ++ else if (mode == AMD_PSTATE_ACTIVE) ++ current_pstate_driver = &amd_pstate_epp_driver; ++ else ++ return -EINVAL; ++ ++ cppc_state = mode; ++ ret = cpufreq_register_driver(current_pstate_driver); ++ if (ret) { ++ amd_pstate_driver_cleanup(); ++ return ret; ++ } ++ return 0; ++} ++ ++static int amd_pstate_unregister_driver(int dummy) ++{ ++ cpufreq_unregister_driver(current_pstate_driver); ++ amd_pstate_driver_cleanup(); ++ return 0; ++} ++ ++static int amd_pstate_change_mode_without_dvr_change(int mode) ++{ ++ int cpu = 0; ++ ++ cppc_state = mode; ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) ++ return 0; ++ ++ for_each_present_cpu(cpu) { ++ cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1); ++ } ++ ++ return 0; ++} ++ ++static int amd_pstate_change_driver_mode(int mode) ++{ ++ int ret; ++ ++ ret = amd_pstate_unregister_driver(0); ++ if (ret) ++ return ret; ++ ++ ret = amd_pstate_register_driver(mode); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = { ++ [AMD_PSTATE_DISABLE] = { ++ [AMD_PSTATE_DISABLE] = NULL, ++ [AMD_PSTATE_PASSIVE] = amd_pstate_register_driver, ++ [AMD_PSTATE_ACTIVE] = amd_pstate_register_driver, ++ [AMD_PSTATE_GUIDED] = amd_pstate_register_driver, ++ }, ++ [AMD_PSTATE_PASSIVE] = { ++ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, ++ [AMD_PSTATE_PASSIVE] = NULL, ++ [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, ++ [AMD_PSTATE_GUIDED] = amd_pstate_change_mode_without_dvr_change, ++ }, ++ [AMD_PSTATE_ACTIVE] = { ++ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, ++ [AMD_PSTATE_PASSIVE] = amd_pstate_change_driver_mode, ++ [AMD_PSTATE_ACTIVE] = NULL, ++ [AMD_PSTATE_GUIDED] = amd_pstate_change_driver_mode, ++ }, ++ [AMD_PSTATE_GUIDED] = { ++ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver, ++ [AMD_PSTATE_PASSIVE] = amd_pstate_change_mode_without_dvr_change, ++ [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode, ++ [AMD_PSTATE_GUIDED] = NULL, ++ }, ++}; ++ ++static ssize_t amd_pstate_show_status(char *buf) ++{ ++ if (!current_pstate_driver) ++ return sysfs_emit(buf, "disable\n"); ++ ++ return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]); ++} ++ ++static int amd_pstate_update_status(const char *buf, size_t size) ++{ ++ int mode_idx; ++ ++ if (size > strlen("passive") || size < strlen("active")) ++ return -EINVAL; ++ ++ mode_idx = get_mode_idx_from_str(buf, size); ++ ++ if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) ++ return -EINVAL; ++ ++ if (mode_state_machine[cppc_state][mode_idx]) ++ return mode_state_machine[cppc_state][mode_idx](mode_idx); ++ ++ return 0; ++} ++ ++static ssize_t show_status(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ ssize_t ret; ++ ++ mutex_lock(&amd_pstate_driver_lock); ++ ret = amd_pstate_show_status(buf); ++ mutex_unlock(&amd_pstate_driver_lock); ++ ++ return ret; ++} ++ ++static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, ++ const char *buf, size_t count) ++{ ++ char *p = memchr(buf, '\n', count); ++ int ret; ++ ++ mutex_lock(&amd_pstate_driver_lock); ++ ret = amd_pstate_update_status(buf, p ? p - buf : count); ++ mutex_unlock(&amd_pstate_driver_lock); ++ ++ return ret < 0 ? ret : count; + } + + cpufreq_freq_attr_ro(amd_pstate_max_freq); + cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + + cpufreq_freq_attr_ro(amd_pstate_highest_perf); ++cpufreq_freq_attr_rw(energy_performance_preference); ++cpufreq_freq_attr_ro(energy_performance_available_preferences); ++define_one_global_rw(status); + + static struct freq_attr *amd_pstate_attr[] = { + &amd_pstate_max_freq, +@@ -604,6 +998,313 @@ static struct freq_attr *amd_pstate_attr[] = { + NULL, + }; + ++static struct freq_attr *amd_pstate_epp_attr[] = { ++ &amd_pstate_max_freq, ++ &amd_pstate_lowest_nonlinear_freq, ++ &amd_pstate_highest_perf, ++ &energy_performance_preference, ++ &energy_performance_available_preferences, ++ NULL, ++}; ++ ++static struct attribute *pstate_global_attributes[] = { ++ &status.attr, ++ NULL ++}; ++ ++static const struct attribute_group amd_pstate_global_attr_group = { ++ .attrs = pstate_global_attributes, ++}; ++ ++static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) ++{ ++ int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; ++ struct amd_cpudata *cpudata; ++ struct device *dev; ++ u64 value; ++ ++ /* ++ * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, ++ * which is ideal for initialization process. ++ */ ++ amd_perf_ctl_reset(policy->cpu); ++ dev = get_cpu_device(policy->cpu); ++ if (!dev) ++ return -ENODEV; ++ ++ cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); ++ if (!cpudata) ++ return -ENOMEM; ++ ++ cpudata->cpu = policy->cpu; ++ cpudata->epp_policy = 0; ++ ++ ret = amd_pstate_init_perf(cpudata); ++ if (ret) ++ goto free_cpudata1; ++ ++ min_freq = amd_get_min_freq(cpudata); ++ max_freq = amd_get_max_freq(cpudata); ++ nominal_freq = amd_get_nominal_freq(cpudata); ++ lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); ++ if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { ++ dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", ++ min_freq, max_freq); ++ ret = -EINVAL; ++ goto free_cpudata1; ++ } ++ ++ policy->cpuinfo.min_freq = min_freq; ++ policy->cpuinfo.max_freq = max_freq; ++ /* It will be updated by governor */ ++ policy->cur = policy->cpuinfo.min_freq; ++ ++ /* Initial processor data capability frequencies */ ++ cpudata->max_freq = max_freq; ++ cpudata->min_freq = min_freq; ++ cpudata->nominal_freq = nominal_freq; ++ cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; ++ ++ policy->driver_data = cpudata; ++ ++ cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); ++ ++ policy->min = policy->cpuinfo.min_freq; ++ policy->max = policy->cpuinfo.max_freq; ++ ++ /* ++ * Set the policy to powersave to provide a valid fallback value in case ++ * the default cpufreq governor is neither powersave nor performance. ++ */ ++ policy->policy = CPUFREQ_POLICY_POWERSAVE; ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ policy->fast_switch_possible = true; ++ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); ++ if (ret) ++ return ret; ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ ++ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); ++ if (ret) ++ return ret; ++ WRITE_ONCE(cpudata->cppc_cap1_cached, value); ++ } ++ amd_pstate_boost_init(cpudata); ++ ++ return 0; ++ ++free_cpudata1: ++ kfree(cpudata); ++ return ret; ++} ++ ++static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) ++{ ++ pr_debug("CPU %d exiting\n", policy->cpu); ++ policy->fast_switch_possible = false; ++ return 0; ++} ++ ++static void amd_pstate_epp_init(unsigned int cpu) ++{ ++ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); ++ struct amd_cpudata *cpudata = policy->driver_data; ++ u32 max_perf, min_perf; ++ u64 value; ++ s16 epp; ++ ++ max_perf = READ_ONCE(cpudata->highest_perf); ++ min_perf = READ_ONCE(cpudata->lowest_perf); ++ ++ value = READ_ONCE(cpudata->cppc_req_cached); ++ ++ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) ++ min_perf = max_perf; ++ ++ /* Initial min/max values for CPPC Performance Controls Register */ ++ value &= ~AMD_CPPC_MIN_PERF(~0L); ++ value |= AMD_CPPC_MIN_PERF(min_perf); ++ ++ value &= ~AMD_CPPC_MAX_PERF(~0L); ++ value |= AMD_CPPC_MAX_PERF(max_perf); ++ ++ /* CPPC EPP feature require to set zero to the desire perf bit */ ++ value &= ~AMD_CPPC_DES_PERF(~0L); ++ value |= AMD_CPPC_DES_PERF(0); ++ ++ if (cpudata->epp_policy == cpudata->policy) ++ goto skip_epp; ++ ++ cpudata->epp_policy = cpudata->policy; ++ ++ /* Get BIOS pre-defined epp value */ ++ epp = amd_pstate_get_epp(cpudata, value); ++ if (epp < 0) { ++ /** ++ * This return value can only be negative for shared_memory ++ * systems where EPP register read/write not supported. ++ */ ++ goto skip_epp; ++ } ++ ++ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) ++ epp = 0; ++ ++ /* Set initial EPP value */ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ value &= ~GENMASK_ULL(31, 24); ++ value |= (u64)epp << 24; ++ } ++ ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ amd_pstate_set_epp(cpudata, epp); ++skip_epp: ++ cpufreq_cpu_put(policy); ++} ++ ++static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ if (!policy->cpuinfo.max_freq) ++ return -ENODEV; ++ ++ pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", ++ policy->cpuinfo.max_freq, policy->max); ++ ++ cpudata->policy = policy->policy; ++ ++ amd_pstate_epp_init(policy->cpu); ++ ++ return 0; ++} ++ ++static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) ++{ ++ struct cppc_perf_ctrls perf_ctrls; ++ u64 value, max_perf; ++ int ret; ++ ++ ret = amd_pstate_enable(true); ++ if (ret) ++ pr_err("failed to enable amd pstate during resume, return %d\n", ret); ++ ++ value = READ_ONCE(cpudata->cppc_req_cached); ++ max_perf = READ_ONCE(cpudata->highest_perf); ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); ++ } else { ++ perf_ctrls.max_perf = max_perf; ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); ++ cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ } ++} ++ ++static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); ++ ++ if (cppc_state == AMD_PSTATE_ACTIVE) { ++ amd_pstate_epp_reenable(cpudata); ++ cpudata->suspended = false; ++ } ++ ++ return 0; ++} ++ ++static void amd_pstate_epp_offline(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ struct cppc_perf_ctrls perf_ctrls; ++ int min_perf; ++ u64 value; ++ ++ min_perf = READ_ONCE(cpudata->lowest_perf); ++ value = READ_ONCE(cpudata->cppc_req_cached); ++ ++ mutex_lock(&amd_pstate_limits_lock); ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; ++ ++ /* Set max perf same as min perf */ ++ value &= ~AMD_CPPC_MAX_PERF(~0L); ++ value |= AMD_CPPC_MAX_PERF(min_perf); ++ value &= ~AMD_CPPC_MIN_PERF(~0L); ++ value |= AMD_CPPC_MIN_PERF(min_perf); ++ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); ++ } else { ++ perf_ctrls.desired_perf = 0; ++ perf_ctrls.max_perf = min_perf; ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); ++ cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ } ++ mutex_unlock(&amd_pstate_limits_lock); ++} ++ ++static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu); ++ ++ if (cpudata->suspended) ++ return 0; ++ ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ amd_pstate_epp_offline(policy); ++ ++ return 0; ++} ++ ++static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) ++{ ++ cpufreq_verify_within_cpu_limits(policy); ++ pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); ++ return 0; ++} ++ ++static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ int ret; ++ ++ /* avoid suspending when EPP is not enabled */ ++ if (cppc_state != AMD_PSTATE_ACTIVE) ++ return 0; ++ ++ /* set this flag to avoid setting core offline*/ ++ cpudata->suspended = true; ++ ++ /* disable CPPC in lowlevel firmware */ ++ ret = amd_pstate_enable(false); ++ if (ret) ++ pr_err("failed to suspend, return %d\n", ret); ++ ++ return 0; ++} ++ ++static int amd_pstate_epp_resume(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ if (cpudata->suspended) { ++ mutex_lock(&amd_pstate_limits_lock); ++ ++ /* enable amd pstate from suspend state*/ ++ amd_pstate_epp_reenable(cpudata); ++ ++ mutex_unlock(&amd_pstate_limits_lock); ++ ++ cpudata->suspended = false; ++ } ++ ++ return 0; ++} ++ + static struct cpufreq_driver amd_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, + .verify = amd_pstate_verify, +@@ -617,6 +1318,20 @@ static struct cpufreq_driver amd_pstate_driver = { + .attr = amd_pstate_attr, + }; + ++static struct cpufreq_driver amd_pstate_epp_driver = { ++ .flags = CPUFREQ_CONST_LOOPS, ++ .verify = amd_pstate_epp_verify_policy, ++ .setpolicy = amd_pstate_epp_set_policy, ++ .init = amd_pstate_epp_cpu_init, ++ .exit = amd_pstate_epp_cpu_exit, ++ .offline = amd_pstate_epp_cpu_offline, ++ .online = amd_pstate_epp_cpu_online, ++ .suspend = amd_pstate_epp_suspend, ++ .resume = amd_pstate_epp_resume, ++ .name = "amd_pstate_epp", ++ .attr = amd_pstate_epp_attr, ++}; ++ + static int __init amd_pstate_init(void) + { + int ret; +@@ -626,10 +1341,10 @@ static int __init amd_pstate_init(void) + /* + * by default the pstate driver is disabled to load + * enable the amd_pstate passive mode driver explicitly +- * with amd_pstate=passive in kernel command line ++ * with amd_pstate=passive or other modes in kernel command line + */ +- if (!cppc_load) { +- pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); ++ if (cppc_state == AMD_PSTATE_DISABLE) { ++ pr_debug("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } + +@@ -645,7 +1360,8 @@ static int __init amd_pstate_init(void) + /* capability check */ + if (boot_cpu_has(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); +- amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; ++ if (cppc_state != AMD_PSTATE_ACTIVE) ++ current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + } else { + pr_debug("AMD CPPC shared memory based functionality is supported\n"); + static_call_update(amd_pstate_enable, cppc_enable); +@@ -656,31 +1372,63 @@ static int __init amd_pstate_init(void) + /* enable amd pstate feature */ + ret = amd_pstate_enable(true); + if (ret) { +- pr_err("failed to enable amd-pstate with return %d\n", ret); ++ pr_err("failed to enable with return %d\n", ret); + return ret; + } + +- ret = cpufreq_register_driver(&amd_pstate_driver); ++ ret = cpufreq_register_driver(current_pstate_driver); + if (ret) +- pr_err("failed to register amd_pstate_driver with return %d\n", +- ret); ++ pr_err("failed to register with return %d\n", ret); ++ ++ amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj); ++ if (!amd_pstate_kobj) { ++ ret = -EINVAL; ++ pr_err("global sysfs registration failed.\n"); ++ goto kobject_free; ++ } + ++ ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group); ++ if (ret) { ++ pr_err("sysfs attribute export failed with error %d.\n", ret); ++ goto global_attr_free; ++ } ++ ++ return ret; ++ ++global_attr_free: ++ kobject_put(amd_pstate_kobj); ++kobject_free: ++ cpufreq_unregister_driver(current_pstate_driver); + return ret; + } + device_initcall(amd_pstate_init); + + static int __init amd_pstate_param(char *str) + { ++ size_t size; ++ int mode_idx; ++ + if (!str) + return -EINVAL; + +- if (!strcmp(str, "disable")) { +- cppc_load = 0; +- pr_info("driver is explicitly disabled\n"); +- } else if (!strcmp(str, "passive")) +- cppc_load = 1; ++ size = strlen(str); ++ mode_idx = get_mode_idx_from_str(str, size); + +- return 0; ++ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { ++ cppc_state = mode_idx; ++ if (cppc_state == AMD_PSTATE_DISABLE) ++ pr_info("driver is explicitly disabled\n"); ++ ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ current_pstate_driver = &amd_pstate_epp_driver; ++ ++ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) ++ current_pstate_driver = &amd_pstate_driver; ++ ++ return 0; ++ } ++ ++ return -EINVAL; + } + early_param("amd_pstate", amd_pstate_param); + +diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c +index 4153150e20db..ffea6402189d 100644 +--- a/drivers/cpufreq/brcmstb-avs-cpufreq.c ++++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c +@@ -751,10 +751,7 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) + + static int brcm_avs_cpufreq_remove(struct platform_device *pdev) + { +- int ret; +- +- ret = cpufreq_unregister_driver(&brcm_avs_driver); +- WARN_ON(ret); ++ cpufreq_unregister_driver(&brcm_avs_driver); + + brcm_avs_prepare_uninit(pdev); + +diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c +index 7e56a42750ea..85a0bea2dbf1 100644 +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -2904,12 +2904,12 @@ EXPORT_SYMBOL_GPL(cpufreq_register_driver); + * Returns zero if successful, and -EINVAL if the cpufreq_driver is + * currently not initialised. + */ +-int cpufreq_unregister_driver(struct cpufreq_driver *driver) ++void cpufreq_unregister_driver(struct cpufreq_driver *driver) + { + unsigned long flags; + +- if (!cpufreq_driver || (driver != cpufreq_driver)) +- return -EINVAL; ++ if (WARN_ON(!cpufreq_driver || (driver != cpufreq_driver))) ++ return; + + pr_debug("unregistering driver %s\n", driver->name); + +@@ -2926,8 +2926,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) + + write_unlock_irqrestore(&cpufreq_driver_lock, flags); + cpus_read_unlock(); +- +- return 0; + } + EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); + +diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c +index 9e97f60f8199..2d23015e2abd 100644 +--- a/drivers/cpufreq/davinci-cpufreq.c ++++ b/drivers/cpufreq/davinci-cpufreq.c +@@ -138,7 +138,9 @@ static int __exit davinci_cpufreq_remove(struct platform_device *pdev) + if (cpufreq.asyncclk) + clk_put(cpufreq.asyncclk); + +- return cpufreq_unregister_driver(&davinci_driver); ++ cpufreq_unregister_driver(&davinci_driver); ++ ++ return 0; + } + + static struct platform_driver davinci_cpufreq_driver = { +diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c +index f80339779084..f21a9e3df53d 100644 +--- a/drivers/cpufreq/mediatek-cpufreq-hw.c ++++ b/drivers/cpufreq/mediatek-cpufreq-hw.c +@@ -317,7 +317,9 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) + + static int mtk_cpufreq_hw_driver_remove(struct platform_device *pdev) + { +- return cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); ++ cpufreq_unregister_driver(&cpufreq_mtk_hw_driver); ++ ++ return 0; + } + + static const struct of_device_id mtk_cpufreq_hw_match[] = { +diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c +index 1b50df06c6bc..81649a1969b6 100644 +--- a/drivers/cpufreq/omap-cpufreq.c ++++ b/drivers/cpufreq/omap-cpufreq.c +@@ -184,7 +184,9 @@ static int omap_cpufreq_probe(struct platform_device *pdev) + + static int omap_cpufreq_remove(struct platform_device *pdev) + { +- return cpufreq_unregister_driver(&omap_driver); ++ cpufreq_unregister_driver(&omap_driver); ++ ++ return 0; + } + + static struct platform_driver omap_cpufreq_platdrv = { +diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c +index d3f55ca06ed3..2f581d2d617d 100644 +--- a/drivers/cpufreq/qcom-cpufreq-hw.c ++++ b/drivers/cpufreq/qcom-cpufreq-hw.c +@@ -770,7 +770,9 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) + + static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) + { +- return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); ++ cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); ++ ++ return 0; + } + + static struct platform_driver qcom_cpufreq_hw_driver = { +diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h +index c5614444031f..6126c977ece0 100644 +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -108,12 +108,15 @@ struct cppc_perf_caps { + u32 lowest_nonlinear_perf; + u32 lowest_freq; + u32 nominal_freq; ++ u32 energy_perf; ++ bool auto_sel; + }; + + struct cppc_perf_ctrls { + u32 max_perf; + u32 min_perf; + u32 desired_perf; ++ u32 energy_perf; + }; + + struct cppc_perf_fb_ctrs { +@@ -149,6 +152,10 @@ extern bool cpc_ffh_supported(void); + extern bool cpc_supported_by_cpu(void); + extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); + extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); ++extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); ++extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable); ++extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps); ++extern int cppc_set_auto_sel(int cpu, bool enable); + #else /* !CONFIG_ACPI_CPPC_LIB */ + static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) + { +@@ -202,6 +209,22 @@ static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) + { + return -ENOTSUPP; + } ++static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) ++{ ++ return -ENOTSUPP; ++} ++static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf) ++{ ++ return -ENOTSUPP; ++} ++static inline int cppc_set_auto_sel(int cpu, bool enable) ++{ ++ return -ENOTSUPP; ++} ++static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps) ++{ ++ return -ENOTSUPP; ++} + #endif /* !CONFIG_ACPI_CPPC_LIB */ + + #endif /* _CPPC_ACPI_H*/ +diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h +index 1c4b8659f171..c10ebf8c42e6 100644 +--- a/include/linux/amd-pstate.h ++++ b/include/linux/amd-pstate.h +@@ -12,6 +12,11 @@ + + #include + ++#define AMD_CPPC_EPP_PERFORMANCE 0x00 ++#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 ++#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF ++#define AMD_CPPC_EPP_POWERSAVE 0xFF ++ + /********************************************************************* + * AMD P-state INTERFACE * + *********************************************************************/ +@@ -47,6 +52,10 @@ struct amd_aperf_mperf { + * @prev: Last Aperf/Mperf/tsc count value read from register + * @freq: current cpu frequency value + * @boost_supported: check whether the Processor or SBIOS supports boost mode ++ * @epp_policy: Last saved policy used to set energy-performance preference ++ * @epp_cached: Cached CPPC energy-performance preference value ++ * @policy: Cpufreq policy value ++ * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. +@@ -72,6 +81,31 @@ struct amd_cpudata { + + u64 freq; + bool boost_supported; ++ ++ /* EPP feature related attributes*/ ++ s16 epp_policy; ++ s16 epp_cached; ++ u32 policy; ++ u64 cppc_cap1_cached; ++ bool suspended; + }; + ++/* ++ * enum amd_pstate_mode - driver working mode of amd pstate ++ */ ++enum amd_pstate_mode { ++ AMD_PSTATE_DISABLE = 0, ++ AMD_PSTATE_PASSIVE, ++ AMD_PSTATE_ACTIVE, ++ AMD_PSTATE_GUIDED, ++ AMD_PSTATE_MAX, ++}; ++ ++static const char * const amd_pstate_mode_string[] = { ++ [AMD_PSTATE_DISABLE] = "disable", ++ [AMD_PSTATE_PASSIVE] = "passive", ++ [AMD_PSTATE_ACTIVE] = "active", ++ [AMD_PSTATE_GUIDED] = "guided", ++ NULL, ++}; + #endif /* _LINUX_AMD_PSTATE_H */ +diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h +index 6a94a6eaad27..65623233ab2f 100644 +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -448,7 +448,7 @@ struct cpufreq_driver { + #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) + + int cpufreq_register_driver(struct cpufreq_driver *driver_data); +-int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); ++void cpufreq_unregister_driver(struct cpufreq_driver *driver_data); + + bool cpufreq_driver_test_flags(u16 flags); + const char *cpufreq_get_current_driver(void); +-- +2.39.2 + +From 087384681c8c010c8a826bc03b6aa7634f73a3bf Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 22 Jan 2023 13:41:50 +0100 +Subject: [PATCH 09/15] ksm + +Signed-off-by: Peter Jung +--- + arch/alpha/kernel/syscalls/syscall.tbl | 1 + + arch/arm/tools/syscall.tbl | 1 + + arch/arm64/include/asm/unistd.h | 2 +- + arch/arm64/include/asm/unistd32.h | 2 + + arch/ia64/kernel/syscalls/syscall.tbl | 1 + + arch/m68k/kernel/syscalls/syscall.tbl | 1 + + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + + arch/parisc/kernel/syscalls/syscall.tbl | 1 + + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + + arch/s390/kernel/syscalls/syscall.tbl | 1 + + arch/sh/kernel/syscalls/syscall.tbl | 1 + + arch/sparc/kernel/syscalls/syscall.tbl | 1 + + arch/x86/entry/syscalls/syscall_32.tbl | 1 + + arch/x86/entry/syscalls/syscall_64.tbl | 1 + + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + + include/linux/ksm.h | 4 + + include/linux/syscalls.h | 1 + + include/uapi/asm-generic/unistd.h | 5 +- + kernel/sys_ni.c | 1 + + mm/ksm.c | 88 +++++++++------ + mm/madvise.c | 113 ++++++++++++++++++++ + 24 files changed, 198 insertions(+), 34 deletions(-) + +diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl +index 8ebacf37a8cf..c9d25f85d86d 100644 +--- a/arch/alpha/kernel/syscalls/syscall.tbl ++++ b/arch/alpha/kernel/syscalls/syscall.tbl +@@ -490,3 +490,4 @@ + 558 common process_mrelease sys_process_mrelease + 559 common futex_waitv sys_futex_waitv + 560 common set_mempolicy_home_node sys_ni_syscall ++561 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index ac964612d8b0..90933eabe115 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -464,3 +464,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h +index 037feba03a51..64a514f90131 100644 +--- a/arch/arm64/include/asm/unistd.h ++++ b/arch/arm64/include/asm/unistd.h +@@ -39,7 +39,7 @@ + #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) + #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) + +-#define __NR_compat_syscalls 451 ++#define __NR_compat_syscalls 452 + #endif + + #define __ARCH_WANT_SYS_CLONE +diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h +index 604a2053d006..91f2bb7199af 100644 +--- a/arch/arm64/include/asm/unistd32.h ++++ b/arch/arm64/include/asm/unistd32.h +@@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) + __SYSCALL(__NR_futex_waitv, sys_futex_waitv) + #define __NR_set_mempolicy_home_node 450 + __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) ++#define __NR_pmadv_ksm 451 ++__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) + + /* + * Please add new compat syscalls above this comment and update +diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl +index 72c929d9902b..0d5b1d14b2b5 100644 +--- a/arch/ia64/kernel/syscalls/syscall.tbl ++++ b/arch/ia64/kernel/syscalls/syscall.tbl +@@ -371,3 +371,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl +index b1f3940bc298..5ccf925567da 100644 +--- a/arch/m68k/kernel/syscalls/syscall.tbl ++++ b/arch/m68k/kernel/syscalls/syscall.tbl +@@ -450,3 +450,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl +index 820145e47350..6b76208597f3 100644 +--- a/arch/microblaze/kernel/syscalls/syscall.tbl ++++ b/arch/microblaze/kernel/syscalls/syscall.tbl +@@ -456,3 +456,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl +index 253ff994ed2e..e4aeedb17c38 100644 +--- a/arch/mips/kernel/syscalls/syscall_n32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl +@@ -389,3 +389,4 @@ + 448 n32 process_mrelease sys_process_mrelease + 449 n32 futex_waitv sys_futex_waitv + 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node ++451 n32 pmadv_ksm sys_pmadv_ksm +diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl +index 3f1886ad9d80..fe88db51efa0 100644 +--- a/arch/mips/kernel/syscalls/syscall_n64.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl +@@ -365,3 +365,4 @@ + 448 n64 process_mrelease sys_process_mrelease + 449 n64 futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 n64 pmadv_ksm sys_pmadv_ksm +diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl +index 8f243e35a7b2..674cb940bd15 100644 +--- a/arch/mips/kernel/syscalls/syscall_o32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl +@@ -438,3 +438,4 @@ + 448 o32 process_mrelease sys_process_mrelease + 449 o32 futex_waitv sys_futex_waitv + 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node ++451 o32 pmadv_ksm sys_pmadv_ksm +diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl +index 0e42fceb2d5e..5914aa460255 100644 +--- a/arch/parisc/kernel/syscalls/syscall.tbl ++++ b/arch/parisc/kernel/syscalls/syscall.tbl +@@ -448,3 +448,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl +index a0be127475b1..347894da4eb6 100644 +--- a/arch/powerpc/kernel/syscalls/syscall.tbl ++++ b/arch/powerpc/kernel/syscalls/syscall.tbl +@@ -537,3 +537,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl +index 799147658dee..1cd523748bd2 100644 +--- a/arch/s390/kernel/syscalls/syscall.tbl ++++ b/arch/s390/kernel/syscalls/syscall.tbl +@@ -453,3 +453,4 @@ + 448 common process_mrelease sys_process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm sys_pmadv_ksm +diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl +index 2de85c977f54..cfc75fa43eae 100644 +--- a/arch/sh/kernel/syscalls/syscall.tbl ++++ b/arch/sh/kernel/syscalls/syscall.tbl +@@ -453,3 +453,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl +index 4398cc6fb68d..d2c0a6426f6b 100644 +--- a/arch/sparc/kernel/syscalls/syscall.tbl ++++ b/arch/sparc/kernel/syscalls/syscall.tbl +@@ -496,3 +496,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 320480a8db4f..331aaf1a782f 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -455,3 +455,4 @@ + 448 i386 process_mrelease sys_process_mrelease + 449 i386 futex_waitv sys_futex_waitv + 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node ++451 i386 pmadv_ksm sys_pmadv_ksm +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index c84d12608cd2..14902db4c01f 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -372,6 +372,7 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm + + # + # Due to a historical design error, certain syscalls are numbered differently +diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl +index 52c94ab5c205..1518e261d882 100644 +--- a/arch/xtensa/kernel/syscalls/syscall.tbl ++++ b/arch/xtensa/kernel/syscalls/syscall.tbl +@@ -421,3 +421,4 @@ + 448 common process_mrelease sys_process_mrelease + 449 common futex_waitv sys_futex_waitv + 450 common set_mempolicy_home_node sys_set_mempolicy_home_node ++451 common pmadv_ksm sys_pmadv_ksm +diff --git a/include/linux/ksm.h b/include/linux/ksm.h +index 7e232ba59b86..632a1a792ebb 100644 +--- a/include/linux/ksm.h ++++ b/include/linux/ksm.h +@@ -16,6 +16,10 @@ + #include + + #ifdef CONFIG_KSM ++int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, ++ unsigned long *vm_flags); ++int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, unsigned long *vm_flags); + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); + int __ksm_enter(struct mm_struct *mm); +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 33a0ee3bcb2e..62f14e800839 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -919,6 +919,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); + asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); ++asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, + unsigned long flags); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 45fa180cc56a..40f7e6d04af0 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) + #define __NR_set_mempolicy_home_node 450 + __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + ++#define __NR_pmadv_ksm 451 ++__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm) ++ + #undef __NR_syscalls +-#define __NR_syscalls 451 ++#define __NR_syscalls 452 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 860b2dcf3ac4..810e1fcaff94 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -292,6 +292,7 @@ COND_SYSCALL(mincore); + COND_SYSCALL(madvise); + COND_SYSCALL(process_madvise); + COND_SYSCALL(process_mrelease); ++COND_SYSCALL(pmadv_ksm); + COND_SYSCALL(remap_file_pages); + COND_SYSCALL(mbind); + COND_SYSCALL(get_mempolicy); +diff --git a/mm/ksm.c b/mm/ksm.c +index c267b92b837b..4474b7ac0cd6 100644 +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -2575,54 +2575,78 @@ static int ksm_scan_thread(void *nothing) + return 0; + } + +-int ksm_madvise(struct vm_area_struct *vma, unsigned long start, +- unsigned long end, int advice, unsigned long *vm_flags) ++int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, ++ unsigned long *vm_flags) + { +- struct mm_struct *mm = vma->vm_mm; + int err; + +- switch (advice) { +- case MADV_MERGEABLE: +- /* +- * Be somewhat over-protective for now! +- */ +- if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | +- VM_PFNMAP | VM_IO | VM_DONTEXPAND | +- VM_HUGETLB | VM_MIXEDMAP)) +- return 0; /* just ignore the advice */ ++ /* ++ * Be somewhat over-protective for now! ++ */ ++ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | ++ VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_HUGETLB | VM_MIXEDMAP)) ++ return 0; /* just ignore the advice */ + +- if (vma_is_dax(vma)) +- return 0; ++ if (vma_is_dax(vma)) ++ return 0; + + #ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; + #endif + #ifdef VM_SPARC_ADI +- if (*vm_flags & VM_SPARC_ADI) +- return 0; ++ if (*vm_flags & VM_SPARC_ADI) ++ return 0; + #endif + +- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { +- err = __ksm_enter(mm); +- if (err) +- return err; +- } ++ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { ++ err = __ksm_enter(mm); ++ if (err) ++ return err; ++ } + +- *vm_flags |= VM_MERGEABLE; +- break; ++ *vm_flags |= VM_MERGEABLE; + +- case MADV_UNMERGEABLE: +- if (!(*vm_flags & VM_MERGEABLE)) +- return 0; /* just ignore the advice */ ++ return 0; ++} + +- if (vma->anon_vma) { +- err = unmerge_ksm_pages(vma, start, end); +- if (err) +- return err; +- } ++int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, unsigned long *vm_flags) ++{ ++ int err; ++ ++ if (!(*vm_flags & VM_MERGEABLE)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_ksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } + +- *vm_flags &= ~VM_MERGEABLE; ++ *vm_flags &= ~VM_MERGEABLE; ++ ++ return 0; ++} ++ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ err = ksm_madvise_merge(mm, vma, vm_flags); ++ if (err) ++ return err; ++ break; ++ ++ case MADV_UNMERGEABLE: ++ err = ksm_madvise_unmerge(vma, start, end, vm_flags); ++ if (err) ++ return err; + break; + } + +diff --git a/mm/madvise.c b/mm/madvise.c +index b6ea204d4e23..0064dcafb812 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1527,3 +1527,116 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + out: + return ret; + } ++ ++SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ ssize_t ret; ++ struct pid *pid; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ unsigned int f_flags; ++ struct vm_area_struct *vma; ++ struct vma_iterator vmi; ++ ++ if (flags != 0) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ switch (behaviour) { ++ case MADV_MERGEABLE: ++ case MADV_UNMERGEABLE: ++ break; ++ default: ++ ret = -EINVAL; ++ goto out; ++ break; ++ } ++ ++ pid = pidfd_get_pid(pidfd, &f_flags); ++ if (IS_ERR(pid)) { ++ ret = PTR_ERR(pid); ++ goto out; ++ } ++ ++ task = get_pid_task(pid, PIDTYPE_PID); ++ if (!task) { ++ ret = -ESRCH; ++ goto put_pid; ++ } ++ ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); ++ if (IS_ERR_OR_NULL(mm)) { ++ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; ++ goto release_task; ++ } ++ ++ /* Require CAP_SYS_NICE for influencing process performance. */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ ++ if (mmap_write_lock_killable(mm)) { ++ ret = -EINTR; ++ goto release_mm; ++ } ++ ++ vma_iter_init(&vmi, mm, 0); ++ for_each_vma(vmi, vma) { ++ switch (behaviour) { ++ case MADV_MERGEABLE: ++ ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags); ++ break; ++ case MADV_UNMERGEABLE: ++ ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags); ++ break; ++ default: ++ /* look, ma, no brain */ ++ break; ++ } ++ if (ret) ++ break; ++ } ++ ++ mmap_write_unlock(mm); ++ ++release_mm: ++ mmput(mm); ++release_task: ++ put_task_struct(task); ++put_pid: ++ put_pid(pid); ++out: ++ return ret; ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++#ifdef CONFIG_KSM ++static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_pmadv_ksm); ++} ++static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm); ++ ++static struct attribute *pmadv_sysfs_attrs[] = { ++ &pmadv_ksm_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group pmadv_sysfs_attr_group = { ++ .attrs = pmadv_sysfs_attrs, ++ .name = "pmadv", ++}; ++ ++static int __init pmadv_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group); ++} ++subsys_initcall(pmadv_sysfs_init); ++#endif /* CONFIG_KSM */ +-- +2.39.2 + +From 3fcdb0864bf3a1d90f3689ffa8acceec00a5926e Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 13 Feb 2023 09:25:47 +0100 +Subject: [PATCH 10/15] maple-lru + +Signed-off-by: Peter Jung +--- + Documentation/mm/multigen_lru.rst | 86 ++- + include/linux/fs.h | 2 + + include/linux/maple_tree.h | 6 - + include/linux/memcontrol.h | 10 + + include/linux/mm_inline.h | 19 +- + include/linux/mmzone.h | 122 +++- + lib/maple_tree.c | 113 ++- + mm/fadvise.c | 5 +- + mm/memcontrol.c | 12 + + mm/memory.c | 7 +- + mm/page_alloc.c | 1 + + mm/rmap.c | 42 +- + mm/vmscan.c | 1059 ++++++++++++++++++----------- + mm/workingset.c | 4 +- + tools/testing/radix-tree/maple.c | 18 +- + 15 files changed, 1002 insertions(+), 504 deletions(-) + +diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst +index d7062c6a8946..5f1f6ecbb79b 100644 +--- a/Documentation/mm/multigen_lru.rst ++++ b/Documentation/mm/multigen_lru.rst +@@ -89,15 +89,15 @@ variables are monotonically increasing. + + Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` + bits in order to fit into the gen counter in ``folio->flags``. Each +-truncated generation number is an index to ``lrugen->lists[]``. The ++truncated generation number is an index to ``lrugen->folios[]``. The + sliding window technique is used to track at least ``MIN_NR_GENS`` and + at most ``MAX_NR_GENS`` generations. The gen counter stores a value + within ``[1, MAX_NR_GENS]`` while a page is on one of +-``lrugen->lists[]``; otherwise it stores zero. ++``lrugen->folios[]``; otherwise it stores zero. + + Each generation is divided into multiple tiers. A page accessed ``N`` + times through file descriptors is in tier ``order_base_2(N)``. Unlike +-generations, tiers do not have dedicated ``lrugen->lists[]``. In ++generations, tiers do not have dedicated ``lrugen->folios[]``. In + contrast to moving across generations, which requires the LRU lock, + moving across tiers only involves atomic operations on + ``folio->flags`` and therefore has a negligible cost. A feedback loop +@@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. + Eviction + -------- + The eviction consumes old generations. Given an ``lruvec``, it +-increments ``min_seq`` when ``lrugen->lists[]`` indexed by ++increments ``min_seq`` when ``lrugen->folios[]`` indexed by + ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to + evict from, it first compares ``min_seq[]`` to select the older type. + If both types are equally old, it selects the one whose first tier has +@@ -141,9 +141,85 @@ loop has detected outlying refaults from the tier this page is in. To + this end, the feedback loop uses the first tier as the baseline, for + the reason stated earlier. + ++Working set protection ++---------------------- ++Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is ++set, an ``lruvec`` is protected from the eviction when its oldest ++generation was born within ``lru_gen_min_ttl`` milliseconds. In other ++words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds ++from getting evicted. The OOM killer is triggered if this working set ++cannot be kept in memory. ++ ++This time-based approach has the following advantages: ++ ++1. It is easier to configure because it is agnostic to applications ++ and memory sizes. ++2. It is more reliable because it is directly wired to the OOM killer. ++ ++Rmap/PT walk feedback ++--------------------- ++Searching the rmap for PTEs mapping each page on an LRU list (to test ++and clear the accessed bit) can be expensive because pages from ++different VMAs (PA space) are not cache friendly to the rmap (VA ++space). For workloads mostly using mapped pages, searching the rmap ++can incur the highest CPU cost in the reclaim path. ++ ++``lru_gen_look_around()`` exploits spatial locality to reduce the ++trips into the rmap. It scans the adjacent PTEs of a young PTE and ++promotes hot pages. If the scan was done cacheline efficiently, it ++adds the PMD entry pointing to the PTE table to the Bloom filter. This ++forms a feedback loop between the eviction and the aging. ++ ++Bloom Filters ++------------- ++Bloom filters are a space and memory efficient data structure for set ++membership test, i.e., test if an element is not in the set or may be ++in the set. ++ ++In the eviction path, specifically, in ``lru_gen_look_around()``, if a ++PMD has a sufficient number of hot pages, its address is placed in the ++filter. In the aging path, set membership means that the PTE range ++will be scanned for young pages. ++ ++Note that Bloom filters are probabilistic on set membership. If a test ++is false positive, the cost is an additional scan of a range of PTEs, ++which may yield hot pages anyway. Parameters of the filter itself can ++control the false positive rate in the limit. ++ ++Memcg LRU ++--------- ++An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, ++since each node and memcg combination has an LRU of folios (see ++``mem_cgroup_lruvec()``). Its goal is to improve the scalability of ++global reclaim, which is critical to system-wide memory overcommit in ++data centers. Note that memcg LRU only applies to global reclaim. ++ ++The basic structure of an memcg LRU can be understood by an analogy to ++the active/inactive LRU (of folios): ++ ++1. It has the young and the old (generations), i.e., the counterparts ++ to the active and the inactive; ++2. The increment of ``max_seq`` triggers promotion, i.e., the ++ counterpart to activation; ++3. Other events trigger similar operations, e.g., offlining an memcg ++ triggers demotion, i.e., the counterpart to deactivation. ++ ++In terms of global reclaim, it has two distinct features: ++ ++1. Sharding, which allows each thread to start at a random memcg (in ++ the old generation) and improves parallelism; ++2. Eventual fairness, which allows direct reclaim to bail out at will ++ and reduces latency without affecting fairness over some time. ++ ++In terms of traversing memcgs during global reclaim, it improves the ++best-case complexity from O(n) to O(1) and does not affect the ++worst-case complexity O(n). Therefore, on average, it has a sublinear ++complexity. ++ + Summary + ------- +-The multi-gen LRU can be disassembled into the following parts: ++The multi-gen LRU (of folios) can be disassembled into the following ++parts: + + * Generations + * Rmap walks +diff --git a/include/linux/fs.h b/include/linux/fs.h +index c1769a2c5d70..d353c262d669 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, + /* File supports DIRECT IO */ + #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) + ++#define FMODE_NOREUSE ((__force fmode_t)0x800000) ++ + /* File was opened by fanotify and shouldn't generate fanotify events */ + #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) + +diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h +index e594db58a0f1..815a27661517 100644 +--- a/include/linux/maple_tree.h ++++ b/include/linux/maple_tree.h +@@ -12,7 +12,6 @@ + #include + #include + /* #define CONFIG_MAPLE_RCU_DISABLED */ +-/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ + + /* + * Allocated nodes are mutable until they have been inserted into the tree, +@@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) + return mas->node == MAS_PAUSE; + } + +-void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); +-void mas_dup_store(struct ma_state *mas, void *entry); +- + /* + * This finds an empty area from the highest address to the lowest. + * AKA "Topdown" version, +@@ -517,7 +513,6 @@ static inline void mas_reset(struct ma_state *mas) + * entry. + * + * Note: may return the zero entry. +- * + */ + #define mas_for_each(__mas, __entry, __max) \ + while (((__entry) = mas_find((__mas), (__max))) != NULL) +@@ -639,7 +634,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) + } + + static inline unsigned int mt_height(const struct maple_tree *mt) +- + { + return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; + } +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 85dc9b88ea37..8e0be0680005 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) + percpu_ref_put(&objcg->refcnt); + } + ++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) ++{ ++ return !memcg || css_tryget(&memcg->css); ++} ++ + static inline void mem_cgroup_put(struct mem_cgroup *memcg) + { + if (memcg) +@@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) + { + } + ++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) ++{ ++ return true; ++} ++ + static inline void mem_cgroup_put(struct mem_cgroup *memcg) + { + } +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index ff3f3f23f649..de1e622dd366 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + enum lru_list lru = type * LRU_INACTIVE_FILE; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); +@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); + +@@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, + lru_gen_update_size(lruvec, folio, -1, gen); + /* for folio_rotate_reclaimable() */ + if (reclaiming) +- list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); ++ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + else +- list_add(&folio->lru, &lrugen->lists[gen][type][zone]); ++ list_add(&folio->lru, &lrugen->folios[gen][type][zone]); + + return true; + } +@@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, + #endif + } + ++static inline bool vma_has_recency(struct vm_area_struct *vma) ++{ ++ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) ++ return false; ++ ++ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) ++ return false; ++ ++ return true; ++} ++ + #endif +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index cd28a100d9e4..977be526c939 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -7,6 +7,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -312,7 +313,7 @@ enum lruvec_flags { + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the + * corresponding generation. The gen counter in folio->flags stores gen+1 while +- * a page is on one of lrugen->lists[]. Otherwise it stores 0. ++ * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * + * A page is added to the youngest generation on faulting. The aging needs to + * check the accessed bit at least twice before handing this page over to the +@@ -324,8 +325,8 @@ enum lruvec_flags { + * rest of generations, if they exist, are considered inactive. See + * lru_gen_is_active(). + * +- * PG_active is always cleared while a page is on one of lrugen->lists[] so that +- * the aging needs not to worry about it. And it's set again when a page ++ * PG_active is always cleared while a page is on one of lrugen->folios[] so ++ * that the aging needs not to worry about it. And it's set again when a page + * considered active is isolated for non-reclaiming purposes, e.g., migration. + * See lru_gen_add_folio() and lru_gen_del_folio(). + * +@@ -404,7 +405,7 @@ enum { + * The number of pages in each generation is eventually consistent and therefore + * can be transiently negative when reset_batch_size() is pending. + */ +-struct lru_gen_struct { ++struct lru_gen_folio { + /* the aging increments the youngest generation number */ + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ +@@ -412,7 +413,7 @@ struct lru_gen_struct { + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; + /* the multi-gen LRU lists, lazily sorted on eviction */ +- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ +@@ -426,6 +427,14 @@ struct lru_gen_struct { + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multi-gen LRU is enabled */ + bool enabled; ++#ifdef CONFIG_MEMCG ++ /* the memcg generation this lru_gen_folio belongs to */ ++ u8 gen; ++ /* the list segment this lru_gen_folio belongs to */ ++ u8 seg; ++ /* per-node lru_gen_folio list for global reclaim */ ++ struct hlist_nulls_node list; ++#endif + }; + + enum { +@@ -461,7 +470,7 @@ struct lru_gen_mm_state { + struct lru_gen_mm_walk { + /* the lruvec under reclaim */ + struct lruvec *lruvec; +- /* unstable max_seq from lru_gen_struct */ ++ /* unstable max_seq from lru_gen_folio */ + unsigned long max_seq; + /* the next address within an mm to scan */ + unsigned long next_addr; +@@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG ++ ++/* ++ * For each node, memcgs are divided into two generations: the old and the ++ * young. For each generation, memcgs are randomly sharded into multiple bins ++ * to improve scalability. For each bin, the hlist_nulls is virtually divided ++ * into three segments: the head, the tail and the default. ++ * ++ * An onlining memcg is added to the tail of a random bin in the old generation. ++ * The eviction starts at the head of a random bin in the old generation. The ++ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes ++ * the old generation, is incremented when all its bins become empty. ++ * ++ * There are four operations: ++ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its ++ * current generation (old or young) and updates its "seg" to "head"; ++ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its ++ * current generation (old or young) and updates its "seg" to "tail"; ++ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old ++ * generation, updates its "gen" to "old" and resets its "seg" to "default"; ++ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the ++ * young generation, updates its "gen" to "young" and resets its "seg" to ++ * "default". ++ * ++ * The events that trigger the above operations are: ++ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; ++ * 2. The first attempt to reclaim an memcg below low, which triggers ++ * MEMCG_LRU_TAIL; ++ * 3. The first attempt to reclaim an memcg below reclaimable size threshold, ++ * which triggers MEMCG_LRU_TAIL; ++ * 4. The second attempt to reclaim an memcg below reclaimable size threshold, ++ * which triggers MEMCG_LRU_YOUNG; ++ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; ++ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; ++ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. ++ * ++ * Note that memcg LRU only applies to global reclaim, and the round-robin ++ * incrementing of their max_seq counters ensures the eventual fairness to all ++ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). ++ */ ++#define MEMCG_NR_GENS 2 ++#define MEMCG_NR_BINS 8 ++ ++struct lru_gen_memcg { ++ /* the per-node memcg generation counter */ ++ unsigned long seq; ++ /* each memcg has one lru_gen_folio per node */ ++ unsigned long nr_memcgs[MEMCG_NR_GENS]; ++ /* per-node lru_gen_folio list for global reclaim */ ++ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; ++ /* protects the above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_init_pgdat(struct pglist_data *pgdat); ++ + void lru_gen_init_memcg(struct mem_cgroup *memcg); + void lru_gen_exit_memcg(struct mem_cgroup *memcg); +-#endif ++void lru_gen_online_memcg(struct mem_cgroup *memcg); ++void lru_gen_offline_memcg(struct mem_cgroup *memcg); ++void lru_gen_release_memcg(struct mem_cgroup *memcg); ++void lru_gen_soft_reclaim(struct lruvec *lruvec); ++ ++#else /* !CONFIG_MEMCG */ ++ ++#define MEMCG_NR_GENS 1 ++ ++struct lru_gen_memcg { ++}; ++ ++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++} ++ ++#endif /* CONFIG_MEMCG */ + + #else /* !CONFIG_LRU_GEN */ + ++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++} ++ + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } +@@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + } + + #ifdef CONFIG_MEMCG ++ + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + } +@@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) + { + } +-#endif ++ ++static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) ++{ ++} ++ ++#endif /* CONFIG_MEMCG */ + + #endif /* CONFIG_LRU_GEN */ + +@@ -524,7 +626,7 @@ struct lruvec { + unsigned long flags; + #ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ +- struct lru_gen_struct lrugen; ++ struct lru_gen_folio lrugen; + /* to concurrently iterate lru_gen_mm_list */ + struct lru_gen_mm_state mm_state; + #endif +@@ -1243,6 +1345,8 @@ typedef struct pglist_data { + #ifdef CONFIG_LRU_GEN + /* kswap mm walk data */ + struct lru_gen_mm_walk mm_walk; ++ /* lru_gen_folio list */ ++ struct lru_gen_memcg memcg_lru; + #endif + + CACHELINE_PADDING(_pad2_); +diff --git a/lib/maple_tree.c b/lib/maple_tree.c +index 5a976393c9ae..b95652b79b55 100644 +--- a/lib/maple_tree.c ++++ b/lib/maple_tree.c +@@ -149,13 +149,12 @@ struct maple_subtree_state { + /* Functions */ + static inline struct maple_node *mt_alloc_one(gfp_t gfp) + { +- return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); ++ return kmem_cache_alloc(maple_node_cache, gfp); + } + + static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) + { +- return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, +- nodes); ++ return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); + } + + static inline void mt_free_bulk(size_t size, void __rcu **nodes) +@@ -183,7 +182,6 @@ static void ma_free_rcu(struct maple_node *node) + call_rcu(&node->rcu, mt_free_rcu); + } + +- + static void mas_set_height(struct ma_state *mas) + { + unsigned int new_flags = mas->tree->ma_flags; +@@ -468,7 +466,7 @@ static inline + void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, + unsigned char slot) + { +- unsigned long val = (unsigned long) parent; ++ unsigned long val = (unsigned long)parent; + unsigned long shift; + unsigned long type; + enum maple_type p_type = mte_node_type(parent); +@@ -502,10 +500,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, + */ + static inline unsigned int mte_parent_slot(const struct maple_enode *enode) + { +- unsigned long val = (unsigned long) mte_to_node(enode)->parent; ++ unsigned long val = (unsigned long)mte_to_node(enode)->parent; + +- /* Root. */ +- if (val & 1) ++ if (val & MA_ROOT_PARENT) + return 0; + + /* +@@ -1128,9 +1125,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) + { + struct maple_alloc *ret, *node = mas->alloc; + unsigned long total = mas_allocated(mas); ++ unsigned int req = mas_alloc_req(mas); + + /* nothing or a request pending. */ +- if (unlikely(!total)) ++ if (WARN_ON(!total)) + return NULL; + + if (total == 1) { +@@ -1140,27 +1138,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) + goto single_node; + } + +- if (!node->node_count) { ++ if (node->node_count == 1) { + /* Single allocation in this node. */ + mas->alloc = node->slot[0]; +- node->slot[0] = NULL; + mas->alloc->total = node->total - 1; + ret = node; + goto new_head; + } +- + node->total--; +- ret = node->slot[node->node_count]; +- node->slot[node->node_count--] = NULL; ++ ret = node->slot[--node->node_count]; ++ node->slot[node->node_count] = NULL; + + single_node: + new_head: +- ret->total = 0; +- ret->node_count = 0; +- if (ret->request_count) { +- mas_set_alloc_req(mas, ret->request_count + 1); +- ret->request_count = 0; ++ if (req) { ++ req++; ++ mas_set_alloc_req(mas, req); + } ++ ++ memset(ret, 0, sizeof(*ret)); + return (struct maple_node *)ret; + } + +@@ -1179,21 +1175,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) + unsigned long count; + unsigned int requested = mas_alloc_req(mas); + +- memset(reuse, 0, sizeof(*reuse)); + count = mas_allocated(mas); + +- if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { +- if (head->slot[0]) +- head->node_count++; +- head->slot[head->node_count] = reuse; ++ reuse->request_count = 0; ++ reuse->node_count = 0; ++ if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { ++ head->slot[head->node_count++] = reuse; + head->total++; + goto done; + } + + reuse->total = 1; + if ((head) && !((unsigned long)head & 0x1)) { +- head->request_count = 0; + reuse->slot[0] = head; ++ reuse->node_count = 1; + reuse->total += head->total; + } + +@@ -1212,7 +1207,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) + { + struct maple_alloc *node; + unsigned long allocated = mas_allocated(mas); +- unsigned long success = allocated; + unsigned int requested = mas_alloc_req(mas); + unsigned int count; + void **slots = NULL; +@@ -1228,24 +1222,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) + WARN_ON(!allocated); + } + +- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { ++ if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { + node = (struct maple_alloc *)mt_alloc_one(gfp); + if (!node) + goto nomem_one; + +- if (allocated) ++ if (allocated) { + node->slot[0] = mas->alloc; ++ node->node_count = 1; ++ } else { ++ node->node_count = 0; ++ } + +- success++; + mas->alloc = node; ++ node->total = ++allocated; + requested--; + } + + node = mas->alloc; ++ node->request_count = 0; + while (requested) { + max_req = MAPLE_ALLOC_SLOTS; +- if (node->slot[0]) { +- unsigned int offset = node->node_count + 1; ++ if (node->node_count) { ++ unsigned int offset = node->node_count; + + slots = (void **)&node->slot[offset]; + max_req -= offset; +@@ -1259,15 +1258,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) + goto nomem_bulk; + + node->node_count += count; +- /* zero indexed. */ +- if (slots == (void **)&node->slot) +- node->node_count--; +- +- success += count; ++ allocated += count; + node = node->slot[0]; ++ node->node_count = 0; ++ node->request_count = 0; + requested -= count; + } +- mas->alloc->total = success; ++ mas->alloc->total = allocated; + return; + + nomem_bulk: +@@ -1276,10 +1273,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) + nomem_one: + mas_set_alloc_req(mas, requested); + if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) +- mas->alloc->total = success; ++ mas->alloc->total = allocated; + mas_set_err(mas, -ENOMEM); +- return; +- + } + + /* +@@ -1887,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas, + + /* Avoid ending a node on a NULL entry */ + split = mab_no_null_split(bn, split, slot_count); +- if (!(*mid_split)) +- return split; + +- *mid_split = mab_no_null_split(bn, *mid_split, slot_count); ++ if (unlikely(*mid_split)) ++ *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + + return split; + } +@@ -2947,7 +2941,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) + mas->min = prev_min; + mas->max = prev_max; + mas->node = last; +- return (void *) next; ++ return (void *)next; + + dead_node: + mas_reset(mas); +@@ -3467,7 +3461,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, + */ + static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) + { +- + struct maple_subtree_state mast; + int height = 0; + unsigned char mid_split, split = 0; +@@ -3893,7 +3886,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) + goto dead_node; + } while (!ma_is_leaf(type)); + +- return (void *) next; ++ return (void *)next; + + dead_node: + mas_reset(mas); +@@ -4711,15 +4704,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, + + static inline void mas_rewalk(struct ma_state *mas, unsigned long index) + { +- + retry: + mas_set(mas, index); + mas_state_walk(mas); + if (mas_is_start(mas)) + goto retry; +- +- return; +- + } + + /* +@@ -5590,8 +5579,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, + + /* + * mte_destroy_walk() - Free a tree or sub-tree. +- * @enode - the encoded maple node (maple_enode) to start +- * @mn - the tree to free - needed for node types. ++ * @enode: the encoded maple node (maple_enode) to start ++ * @mt: the tree to free - needed for node types. + * + * Must hold the write lock. + */ +@@ -5620,7 +5609,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) + mas_reset(wr_mas->mas); + } + } +- + } + + /* Interface */ +@@ -5745,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) + void mas_destroy(struct ma_state *mas) + { + struct maple_alloc *node; ++ unsigned long total; + + /* + * When using mas_for_each() to insert an expected number of elements, +@@ -5767,14 +5756,20 @@ void mas_destroy(struct ma_state *mas) + } + mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); + +- while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { ++ total = mas_allocated(mas); ++ while (total) { + node = mas->alloc; + mas->alloc = node->slot[0]; +- if (node->node_count > 0) +- mt_free_bulk(node->node_count, +- (void __rcu **)&node->slot[1]); ++ if (node->node_count > 1) { ++ size_t count = node->node_count - 1; ++ ++ mt_free_bulk(count, (void __rcu **)&node->slot[1]); ++ total -= count; ++ } + kmem_cache_free(maple_node_cache, node); ++ total--; + } ++ + mas->alloc = NULL; + } + EXPORT_SYMBOL_GPL(mas_destroy); +@@ -6734,7 +6729,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, + + if (i < (MAPLE_RANGE64_SLOTS - 1)) + last = node->pivot[i]; +- else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) ++ else if (!node->slot[i] && max != mt_node_max(entry)) + break; + if (last == 0 && i > 0) + break; +@@ -6841,7 +6836,7 @@ void mt_dump(const struct maple_tree *mt) + if (!xa_is_node(entry)) + mt_dump_entry(entry, 0, 0, 0); + else if (entry) +- mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); ++ mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); + } + EXPORT_SYMBOL_GPL(mt_dump); + +diff --git a/mm/fadvise.c b/mm/fadvise.c +index bf04fec87f35..fb7c5f43fd2a 100644 +--- a/mm/fadvise.c ++++ b/mm/fadvise.c +@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); +- file->f_mode &= ~FMODE_RANDOM; ++ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_RANDOM: +@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) + force_page_cache_readahead(mapping, file, start_index, nrpages); + break; + case POSIX_FADV_NOREUSE: ++ spin_lock(&file->f_lock); ++ file->f_mode |= FMODE_NOREUSE; ++ spin_unlock(&file->f_lock); + break; + case POSIX_FADV_DONTNEED: + __filemap_fdatawrite_range(mapping, offset, endbyte, +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 73afff8062f9..7fe2f4f36cf4 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; + ++ if (lru_gen_enabled()) { ++ if (soft_limit_excess(memcg)) ++ lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); ++ return; ++ } ++ + mctz = soft_limit_tree.rb_tree_per_node[nid]; + if (!mctz) + return; +@@ -3526,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + struct mem_cgroup_tree_per_node *mctz; + unsigned long excess; + ++ if (lru_gen_enabled()) ++ return 0; ++ + if (order > 0) + return 0; + +@@ -5382,6 +5391,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); ++ lru_gen_online_memcg(memcg); + return 0; + offline_kmem: + memcg_offline_kmem(memcg); +@@ -5413,6 +5423,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) + memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); + wb_memcg_offline(memcg); ++ lru_gen_offline_memcg(memcg); + + drain_all_stock(memcg); + +@@ -5424,6 +5435,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); ++ lru_gen_release_memcg(memcg); + } + + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) +diff --git a/mm/memory.c b/mm/memory.c +index f526b9152bef..4ad62eba3cb7 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1392,8 +1392,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + force_flush = 1; + } + } +- if (pte_young(ptent) && +- likely(!(vma->vm_flags & VM_SEQ_READ))) ++ if (pte_young(ptent) && likely(vma_has_recency(vma))) + mark_page_accessed(page); + } + rss[mm_counter(page)]--; +@@ -5140,8 +5139,8 @@ static inline void mm_account_fault(struct pt_regs *regs, + #ifdef CONFIG_LRU_GEN + static void lru_gen_enter_fault(struct vm_area_struct *vma) + { +- /* the LRU algorithm doesn't apply to sequential or random reads */ +- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); ++ /* the LRU algorithm only applies to accesses with recency */ ++ current->in_lru_fault = vma_has_recency(vma); + } + + static void lru_gen_exit_fault(void) +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3aec9a6a9cb7..6658cbf43f5d 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid) + pgdat_set_deferred_range(pgdat); + + free_area_init_core(pgdat); ++ lru_gen_init_pgdat(pgdat); + } + + static void __init free_area_init_memoryless_node(int nid) +diff --git a/mm/rmap.c b/mm/rmap.c +index b616870a09be..7b9205cb7d87 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, + } + + if (pvmw.pte) { +- if (lru_gen_enabled() && pte_young(*pvmw.pte) && +- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { ++ if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + lru_gen_look_around(&pvmw); + referenced++; + } + + if (ptep_clear_flush_young_notify(vma, address, +- pvmw.pte)) { +- /* +- * Don't treat a reference through +- * a sequentially read mapping as such. +- * If the folio has been used in another mapping, +- * we will catch it; if this other mapping is +- * already gone, the unmap path will have set +- * the referenced flag or activated the folio. +- */ +- if (likely(!(vma->vm_flags & VM_SEQ_READ))) +- referenced++; +- } ++ pvmw.pte)) ++ referenced++; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, + pvmw.pmd)) +@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) + struct folio_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; + +- if (!mm_match_cgroup(vma->vm_mm, memcg)) ++ /* ++ * Ignore references from this mapping if it has no recency. If the ++ * folio has been used in another mapping, we will catch it; if this ++ * other mapping is already gone, the unmap path will have set the ++ * referenced flag or activated the folio in zap_pte_range(). ++ */ ++ if (!vma_has_recency(vma)) ++ return true; ++ ++ /* ++ * If we are reclaiming on behalf of a cgroup, skip counting on behalf ++ * of references from different cgroups. ++ */ ++ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + return true; + + return false; +@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked, + .arg = (void *)&pra, + .anon_lock = folio_lock_anon_vma_read, + .try_lock = true, ++ .invalid_vma = invalid_folio_referenced_vma, + }; + + *vm_flags = 0; +@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked, + return 1; + } + +- /* +- * If we are reclaiming on behalf of a cgroup, skip +- * counting on behalf of references from different +- * cgroups +- */ +- if (memcg) { +- rwc.invalid_vma = invalid_folio_referenced_vma; +- } +- + rmap_walk(folio, &rwc); + *vm_flags = pra.vm_flags; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 160acbbdf111..04a54656b6b7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -55,6 +55,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -135,12 +137,6 @@ struct scan_control { + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +-#ifdef CONFIG_LRU_GEN +- /* help kswapd make better choices among multiple memcgs */ +- unsigned int memcgs_need_aging:1; +- unsigned long last_reclaimed; +-#endif +- + /* Allocation order */ + s8 order; + +@@ -453,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc) + return sc->target_mem_cgroup; + } + ++static bool global_reclaim(struct scan_control *sc) ++{ ++ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); ++} ++ + /** + * writeback_throttling_sane - is the usual dirty throttling mechanism available? + * @sc: scan_control in question +@@ -503,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc) + return false; + } + ++static bool global_reclaim(struct scan_control *sc) ++{ ++ return true; ++} ++ + static bool writeback_throttling_sane(struct scan_control *sc) + { + return true; +@@ -3184,6 +3190,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + ++#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) ++#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) ++ + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) + { + struct pglist_data *pgdat = NODE_DATA(nid); +@@ -3209,6 +3218,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ if (!sc->may_swap) ++ return 0; ++ + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; +@@ -3223,12 +3235,104 @@ static int get_nr_gens(struct lruvec *lruvec, int type) + + static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) + { +- /* see the comment on lru_gen_struct */ ++ /* see the comment on lru_gen_folio */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + } + ++/****************************************************************************** ++ * Bloom filters ++ ******************************************************************************/ ++ ++/* ++ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when ++ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of ++ * bits in a bitmap, k is the number of hash functions and n is the number of ++ * inserted items. ++ * ++ * Page table walkers use one of the two filters to reduce their search space. ++ * To get rid of non-leaf entries that no longer have enough leaf entries, the ++ * aging uses the double-buffering technique to flip to the other filter each ++ * time it produces a new generation. For non-leaf entries that have enough ++ * leaf entries, the aging carries them over to the next generation in ++ * walk_pmd_range(); the eviction also report them when walking the rmap ++ * in lru_gen_look_around(). ++ * ++ * For future optimizations: ++ * 1. It's not necessary to keep both filters all the time. The spare one can be ++ * freed after the RCU grace period and reallocated if needed again. ++ * 2. And when reallocating, it's worth scaling its size according to the number ++ * of inserted entries in the other filter, to reduce the memory overhead on ++ * small systems and false positives on large systems. ++ * 3. Jenkins' hash function is an alternative to Knuth's. ++ */ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return true; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = lruvec->mm_state.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), ++ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); ++ WRITE_ONCE(lruvec->mm_state.filters[gen], filter); ++} ++ + /****************************************************************************** + * mm_struct list + ******************************************************************************/ +@@ -3348,94 +3452,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) + } + #endif + +-/* +- * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when +- * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of +- * bits in a bitmap, k is the number of hash functions and n is the number of +- * inserted items. +- * +- * Page table walkers use one of the two filters to reduce their search space. +- * To get rid of non-leaf entries that no longer have enough leaf entries, the +- * aging uses the double-buffering technique to flip to the other filter each +- * time it produces a new generation. For non-leaf entries that have enough +- * leaf entries, the aging carries them over to the next generation in +- * walk_pmd_range(); the eviction also report them when walking the rmap +- * in lru_gen_look_around(). +- * +- * For future optimizations: +- * 1. It's not necessary to keep both filters all the time. The spare one can be +- * freed after the RCU grace period and reallocated if needed again. +- * 2. And when reallocating, it's worth scaling its size according to the number +- * of inserted entries in the other filter, to reduce the memory overhead on +- * small systems and false positives on large systems. +- * 3. Jenkins' hash function is an alternative to Knuth's. +- */ +-#define BLOOM_FILTER_SHIFT 15 +- +-static inline int filter_gen_from_seq(unsigned long seq) +-{ +- return seq % NR_BLOOM_FILTERS; +-} +- +-static void get_item_key(void *item, int *key) +-{ +- u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); +- +- BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); +- +- key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); +- key[1] = hash >> BLOOM_FILTER_SHIFT; +-} +- +-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +-{ +- unsigned long *filter; +- int gen = filter_gen_from_seq(seq); +- +- filter = lruvec->mm_state.filters[gen]; +- if (filter) { +- bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); +- return; +- } +- +- filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), +- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); +- WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +-} +- +-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +-{ +- int key[2]; +- unsigned long *filter; +- int gen = filter_gen_from_seq(seq); +- +- filter = READ_ONCE(lruvec->mm_state.filters[gen]); +- if (!filter) +- return; +- +- get_item_key(item, key); +- +- if (!test_bit(key[0], filter)) +- set_bit(key[0], filter); +- if (!test_bit(key[1], filter)) +- set_bit(key[1], filter); +-} +- +-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +-{ +- int key[2]; +- unsigned long *filter; +- int gen = filter_gen_from_seq(seq); +- +- filter = READ_ONCE(lruvec->mm_state.filters[gen]); +- if (!filter) +- return true; +- +- get_item_key(item, key); +- +- return test_bit(key[0], filter) && test_bit(key[1], filter); +-} +- + static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) + { + int i; +@@ -3623,7 +3639,7 @@ struct ctrl_pos { + static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) + { +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + +@@ -3638,7 +3654,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) + { + int hist, tier; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + +@@ -3715,7 +3731,7 @@ static int folio_update_gen(struct folio *folio, int gen) + static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) + { + int type = folio_is_file_lru(folio); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + +@@ -3760,7 +3776,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, + static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) + { + int gen, type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + walk->batched = 0; + +@@ -3793,7 +3809,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal + if (is_vm_hugetlb_page(vma)) + return true; + +- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ++ if (!vma_has_recency(vma)) ++ return true; ++ ++ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) +@@ -3988,8 +4007,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + } + + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, +- struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *first) + { + int i; + pmd_t *pmd; +@@ -4002,18 +4021,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ +- if (*start == -1) { +- *start = next; ++ if (*first == -1) { ++ *first = addr; ++ bitmap_zero(bitmap, MIN_LRU_BATCH); + return; + } + +- i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); ++ i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + +- pmd = pmd_offset(pud, *start); ++ pmd = pmd_offset(pud, *first); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) +@@ -4024,15 +4044,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + do { + unsigned long pfn; + struct folio *folio; +- unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; ++ ++ /* don't round down the first address */ ++ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; + + pfn = get_pmd_pfn(pmd[i], vma, addr); + if (pfn == -1) + goto next; + + if (!pmd_trans_huge(pmd[i])) { +- if (arch_has_hw_nonleaf_pmd_young() && +- get_cap(LRU_GEN_NONLEAF_YOUNG)) ++ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } +@@ -4061,12 +4082,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + done: +- *start = -1; +- bitmap_zero(bitmap, MIN_LRU_BATCH); ++ *first = -1; + } + #else +-static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, +- struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *first) + { + } + #endif +@@ -4079,9 +4099,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; +- unsigned long pos = -1; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; ++ unsigned long first = -1; + struct lru_gen_mm_walk *walk = args->private; +- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + +@@ -4120,18 +4140,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + +- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + continue; + } + #endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + +- if (arch_has_hw_nonleaf_pmd_young() && +- get_cap(LRU_GEN_NONLEAF_YOUNG)) { ++ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + +- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + } + + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) +@@ -4148,7 +4167,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + } + +- walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +@@ -4238,7 +4257,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ + } while (err == -EAGAIN); + } + +-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) ++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) + { + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + +@@ -4246,7 +4265,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; +- } else if (!pgdat && !walk) { ++ } else if (!walk && force_alloc) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); +@@ -4274,7 +4293,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + { + int zone; + int remaining = MAX_LRU_BATCH; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) +@@ -4282,7 +4301,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { +- struct list_head *head = &lrugen->lists[old_gen][type][zone]; ++ struct list_head *head = &lrugen->folios[old_gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); +@@ -4293,7 +4312,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + new_gen = folio_inc_gen(lruvec, folio, false); +- list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); ++ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + + if (!--remaining) + return false; +@@ -4310,7 +4329,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + { + int gen, type, zone; + bool success = false; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); +@@ -4321,7 +4340,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { +- if (!list_empty(&lrugen->lists[gen][type][zone])) ++ if (!list_empty(&lrugen->folios[gen][type][zone])) + goto next; + } + +@@ -4331,7 +4350,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + ; + } + +- /* see the comment on lru_gen_struct */ ++ /* see the comment on lru_gen_folio */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); +@@ -4353,7 +4372,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) + { + int prev, next; + int type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec->lru_lock); + +@@ -4411,7 +4430,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + +@@ -4427,12 +4446,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { ++ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + +- walk = set_mm_walk(NULL); ++ walk = set_mm_walk(NULL, true); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; +@@ -4455,8 +4474,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + if (sc->priority <= DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_state.wait, + max_seq < READ_ONCE(lrugen->max_seq)); +- +- return max_seq < READ_ONCE(lrugen->max_seq); ++ return false; + } + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); +@@ -4469,97 +4487,56 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + return true; + } + +-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, +- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) ++/****************************************************************************** ++ * working set protection ++ ******************************************************************************/ ++ ++static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) + { + int gen, type, zone; +- unsigned long old = 0; +- unsigned long young = 0; + unsigned long total = 0; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ bool can_swap = get_swappiness(lruvec, sc); ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { +- unsigned long size = 0; +- + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) +- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); +- +- total += size; +- if (seq == max_seq) +- young += size; +- else if (seq + MIN_NR_GENS == max_seq) +- old += size; ++ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + } + } + +- /* try to scrape all its memory if this memcg was deleted */ +- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; +- +- /* +- * The aging tries to be lazy to reduce the overhead, while the eviction +- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the +- * ideal number of generations is MIN_NR_GENS+1. +- */ +- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) +- return true; +- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) +- return false; +- +- /* +- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) +- * of the total number of pages for each generation. A reasonable range +- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The +- * aging cares about the upper bound of hot pages, while the eviction +- * cares about the lower bound of cold pages. +- */ +- if (young * MIN_NR_GENS > total) +- return true; +- if (old * (MIN_NR_GENS + 2) < total) +- return true; +- +- return false; ++ /* whether the size is big enough to be helpful */ ++ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + } + +-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) ++static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, ++ unsigned long min_ttl) + { +- bool need_aging; +- unsigned long nr_to_scan; +- int swappiness = get_swappiness(lruvec, sc); ++ int gen; ++ unsigned long birth; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); +- DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + +- VM_WARN_ON_ONCE(sc->memcg_low_reclaim); +- +- mem_cgroup_calculate_protection(NULL, memcg); ++ /* see the comment on lru_gen_folio */ ++ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); ++ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + +- if (mem_cgroup_below_min(NULL, memcg)) ++ if (time_is_after_jiffies(birth + min_ttl)) + return false; + +- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); +- +- if (min_ttl) { +- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); +- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); +- +- if (time_is_after_jiffies(birth + min_ttl)) +- return false; +- +- /* the size is likely too small to be helpful */ +- if (!nr_to_scan && sc->priority != DEF_PRIORITY) +- return false; +- } ++ if (!lruvec_is_sizable(lruvec, sc)) ++ return false; + +- if (need_aging) +- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); ++ mem_cgroup_calculate_protection(NULL, memcg); + +- return true; ++ return !mem_cgroup_below_min(NULL, memcg); + } + + /* to protect the working set of the last N jiffies */ +@@ -4572,46 +4549,30 @@ static unsigned long lru_gen_min_ttl __read_mostly; + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; +- bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + +- sc->last_reclaimed = sc->nr_reclaimed; +- +- /* +- * To reduce the chance of going into the aging path, which can be +- * costly, optimistically skip it if the flag below was cleared in the +- * eviction path. This improves the overall performance when multiple +- * memcgs are available. +- */ +- if (!sc->memcgs_need_aging) { +- sc->memcgs_need_aging = true; ++ /* check the order to exclude compaction-induced reclaim */ ++ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) + return; +- } +- +- set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + +- if (age_lruvec(lruvec, sc, min_ttl)) +- success = true; ++ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { ++ mem_cgroup_iter_break(NULL, memcg); ++ return; ++ } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + +- clear_mm_walk(); +- +- /* check the order to exclude compaction-induced reclaim */ +- if (success || !min_ttl || sc->order) +- return; +- + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are +- * either below min or empty. ++ * either too small or below min. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { +@@ -4624,6 +4585,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + } + } + ++/****************************************************************************** ++ * rmap/PT walk feedback ++ ******************************************************************************/ ++ + /* + * This function exploits spatial locality when shrink_folio_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If +@@ -4634,13 +4599,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + { + int i; +- pte_t *pte; + unsigned long start; + unsigned long end; +- unsigned long addr; + struct lru_gen_mm_walk *walk; + int young = 0; +- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ pte_t *pte = pvmw->pte; ++ unsigned long addr = pvmw->address; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); +@@ -4657,25 +4621,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + +- start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); +- end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ start = max(addr & PMD_MASK, pvmw->vma->vm_start); ++ end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { +- if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; +- else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { +- start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; +- end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; ++ start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; ++ end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + +- pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ /* folio_update_gen() requires stable folio_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ return; + +- rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + ++ pte -= (addr - start) / PAGE_SIZE; ++ + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + +@@ -4700,58 +4667,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + ++ if (walk) { ++ old_gen = folio_update_gen(folio, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(walk, folio, old_gen, new_gen); ++ ++ continue; ++ } ++ + old_gen = folio_lru_gen(folio); + if (old_gen < 0) + folio_set_referenced(folio); + else if (old_gen != new_gen) +- __set_bit(i, bitmap); ++ folio_activate(folio); + } + + arch_leave_lazy_mmu_mode(); +- rcu_read_unlock(); ++ mem_cgroup_unlock_pages(); + + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); ++} + +- if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { +- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { +- folio = pfn_folio(pte_pfn(pte[i])); +- folio_activate(folio); +- } +- return; ++/****************************************************************************** ++ * memcg LRU ++ ******************************************************************************/ ++ ++/* see the comment on MEMCG_NR_GENS */ ++enum { ++ MEMCG_LRU_NOP, ++ MEMCG_LRU_HEAD, ++ MEMCG_LRU_TAIL, ++ MEMCG_LRU_OLD, ++ MEMCG_LRU_YOUNG, ++}; ++ ++#ifdef CONFIG_MEMCG ++ ++static int lru_gen_memcg_seg(struct lruvec *lruvec) ++{ ++ return READ_ONCE(lruvec->lrugen.seg); ++} ++ ++static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) ++{ ++ int seg; ++ int old, new; ++ int bin = get_random_u32_below(MEMCG_NR_BINS); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ seg = 0; ++ new = old = lruvec->lrugen.gen; ++ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (op == MEMCG_LRU_HEAD) ++ seg = MEMCG_LRU_HEAD; ++ else if (op == MEMCG_LRU_TAIL) ++ seg = MEMCG_LRU_TAIL; ++ else if (op == MEMCG_LRU_OLD) ++ new = get_memcg_gen(pgdat->memcg_lru.seq); ++ else if (op == MEMCG_LRU_YOUNG) ++ new = get_memcg_gen(pgdat->memcg_lru.seq + 1); ++ else ++ VM_WARN_ON_ONCE(true); ++ ++ hlist_nulls_del_rcu(&lruvec->lrugen.list); ++ ++ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) ++ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); ++ else ++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); ++ ++ pgdat->memcg_lru.nr_memcgs[old]--; ++ pgdat->memcg_lru.nr_memcgs[new]++; ++ ++ lruvec->lrugen.gen = new; ++ WRITE_ONCE(lruvec->lrugen.seg, seg); ++ ++ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) ++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); ++ ++ spin_unlock(&pgdat->memcg_lru.lock); ++} ++ ++void lru_gen_online_memcg(struct mem_cgroup *memcg) ++{ ++ int gen; ++ int nid; ++ int bin = get_random_u32_below(MEMCG_NR_BINS); ++ ++ for_each_node(nid) { ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ gen = get_memcg_gen(pgdat->memcg_lru.seq); ++ ++ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); ++ pgdat->memcg_lru.nr_memcgs[gen]++; ++ ++ lruvec->lrugen.gen = gen; ++ ++ spin_unlock(&pgdat->memcg_lru.lock); + } ++} + +- /* folio_update_gen() requires stable folio_memcg() */ +- if (!mem_cgroup_trylock_pages(memcg)) +- return; ++void lru_gen_offline_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; + +- if (!walk) { +- spin_lock_irq(&lruvec->lru_lock); +- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } ++} + +- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { +- folio = pfn_folio(pte_pfn(pte[i])); +- if (folio_memcg_rcu(folio) != memcg) +- continue; ++void lru_gen_release_memcg(struct mem_cgroup *memcg) ++{ ++ int gen; ++ int nid; + +- old_gen = folio_update_gen(folio, new_gen); +- if (old_gen < 0 || old_gen == new_gen) +- continue; ++ for_each_node(nid) { ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ struct lruvec *lruvec = get_lruvec(memcg, nid); + +- if (walk) +- update_batch_size(walk, folio, old_gen, new_gen); +- else +- lru_gen_update_size(lruvec, folio, old_gen, new_gen); ++ spin_lock(&pgdat->memcg_lru.lock); ++ ++ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); ++ ++ gen = lruvec->lrugen.gen; ++ ++ hlist_nulls_del_rcu(&lruvec->lrugen.list); ++ pgdat->memcg_lru.nr_memcgs[gen]--; ++ ++ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) ++ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); ++ ++ spin_unlock(&pgdat->memcg_lru.lock); + } ++} ++ ++void lru_gen_soft_reclaim(struct lruvec *lruvec) ++{ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); ++} + +- if (!walk) +- spin_unlock_irq(&lruvec->lru_lock); ++#else /* !CONFIG_MEMCG */ + +- mem_cgroup_unlock_pages(); ++static int lru_gen_memcg_seg(struct lruvec *lruvec) ++{ ++ return 0; + } + ++#endif ++ + /****************************************************************************** + * the eviction + ******************************************************************************/ +@@ -4765,7 +4845,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); + +@@ -4790,7 +4870,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { +- list_move(&folio->lru, &lrugen->lists[gen][type][zone]); ++ list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + +@@ -4799,7 +4879,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = folio_inc_gen(lruvec, folio, false); +- list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); ++ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); +@@ -4811,7 +4891,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) + if (folio_test_locked(folio) || folio_test_writeback(folio) || + (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + gen = folio_inc_gen(lruvec, folio, true); +- list_move(&folio->lru, &lrugen->lists[gen][type][zone]); ++ list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + +@@ -4822,12 +4902,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca + { + bool success; + +- /* unmapping inhibited */ +- if (!sc->may_unmap && folio_mapped(folio)) +- return false; +- + /* swapping inhibited */ +- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ if (!(sc->gfp_mask & __GFP_IO) && + (folio_test_dirty(folio) || + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) + return false; +@@ -4865,7 +4941,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); +@@ -4878,7 +4954,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; +- struct list_head *head = &lrugen->lists[gen][type][zone]; ++ struct list_head *head = &lrugen->folios[gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); +@@ -4924,9 +5000,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* +- * There might not be eligible pages due to reclaim_idx, may_unmap and +- * may_writepage. Check the remaining to prevent livelock if it's not +- * making progress. ++ * There might not be eligible folios due to reclaim_idx. Check the ++ * remaining to prevent livelock if it's not making progress. + */ + return isolated || !remaining ? scanned : 0; + } +@@ -5021,8 +5096,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + return scanned; + } + +-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +- bool *need_swapping) ++static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) + { + int type; + int scanned; +@@ -5111,153 +5185,348 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap + goto retry; + } + +- if (need_swapping && type == LRU_GEN_ANON) +- *need_swapping = true; +- + return scanned; + } + ++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, ++ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) ++{ ++ int gen, type, zone; ++ unsigned long old = 0; ++ unsigned long young = 0; ++ unsigned long total = 0; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ /* whether this lruvec is completely out of cold folios */ ++ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { ++ *nr_to_scan = 0; ++ return true; ++ } ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ unsigned long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ total += size; ++ if (seq == max_seq) ++ young += size; ++ else if (seq + MIN_NR_GENS == max_seq) ++ old += size; ++ } ++ } ++ ++ /* try to scrape all its memory if this memcg was deleted */ ++ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; ++ ++ /* ++ * The aging tries to be lazy to reduce the overhead, while the eviction ++ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the ++ * ideal number of generations is MIN_NR_GENS+1. ++ */ ++ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) ++ return false; ++ ++ /* ++ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) ++ * of the total number of pages for each generation. A reasonable range ++ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The ++ * aging cares about the upper bound of hot pages, while the eviction ++ * cares about the lower bound of cold pages. ++ */ ++ if (young * MIN_NR_GENS > total) ++ return true; ++ if (old * (MIN_NR_GENS + 2) < total) ++ return true; ++ ++ return false; ++} ++ + /* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap, bool *need_aging) ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) + { + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +- DEFINE_MIN_SEQ(lruvec); + +- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || +- (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && +- !sc->memcg_low_reclaim)) ++ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) + return 0; + +- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); +- if (!*need_aging) ++ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) +- goto done; ++ return nr_to_scan; + +- /* leave the work to lru_gen_age_node() */ +- if (current_is_kswapd()) +- return 0; ++ /* skip this lruvec as it's low on cold folios */ ++ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; ++} + +- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) +- return nr_to_scan; +-done: +- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; ++static unsigned long get_nr_to_reclaim(struct scan_control *sc) ++{ ++ /* don't abort memcg reclaim to ensure fairness */ ++ if (!global_reclaim(sc)) ++ return -1; ++ ++ return max(sc->nr_to_reclaim, compact_gap(sc->order)); + } + +-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, +- struct scan_control *sc, bool need_swapping) ++static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { +- int i; +- DEFINE_MAX_SEQ(lruvec); ++ long nr_to_scan; ++ unsigned long scanned = 0; ++ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); ++ int swappiness = get_swappiness(lruvec, sc); + +- if (!current_is_kswapd()) { +- /* age each memcg at most once to ensure fairness */ +- if (max_seq - seq > 1) +- return true; ++ /* clean file folios are more likely to exist */ ++ if (swappiness && !(sc->gfp_mask & __GFP_IO)) ++ swappiness = 1; + +- /* over-swapping can increase allocation latency */ +- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) +- return true; ++ while (true) { ++ int delta; + +- /* give this thread a chance to exit and free its memory */ +- if (fatal_signal_pending(current)) { +- sc->nr_reclaimed += MIN_LRU_BATCH; +- return true; +- } ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (nr_to_scan <= 0) ++ break; + +- if (cgroup_reclaim(sc)) +- return false; +- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) +- return false; ++ delta = evict_folios(lruvec, sc, swappiness); ++ if (!delta) ++ break; + +- /* keep scanning at low priorities to ensure fairness */ +- if (sc->priority > DEF_PRIORITY - 2) +- return false; ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; + +- /* +- * A minimum amount of work was done under global memory pressure. For +- * kswapd, it may be overshooting. For direct reclaim, the allocation +- * may succeed if all suitable zones are somewhat safe. In either case, +- * it's better to stop now, and restart later if necessary. +- */ +- for (i = 0; i <= sc->reclaim_idx; i++) { +- unsigned long wmark; +- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ break; + +- if (!managed_zone(zone)) ++ cond_resched(); ++ } ++ ++ /* whether try_to_inc_max_seq() was successful */ ++ return nr_to_scan < 0; ++} ++ ++static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ unsigned long scanned = sc->nr_scanned; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ int seg = lru_gen_memcg_seg(lruvec); ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ /* see the comment on MEMCG_NR_GENS */ ++ if (!lruvec_is_sizable(lruvec, sc)) ++ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; ++ ++ mem_cgroup_calculate_protection(NULL, memcg); ++ ++ if (mem_cgroup_below_min(NULL, memcg)) ++ return MEMCG_LRU_YOUNG; ++ ++ if (mem_cgroup_below_low(NULL, memcg)) { ++ /* see the comment on MEMCG_NR_GENS */ ++ if (seg != MEMCG_LRU_TAIL) ++ return MEMCG_LRU_TAIL; ++ ++ memcg_memory_event(memcg, MEMCG_LOW); ++ } ++ ++ success = try_to_shrink_lruvec(lruvec, sc); ++ ++ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); ++ ++ if (!sc->proactive) ++ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, ++ sc->nr_reclaimed - reclaimed); ++ ++ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; ++ current->reclaim_state->reclaimed_slab = 0; ++ ++ return success ? MEMCG_LRU_YOUNG : 0; ++} ++ ++#ifdef CONFIG_MEMCG ++ ++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ int op; ++ int gen; ++ int bin; ++ int first_bin; ++ struct lruvec *lruvec; ++ struct lru_gen_folio *lrugen; ++ struct mem_cgroup *memcg; ++ const struct hlist_nulls_node *pos; ++ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); ++ ++ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); ++restart: ++ op = 0; ++ memcg = NULL; ++ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); ++ ++ rcu_read_lock(); ++ ++ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { ++ if (op) ++ lru_gen_rotate_memcg(lruvec, op); ++ ++ mem_cgroup_put(memcg); ++ ++ lruvec = container_of(lrugen, struct lruvec, lrugen); ++ memcg = lruvec_memcg(lruvec); ++ ++ if (!mem_cgroup_tryget(memcg)) { ++ op = 0; ++ memcg = NULL; + continue; ++ } + +- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); +- if (wmark > zone_page_state(zone, NR_FREE_PAGES)) +- return false; ++ rcu_read_unlock(); ++ ++ op = shrink_one(lruvec, sc); ++ ++ rcu_read_lock(); ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ break; + } + +- sc->nr_reclaimed += MIN_LRU_BATCH; ++ rcu_read_unlock(); + +- return true; ++ if (op) ++ lru_gen_rotate_memcg(lruvec, op); ++ ++ mem_cgroup_put(memcg); ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ return; ++ ++ /* restart if raced with lru_gen_rotate_memcg() */ ++ if (gen != get_nulls_value(pos)) ++ goto restart; ++ ++ /* try the rest of the bins of the current generation */ ++ bin = get_memcg_bin(bin + 1); ++ if (bin != first_bin) ++ goto restart; + } + + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; +- bool need_aging = false; +- bool need_swapping = false; +- unsigned long scanned = 0; +- unsigned long reclaimed = sc->nr_reclaimed; +- DEFINE_MAX_SEQ(lruvec); ++ ++ VM_WARN_ON_ONCE(global_reclaim(sc)); ++ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); + + lru_add_drain(); + + blk_start_plug(&plug); + +- set_mm_walk(lruvec_pgdat(lruvec)); ++ set_mm_walk(NULL, sc->proactive); + +- while (true) { +- int delta; +- int swappiness; +- unsigned long nr_to_scan; ++ if (try_to_shrink_lruvec(lruvec, sc)) ++ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + +- if (sc->may_swap) +- swappiness = get_swappiness(lruvec, sc); +- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) +- swappiness = 1; +- else +- swappiness = 0; ++ clear_mm_walk(); + +- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); +- if (!nr_to_scan) +- goto done; ++ blk_finish_plug(&plug); ++} + +- delta = evict_folios(lruvec, sc, swappiness, &need_swapping); +- if (!delta) +- goto done; ++#else /* !CONFIG_MEMCG */ + +- scanned += delta; +- if (scanned >= nr_to_scan) +- break; ++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ BUILD_BUG(); ++} + +- if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) +- break; ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ BUILD_BUG(); ++} + +- cond_resched(); +- } ++#endif ++ ++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ int priority; ++ unsigned long reclaimable; ++ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); ++ ++ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) ++ return; ++ /* ++ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> ++ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the ++ * estimated reclaimed_to_scanned_ratio = inactive / total. ++ */ ++ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); ++ if (get_swappiness(lruvec, sc)) ++ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ reclaimable /= MEMCG_NR_GENS; ++ ++ /* round down reclaimable and round up sc->nr_to_reclaim */ ++ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); ++ ++ sc->priority = clamp(priority, 0, DEF_PRIORITY); ++} ++ ++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ ++ VM_WARN_ON_ONCE(!global_reclaim(sc)); ++ ++ /* ++ * Unmapped clean folios are already prioritized. Scanning for more of ++ * them is likely futile and can cause high reclaim latency when there ++ * is a large number of memcgs. ++ */ ++ if (!sc->may_writepage || !sc->may_unmap) ++ goto done; ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ set_mm_walk(pgdat, sc->proactive); ++ ++ set_initial_priority(pgdat, sc); ++ ++ if (current_is_kswapd()) ++ sc->nr_reclaimed = 0; ++ ++ if (mem_cgroup_disabled()) ++ shrink_one(&pgdat->__lruvec, sc); ++ else ++ shrink_many(pgdat, sc); ++ ++ if (current_is_kswapd()) ++ sc->nr_reclaimed += reclaimed; + +- /* see the comment in lru_gen_age_node() */ +- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) +- sc->memcgs_need_aging = false; +-done: + clear_mm_walk(); + + blk_finish_plug(&plug); ++done: ++ /* kswapd should never fail */ ++ pgdat->kswapd_failures = 0; + } + + /****************************************************************************** +@@ -5266,7 +5535,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + + static bool __maybe_unused state_is_valid(struct lruvec *lruvec) + { +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; +@@ -5279,7 +5548,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { +- if (!list_empty(&lrugen->lists[gen][type][zone])) ++ if (!list_empty(&lrugen->folios[gen][type][zone])) + return false; + } + } +@@ -5324,7 +5593,7 @@ static bool drain_evictable(struct lruvec *lruvec) + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { +- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; ++ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; + + while (!list_empty(head)) { + bool success; +@@ -5545,7 +5814,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); +@@ -5595,7 +5864,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +@@ -5692,7 +5961,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + +- if (!evict_folios(lruvec, sc, swappiness, NULL)) ++ if (!evict_folios(lruvec, sc, swappiness)) + return 0; + + cond_resched(); +@@ -5713,11 +5982,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); ++ + memcg = mem_cgroup_from_id(memcg_id); +-#ifdef CONFIG_MEMCG +- if (memcg && !css_tryget(&memcg->css)) ++ if (!mem_cgroup_tryget(memcg)) + memcg = NULL; +-#endif ++ + rcu_read_unlock(); + + if (!memcg) +@@ -5777,7 +6046,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); +- if (!set_mm_walk(NULL)) { ++ if (!set_mm_walk(NULL, true)) { + err = -ENOMEM; + goto done; + } +@@ -5849,7 +6118,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + { + int i; + int gen, type, zone; +- struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct lru_gen_folio *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); +@@ -5858,13 +6127,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) +- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + init_waitqueue_head(&lruvec->mm_state.wait); + } + + #ifdef CONFIG_MEMCG ++ ++void lru_gen_init_pgdat(struct pglist_data *pgdat) ++{ ++ int i, j; ++ ++ spin_lock_init(&pgdat->memcg_lru.lock); ++ ++ for (i = 0; i < MEMCG_NR_GENS; i++) { ++ for (j = 0; j < MEMCG_NR_BINS; j++) ++ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); ++ } ++} ++ + void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + INIT_LIST_HEAD(&memcg->mm_list.fifo); +@@ -5876,19 +6158,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) + int i; + int nid; + ++ VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); ++ + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + ++ VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + ++ lruvec->lrugen.list.next = LIST_POISON1; ++ + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } + } + } +-#endif ++ ++#endif /* CONFIG_MEMCG */ + + static int __init init_lru_gen(void) + { +@@ -5915,6 +6203,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + { + } + ++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -5928,7 +6220,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + bool proportional_reclaim; + struct blk_plug plug; + +- if (lru_gen_enabled()) { ++ if (lru_gen_enabled() && !global_reclaim(sc)) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } +@@ -6171,6 +6463,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + struct lruvec *target_lruvec; + bool reclaimable = false; + ++ if (lru_gen_enabled() && global_reclaim(sc)) { ++ lru_gen_shrink_node(pgdat, sc); ++ return; ++ } ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + again: +diff --git a/mm/workingset.c b/mm/workingset.c +index 1a86645b7b3c..fd666584515c 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio) + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; +- struct lru_gen_struct *lrugen; ++ struct lru_gen_folio *lrugen; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); +@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; +- struct lru_gen_struct *lrugen; ++ struct lru_gen_folio *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = folio_is_file_lru(folio); +diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c +index 81fa7ec2e66a..1f36bc1c5d36 100644 +--- a/tools/testing/radix-tree/maple.c ++++ b/tools/testing/radix-tree/maple.c +@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt) + + if (!MAPLE_32BIT) { + if (i >= 35) +- e = i - 35; ++ e = i - 34; + else if (i >= 5) +- e = i - 5; ++ e = i - 4; + else if (i >= 2) +- e = i - 2; ++ e = i - 1; + } else { + if (i >= 4) + e = i - 4; +@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt) + MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); +- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); ++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); + + mn = mas_pop_node(&mas); /* get the next node. */ + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); +- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); ++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); +- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); ++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); + + /* Check the limit of pop/push/pop */ + mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ +@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt) + MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_alloc_req(&mas)); +- MT_BUG_ON(mt, mas.alloc->node_count); ++ MT_BUG_ON(mt, mas.alloc->node_count != 1); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); +- MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); ++ MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); + mas_push_node(&mas, mn); +- MT_BUG_ON(mt, mas.alloc->node_count); ++ MT_BUG_ON(mt, mas.alloc->node_count != 1); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); +-- +2.39.2 + +From d3f266dbba701440ba392ceaf1b4cad9194dcdc7 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 17 Feb 2023 13:41:20 +0100 +Subject: [PATCH 11/15] mm/kvm: lockless accessed bit harvest + +TLDR +==== +This patchset RCU-protects KVM page tables and compare-and-exchanges +KVM PTEs with the accessed bit set by hardware. It significantly +improves the performance of guests when the host is under heavy +memory pressure. + +ChromeOS has been using a similar approach [1] since mid 2021 and it +was proven successful on tens of millions devices. + +[1] https://crrev.com/c/2987928 + +Overview +======== +The goal of this patchset is to optimize the performance of guests +when the host memory is overcommitted. It focuses on the vast +majority of VMs that are not nested and run on hardware that sets the +accessed bit in KVM page tables. + +Note that nested VMs and hardware that does not support the accessed +bit are both out of scope. + +This patchset relies on two techniques, RCU and cmpxchg, to safely +test and clear the accessed bit without taking kvm->mmu_lock. The +former protects KVM page tables from being freed while the latter +clears the accessed bit atomically against both hardware and other +software page table walkers. + +A new MMU notifier API, mmu_notifier_test_clear_young(), is +introduced. It follows two design patterns: fallback and batching. +For any unsupported cases, it can optionally fall back to +mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can test +or test and clear their accessed bits according to a bitmap provided +by the caller. + +This patchset only applies mmu_notifier_test_clear_young() to MGLRU. +A follow-up patchset will apply it to /proc/PID/pagemap and +/prod/PID/clear_refs. + +Evaluation +========== +An existing selftest can quickly demonstrate the effectiveness of +this patchset. On a generic workstation equipped with 64 CPUs and +256GB DRAM: + + $ sudo max_guest_memory_test -c 64 -m 256 -s 256 + + MGLRU run2 + --------------- + Before ~600s + After ~50s + Off ~250s + + kswapd (MGLRU before) + 100.00% balance_pgdat + 100.00% shrink_node + 100.00% shrink_one + 99.97% try_to_shrink_lruvec + 99.06% evict_folios + 97.41% shrink_folio_list + 31.33% folio_referenced + 31.06% rmap_walk_file + 30.89% folio_referenced_one + 20.83% __mmu_notifier_clear_flush_young + 20.54% kvm_mmu_notifier_clear_flush_young + => 19.34% _raw_write_lock + + kswapd (MGLRU after) + 100.00% balance_pgdat + 100.00% shrink_node + 100.00% shrink_one + 99.97% try_to_shrink_lruvec + 99.51% evict_folios + 71.70% shrink_folio_list + 7.08% folio_referenced + 6.78% rmap_walk_file + 6.72% folio_referenced_one + 5.60% lru_gen_look_around + => 1.53% __mmu_notifier_test_clear_young + + kswapd (MGLRU off) + 100.00% balance_pgdat + 100.00% shrink_node + 99.92% shrink_lruvec + 69.95% shrink_folio_list + 19.35% folio_referenced + 18.37% rmap_walk_file + 17.88% folio_referenced_one + 13.20% __mmu_notifier_clear_flush_young + 11.64% kvm_mmu_notifier_clear_flush_young + => 9.93% _raw_write_lock + 26.23% shrink_active_list + 25.50% folio_referenced + 25.35% rmap_walk_file + 25.28% folio_referenced_one + 23.87% __mmu_notifier_clear_flush_young + 23.69% kvm_mmu_notifier_clear_flush_young + => 18.98% _raw_write_lock + +Comprehensive benchmarks are coming soon. + +Yu Zhao (5): + mm/kvm: add mmu_notifier_test_clear_young() + kvm/x86: add kvm_arch_test_clear_young() + kvm/arm64: add kvm_arch_test_clear_young() + kvm/powerpc: add kvm_arch_test_clear_young() + mm: multi-gen LRU: use mmu_notifier_test_clear_young() + +Signed-off-by: Peter Jung +--- + arch/arm64/include/asm/kvm_host.h | 7 ++ + arch/arm64/include/asm/kvm_pgtable.h | 8 ++ + arch/arm64/include/asm/stage2_pgtable.h | 43 ++++++++ + arch/arm64/kvm/arm.c | 1 + + arch/arm64/kvm/hyp/pgtable.c | 51 ++-------- + arch/arm64/kvm/mmu.c | 77 +++++++++++++- + arch/powerpc/include/asm/kvm_host.h | 18 ++++ + arch/powerpc/include/asm/kvm_ppc.h | 14 +-- + arch/powerpc/kvm/book3s.c | 7 ++ + arch/powerpc/kvm/book3s.h | 2 + + arch/powerpc/kvm/book3s_64_mmu_radix.c | 78 ++++++++++++++- + arch/powerpc/kvm/book3s_hv.c | 10 +- + arch/x86/include/asm/kvm_host.h | 27 +++++ + arch/x86/kvm/mmu/spte.h | 12 --- + arch/x86/kvm/mmu/tdp_mmu.c | 41 ++++++++ + include/linux/kvm_host.h | 29 ++++++ + include/linux/mmu_notifier.h | 40 ++++++++ + include/linux/mmzone.h | 6 +- + mm/mmu_notifier.c | 26 +++++ + mm/rmap.c | 8 +- + mm/vmscan.c | 127 +++++++++++++++++++++--- + virt/kvm/kvm_main.c | 58 +++++++++++ + 22 files changed, 593 insertions(+), 97 deletions(-) + +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index 35a159d131b5..572bcd321586 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -1031,4 +1031,11 @@ static inline void kvm_hyp_reserve(void) { } + void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); + bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); + ++/* see the comments on the generic kvm_arch_has_test_clear_young() */ ++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young ++static inline bool kvm_arch_has_test_clear_young(void) ++{ ++ return IS_ENABLED(CONFIG_KVM) && cpu_has_hw_af() && !is_protected_kvm_enabled(); ++} ++ + #endif /* __ARM64_KVM_HOST_H__ */ +diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h +index 63f81b27a4e3..8c9a04388c88 100644 +--- a/arch/arm64/include/asm/kvm_pgtable.h ++++ b/arch/arm64/include/asm/kvm_pgtable.h +@@ -105,6 +105,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level) + * @put_page: Decrement the refcount on a page. When the + * refcount reaches 0 the page is automatically + * freed. ++ * @put_page_rcu: RCU variant of put_page(). + * @page_count: Return the refcount of a page. + * @phys_to_virt: Convert a physical address into a virtual + * address mapped in the current context. +@@ -122,6 +123,7 @@ struct kvm_pgtable_mm_ops { + void (*free_removed_table)(void *addr, u32 level); + void (*get_page)(void *addr); + void (*put_page)(void *addr); ++ void (*put_page_rcu)(void *addr); + int (*page_count)(void *addr); + void* (*phys_to_virt)(phys_addr_t phys); + phys_addr_t (*virt_to_phys)(void *addr); +@@ -188,6 +190,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, + * children. + * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared + * with other software walkers. ++ * ++ * kvm_arch_test_clear_young() is a special case. It relies on two ++ * techniques, RCU and cmpxchg, to safely test and clear the accessed ++ * bit without taking the MMU lock. The former protects KVM page tables ++ * from being freed while the latter clears the accessed bit atomically ++ * against both the hardware and other software page table walkers. + */ + enum kvm_pgtable_walk_flags { + KVM_PGTABLE_WALK_LEAF = BIT(0), +diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h +index c8dca8ae359c..350437661d4b 100644 +--- a/arch/arm64/include/asm/stage2_pgtable.h ++++ b/arch/arm64/include/asm/stage2_pgtable.h +@@ -30,4 +30,47 @@ + */ + #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) + ++#define KVM_PTE_TYPE BIT(1) ++#define KVM_PTE_TYPE_BLOCK 0 ++#define KVM_PTE_TYPE_PAGE 1 ++#define KVM_PTE_TYPE_TABLE 1 ++ ++#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) ++ ++#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) ++#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) ++#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 ++#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 ++#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) ++#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 ++#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) ++ ++#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) ++#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) ++#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) ++#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) ++#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 ++#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) ++ ++#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) ++ ++#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) ++ ++#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) ++ ++#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) ++ ++#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ ++ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ ++ KVM_PTE_LEAF_ATTR_HI_S2_XN) ++ ++#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) ++#define KVM_MAX_OWNER_ID 1 ++ ++/* ++ * Used to indicate a pte for which a 'break-before-make' sequence is in ++ * progress. ++ */ ++#define KVM_INVALID_PTE_LOCKED BIT(10) ++ + #endif /* __ARM64_S2_PGTABLE_H_ */ +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index 9c5573bc4614..6770bc47f5c9 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -191,6 +191,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) + */ + void kvm_arch_destroy_vm(struct kvm *kvm) + { ++ kvm_free_stage2_pgd(&kvm->arch.mmu); + bitmap_free(kvm->arch.pmu_filter); + free_cpumask_var(kvm->arch.supported_cpus); + +diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c +index b11cf2c618a6..8d65ee4767f1 100644 +--- a/arch/arm64/kvm/hyp/pgtable.c ++++ b/arch/arm64/kvm/hyp/pgtable.c +@@ -12,49 +12,6 @@ + #include + + +-#define KVM_PTE_TYPE BIT(1) +-#define KVM_PTE_TYPE_BLOCK 0 +-#define KVM_PTE_TYPE_PAGE 1 +-#define KVM_PTE_TYPE_TABLE 1 +- +-#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) +- +-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +-#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +-#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +-#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) +- +-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +-#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +-#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) +- +-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +- +-#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) +- +-#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) +- +-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +- +-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ +- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ +- KVM_PTE_LEAF_ATTR_HI_S2_XN) +- +-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +-#define KVM_MAX_OWNER_ID 1 +- +-/* +- * Used to indicate a pte for which a 'break-before-make' sequence is in +- * progress. +- */ +-#define KVM_INVALID_PTE_LOCKED BIT(10) +- + struct kvm_pgtable_walk_data { + struct kvm_pgtable_walker *walker; + +@@ -994,8 +951,12 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), + kvm_granule_size(ctx->level)); + +- if (childp) +- mm_ops->put_page(childp); ++ if (childp) { ++ if (mm_ops->put_page_rcu) ++ mm_ops->put_page_rcu(childp); ++ else ++ mm_ops->put_page(childp); ++ } + + return 0; + } +diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c +index a3ee3b605c9b..761fffc788f5 100644 +--- a/arch/arm64/kvm/mmu.c ++++ b/arch/arm64/kvm/mmu.c +@@ -171,6 +171,21 @@ static int kvm_host_page_count(void *addr) + return page_count(virt_to_page(addr)); + } + ++static void kvm_s2_rcu_put_page(struct rcu_head *head) ++{ ++ put_page(container_of(head, struct page, rcu_head)); ++} ++ ++static void kvm_s2_put_page_rcu(void *addr) ++{ ++ struct page *page = virt_to_page(addr); ++ ++ if (kvm_host_page_count(addr) == 1) ++ kvm_account_pgtable_pages(addr, -1); ++ ++ call_rcu(&page->rcu_head, kvm_s2_rcu_put_page); ++} ++ + static phys_addr_t kvm_host_pa(void *addr) + { + return __pa(addr); +@@ -684,6 +699,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { + .free_removed_table = stage2_free_removed_table, + .get_page = kvm_host_get_page, + .put_page = kvm_s2_put_page, ++ .put_page_rcu = kvm_s2_put_page_rcu, + .page_count = kvm_host_page_count, + .phys_to_virt = kvm_host_va, + .virt_to_phys = kvm_host_pa, +@@ -1624,6 +1640,66 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) + return pte_valid(pte) && pte_young(pte); + } + ++struct test_clear_young_arg { ++ struct kvm_gfn_range *range; ++ gfn_t lsb_gfn; ++ unsigned long *bitmap; ++}; ++ ++static int stage2_test_clear_young(const struct kvm_pgtable_visit_ctx *ctx, ++ enum kvm_pgtable_walk_flags flags) ++{ ++ struct test_clear_young_arg *arg = ctx->arg; ++ gfn_t gfn = ctx->addr / PAGE_SIZE; ++ kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; ++ ++ VM_WARN_ON_ONCE(!page_count(virt_to_page(ctx->ptep))); ++ VM_WARN_ON_ONCE(gfn < arg->range->start || gfn >= arg->range->end); ++ ++ if (!kvm_pte_valid(new)) ++ return 0; ++ ++ if (new == ctx->old) ++ return 0; ++ ++ /* see the comments on the generic kvm_arch_has_test_clear_young() */ ++ if (__test_and_change_bit(arg->lsb_gfn - gfn, arg->bitmap)) ++ cmpxchg64(ctx->ptep, ctx->old, new); ++ ++ return 0; ++} ++ ++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap) ++{ ++ u64 start = range->start * PAGE_SIZE; ++ u64 end = range->end * PAGE_SIZE; ++ struct test_clear_young_arg arg = { ++ .range = range, ++ .lsb_gfn = lsb_gfn, ++ .bitmap = bitmap, ++ }; ++ struct kvm_pgtable_walker walker = { ++ .cb = stage2_test_clear_young, ++ .arg = &arg, ++ .flags = KVM_PGTABLE_WALK_LEAF, ++ }; ++ ++ BUILD_BUG_ON(is_hyp_code()); ++ ++ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) ++ return false; ++ ++ /* see the comments on kvm_pgtable_walk_flags */ ++ rcu_read_lock(); ++ ++ kvm_pgtable_walk(kvm->arch.mmu.pgt, start, end - start, &walker); ++ ++ rcu_read_unlock(); ++ ++ return true; ++} ++ + bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) + { + if (!kvm->arch.mmu.pgt) +@@ -1848,7 +1924,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) + + void kvm_arch_flush_shadow_all(struct kvm *kvm) + { +- kvm_free_stage2_pgd(&kvm->arch.mmu); + } + + void kvm_arch_flush_shadow_memslot(struct kvm *kvm, +diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h +index caea15dcb91d..996850029ce0 100644 +--- a/arch/powerpc/include/asm/kvm_host.h ++++ b/arch/powerpc/include/asm/kvm_host.h +@@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {} + static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} + static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + ++static inline int kvmppc_radix_possible(void) ++{ ++ return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); ++} ++ ++static inline bool kvmhv_on_pseries(void) ++{ ++ return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE); ++} ++ ++/* see the comments on the generic kvm_arch_has_test_clear_young() */ ++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young ++static inline bool kvm_arch_has_test_clear_young(void) ++{ ++ return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && ++ kvmppc_radix_possible() && !kvmhv_on_pseries(); ++} ++ + #endif /* __POWERPC_KVM_HOST_H__ */ +diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h +index eae9619b6190..0bb772fc12b1 100644 +--- a/arch/powerpc/include/asm/kvm_ppc.h ++++ b/arch/powerpc/include/asm/kvm_ppc.h +@@ -277,6 +277,8 @@ struct kvmppc_ops { + bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); ++ bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap); + bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + void (*free_memslot)(struct kvm_memory_slot *slot); + int (*init_vm)(struct kvm *kvm); +@@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void) { return false; } + + #endif + +-#ifdef CONFIG_PPC_PSERIES +-static inline bool kvmhv_on_pseries(void) +-{ +- return !cpu_has_feature(CPU_FTR_HVMODE); +-} +-#else +-static inline bool kvmhv_on_pseries(void) +-{ +- return false; +-} +-#endif +- + #ifdef CONFIG_KVM_XICS + static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) + { +diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c +index 6d525285dbe8..f4cf330e3e81 100644 +--- a/arch/powerpc/kvm/book3s.c ++++ b/arch/powerpc/kvm/book3s.c +@@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) + return kvm->arch.kvm_ops->test_age_gfn(kvm, range); + } + ++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap) ++{ ++ return kvm->arch.kvm_ops->test_clear_young && ++ kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap); ++} ++ + bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) + { + return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); +diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h +index 58391b4b32ed..fe9cac423817 100644 +--- a/arch/powerpc/kvm/book3s.h ++++ b/arch/powerpc/kvm/book3s.h +@@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); + extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); + extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); ++extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap); + extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); + + extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); +diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c +index 9d3743ca16d5..8476646c554c 100644 +--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c ++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c +@@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + return ref; + } + ++bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap) ++{ ++ bool success; ++ gfn_t gfn = range->start; ++ ++ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) ++ return false; ++ ++ /* ++ * This function relies on two techniques, RCU and cmpxchg, to safely ++ * test and clear the accessed bit without taking the MMU lock. The ++ * former protects KVM page tables from being freed while the latter ++ * clears the accessed bit atomically against both the hardware and ++ * other software page table walkers. ++ */ ++ rcu_read_lock(); ++ ++ success = kvm_is_radix(kvm); ++ if (!success) ++ goto unlock; ++ ++ /* ++ * case 1: this function kvmppc_switch_mmu_to_hpt() ++ * ++ * rcu_read_lock() ++ * test kvm_is_radix() kvm->arch.radix = 0 ++ * use kvm->arch.pgtable ++ * rcu_read_unlock() ++ * synchronize_rcu() ++ * kvmppc_free_radix() ++ * ++ * ++ * case 2: this function kvmppc_switch_mmu_to_radix() ++ * ++ * kvmppc_init_vm_radix() ++ * smp_wmb() ++ * test kvm_is_radix() kvm->arch.radix = 1 ++ * smp_rmb() ++ * use kvm->arch.pgtable ++ */ ++ smp_rmb(); ++ ++ while (gfn < range->end) { ++ pte_t *ptep; ++ pte_t old, new; ++ unsigned int shift; ++ ++ ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift); ++ if (!ptep) ++ goto next; ++ ++ VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep))); ++ ++ old = READ_ONCE(*ptep); ++ if (!pte_present(old) || !pte_young(old)) ++ goto next; ++ ++ new = pte_mkold(old); ++ ++ /* see the comments on the generic kvm_arch_has_test_clear_young() */ ++ if (__test_and_change_bit(lsb_gfn - gfn, bitmap)) ++ pte_xchg(ptep, old, new); ++next: ++ gfn += shift ? BIT(shift - PAGE_SHIFT) : 1; ++ } ++unlock: ++ rcu_read_unlock(); ++ ++ return success; ++} ++ + /* Returns the number of PAGE_SIZE pages that are dirty */ + static int kvm_radix_test_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot, int pagenum) +@@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void) + { + unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; + +- kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); ++ kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, ++ SLAB_TYPESAFE_BY_RCU, pte_ctor); + if (!kvm_pte_cache) + return -ENOMEM; + + size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; + +- kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); ++ kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, ++ SLAB_TYPESAFE_BY_RCU, pmd_ctor); + if (!kvm_pmd_cache) { + kmem_cache_destroy(kvm_pte_cache); + return -ENOMEM; +diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c +index 6ba68dd6190b..17b415661282 100644 +--- a/arch/powerpc/kvm/book3s_hv.c ++++ b/arch/powerpc/kvm/book3s_hv.c +@@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) + spin_lock(&kvm->mmu_lock); + kvm->arch.radix = 0; + spin_unlock(&kvm->mmu_lock); ++ /* see the comments in kvmhv_test_clear_young() */ ++ synchronize_rcu(); + kvmppc_free_radix(kvm); + + lpcr = LPCR_VPM1; +@@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) + if (err) + return err; + kvmppc_rmap_reset(kvm); ++ /* see the comments in kvmhv_test_clear_young() */ ++ smp_wmb(); + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ + spin_lock(&kvm->mmu_lock); + kvm->arch.radix = 1; +@@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = { + .unmap_gfn_range = kvm_unmap_gfn_range_hv, + .age_gfn = kvm_age_gfn_hv, + .test_age_gfn = kvm_test_age_gfn_hv, ++ .test_clear_young = kvmhv_test_clear_young, + .set_spte_gfn = kvm_set_spte_gfn_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, +@@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void) + return 0; + } + +-static int kvmppc_radix_possible(void) +-{ +- return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +-} +- + static int kvmppc_book3s_init_hv(void) + { + int r; +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 6aaae18f1854..d2995c9e8f07 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1367,6 +1367,12 @@ struct kvm_arch { + * the MMU lock in read mode + the tdp_mmu_pages_lock or + * the MMU lock in write mode + * ++ * kvm_arch_test_clear_young() is a special case. It relies on two ++ * techniques, RCU and cmpxchg, to safely test and clear the accessed ++ * bit without taking the MMU lock. The former protects KVM page tables ++ * from being freed while the latter clears the accessed bit atomically ++ * against both the hardware and other software page table walkers. ++ * + * Roots will remain in the list until their tdp_mmu_root_count + * drops to zero, at which point the thread that decremented the + * count to zero should removed the root from the list and clean +@@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) + ++extern u64 __read_mostly shadow_accessed_mask; ++ ++/* ++ * Returns true if A/D bits are supported in hardware and are enabled by KVM. ++ * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can ++ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the ++ * scenario where KVM is using A/D bits for L1, but not L2. ++ */ ++static inline bool kvm_ad_enabled(void) ++{ ++ return shadow_accessed_mask; ++} ++ ++/* see the comments on the generic kvm_arch_has_test_clear_young() */ ++#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young ++static inline bool kvm_arch_has_test_clear_young(void) ++{ ++ return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) && ++ (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled)); ++} ++ + #endif /* _ASM_X86_KVM_HOST_H */ +diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h +index 6f54dc9409c9..0dc7fed1f3fd 100644 +--- a/arch/x86/kvm/mmu/spte.h ++++ b/arch/x86/kvm/mmu/spte.h +@@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask; + extern u64 __read_mostly shadow_nx_mask; + extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ + extern u64 __read_mostly shadow_user_mask; +-extern u64 __read_mostly shadow_accessed_mask; + extern u64 __read_mostly shadow_dirty_mask; + extern u64 __read_mostly shadow_mmio_value; + extern u64 __read_mostly shadow_mmio_mask; +@@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte) + return !!(pte & SPTE_MMU_PRESENT_MASK); + } + +-/* +- * Returns true if A/D bits are supported in hardware and are enabled by KVM. +- * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can +- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the +- * scenario where KVM is using A/D bits for L1, but not L2. +- */ +-static inline bool kvm_ad_enabled(void) +-{ +- return !!shadow_accessed_mask; +-} +- + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) + { + return sp->role.ad_disabled; +diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c +index d6df38d371a0..9028e09f1aab 100644 +--- a/arch/x86/kvm/mmu/tdp_mmu.c ++++ b/arch/x86/kvm/mmu/tdp_mmu.c +@@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) + return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); + } + ++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap) ++{ ++ struct kvm_mmu_page *root; ++ ++ if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) ++ return false; ++ ++ if (kvm_memslots_have_rmaps(kvm)) ++ return false; ++ ++ /* see the comments on kvm_arch->tdp_mmu_roots */ ++ rcu_read_lock(); ++ ++ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { ++ struct tdp_iter iter; ++ ++ if (kvm_mmu_page_as_id(root) != range->slot->as_id) ++ continue; ++ ++ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { ++ u64 *sptep = rcu_dereference(iter.sptep); ++ u64 new_spte = iter.old_spte & ~shadow_accessed_mask; ++ ++ VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep))); ++ VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end); ++ ++ if (new_spte == iter.old_spte) ++ continue; ++ ++ /* see the comments on the generic kvm_arch_has_test_clear_young() */ ++ if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap)) ++ cmpxchg64(sptep, iter.old_spte, new_spte); ++ } ++ } ++ ++ rcu_read_unlock(); ++ ++ return true; ++} ++ + static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) + { +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index 4f26b244f6d0..df46fc815c8b 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -2281,4 +2281,33 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) + /* Max number of entries allowed for each kvm dirty ring */ + #define KVM_DIRTY_RING_MAX_ENTRIES 65536 + ++/* ++ * Architectures that implement kvm_arch_test_clear_young() should override ++ * kvm_arch_has_test_clear_young(). ++ * ++ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can ++ * return true if kvm_arch_test_clear_young() is supported but disabled due to ++ * some runtime constraint. In this case, kvm_arch_test_clear_young() should ++ * return false. ++ * ++ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the ++ * following specifications: ++ * 1. The offset of each bit is relative to the second to the last parameter ++ * lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to ++ * better suit batching while forward looping. ++ * 2. For each KVM PTE with the accessed bit set, the implementation should flip ++ * the corresponding bit in the bitmap. It should only clear the accessed bit ++ * if the old value is 1. This allows the caller to test or test and clear ++ * the accessed bit. ++ */ ++#ifndef kvm_arch_has_test_clear_young ++static inline bool kvm_arch_has_test_clear_young(void) ++{ ++ return false; ++} ++#endif ++ ++bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, ++ gfn_t lsb_gfn, unsigned long *bitmap); ++ + #endif +diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h +index d6c06e140277..521f71ad0467 100644 +--- a/include/linux/mmu_notifier.h ++++ b/include/linux/mmu_notifier.h +@@ -122,6 +122,11 @@ struct mmu_notifier_ops { + struct mm_struct *mm, + unsigned long address); + ++ /* see the comments on mmu_notifier_test_clear_young() */ ++ bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ unsigned long *bitmap); ++ + /* + * change_pte is called in cases that pte mapping to page is changed: + * for example, when ksm remaps pte to point to a new shared page. +@@ -391,6 +396,9 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + extern int __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, + unsigned long end); ++extern int __mmu_notifier_test_clear_young(struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ bool fallback, unsigned long *bitmap); + extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); + extern void __mmu_notifier_change_pte(struct mm_struct *mm, +@@ -433,6 +441,31 @@ static inline int mmu_notifier_clear_young(struct mm_struct *mm, + return 0; + } + ++/* ++ * This function always returns 0 if fallback is not allowed. If fallback ++ * happens, its return value is similar to that of mmu_notifier_clear_young(). ++ * ++ * The bitmap has the following specifications: ++ * 1. The number of bits should be at least (end-start)/PAGE_SIZE. ++ * 2. The offset of each bit is relative to the end. E.g., the offset ++ * corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit ++ * batching while forward looping. ++ * 3. For each KVM PTE with the accessed bit set (young), this function flips ++ * the corresponding bit in the bitmap. It only clears the accessed bit if ++ * the old value is 1. A caller can test or test and clear the accessed bit ++ * by setting the corresponding bit in the bitmap to 0 or 1, and the new ++ * value will be 1 or 0 for a young KVM PTE. ++ */ ++static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ bool fallback, unsigned long *bitmap) ++{ ++ if (mm_has_notifiers(mm)) ++ return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap); ++ ++ return 0; ++} ++ + static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) + { +@@ -687,6 +720,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, + return 0; + } + ++static inline int mmu_notifier_test_clear_young(struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ bool fallback, unsigned long *bitmap) ++{ ++ return 0; ++} ++ + static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) + { +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 977be526c939..beece92ce62e 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -379,6 +379,7 @@ enum { + LRU_GEN_CORE, + LRU_GEN_MM_WALK, + LRU_GEN_NONLEAF_YOUNG, ++ LRU_GEN_SPTE_WALK, + NR_LRU_GEN_CAPS + }; + +@@ -485,7 +486,7 @@ struct lru_gen_mm_walk { + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); +-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); ++bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + +@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } + +-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + { ++ return false; + } + + #ifdef CONFIG_MEMCG +diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c +index f45ff1b7626a..324799848fed 100644 +--- a/mm/mmu_notifier.c ++++ b/mm/mmu_notifier.c +@@ -402,6 +402,32 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, + return young; + } + ++/* see the comments on mmu_notifier_test_clear_young() */ ++int __mmu_notifier_test_clear_young(struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ bool fallback, unsigned long *bitmap) ++{ ++ int key; ++ struct mmu_notifier *mn; ++ int young = 0; ++ ++ key = srcu_read_lock(&srcu); ++ ++ hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list, ++ hlist, srcu_read_lock_held(&srcu)) { ++ if (mn->ops->test_clear_young && ++ mn->ops->test_clear_young(mn, mm, start, end, bitmap)) ++ continue; ++ ++ if (fallback && mn->ops->clear_young) ++ young |= mn->ops->clear_young(mn, mm, start, end); ++ } ++ ++ srcu_read_unlock(&srcu, key); ++ ++ return young; ++} ++ + int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) + { +diff --git a/mm/rmap.c b/mm/rmap.c +index 7b9205cb7d87..82e3a0be1ada 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio, + return false; /* To break the loop */ + } + +- if (pvmw.pte) { +- if (lru_gen_enabled() && pte_young(*pvmw.pte)) { +- lru_gen_look_around(&pvmw); ++ if (lru_gen_enabled() && pvmw.pte) { ++ if (lru_gen_look_around(&pvmw)) + referenced++; +- } +- ++ } else if (pvmw.pte) { + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) + referenced++; +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 04a54656b6b7..2fc436638dfe 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -57,6 +57,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -3927,6 +3929,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + return folio; + } + ++static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end, ++ unsigned long *bitmap, unsigned long *last) ++{ ++ if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK)) ++ return false; ++ ++ if (*last > addr) ++ goto done; ++ ++ *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ? ++ addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1; ++ bitmap_zero(bitmap, MIN_LRU_BATCH); ++ ++ mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap); ++done: ++ return test_bit((*last - addr) / PAGE_SIZE, bitmap); ++} ++ ++static void clear_spte_young(struct mm_struct *mm, unsigned long addr, ++ unsigned long *bitmap, unsigned long *last) ++{ ++ int i; ++ unsigned long start, end = *last + 1; ++ ++ if (addr + PAGE_SIZE != end) ++ return; ++ ++ i = find_last_bit(bitmap, MIN_LRU_BATCH); ++ if (i == MIN_LRU_BATCH) ++ return; ++ ++ start = end - (i + 1) * PAGE_SIZE; ++ ++ i = find_first_bit(bitmap, MIN_LRU_BATCH); ++ ++ end -= i * PAGE_SIZE; ++ ++ mmu_notifier_test_clear_young(mm, start, end, false, bitmap); ++} ++ ++static void skip_spte_young(struct mm_struct *mm, unsigned long addr, ++ unsigned long *bitmap, unsigned long *last) ++{ ++ if (*last > addr) ++ __clear_bit((*last - addr) / PAGE_SIZE, bitmap); ++ ++ clear_spte_young(mm, addr, bitmap, last); ++} ++ + static bool suitable_to_scan(int total, int young) + { + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); +@@ -3942,6 +3993,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; ++ unsigned long last = 0; + int total = 0; + int young = 0; + struct lru_gen_mm_walk *walk = args->private; +@@ -3960,6 +4013,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + pte = pte_offset_map(pmd, start & PMD_MASK); + restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ bool success; + unsigned long pfn; + struct folio *folio; + +@@ -3967,20 +4021,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(pte[i], args->vma, addr); +- if (pfn == -1) ++ if (pfn == -1) { ++ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); + continue; ++ } + +- if (!pte_young(pte[i])) { ++ success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last); ++ if (!success && !pte_young(pte[i])) { ++ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); +- if (!folio) ++ if (!folio) { ++ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); + continue; ++ } + +- if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) +- VM_WARN_ON_ONCE(true); ++ clear_spte_young(args->vma->vm_mm, addr, bitmap, &last); ++ if (pte_young(pte[i])) ++ ptep_test_and_clear_young(args->vma, addr, pte + i); + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; +@@ -4589,6 +4650,24 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + * rmap/PT walk feedback + ******************************************************************************/ + ++static bool should_look_around(struct vm_area_struct *vma, unsigned long addr, ++ pte_t *pte, int *young) ++{ ++ unsigned long old = true; ++ ++ *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old); ++ if (!old) ++ *young = true; ++ ++ if (pte_young(*pte)) { ++ ptep_test_and_clear_young(vma, addr, pte); ++ *young = true; ++ return true; ++ } ++ ++ return !old && get_cap(LRU_GEN_SPTE_WALK); ++} ++ + /* + * This function exploits spatial locality when shrink_folio_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If +@@ -4596,12 +4675,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + { + int i; + unsigned long start; + unsigned long end; + struct lru_gen_mm_walk *walk; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; ++ unsigned long last = 0; + int young = 0; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; +@@ -4615,8 +4696,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + ++ if (!should_look_around(pvmw->vma, addr, pte, &young)) ++ return young; ++ + if (spin_is_contended(pvmw->ptl)) +- return; ++ return young; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; +@@ -4624,6 +4708,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + start = max(addr & PMD_MASK, pvmw->vma->vm_start); + end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + ++ if (end - start == PAGE_SIZE) ++ return young; ++ + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; +@@ -4637,28 +4724,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) +- return; ++ return young; + + arch_enter_lazy_mmu_mode(); + + pte -= (addr - start) / PAGE_SIZE; + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ bool success; + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); +- if (pfn == -1) ++ if (pfn == -1) { ++ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); + continue; ++ } + +- if (!pte_young(pte[i])) ++ success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last); ++ if (!success && !pte_young(pte[i])) { ++ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); + continue; ++ } + + folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); +- if (!folio) ++ if (!folio) { ++ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); + continue; ++ } + +- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) +- VM_WARN_ON_ONCE(true); ++ clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); ++ if (pte_young(pte[i])) ++ ptep_test_and_clear_young(pvmw->vma, addr, pte + i); + + young++; + +@@ -4688,6 +4784,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ return young; + } + + /****************************************************************************** +@@ -5707,6 +5805,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + ++ if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK)) ++ caps |= BIT(LRU_GEN_SPTE_WALK); ++ + return sysfs_emit(buf, "0x%04x\n", caps); + } + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 9c60384b5ae0..1b465df4a93d 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, + return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); + } + ++static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start, ++ unsigned long end, unsigned long *bitmap) ++{ ++ int i; ++ int key; ++ bool success = true; ++ ++ trace_kvm_age_hva(start, end); ++ ++ key = srcu_read_lock(&kvm->srcu); ++ ++ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { ++ struct interval_tree_node *node; ++ struct kvm_memslots *slots = __kvm_memslots(kvm, i); ++ ++ kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) { ++ gfn_t lsb_gfn; ++ unsigned long hva_start, hva_end; ++ struct kvm_gfn_range range = { ++ .slot = container_of(node, struct kvm_memory_slot, ++ hva_node[slots->node_idx]), ++ }; ++ ++ hva_start = max(start, range.slot->userspace_addr); ++ hva_end = min(end - 1, range.slot->userspace_addr + ++ range.slot->npages * PAGE_SIZE - 1); ++ ++ range.start = hva_to_gfn_memslot(hva_start, range.slot); ++ range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1; ++ ++ if (WARN_ON_ONCE(range.end <= range.start)) ++ continue; ++ ++ /* see the comments on the generic kvm_arch_has_test_clear_young() */ ++ lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot); ++ ++ success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap); ++ if (!success) ++ break; ++ } ++ } ++ ++ srcu_read_unlock(&kvm->srcu, key); ++ ++ return success; ++} ++ ++static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm, ++ unsigned long start, unsigned long end, ++ unsigned long *bitmap) ++{ ++ if (kvm_arch_has_test_clear_young()) ++ return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap); ++ ++ return false; ++} ++ + static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +@@ -903,6 +960,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .clear_young = kvm_mmu_notifier_clear_young, + .test_young = kvm_mmu_notifier_test_young, ++ .test_clear_young = kvm_mmu_notifier_test_clear_young, + .change_pte = kvm_mmu_notifier_change_pte, + .release = kvm_mmu_notifier_release, + }; +-- +2.39.2 + +From 1c4ee6ec54d7431a95f829f518cb6b1f7154c6b7 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 13 Feb 2023 09:26:09 +0100 +Subject: [PATCH 12/15] objtool + +Signed-off-by: Peter Jung +--- + tools/objtool/.gitignore | 1 + + tools/objtool/Build | 2 - + tools/objtool/Documentation/objtool.txt | 8 +++ + tools/objtool/Makefile | 66 +++++++++++++++++-------- + tools/objtool/builtin-check.c | 2 +- + tools/objtool/check.c | 9 ++++ + tools/objtool/elf.c | 42 ++++++++-------- + tools/objtool/include/objtool/builtin.h | 2 - + tools/objtool/include/objtool/elf.h | 9 ++-- + tools/objtool/include/objtool/special.h | 2 +- + tools/objtool/special.c | 6 +-- + 11 files changed, 95 insertions(+), 54 deletions(-) + +diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore +index 14236db3677f..4faa4dd72f35 100644 +--- a/tools/objtool/.gitignore ++++ b/tools/objtool/.gitignore +@@ -2,3 +2,4 @@ + arch/x86/lib/inat-tables.c + /objtool + fixdep ++libsubcmd/ +diff --git a/tools/objtool/Build b/tools/objtool/Build +index 33f2ee5a46d3..a3cdf8af6635 100644 +--- a/tools/objtool/Build ++++ b/tools/objtool/Build +@@ -16,8 +16,6 @@ objtool-y += libctype.o + objtool-y += str_error_r.o + objtool-y += librbtree.o + +-CFLAGS += -I$(srctree)/tools/lib +- + $(OUTPUT)libstring.o: ../lib/string.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) +diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt +index 8a671902a187..8e53fc6735ef 100644 +--- a/tools/objtool/Documentation/objtool.txt ++++ b/tools/objtool/Documentation/objtool.txt +@@ -410,6 +410,14 @@ the objtool maintainers. + can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL + directive right before the call. + ++12. file.o: warning: func(): not an indirect call target ++ ++ This means that objtool is running with --ibt and a function expected ++ to be an indirect call target is not. In particular, this happens for ++ init_module() or cleanup_module() if a module relies on these special ++ names and does not use module_init() / module_exit() macros to create ++ them. ++ + + If the error doesn't seem to make sense, it could be a bug in objtool. + Feel free to ask the objtool maintainer for help. +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index a3a9cc24e0e3..83b100c1e7f6 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -2,19 +2,18 @@ + include ../scripts/Makefile.include + include ../scripts/Makefile.arch + +-# always use the host compiler +-AR = $(HOSTAR) +-CC = $(HOSTCC) +-LD = $(HOSTLD) +- + ifeq ($(srctree),) + srctree := $(patsubst %/,%,$(dir $(CURDIR))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + +-SUBCMD_SRCDIR = $(srctree)/tools/lib/subcmd/ +-LIBSUBCMD_OUTPUT = $(or $(OUTPUT),$(CURDIR)/) +-LIBSUBCMD = $(LIBSUBCMD_OUTPUT)libsubcmd.a ++LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/ ++ifneq ($(OUTPUT),) ++ LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd ++else ++ LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd ++endif ++LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a + + OBJTOOL := $(OUTPUT)objtool + OBJTOOL_IN := $(OBJTOOL)-in.o +@@ -28,16 +27,29 @@ INCLUDES := -I$(srctree)/tools/include \ + -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ + -I$(srctree)/tools/arch/$(SRCARCH)/include \ + -I$(srctree)/tools/objtool/include \ +- -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include ++ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \ ++ -I$(LIBSUBCMD_OUTPUT)/include ++# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it ++# is passed here to match a legacy behavior. + WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs +-CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) +-LDFLAGS += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) ++OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) ++OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) + + # Allow old libelf to be used: +-elfshdr := $(shell echo '$(pound)include ' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr) +-CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) ++elfshdr := $(shell echo '$(pound)include ' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr) ++OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) ++ ++# Always want host compilation. ++HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" + + AWK = awk ++MKDIR = mkdir ++ ++ifeq ($(V),1) ++ Q = ++else ++ Q = @ ++endif + + BUILD_ORC := n + +@@ -49,21 +61,33 @@ export BUILD_ORC + export srctree OUTPUT CFLAGS SRCARCH AWK + include $(srctree)/tools/build/Makefile.include + +-$(OBJTOOL_IN): fixdep FORCE +- @$(CONFIG_SHELL) ./sync-check.sh +- @$(MAKE) $(build)=objtool ++$(OBJTOOL_IN): fixdep $(LIBSUBCMD) FORCE ++ $(Q)$(CONFIG_SHELL) ./sync-check.sh ++ $(Q)$(MAKE) $(build)=objtool $(HOST_OVERRIDES) CFLAGS="$(OBJTOOL_CFLAGS)" \ ++ LDFLAGS="$(OBJTOOL_LDFLAGS)" ++ + + $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) +- $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ ++ $(QUIET_LINK)$(HOSTCC) $(OBJTOOL_IN) $(OBJTOOL_LDFLAGS) -o $@ ++ ++ ++$(LIBSUBCMD_OUTPUT): ++ $(Q)$(MKDIR) -p $@ + ++$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE ++ $(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \ ++ DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \ ++ $(HOST_OVERRIDES) EXTRA_CFLAGS="$(OBJTOOL_CFLAGS)" \ ++ $@ install_headers + +-$(LIBSUBCMD): fixdep FORCE +- $(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT) ++$(LIBSUBCMD)-clean: ++ $(call QUIET_CLEAN, libsubcmd) ++ $(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT) + +-clean: ++clean: $(LIBSUBCMD)-clean + $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) + $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete +- $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep $(LIBSUBCMD) ++ $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep + + FORCE: + +diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c +index a4f39407bf59..7c175198d09f 100644 +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -65,7 +65,7 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) + return found ? 0 : -1; + } + +-const struct option check_options[] = { ++static const struct option check_options[] = { + OPT_GROUP("Actions:"), + OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), + OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index 4b7c8b33069e..0678ba04fe22 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -688,6 +688,7 @@ static int create_static_call_sections(struct objtool_file *file) + if (strncmp(key_name, STATIC_CALL_TRAMP_PREFIX_STR, + STATIC_CALL_TRAMP_PREFIX_LEN)) { + WARN("static_call: trampoline name malformed: %s", key_name); ++ free(key_name); + return -1; + } + tmp = key_name + STATIC_CALL_TRAMP_PREFIX_LEN - STATIC_CALL_KEY_PREFIX_LEN; +@@ -697,6 +698,7 @@ static int create_static_call_sections(struct objtool_file *file) + if (!key_sym) { + if (!opts.module) { + WARN("static_call: can't find static_call_key symbol: %s", tmp); ++ free(key_name); + return -1; + } + +@@ -854,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) + list_for_each_entry(insn, &file->endbr_list, call_node) { + + int *site = (int *)sec->data->d_buf + idx; ++ struct symbol *sym = insn->sym; + *site = 0; + ++ if (opts.module && sym && sym->type == STT_FUNC && ++ insn->offset == sym->offset && ++ (!strcmp(sym->name, "init_module") || ++ !strcmp(sym->name, "cleanup_module"))) ++ WARN("%s(): not an indirect call target", sym->name); ++ + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(int), + R_X86_64_PC32, +diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c +index 64443a7f4bbf..6806ce01d933 100644 +--- a/tools/objtool/elf.c ++++ b/tools/objtool/elf.c +@@ -284,13 +284,13 @@ static int read_sections(struct elf *elf) + !elf_alloc_hash(section_name, sections_nr)) + return -1; + ++ elf->section_data = calloc(sections_nr, sizeof(*sec)); ++ if (!elf->section_data) { ++ perror("calloc"); ++ return -1; ++ } + for (i = 0; i < sections_nr; i++) { +- sec = malloc(sizeof(*sec)); +- if (!sec) { +- perror("malloc"); +- return -1; +- } +- memset(sec, 0, sizeof(*sec)); ++ sec = &elf->section_data[i]; + + INIT_LIST_HEAD(&sec->symbol_list); + INIT_LIST_HEAD(&sec->reloc_list); +@@ -422,13 +422,13 @@ static int read_symbols(struct elf *elf) + !elf_alloc_hash(symbol_name, symbols_nr)) + return -1; + ++ elf->symbol_data = calloc(symbols_nr, sizeof(*sym)); ++ if (!elf->symbol_data) { ++ perror("calloc"); ++ return -1; ++ } + for (i = 0; i < symbols_nr; i++) { +- sym = malloc(sizeof(*sym)); +- if (!sym) { +- perror("malloc"); +- return -1; +- } +- memset(sym, 0, sizeof(*sym)); ++ sym = &elf->symbol_data[i]; + + sym->idx = i; + +@@ -918,13 +918,13 @@ static int read_relocs(struct elf *elf) + sec->base->reloc = sec; + + nr_reloc = 0; ++ sec->reloc_data = calloc(sec->sh.sh_size / sec->sh.sh_entsize, sizeof(*reloc)); ++ if (!sec->reloc_data) { ++ perror("calloc"); ++ return -1; ++ } + for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) { +- reloc = malloc(sizeof(*reloc)); +- if (!reloc) { +- perror("malloc"); +- return -1; +- } +- memset(reloc, 0, sizeof(*reloc)); ++ reloc = &sec->reloc_data[i]; + switch (sec->sh.sh_type) { + case SHT_REL: + if (read_rel_reloc(sec, i, reloc, &symndx)) +@@ -1453,16 +1453,16 @@ void elf_close(struct elf *elf) + list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) { + list_del(&sym->list); + hash_del(&sym->hash); +- free(sym); + } + list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) { + list_del(&reloc->list); + hash_del(&reloc->hash); +- free(reloc); + } + list_del(&sec->list); +- free(sec); ++ free(sec->reloc_data); + } + ++ free(elf->symbol_data); ++ free(elf->section_data); + free(elf); + } +diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h +index fa45044e3863..2a108e648b7a 100644 +--- a/tools/objtool/include/objtool/builtin.h ++++ b/tools/objtool/include/objtool/builtin.h +@@ -7,8 +7,6 @@ + + #include + +-extern const struct option check_options[]; +- + struct opts { + /* actions: */ + bool dump_orc; +diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h +index bb60fd42b46f..ad0024da262b 100644 +--- a/tools/objtool/include/objtool/elf.h ++++ b/tools/objtool/include/objtool/elf.h +@@ -39,6 +39,7 @@ struct section { + char *name; + int idx; + bool changed, text, rodata, noinstr, init, truncate; ++ struct reloc *reloc_data; + }; + + struct symbol { +@@ -49,12 +50,11 @@ struct symbol { + GElf_Sym sym; + struct section *sec; + char *name; +- unsigned int idx; +- unsigned char bind, type; ++ unsigned int idx, len; + unsigned long offset; +- unsigned int len; + unsigned long __subtree_last; + struct symbol *pfunc, *cfunc, *alias; ++ unsigned char bind, type; + u8 uaccess_safe : 1; + u8 static_call_tramp : 1; + u8 retpoline_thunk : 1; +@@ -104,6 +104,9 @@ struct elf { + struct hlist_head *section_hash; + struct hlist_head *section_name_hash; + struct hlist_head *reloc_hash; ++ ++ struct section *section_data; ++ struct symbol *symbol_data; + }; + + #define OFFSET_STRIDE_BITS 4 +diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h +index dc4721e19002..86d4af9c5aa9 100644 +--- a/tools/objtool/include/objtool/special.h ++++ b/tools/objtool/include/objtool/special.h +@@ -19,6 +19,7 @@ struct special_alt { + bool skip_orig; + bool skip_alt; + bool jump_or_nop; ++ u8 key_addend; + + struct section *orig_sec; + unsigned long orig_off; +@@ -27,7 +28,6 @@ struct special_alt { + unsigned long new_off; + + unsigned int orig_len, new_len; /* group only */ +- u8 key_addend; + }; + + int special_get_alts(struct elf *elf, struct list_head *alts); +diff --git a/tools/objtool/special.c b/tools/objtool/special.c +index 9c8d827f69af..baa85c31526b 100644 +--- a/tools/objtool/special.c ++++ b/tools/objtool/special.c +@@ -26,7 +26,7 @@ struct special_entry { + unsigned char key; /* jump_label key */ + }; + +-struct special_entry entries[] = { ++static const struct special_entry entries[] = { + { + .sec = ".altinstructions", + .group = true, +@@ -65,7 +65,7 @@ static void reloc_to_sec_off(struct reloc *reloc, struct section **sec, + *off = reloc->sym->offset + reloc->addend; + } + +-static int get_alt_entry(struct elf *elf, struct special_entry *entry, ++static int get_alt_entry(struct elf *elf, const struct special_entry *entry, + struct section *sec, int idx, + struct special_alt *alt) + { +@@ -139,7 +139,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, + */ + int special_get_alts(struct elf *elf, struct list_head *alts) + { +- struct special_entry *entry; ++ const struct special_entry *entry; + struct section *sec; + unsigned int nr_entries; + struct special_alt *alt; +-- +2.39.2 + +From 66e965775ec9bee68e40b497c765bc03fc264ec8 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 17 Feb 2023 15:36:12 +0100 +Subject: [PATCH 13/15] sched + +Signed-off-by: Peter Jung +--- + arch/x86/kernel/itmt.c | 23 +- + arch/x86/kernel/smpboot.c | 2 +- + include/linux/sched.h | 3 + + include/linux/sched/sd_flags.h | 5 +- + kernel/sched/core.c | 77 ++-- + kernel/sched/cpufreq_schedutil.c | 43 +-- + kernel/sched/deadline.c | 42 ++- + kernel/sched/debug.c | 1 + + kernel/sched/fair.c | 581 ++++++++++++++++++++----------- + kernel/sched/features.h | 1 + + kernel/sched/pelt.c | 60 ++++ + kernel/sched/pelt.h | 42 ++- + kernel/sched/sched.h | 28 +- + 13 files changed, 591 insertions(+), 317 deletions(-) + +diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c +index 9ff480e94511..6510883c5e81 100644 +--- a/arch/x86/kernel/itmt.c ++++ b/arch/x86/kernel/itmt.c +@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu) + + /** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT +- * @prio: Priority of cpu core +- * @core_cpu: The cpu number associated with the core ++ * @prio: Priority of @cpu ++ * @cpu: The CPU number + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional +- * to the max boost frequency. CPU with higher boost ++ * to the max boost frequency. CPUs with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +-void sched_set_itmt_core_prio(int prio, int core_cpu) ++void sched_set_itmt_core_prio(int prio, int cpu) + { +- int cpu, i = 1; +- +- for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { +- int smt_prio; +- +- /* +- * Ensure that the siblings are moved to the end +- * of the priority chain and only used when +- * all other high priority cpus are out of capacity. +- */ +- smt_prio = prio * smp_num_siblings / (i * i); +- per_cpu(sched_core_priority, cpu) = smt_prio; +- i++; +- } ++ per_cpu(sched_core_priority, cpu) = prio; + } +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 55cad72715d9..0213d066a9a9 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -547,7 +547,7 @@ static int x86_core_flags(void) + #ifdef CONFIG_SCHED_SMT + static int x86_smt_flags(void) + { +- return cpu_smt_flags() | x86_sched_itmt_flags(); ++ return cpu_smt_flags(); + } + #endif + #ifdef CONFIG_SCHED_CLUSTER +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 853d08f7562b..28ce1be0ba47 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -557,6 +557,9 @@ struct sched_entity { + u64 prev_sum_exec_runtime; + + u64 nr_migrations; ++ u64 prev_sleep_sum_runtime; ++ /* average duration of a task */ ++ u64 dur_avg; + + #ifdef CONFIG_FAIR_GROUP_SCHED + int depth; +diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h +index 57bde66d95f7..800238854ba5 100644 +--- a/include/linux/sched/sd_flags.h ++++ b/include/linux/sched/sd_flags.h +@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + /* + * Place busy tasks earlier in the domain + * +- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further +- * up, but currently assumed to be set from the base domain +- * upwards (see update_top_cache_domain()). + * NEEDS_GROUPS: Load balancing flag. + */ +-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) ++SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) + + /* + * Prefer to place tasks in a sibling domain +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 2a4918a1faa9..5237639786b7 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -722,7 +722,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) + update_irq_load_avg(rq, irq_delta + steal); + #endif +- update_rq_clock_pelt(rq, delta); ++ update_rq_clock_task_mult(rq, delta); + } + + void update_rq_clock(struct rq *rq) +@@ -3675,14 +3675,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) + } + + /* +- * Mark the task runnable and perform wakeup-preemption. ++ * Mark the task runnable. + */ +-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, +- struct rq_flags *rf) ++static inline void ttwu_do_wakeup(struct task_struct *p) + { +- check_preempt_curr(rq, p, wake_flags); + WRITE_ONCE(p->__state, TASK_RUNNING); + trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ++ struct rq_flags *rf) ++{ ++ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; ++ ++ lockdep_assert_rq_held(rq); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++#ifdef CONFIG_SMP ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++ else ++#endif ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ activate_task(rq, p, en_flags); ++ check_preempt_curr(rq, p, wake_flags); ++ ++ ttwu_do_wakeup(p); + + #ifdef CONFIG_SMP + if (p->sched_class->task_woken) { +@@ -3712,31 +3737,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + #endif + } + +-static void +-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, +- struct rq_flags *rf) +-{ +- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; +- +- lockdep_assert_rq_held(rq); +- +- if (p->sched_contributes_to_load) +- rq->nr_uninterruptible--; +- +-#ifdef CONFIG_SMP +- if (wake_flags & WF_MIGRATED) +- en_flags |= ENQUEUE_MIGRATED; +- else +-#endif +- if (p->in_iowait) { +- delayacct_blkio_end(p); +- atomic_dec(&task_rq(p)->nr_iowait); +- } +- +- activate_task(rq, p, en_flags); +- ttwu_do_wakeup(rq, p, wake_flags, rf); +-} +- + /* + * Consider @p being inside a wait loop: + * +@@ -3770,9 +3770,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + + rq = __task_rq_lock(p, &rf); + if (task_on_rq_queued(p)) { +- /* check_preempt_curr() may use rq clock */ +- update_rq_clock(rq); +- ttwu_do_wakeup(rq, p, wake_flags, &rf); ++ if (!task_on_cpu(rq, p)) { ++ /* ++ * When on_rq && !on_cpu the task is preempted, see if ++ * it should preempt the task that is current now. ++ */ ++ update_rq_clock(rq); ++ check_preempt_curr(rq, p, wake_flags); ++ } ++ ttwu_do_wakeup(p); + ret = 1; + } + __task_rq_unlock(rq, &rf); +@@ -4138,8 +4144,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + goto out; + + trace_sched_waking(p); +- WRITE_ONCE(p->__state, TASK_RUNNING); +- trace_sched_wakeup(p); ++ ttwu_do_wakeup(p); + goto out; + } + +@@ -4424,6 +4429,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++ p->se.dur_avg = 0; ++ p->se.prev_sleep_sum_runtime = 0; + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 1207c78f85c1..5c840151f3bb 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -48,7 +48,6 @@ struct sugov_cpu { + + unsigned long util; + unsigned long bw_dl; +- unsigned long max; + + /* The field below is for single-CPU policies only: */ + #ifdef CONFIG_NO_HZ_COMMON +@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) + { + struct rq *rq = cpu_rq(sg_cpu->cpu); + +- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), + FREQUENCY_UTIL, NULL); +@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + * sugov_iowait_apply() - Apply the IO boost to a CPU. + * @sg_cpu: the sugov data for the cpu to boost + * @time: the update time from the caller ++ * @max_cap: the max CPU capacity + * + * A CPU running a task which woken up after an IO operation can have its + * utilization boosted to speed up the completion of those IO operations. +@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + * This mechanism is designed to boost high frequently IO waiting tasks, while + * being more conservative on tasks which does sporadic IO operations. + */ +-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) ++static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, ++ unsigned long max_cap) + { + unsigned long boost; + +@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) + * sg_cpu->util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. + */ +- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; ++ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; + boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); + if (sg_cpu->util < boost) + sg_cpu->util = boost; +@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) + } + + static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, +- u64 time, unsigned int flags) ++ u64 time, unsigned long max_cap, ++ unsigned int flags) + { + sugov_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; +@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + return false; + + sugov_get_util(sg_cpu); +- sugov_iowait_apply(sg_cpu, time); ++ sugov_iowait_apply(sg_cpu, time, max_cap); + + return true; + } +@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; ++ unsigned long max_cap; + unsigned int next_f; + +- if (!sugov_update_single_common(sg_cpu, time, flags)) ++ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); ++ ++ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) + return; + +- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); ++ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. +@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + { + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; ++ unsigned long max_cap; + + /* + * Fall back to the "frequency" path if frequency invariance is not +@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + return; + } + +- if (!sugov_update_single_common(sg_cpu, time, flags)) ++ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); ++ ++ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) + return; + + /* +@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), +- map_util_perf(sg_cpu->util), sg_cpu->max); ++ map_util_perf(sg_cpu->util), max_cap); + + sg_cpu->sg_policy->last_freq_update_time = time; + } +@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) + { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; +- unsigned long util = 0, max = 1; ++ unsigned long util = 0, max_cap; + unsigned int j; + ++ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); ++ + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); +- unsigned long j_util, j_max; + + sugov_get_util(j_sg_cpu); +- sugov_iowait_apply(j_sg_cpu, time); +- j_util = j_sg_cpu->util; +- j_max = j_sg_cpu->max; ++ sugov_iowait_apply(j_sg_cpu, time, max_cap); + +- if (j_util * max > j_max * util) { +- util = j_util; +- max = j_max; +- } ++ util = max(j_sg_cpu->util, util); + } + +- return get_next_freq(sg_policy, util, max); ++ return get_next_freq(sg_policy, util, max_cap); + } + + static void +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 0d97d54276cc..71b24371a6f7 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) + static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) + { +- if (task_on_rq_queued(p) || task_current(rq, p)) { ++ if (!task_on_rq_queued(p)) ++ return; ++ + #ifdef CONFIG_SMP +- /* +- * This might be too much, but unfortunately +- * we don't have the old deadline value, and +- * we can't argue if the task is increasing +- * or lowering its prio, so... +- */ +- if (!rq->dl.overloaded) +- deadline_queue_pull_task(rq); ++ /* ++ * This might be too much, but unfortunately ++ * we don't have the old deadline value, and ++ * we can't argue if the task is increasing ++ * or lowering its prio, so... ++ */ ++ if (!rq->dl.overloaded) ++ deadline_queue_pull_task(rq); + ++ if (task_current(rq, p)) { + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this +@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) + resched_curr(rq); +-#else ++ } else { + /* +- * Again, we don't know if p has a earlier +- * or later deadline, so let's blindly set a +- * (maybe not needed) rescheduling point. ++ * Current may not be deadline in case p was throttled but we ++ * have just replenished it (e.g. rt_mutex_setprio()). ++ * ++ * Otherwise, if p was given an earlier deadline, reschedule. + */ +- resched_curr(rq); +-#endif /* CONFIG_SMP */ ++ if (!dl_task(rq->curr) || ++ dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) ++ resched_curr(rq); + } ++#else ++ /* ++ * We don't know if p has a earlier or later deadline, so let's blindly ++ * set a (maybe not needed) rescheduling point. ++ */ ++ resched_curr(rq); ++#endif + } + + DEFINE_SCHED_CLASS(dl) = { +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 1637b65ba07a..8d64fba16cfe 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -1024,6 +1024,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + __PS("nr_involuntary_switches", p->nivcsw); + + P(se.load.weight); ++ P(se.dur_avg); + #ifdef CONFIG_SMP + P(se.avg.load_sum); + P(se.avg.runnable_sum); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 86a988c830ef..b38a1ce1be49 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) + * Scheduling class queueing methods: + */ + ++static inline bool is_core_idle(int cpu) ++{ ++#ifdef CONFIG_SCHED_SMT ++ int sibling; ++ ++ for_each_cpu(sibling, cpu_smt_mask(cpu)) { ++ if (cpu == sibling) ++ continue; ++ ++ if (!idle_cpu(sibling)) ++ return false; ++ } ++#endif ++ ++ return true; ++} ++ + #ifdef CONFIG_NUMA + #define NUMA_IMBALANCE_MIN 2 + +@@ -1718,23 +1735,6 @@ struct numa_stats { + int idle_cpu; + }; + +-static inline bool is_core_idle(int cpu) +-{ +-#ifdef CONFIG_SCHED_SMT +- int sibling; +- +- for_each_cpu(sibling, cpu_smt_mask(cpu)) { +- if (cpu == sibling) +- continue; +- +- if (!idle_cpu(sibling)) +- return false; +- } +-#endif +- +- return true; +-} +- + struct task_numa_env { + struct task_struct *p; + +@@ -4494,17 +4494,9 @@ static inline int util_fits_cpu(unsigned long util, + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. +- * +- * In case of capacity inversion we should honour the inverted capacity +- * for both uclamp_min and uclamp_max all the time. + */ +- capacity_orig = cpu_in_capacity_inversion(cpu); +- if (capacity_orig) { +- capacity_orig_thermal = capacity_orig; +- } else { +- capacity_orig = capacity_orig_of(cpu); +- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); +- } ++ capacity_orig = capacity_orig_of(cpu); ++ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. +@@ -4579,8 +4571,8 @@ static inline int util_fits_cpu(unsigned long util, + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); +- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) +- fits = fits && (uclamp_min <= capacity_orig_thermal); ++ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) ++ return -1; + + return fits; + } +@@ -4590,7 +4582,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); +- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); ++ /* ++ * Return true only if the cpu fully fits the task requirements, which ++ * include the utilization but also the performance hints. ++ */ ++ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); + } + + static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +@@ -4674,6 +4670,7 @@ static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + { + u64 vruntime = cfs_rq->min_vruntime; ++ u64 sleep_time; + + /* + * The 'current' period is already promised to the current tasks, +@@ -4703,8 +4700,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + vruntime -= thresh; + } + +- /* ensure we never gain time by being placed backwards. */ +- se->vruntime = max_vruntime(se->vruntime, vruntime); ++ /* ++ * Pull vruntime of the entity being placed to the base level of ++ * cfs_rq, to prevent boosting it if placed backwards. If the entity ++ * slept for a long time, don't even try to compare its vruntime with ++ * the base as it may be too far off and the comparison may get ++ * inversed due to s64 overflow. ++ */ ++ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; ++ if ((s64)sleep_time > 60LL * NSEC_PER_SEC) ++ se->vruntime = vruntime; ++ else ++ se->vruntime = max_vruntime(se->vruntime, vruntime); + } + + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +@@ -4914,7 +4921,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + struct sched_entity *se; + s64 delta; + +- ideal_runtime = sched_slice(cfs_rq, curr); ++ /* ++ * When many tasks blow up the sched_period; it is possible that ++ * sched_slice() reports unusually large results (when many tasks are ++ * very light for example). Therefore impose a maximum. ++ */ ++ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); ++ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + resched_curr(rq_of(cfs_rq)); +@@ -5479,22 +5492,105 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + resched_curr(rq); + } + +-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) ++#ifdef CONFIG_SMP ++static void __cfsb_csd_unthrottle(void *arg) + { +- struct cfs_rq *cfs_rq; ++ struct cfs_rq *cursor, *tmp; ++ struct rq *rq = arg; ++ struct rq_flags rf; ++ ++ rq_lock(rq, &rf); ++ ++ /* ++ * Since we hold rq lock we're safe from concurrent manipulation of ++ * the CSD list. However, this RCU critical section annotates the ++ * fact that we pair with sched_free_group_rcu(), so that we cannot ++ * race with group being freed in the window between removing it ++ * from the list and advancing to the next entry in the list. ++ */ ++ rcu_read_lock(); ++ ++ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, ++ throttled_csd_list) { ++ list_del_init(&cursor->throttled_csd_list); ++ ++ if (cfs_rq_throttled(cursor)) ++ unthrottle_cfs_rq(cursor); ++ } ++ ++ rcu_read_unlock(); ++ ++ rq_unlock(rq, &rf); ++} ++ ++static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) ++{ ++ struct rq *rq = rq_of(cfs_rq); ++ bool first; ++ ++ if (rq == this_rq()) { ++ unthrottle_cfs_rq(cfs_rq); ++ return; ++ } ++ ++ /* Already enqueued */ ++ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) ++ return; ++ ++ first = list_empty(&rq->cfsb_csd_list); ++ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); ++ if (first) ++ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); ++} ++#else ++static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) ++{ ++ unthrottle_cfs_rq(cfs_rq); ++} ++#endif ++ ++static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) ++{ ++ lockdep_assert_rq_held(rq_of(cfs_rq)); ++ ++ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || ++ cfs_rq->runtime_remaining <= 0)) ++ return; ++ ++ __unthrottle_cfs_rq_async(cfs_rq); ++} ++ ++static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) ++{ ++ struct cfs_rq *local_unthrottle = NULL; ++ int this_cpu = smp_processor_id(); + u64 runtime, remaining = 1; ++ bool throttled = false; ++ struct cfs_rq *cfs_rq; ++ struct rq_flags rf; ++ struct rq *rq; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { +- struct rq *rq = rq_of(cfs_rq); +- struct rq_flags rf; ++ rq = rq_of(cfs_rq); ++ ++ if (!remaining) { ++ throttled = true; ++ break; ++ } + + rq_lock_irqsave(rq, &rf); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + +- /* By the above check, this should never be true */ ++#ifdef CONFIG_SMP ++ /* Already queued for async unthrottle */ ++ if (!list_empty(&cfs_rq->throttled_csd_list)) ++ goto next; ++#endif ++ ++ /* By the above checks, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + + raw_spin_lock(&cfs_b->lock); +@@ -5508,16 +5604,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) + cfs_rq->runtime_remaining += runtime; + + /* we check whether we're throttled above */ +- if (cfs_rq->runtime_remaining > 0) +- unthrottle_cfs_rq(cfs_rq); ++ if (cfs_rq->runtime_remaining > 0) { ++ if (cpu_of(rq) != this_cpu || ++ SCHED_WARN_ON(local_unthrottle)) ++ unthrottle_cfs_rq_async(cfs_rq); ++ else ++ local_unthrottle = cfs_rq; ++ } else { ++ throttled = true; ++ } + + next: + rq_unlock_irqrestore(rq, &rf); +- +- if (!remaining) +- break; + } + rcu_read_unlock(); ++ ++ if (local_unthrottle) { ++ rq = cpu_rq(this_cpu); ++ rq_lock_irqsave(rq, &rf); ++ if (cfs_rq_throttled(local_unthrottle)) ++ unthrottle_cfs_rq(local_unthrottle); ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++ return throttled; + } + + /* +@@ -5562,10 +5672,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u + while (throttled && cfs_b->runtime > 0) { + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + /* we can't nest cfs_b->lock while distributing bandwidth */ +- distribute_cfs_runtime(cfs_b); ++ throttled = distribute_cfs_runtime(cfs_b); + raw_spin_lock_irqsave(&cfs_b->lock, flags); +- +- throttled = !list_empty(&cfs_b->throttled_cfs_rq); + } + + /* +@@ -5842,6 +5950,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) + { + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); ++#ifdef CONFIG_SMP ++ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); ++#endif + } + + void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +@@ -5858,12 +5969,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) + + static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) + { ++ int __maybe_unused i; ++ + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); ++ ++ /* ++ * It is possible that we still have some cfs_rq's pending on a CSD ++ * list, though this race is very rare. In order for this to occur, we ++ * must have raced with the last task leaving the group while there ++ * exist throttled cfs_rq(s), and the period_timer must have queued the ++ * CSD item but the remote cpu has not yet processed it. To handle this, ++ * we can simply flush all pending CSD work inline here. We're ++ * guaranteed at this point that no additional cfs_rq of this group can ++ * join a CSD list. ++ */ ++#ifdef CONFIG_SMP ++ for_each_possible_cpu(i) { ++ struct rq *rq = cpu_rq(i); ++ unsigned long flags; ++ ++ if (list_empty(&rq->cfsb_csd_list)) ++ continue; ++ ++ local_irq_save(flags); ++ __cfsb_csd_unthrottle(rq); ++ local_irq_restore(flags); ++ } ++#endif + } + + /* +@@ -6026,6 +6163,7 @@ static inline bool cpu_overutilized(int cpu) + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + ++ /* Return true only if the utilization doesn't fit CPU's capacity */ + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + } + +@@ -6159,6 +6297,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + + static void set_next_buddy(struct sched_entity *se); + ++static inline void dur_avg_update(struct task_struct *p, bool task_sleep) ++{ ++ u64 dur; ++ ++ if (!task_sleep) ++ return; ++ ++ dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime; ++ p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime; ++ update_avg(&p->se.dur_avg, dur); ++} ++ + /* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and +@@ -6231,6 +6381,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + + dequeue_throttle: + util_est_update(&rq->cfs, p, task_sleep); ++ dur_avg_update(p, task_sleep); + hrtick_update(rq); + } + +@@ -6364,6 +6515,20 @@ static int wake_wide(struct task_struct *p) + return 1; + } + ++/* ++ * If a task switches in and then voluntarily relinquishes the ++ * CPU quickly, it is regarded as a short duration task. ++ * ++ * SIS_SHORT tries to wake up the short wakee on current CPU. This ++ * aims to avoid race condition among CPUs due to frequent context ++ * switch. ++ */ ++static inline int is_short_task(struct task_struct *p) ++{ ++ return sched_feat(SIS_SHORT) && p->se.dur_avg && ++ ((p->se.dur_avg * 8) < sysctl_sched_min_granularity); ++} ++ + /* + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous +@@ -6400,6 +6565,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + ++ /* The only running task is a short duration one. */ ++ if (cpu_rq(this_cpu)->nr_running == 1 && ++ is_short_task(rcu_dereference(cpu_curr(this_cpu)))) ++ return this_cpu; ++ + return nr_cpumask_bits; + } + +@@ -6774,6 +6944,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; ++ ++ if (!has_idle_core && this == target && ++ (5 * nr < 3 * sd->span_weight) && ++ cpu_rq(target)->nr_running <= 1 && ++ is_short_task(p) && ++ is_short_task(rcu_dereference(cpu_curr(target)))) ++ return target; + } + } + +@@ -6819,6 +6996,7 @@ static int + select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) + { + unsigned long task_util, util_min, util_max, best_cap = 0; ++ int fits, best_fits = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + +@@ -6834,12 +7012,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; +- if (util_fits_cpu(task_util, util_min, util_max, cpu)) ++ ++ fits = util_fits_cpu(task_util, util_min, util_max, cpu); ++ ++ /* This CPU fits with all requirements */ ++ if (fits > 0) + return cpu; ++ /* ++ * Only the min performance hint (i.e. uclamp_min) doesn't fit. ++ * Look for the CPU with best capacity. ++ */ ++ else if (fits < 0) ++ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); + +- if (cpu_cap > best_cap) { ++ /* ++ * First, select CPU which fits better (-1 being better than 0). ++ * Then, select the one with best capacity at same level. ++ */ ++ if ((fits < best_fits) || ++ ((fits == best_fits) && (cpu_cap > best_cap))) { + best_cap = cpu_cap; + best_cpu = cpu; ++ best_fits = fits; + } + } + +@@ -6852,7 +7046,11 @@ static inline bool asym_fits_cpu(unsigned long util, + int cpu) + { + if (sched_asym_cpucap_active()) +- return util_fits_cpu(util, util_min, util_max, cpu); ++ /* ++ * Return true only if the cpu fully fits the task requirements ++ * which include the utilization and the performance hints. ++ */ ++ return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + + return true; + } +@@ -7219,6 +7417,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; + struct root_domain *rd = this_rq()->rd; + int cpu, best_energy_cpu, target = -1; ++ int prev_fits = -1, best_fits = -1; ++ unsigned long best_thermal_cap = 0; ++ unsigned long prev_thermal_cap = 0; + struct sched_domain *sd; + struct perf_domain *pd; + struct energy_env eenv; +@@ -7254,6 +7455,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + unsigned long prev_spare_cap = 0; + int max_spare_cap_cpu = -1; + unsigned long base_energy; ++ int fits, max_fits = -1; + + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + +@@ -7303,7 +7505,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } +- if (!util_fits_cpu(util, util_min, util_max, cpu)) ++ ++ fits = util_fits_cpu(util, util_min, util_max, cpu); ++ if (!fits) + continue; + + lsub_positive(&cpu_cap, util); +@@ -7311,7 +7515,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + if (cpu == prev_cpu) { + /* Always use prev_cpu as a candidate. */ + prev_spare_cap = cpu_cap; +- } else if (cpu_cap > max_spare_cap) { ++ prev_fits = fits; ++ } else if ((fits > max_fits) || ++ ((fits == max_fits) && (cpu_cap > max_spare_cap))) { + /* + * Find the CPU with the maximum spare capacity + * among the remaining CPUs in the performance +@@ -7319,6 +7525,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + */ + max_spare_cap = cpu_cap; + max_spare_cap_cpu = cpu; ++ max_fits = fits; + } + } + +@@ -7337,26 +7544,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + if (prev_delta < base_energy) + goto unlock; + prev_delta -= base_energy; ++ prev_thermal_cap = cpu_thermal_cap; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { ++ /* Current best energy cpu fits better */ ++ if (max_fits < best_fits) ++ continue; ++ ++ /* ++ * Both don't fit performance hint (i.e. uclamp_min) ++ * but best energy cpu has better capacity. ++ */ ++ if ((max_fits < 0) && ++ (cpu_thermal_cap <= best_thermal_cap)) ++ continue; ++ + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ + if (cur_delta < base_energy) + goto unlock; + cur_delta -= base_energy; +- if (cur_delta < best_delta) { +- best_delta = cur_delta; +- best_energy_cpu = max_spare_cap_cpu; +- } ++ ++ /* ++ * Both fit for the task but best energy cpu has lower ++ * energy impact. ++ */ ++ if ((max_fits > 0) && (best_fits > 0) && ++ (cur_delta >= best_delta)) ++ continue; ++ ++ best_delta = cur_delta; ++ best_energy_cpu = max_spare_cap_cpu; ++ best_fits = max_fits; ++ best_thermal_cap = cpu_thermal_cap; + } + } + rcu_read_unlock(); + +- if (best_delta < prev_delta) ++ if ((best_fits > prev_fits) || ++ ((best_fits > 0) && (best_delta < prev_delta)) || ++ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + target = best_energy_cpu; + + return target; +@@ -8856,82 +9087,16 @@ static unsigned long scale_rt_capacity(int cpu) + + static void update_cpu_capacity(struct sched_domain *sd, int cpu) + { +- unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); + unsigned long capacity = scale_rt_capacity(cpu); + struct sched_group *sdg = sd->groups; +- struct rq *rq = cpu_rq(cpu); + +- rq->cpu_capacity_orig = capacity_orig; ++ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); + + if (!capacity) + capacity = 1; + +- rq->cpu_capacity = capacity; +- +- /* +- * Detect if the performance domain is in capacity inversion state. +- * +- * Capacity inversion happens when another perf domain with equal or +- * lower capacity_orig_of() ends up having higher capacity than this +- * domain after subtracting thermal pressure. +- * +- * We only take into account thermal pressure in this detection as it's +- * the only metric that actually results in *real* reduction of +- * capacity due to performance points (OPPs) being dropped/become +- * unreachable due to thermal throttling. +- * +- * We assume: +- * * That all cpus in a perf domain have the same capacity_orig +- * (same uArch). +- * * Thermal pressure will impact all cpus in this perf domain +- * equally. +- */ +- if (sched_energy_enabled()) { +- unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); +- struct perf_domain *pd; +- +- rcu_read_lock(); +- +- pd = rcu_dereference(rq->rd->pd); +- rq->cpu_capacity_inverted = 0; +- +- for (; pd; pd = pd->next) { +- struct cpumask *pd_span = perf_domain_span(pd); +- unsigned long pd_cap_orig, pd_cap; +- +- /* We can't be inverted against our own pd */ +- if (cpumask_test_cpu(cpu_of(rq), pd_span)) +- continue; +- +- cpu = cpumask_any(pd_span); +- pd_cap_orig = arch_scale_cpu_capacity(cpu); +- +- if (capacity_orig < pd_cap_orig) +- continue; +- +- /* +- * handle the case of multiple perf domains have the +- * same capacity_orig but one of them is under higher +- * thermal pressure. We record it as capacity +- * inversion. +- */ +- if (capacity_orig == pd_cap_orig) { +- pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); +- +- if (pd_cap > inv_cap) { +- rq->cpu_capacity_inverted = inv_cap; +- break; +- } +- } else if (pd_cap_orig > inv_cap) { +- rq->cpu_capacity_inverted = inv_cap; +- break; +- } +- } +- +- rcu_read_unlock(); +- } +- +- trace_sched_cpu_capacity_tp(rq); ++ cpu_rq(cpu)->cpu_capacity = capacity; ++ trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + + sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; +@@ -9135,20 +9300,15 @@ group_type group_classify(unsigned int imbalance_pct, + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * +- * Check the state of the SMT siblings of both @sds::local and @sg and decide +- * if @dst_cpu can pull tasks. ++ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull ++ * tasks. + * +- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of +- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks +- * only if @dst_cpu has higher priority. ++ * This function must be called only if all the SMT siblings of @dst_cpu are ++ * idle, if any. + * +- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more +- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. +- * Bigger imbalances in the number of busy CPUs will be dealt with in +- * update_sd_pick_busiest(). +- * +- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings +- * of @dst_cpu are idle and @sg has lower priority. ++ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than ++ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances ++ * in the number of busy CPUs will be dealt with in find_busiest_group(). + * + * Return: true if @dst_cpu can pull tasks, false otherwise. + */ +@@ -9157,51 +9317,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sched_group *sg) + { + #ifdef CONFIG_SCHED_SMT +- bool local_is_smt, sg_is_smt; + int sg_busy_cpus; + +- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; +- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; +- + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + +- if (!local_is_smt) { +- /* +- * If we are here, @dst_cpu is idle and does not have SMT +- * siblings. Pull tasks if candidate group has two or more +- * busy CPUs. +- */ +- if (sg_busy_cpus >= 2) /* implies sg_is_smt */ +- return true; +- +- /* +- * @dst_cpu does not have SMT siblings. @sg may have SMT +- * siblings and only one is busy. In such case, @dst_cpu +- * can help if it has higher priority and is idle (i.e., +- * it has no running tasks). +- */ +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- } +- +- /* @dst_cpu has SMT siblings. */ +- +- if (sg_is_smt) { +- int local_busy_cpus = sds->local->group_weight - +- sds->local_stat.idle_cpus; +- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; +- +- if (busy_cpus_delta == 1) +- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); +- +- return false; +- } +- + /* +- * @sg does not have SMT siblings. Ensure that @sds::local does not end +- * up with more than one busy SMT sibling and only pull tasks if there +- * are not busy CPUs (i.e., no CPU has running tasks). ++ * If the difference in the number of busy CPUs is two or more, let ++ * find_busiest_group() take care of it. We only care if @sg has ++ * exactly one busy CPU. This covers SMT and non-SMT sched groups. + */ +- if (!sds->local_stat.sum_nr_running) ++ if (sg_busy_cpus == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +@@ -9215,7 +9340,14 @@ static inline bool + sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) + { +- /* Only do SMT checks if either local or candidate have SMT siblings */ ++ /* ++ * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE ++ * is not sufficient. We need to make sure the whole core is idle. ++ */ ++ if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu)) ++ return false; ++ ++ /* Only do SMT checks if either local or candidate have SMT siblings. */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); +@@ -9408,10 +9540,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we +- * select the 1st one. ++ * select the 1st one, except if @sg is composed of SMT ++ * siblings. + */ +- if (sgs->avg_load <= busiest->avg_load) ++ ++ if (sgs->avg_load < busiest->avg_load) + return false; ++ ++ if (sgs->avg_load == busiest->avg_load) { ++ /* ++ * SMT sched groups need more help than non-SMT groups. ++ * If @sg happens to also be SMT, either choice is good. ++ */ ++ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) ++ return false; ++ } ++ + break; + + case group_has_spare: +@@ -9886,7 +10030,6 @@ static void update_idle_cpu_scan(struct lb_env *env, + + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) + { +- struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; + struct sg_lb_stats tmp_sgs; +@@ -9927,9 +10070,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + sg = sg->next; + } while (sg != env->sd->groups); + +- /* Tag domain that child domain prefers tasks go to siblings first */ +- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; +- ++ /* ++ * Tag domain that @env::sd prefers to spread excess tasks among ++ * sibling sched groups. ++ */ ++ sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING; + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); +@@ -10159,24 +10304,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + */ + update_sd_lb_stats(env, &sds); + +- if (sched_energy_enabled()) { +- struct root_domain *rd = env->dst_rq->rd; +- +- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) +- goto out_balanced; +- } +- +- local = &sds.local_stat; +- busiest = &sds.busiest_stat; +- + /* There is no busy sibling group to pull tasks from */ + if (!sds.busiest) + goto out_balanced; + ++ busiest = &sds.busiest_stat; ++ + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + ++ if (sched_energy_enabled()) { ++ struct root_domain *rd = env->dst_rq->rd; ++ ++ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) ++ goto out_balanced; ++ } ++ + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; +@@ -10189,6 +10333,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + if (busiest->group_type == group_imbalanced) + goto force_balance; + ++ local = &sds.local_stat; + /* + * If the local group is busier than the selected busiest group + * don't try and pull any tasks. +@@ -10228,7 +10373,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + goto out_balanced; + } + +- /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; +@@ -10330,11 +10474,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, + nr_running == 1) + continue; + +- /* Make sure we only pull tasks from a CPU of lower priority */ ++ /* ++ * Make sure we only pull tasks from a CPU of lower priority ++ * when balancing between SMT siblings. ++ * ++ * If balancing between cores, let lower priority CPUs help ++ * SMT cores with more than one busy sibling. ++ */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && +- nr_running == 1) +- continue; ++ nr_running == 1) { ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i))) ++ continue; ++ } + + switch (env->migration_type) { + case migrate_load: +@@ -10424,8 +10577,20 @@ asym_active_balance(struct lb_env *env) + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ +- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && +- sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) { ++ /* Always obey priorities between SMT siblings. */ ++ if (env->sd->flags & SD_SHARE_CPUCAPACITY) ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu); ++ ++ /* ++ * A lower priority CPU can help an SMT core with more than one ++ * busy sibling. ++ */ ++ return sched_asym_prefer(env->dst_cpu, env->src_cpu) || ++ !is_core_idle(env->src_cpu); ++ } ++ ++ return false; + } + + static inline bool +@@ -11162,8 +11327,17 @@ static void nohz_balancer_kick(struct rq *rq) + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + if (sched_asym_prefer(i, cpu)) { +- flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; +- goto unlock; ++ /* ++ * Always do ASYM_PACKING balance in the SMT ++ * domain. In upper domains, the core must be ++ * fully idle. ++ */ ++ if (sd->flags & SD_SHARE_CPUCAPACITY || ++ (!(sd->flags & SD_SHARE_CPUCAPACITY) && ++ is_core_idle(i))) { ++ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; ++ goto unlock; ++ } + } + } + } +@@ -12498,6 +12672,11 @@ __init void init_sched_fair_class(void) + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); ++ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); ++#endif + } + + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index ee7f23c76bd3..efdc29c42161 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_PROP, false) + SCHED_FEAT(SIS_UTIL, true) ++SCHED_FEAT(SIS_SHORT, true) + + /* + * Issue a WARN when we do multiple update_rq_clock() calls +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index 0f310768260c..036b0e2cd2b4 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -467,3 +467,63 @@ int update_irq_load_avg(struct rq *rq, u64 running) + return ret; + } + #endif ++ ++__read_mostly unsigned int sched_pelt_lshift; ++ ++#ifdef CONFIG_SYSCTL ++static unsigned int sysctl_sched_pelt_multiplier = 1; ++ ++int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ static DEFINE_MUTEX(mutex); ++ unsigned int old; ++ int ret; ++ ++ mutex_lock(&mutex); ++ old = sysctl_sched_pelt_multiplier; ++ ret = proc_dointvec(table, write, buffer, lenp, ppos); ++ if (ret) ++ goto undo; ++ if (!write) ++ goto done; ++ ++ switch (sysctl_sched_pelt_multiplier) { ++ case 1: ++ fallthrough; ++ case 2: ++ fallthrough; ++ case 4: ++ WRITE_ONCE(sched_pelt_lshift, ++ sysctl_sched_pelt_multiplier >> 1); ++ goto done; ++ default: ++ ret = -EINVAL; ++ } ++ ++undo: ++ sysctl_sched_pelt_multiplier = old; ++done: ++ mutex_unlock(&mutex); ++ ++ return ret; ++} ++ ++static struct ctl_table sched_pelt_sysctls[] = { ++ { ++ .procname = "sched_pelt_multiplier", ++ .data = &sysctl_sched_pelt_multiplier, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_pelt_multiplier, ++ }, ++ {} ++}; ++ ++static int __init sched_pelt_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", sched_pelt_sysctls); ++ return 0; ++} ++late_initcall(sched_pelt_sysctl_init); ++#endif +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index 3a0e0dc28721..9b35b5072bae 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg) + WRITE_ONCE(avg->util_est.enqueued, enqueued); + } + ++static inline u64 rq_clock_task_mult(struct rq *rq) ++{ ++ lockdep_assert_rq_held(rq); ++ assert_clock_updated(rq); ++ ++ return rq->clock_task_mult; ++} ++ + static inline u64 rq_clock_pelt(struct rq *rq) + { + lockdep_assert_rq_held(rq); +@@ -72,7 +80,7 @@ static inline u64 rq_clock_pelt(struct rq *rq) + /* The rq is idle, we can sync to clock_task */ + static inline void _update_idle_rq_clock_pelt(struct rq *rq) + { +- rq->clock_pelt = rq_clock_task(rq); ++ rq->clock_pelt = rq_clock_task_mult(rq); + + u64_u32_store(rq->clock_idle, rq_clock(rq)); + /* Paired with smp_rmb in migrate_se_pelt_lag() */ +@@ -121,6 +129,27 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) + rq->clock_pelt += delta; + } + ++extern unsigned int sched_pelt_lshift; ++ ++/* ++ * absolute time |1 |2 |3 |4 |5 |6 | ++ * @ mult = 1 --------****************--------****************- ++ * @ mult = 2 --------********----------------********--------- ++ * @ mult = 4 --------****--------------------****------------- ++ * clock task mult ++ * @ mult = 2 | | |2 |3 | | | | |5 |6 | | | ++ * @ mult = 4 | | | | |2|3| | | | | | | | | | |5|6| | | | | | | ++ * ++ */ ++static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta) ++{ ++ delta <<= READ_ONCE(sched_pelt_lshift); ++ ++ rq->clock_task_mult += delta; ++ ++ update_rq_clock_pelt(rq, delta); ++} ++ + /* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum +@@ -147,7 +176,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) + * rq's clock_task. + */ + if (util_sum >= divider) +- rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; ++ rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt; + + _update_idle_rq_clock_pelt(rq); + } +@@ -218,13 +247,18 @@ update_irq_load_avg(struct rq *rq, u64 running) + return 0; + } + +-static inline u64 rq_clock_pelt(struct rq *rq) ++static inline u64 rq_clock_task_mult(struct rq *rq) + { + return rq_clock_task(rq); + } + ++static inline u64 rq_clock_pelt(struct rq *rq) ++{ ++ return rq_clock_task_mult(rq); ++} ++ + static inline void +-update_rq_clock_pelt(struct rq *rq, s64 delta) { } ++update_rq_clock_task_mult(struct rq *rq, s64 delta) { } + + static inline void + update_idle_rq_clock_pelt(struct rq *rq) { } +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 771f8ddb7053..9e8bb6278604 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -645,6 +645,9 @@ struct cfs_rq { + int throttled; + int throttle_count; + struct list_head throttled_list; ++#ifdef CONFIG_SMP ++ struct list_head throttled_csd_list; ++#endif + #endif /* CONFIG_CFS_BANDWIDTH */ + #endif /* CONFIG_FAIR_GROUP_SCHED */ + }; +@@ -1015,6 +1018,7 @@ struct rq { + u64 clock; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; ++ u64 clock_task_mult; + u64 clock_pelt; + unsigned long lost_idle_time; + u64 clock_pelt_idle; +@@ -1041,7 +1045,6 @@ struct rq { + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; +- unsigned long cpu_capacity_inverted; + + struct balance_callback *balance_callback; + +@@ -1154,6 +1157,11 @@ struct rq { + + /* Scratch cpumask to be temporarily used under rq_lock */ + cpumask_var_t scratch_mask; ++ ++#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) ++ call_single_data_t cfsb_csd; ++ struct list_head cfsb_csd_list; ++#endif + }; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -2893,24 +2901,6 @@ static inline unsigned long capacity_orig_of(int cpu) + return cpu_rq(cpu)->cpu_capacity_orig; + } + +-/* +- * Returns inverted capacity if the CPU is in capacity inversion state. +- * 0 otherwise. +- * +- * Capacity inversion detection only considers thermal impact where actual +- * performance points (OPPs) gets dropped. +- * +- * Capacity inversion state happens when another performance domain that has +- * equal or lower capacity_orig_of() becomes effectively larger than the perf +- * domain this CPU belongs to due to thermal pressure throttling it hard. +- * +- * See comment in update_cpu_capacity(). +- */ +-static inline unsigned long cpu_in_capacity_inversion(int cpu) +-{ +- return cpu_rq(cpu)->cpu_capacity_inverted; +-} +- + /** + * enum cpu_util_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency +-- +2.39.2 + +From a98da743d79741ac811bca0a2704902a27604768 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 6 Feb 2023 09:53:13 +0100 +Subject: [PATCH 14/15] zram + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/blockdev/zram.rst | 2 + + drivers/block/zram/zram_drv.c | 319 +++++++++++++++++++- + drivers/block/zram/zram_drv.h | 7 + + 3 files changed, 322 insertions(+), 6 deletions(-) + +diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst +index e4551579cb12..a1dd202efca1 100644 +--- a/Documentation/admin-guide/blockdev/zram.rst ++++ b/Documentation/admin-guide/blockdev/zram.rst +@@ -209,6 +209,7 @@ compact WO trigger memory compaction + debug_stat RO this file is used for zram debugging purposes + backing_dev RW set up backend storage for zram to write out + idle WO mark allocated slot as idle ++merge WO trigger merge identical pages + ====================== ====== =============================================== + + +@@ -267,6 +268,7 @@ line of text and contains the following stats separated by whitespace: + pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages + huge_pages_since the number of incompressible pages since zram set up ++ pages_merged the number of identical pages merged into single one + ================ ============================================================= + + File /sys/block/zram/bd_stat +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index e290d6d97047..084f8f830bde 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -33,12 +33,15 @@ + #include + #include + #include ++#include ++#include + + #include "zram_drv.h" + + static DEFINE_IDR(zram_index_idr); + /* idr index must be protected */ + static DEFINE_MUTEX(zram_index_mutex); ++static DEFINE_MUTEX(zram_rbtree_mutex); + + static int zram_major; + static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; +@@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index); + static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + ++struct zram_rbtree_node { ++ struct rb_node node; ++ unsigned long key; ++ unsigned long cnt; ++}; ++ ++struct zram_hash_node { ++ unsigned long index; ++ struct hlist_node next; ++}; + + static int zram_slot_trylock(struct zram *zram, u32 index) + { +@@ -1140,7 +1153,7 @@ static ssize_t recomp_algorithm_store(struct device *dev, + while (*args) { + args = next_arg(args, ¶m, &val); + +- if (!*val) ++ if (!val || !*val) + return -EINVAL; + + if (!strcmp(param, "algo")) { +@@ -1184,6 +1197,30 @@ static ssize_t compact_store(struct device *dev, + return len; + } + ++static int zram_do_scan(struct zram *zram); ++ ++static ssize_t merge_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t len) ++{ ++ struct zram *zram = dev_to_zram(dev); ++ int ret; ++ ++ down_read(&zram->init_lock); ++ if (!init_done(zram)) { ++ up_read(&zram->init_lock); ++ return -EINVAL; ++ } ++ ++ ret = zram_do_scan(zram); ++ if (ret != 0) { ++ up_read(&zram->init_lock); ++ return -ENOMEM; ++ } ++ ++ up_read(&zram->init_lock); ++ return len; ++} ++ + static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -1223,7 +1260,7 @@ static ssize_t mm_stat_show(struct device *dev, + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, +- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", ++ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, +@@ -1232,7 +1269,8 @@ static ssize_t mm_stat_show(struct device *dev, + (u64)atomic64_read(&zram->stats.same_pages), + atomic_long_read(&pool_stats.pages_compacted), + (u64)atomic64_read(&zram->stats.huge_pages), +- (u64)atomic64_read(&zram->stats.huge_pages_since)); ++ (u64)atomic64_read(&zram->stats.huge_pages_since), ++ (u64)atomic64_read(&zram->stats.pages_merged)); + up_read(&zram->init_lock); + + return ret; +@@ -1283,6 +1321,248 @@ static DEVICE_ATTR_RO(bd_stat); + #endif + static DEVICE_ATTR_RO(debug_stat); + ++static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ struct zram_rbtree_node *this; ++ ++ while (*new) { ++ this = rb_entry(*new, struct zram_rbtree_node, node); ++ parent = *new; ++ if (data->key < this->key) ++ new = &((*new)->rb_left); ++ else if (data->key > this->key) ++ new = &((*new)->rb_right); ++ else ++ return false; ++ } ++ ++ rb_link_node(&data->node, parent, new); ++ rb_insert_color(&data->node, root); ++ return true; ++} ++ ++static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root, ++ unsigned long key) ++{ ++ struct rb_node *node = root->rb_node; ++ struct zram_rbtree_node *data; ++ ++ while (node) { ++ data = rb_entry(node, struct zram_rbtree_node, node); ++ if (key < data->key) ++ node = node->rb_left; ++ else if (key > data->key) ++ node = node->rb_right; ++ else ++ return data; ++ } ++ ++ return NULL; ++} ++ ++static unsigned long zram_calc_hash(void *src, size_t len) ++{ ++ return xxhash(src, len, 0); ++} ++ ++static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable, ++ size_t htable_size, size_t index) ++{ ++ struct zram_rbtree_node *rb_node; ++ struct zram_hash_node *node; ++ unsigned long handle, cur_handle; ++ size_t obj_size; ++ char *src, *buf; ++ unsigned long hash; ++ int ret = 0; ++ ++ handle = zram_get_handle(zram, index); ++ if (!handle) ++ return ret; ++ ++ obj_size = zram_get_obj_size(zram, index); ++ buf = kmalloc(obj_size, GFP_KERNEL); ++ if (!buf) { ++ pr_err("Failed to allocate zs_map_object buffer\n"); ++ return -ENOMEM; ++ } ++ ++ src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); ++ memcpy(buf, src, obj_size); ++ zs_unmap_object(zram->mem_pool, handle); ++ hash = zram_calc_hash(buf, obj_size); ++ ++ mutex_lock(&zram_rbtree_mutex); ++ hlist_for_each_entry(node, &htable[hash % htable_size], next) { ++ int cmp; ++ ++ zram_slot_lock(zram, node->index); ++ ++ /* ++ * Page may change as the hash table is being formed, ++ * so the checks below are necessary. ++ */ ++ cur_handle = zram_get_handle(zram, node->index); ++ if (handle == cur_handle || ++ obj_size != zram_get_obj_size(zram, node->index)) { ++ zram_slot_unlock(zram, node->index); ++ continue; ++ } ++ ++ src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO); ++ cmp = memcmp(buf, src, obj_size); ++ zs_unmap_object(zram->mem_pool, cur_handle); ++ ++ if (!cmp) { ++ rb_node = zram_rbtree_search(&zram->sph_rbtree, handle); ++ ++ /* ++ * This check is necessary in order not to zs_free an object ++ * that someone already refers to. This situation is possible ++ * when with repeated calls to zram_do_scan(). For example: ++ * ++ * [slot0] [slot1] [slot2] [slot3] [slot4] ++ * [obj0] [obj1] [obj2] [obj3] [obj4] ++ * ++ * Let's imagine that obj2 and obj3 are equal, and we called ++ * zram_do_scan() function: ++ * ++ * [slot0] [slot1] [slot2] [slot3] [slot4] ++ * [obj0] [obj1] [obj2] [obj2] [obj4] ++ * ++ * Now, slot2 and slot3 refers to obj2 zsmalloc object. ++ * Time passed, now slot0 refres to obj0_n, which is equal ++ * to obj2: ++ * ++ * [slot0] [slot1] [slot2] [slot3] [slot4] ++ * [obj0_n] [obj1] [obj2] [obj2] [obj4] ++ * ++ * Now we call zram_do_scan() function again. We get to slot2, ++ * and we understand that obj2 and obj0_n hashes are the same. We ++ * try to zs_free(obj2), but slot3 also already refers to it. ++ * ++ * This is not correct! ++ */ ++ if (unlikely(rb_node)) ++ if (rb_node->cnt > 1) { ++ zram_slot_unlock(zram, node->index); ++ continue; ++ } ++ ++ zram_set_handle(zram, index, cur_handle); ++ zs_free(zram->mem_pool, handle); ++ ++ rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle); ++ ++ if (!rb_node) { ++ rb_node = kzalloc(sizeof(struct zram_rbtree_node), ++ GFP_KERNEL); ++ if (!rb_node) { ++ pr_err("Failed to allocate rb_node\n"); ++ ret = -ENOMEM; ++ zram_slot_unlock(zram, node->index); ++ mutex_unlock(&zram_rbtree_mutex); ++ goto merged_or_err; ++ } ++ ++ rb_node->key = cur_handle; ++ /* Two slots refers to an zsmalloc object with cur_handle key */ ++ rb_node->cnt = 2; ++ zram_rbtree_insert(&zram->sph_rbtree, rb_node); ++ } else { ++ rb_node->cnt++; ++ } ++ ++ atomic64_inc(&zram->stats.pages_merged); ++ atomic64_sub(obj_size, &zram->stats.compr_data_size); ++ zram_set_flag(zram, index, ZRAM_MERGED); ++ zram_set_flag(zram, node->index, ZRAM_MERGED); ++ ++ zram_slot_unlock(zram, node->index); ++ mutex_unlock(&zram_rbtree_mutex); ++ goto merged_or_err; ++ } ++ ++ zram_slot_unlock(zram, node->index); ++ } ++ ++ mutex_unlock(&zram_rbtree_mutex); ++ ++ node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL); ++ if (!node) { ++ ret = -ENOMEM; ++ goto merged_or_err; ++ } ++ ++ node->index = index; ++ hlist_add_head(&node->next, &htable[hash % htable_size]); ++ ++merged_or_err: ++ kfree(buf); ++ return ret; ++} ++ ++static void zram_free_htable_entries(struct hlist_head *htable, ++ size_t htable_size) ++{ ++ struct hlist_node *n; ++ struct zram_hash_node *node; ++ ++ hlist_for_each_entry_safe(node, n, htable, next) { ++ hlist_del(&node->next); ++ kfree(node); ++ } ++} ++ ++static int zram_do_scan(struct zram *zram) ++{ ++ size_t num_pages = zram->disksize >> PAGE_SHIFT; ++ size_t htable_size = num_pages; ++ size_t index; ++ struct hlist_head *htable; ++ int i, ret = 0; ++ ++ htable = vzalloc(htable_size * sizeof(struct hlist_head)); ++ if (!htable) { ++ pr_err("Failed to allocate hash table\n"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < htable_size; i++) ++ INIT_HLIST_HEAD(&htable[i]); ++ ++ for (index = 0; index < num_pages; index++) { ++ zram_slot_lock(zram, index); ++ ++ if (!zram_allocated(zram, index)) { ++ zram_slot_unlock(zram, index); ++ continue; ++ } ++ ++ if (zram_test_flag(zram, index, ZRAM_UNDER_WB) || ++ zram_test_flag(zram, index, ZRAM_WB) || ++ zram_test_flag(zram, index, ZRAM_SAME)) { ++ zram_slot_unlock(zram, index); ++ continue; ++ } ++ ++ /* Ignore pages that have been recompressed */ ++ if (zram_get_priority(zram, index) != 0) ++ continue; ++ ++ ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index); ++ zram_slot_unlock(zram, index); ++ if (ret != 0) ++ goto out; ++ } ++ ++out: ++ zram_free_htable_entries(htable, htable_size); ++ vfree(htable); ++ return ret; ++} ++ + static void zram_meta_free(struct zram *zram, u64 disksize) + { + size_t num_pages = disksize >> PAGE_SHIFT; +@@ -1324,6 +1604,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) + static void zram_free_page(struct zram *zram, size_t index) + { + unsigned long handle; ++ struct zram_rbtree_node *node; + + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + zram->table[index].ac_time = 0; +@@ -1361,7 +1642,28 @@ static void zram_free_page(struct zram *zram, size_t index) + if (!handle) + return; + +- zs_free(zram->mem_pool, handle); ++ if (zram_test_flag(zram, index, ZRAM_MERGED)) { ++ zram_clear_flag(zram, index, ZRAM_MERGED); ++ mutex_lock(&zram_rbtree_mutex); ++ ++ node = zram_rbtree_search(&zram->sph_rbtree, handle); ++ BUG_ON(!node); ++ ++ node->cnt--; ++ if (node->cnt == 0) { ++ rb_erase(&node->node, &zram->sph_rbtree); ++ mutex_unlock(&zram_rbtree_mutex); ++ ++ zs_free(zram->mem_pool, handle); ++ kfree(node); ++ } else { ++ mutex_unlock(&zram_rbtree_mutex); ++ } ++ ++ atomic64_dec(&zram->stats.pages_merged); ++ } else { ++ zs_free(zram->mem_pool, handle); ++ } + + atomic64_sub(zram_get_obj_size(zram, index), + &zram->stats.compr_data_size); +@@ -1824,7 +2126,7 @@ static ssize_t recompress_store(struct device *dev, + while (*args) { + args = next_arg(args, ¶m, &val); + +- if (!*val) ++ if (!val || !*val) + return -EINVAL; + + if (!strcmp(param, "type")) { +@@ -1909,7 +2211,8 @@ static ssize_t recompress_store(struct device *dev, + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_UNDER_WB) || + zram_test_flag(zram, index, ZRAM_SAME) || +- zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) ++ zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE) || ++ zram_test_flag(zram, index, ZRAM_MERGED)) + goto next; + + err = zram_recompress(zram, index, page, threshold, +@@ -2295,6 +2598,7 @@ static const struct block_device_operations zram_devops = { + }; + + static DEVICE_ATTR_WO(compact); ++static DEVICE_ATTR_WO(merge); + static DEVICE_ATTR_RW(disksize); + static DEVICE_ATTR_RO(initstate); + static DEVICE_ATTR_WO(reset); +@@ -2335,6 +2639,7 @@ static struct attribute *zram_disk_attrs[] = { + #ifdef CONFIG_ZRAM_WRITEBACK + &dev_attr_bd_stat.attr, + #endif ++ &dev_attr_merge.attr, + &dev_attr_debug_stat.attr, + #ifdef CONFIG_ZRAM_MULTI_COMP + &dev_attr_recomp_algorithm.attr, +@@ -2421,6 +2726,8 @@ static int zram_add(void) + + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); + ++ zram->sph_rbtree = RB_ROOT; ++ + zram_debugfs_register(zram); + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index c5254626f051..2afdbf76a1aa 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -56,6 +56,7 @@ enum zram_pageflags { + + ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ + ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */ ++ ZRAM_MERGED, /* page was merged */ + + __NR_ZRAM_PAGEFLAGS, + }; +@@ -87,6 +88,7 @@ struct zram_stats { + atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ + atomic64_t miss_free; /* no. of missed free */ ++ atomic64_t pages_merged; /* no. of pages, which merged into single one */ + #ifdef CONFIG_ZRAM_WRITEBACK + atomic64_t bd_count; /* no. of pages in backing device */ + atomic64_t bd_reads; /* no. of reads from backing device */ +@@ -140,5 +142,10 @@ struct zram { + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; + #endif ++ /* ++ * This is same pages handle's rb tree, where the key is a handle ++ * to same pages and the value is a link counter ++ */ ++ struct rb_root sph_rbtree; + }; + #endif +-- +2.39.2 + +From d28acbb9cafa5f1fa935147e0dc23e1a211848e7 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Tue, 14 Feb 2023 22:02:09 +0100 +Subject: [PATCH 15/15] zstd import v1.5.4 + +Signed-off-by: Peter Jung +--- + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 569 +++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/bits.h | 124 ++ + lib/zstd/common/bitstream.h | 51 +- + lib/zstd/common/compiler.h | 14 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 3 +- + lib/zstd/common/debug.h | 3 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 3 +- + lib/zstd/common/fse.h | 89 +- + lib/zstd/common/fse_decompress.c | 94 +- + lib/zstd/common/huf.h | 222 +-- + lib/zstd/common/mem.h | 2 +- + lib/zstd/common/portability_macros.h | 26 +- + lib/zstd/common/zstd_common.c | 3 +- + lib/zstd/common/zstd_deps.h | 2 +- + lib/zstd/common/zstd_internal.h | 94 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 59 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 372 ++-- + lib/zstd/compress/zstd_compress.c | 1491 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 267 +-- + lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 47 +- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 5 +- + lib/zstd/compress/zstd_double_fast.c | 129 +- + lib/zstd/compress/zstd_double_fast.h | 6 +- + lib/zstd/compress/zstd_fast.c | 582 +++++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 364 ++-- + lib/zstd/compress/zstd_lazy.h | 7 +- + lib/zstd/compress/zstd_ldm.c | 11 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 185 +- + lib/zstd/compress/zstd_opt.h | 3 +- + lib/zstd/decompress/huf_decompress.c | 731 ++++---- + lib/zstd/decompress/zstd_ddict.c | 8 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 215 ++- + lib/zstd/decompress/zstd_decompress_block.c | 252 ++- + lib/zstd/decompress/zstd_decompress_block.h | 3 +- + .../decompress/zstd_decompress_internal.h | 7 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 2 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 57 files changed, 4086 insertions(+), 2268 deletions(-) + create mode 100644 lib/zstd/common/bits.h + +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index 113408eef6ec..f109d49f43f8 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..6d5cf55f0bf3 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -17,8 +18,17 @@ + + + /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_VISIBLE ++ ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif ++ ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +53,17 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,11 +71,15 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..dc7e9605a624 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,42 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ ++/* ====== Dependencies ======*/ + #include /* INT_MAX */ + #include /* size_t */ + + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +85,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 4 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -156,7 +176,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ++ZSTDLIB_API ++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ + * `src` should point to the start of a ZSTD frame or skippable frame. +@@ -168,8 +190,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) + + + /*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ + ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ + ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +@@ -412,6 +456,9 @@ typedef enum { + * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -430,7 +477,11 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +544,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -543,13 +594,15 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004 + + } ZSTD_dParameter; + +@@ -728,8 +781,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +789,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -788,13 +842,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. ++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, ++ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +985,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -937,8 +1009,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -951,7 +1024,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -986,9 +1059,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1085,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1071,24 +1145,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1179,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1350,29 +1407,85 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ + } ZSTD_sequenceFormat_e; + ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); ++ + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * Generate sequences using ZSTD_compress2(), given a source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 ++ * @zc can be used to insert custom compression params. ++ * This function invokes ZSTD_compress2(). + * + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters + * @return : number of sequences generated + */ + +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences( ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,7 +1501,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * +@@ -1413,11 +1528,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. ++ * @return : final compressed size, or a ZSTD error code. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1481,8 +1597,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * ++ * Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. + */ + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +@@ -1501,7 +1620,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ * Note 2 : only single-threaded compression is supported. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ */ + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +@@ -1649,22 +1773,31 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @cparams into the working @cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1808,13 +1941,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +1958,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1878,7 +2011,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * +@@ -1928,6 +2061,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() innacurate). Only currently meant to be used for testing. ++ * ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_searchForExternalRepcodes ++ * This parameter affects how zstd parses external sequences, such as sequences ++ * provided through the compressSequences() API or from an external block-level ++ * sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level. ++ * ++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ */ ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2290,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2343,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2362,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2399,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,6 +2417,7 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); +@@ -2218,6 +2438,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,6 +2453,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +@@ -2250,6 +2472,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2274,6 +2497,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2543,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,20 +2554,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +-/*! +- * This function is deprecated, and is equivalent to: +- * +- * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); +- * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x +- */ +-ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); +- + + /* ******************************************************************* + * Buffer-less and synchronous inner streaming functions +@@ -2362,7 +2576,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2387,15 +2600,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) +@@ -2408,8 +2626,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2646,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2666,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2480,6 +2698,8 @@ typedef struct { + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; + } ZSTD_frameHeader; + + /*! ZSTD_getFrameHeader() : +@@ -2502,6 +2722,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2524,7 +2745,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2547,5 +2767,166 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_ + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t ZSTD_sequenceProducer_F ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F* sequenceProducer ++); ++ + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..464c410b2768 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..bb7967def569 +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,124 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const int DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++# else ++ return ZSTD_countTrailingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++# else ++ return ZSTD_countLeadingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..83a180c65faf 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,6 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /*========================================= +@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; +@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. +@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ ++ * unsafe version; only works if nbBits >= 1 */ + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBitsFast(bitD, nbBits); +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..c437e0975575 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -179,6 +180,17 @@ + * Sanitizer + *****************************************************************/ + ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..e56ff6464e91 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..da0dbfc614b8 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..a4062d30d170 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..9a4699a38a88 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..c4e25a219142 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -50,34 +51,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index a0d06095be83..45cf457f31ef 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -24,6 +25,7 @@ + #include "error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #include "zstd_deps.h" ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +57,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); +-} +- + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; + FSE_DTable dtable[1]; /* Dynamically sized */ +@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + + CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); +@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..8e7943092ed1 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -18,99 +19,22 @@ + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + + +-/* *** Advanced function *** */ +- +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +144,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); +- +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ ++#endif /* HUF_H_298734234 */ + +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index 1d9cc03924ca..a7231822b6e3 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0e3b2c0a527d..7ede8cf1ffe5 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -65,7 +66,7 @@ + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNUC compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may +@@ -90,4 +91,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..5a9abca10944 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 7a5bf44839c9..925161416033 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..170cd1db4819 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,7 +29,6 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 +@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 + typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 +@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= length) return; + op += 16; +@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -289,11 +285,11 @@ typedef enum { + typedef struct { + seqDef* sequencesStart; + seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + +@@ -301,8 +297,8 @@ typedef struct { + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + } seqStore_t; + + typedef struct { +@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; ++ seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; ++ seqLen.matchLength += 0x10000; + } + } + return seqLen; +@@ -337,12 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + /* custom memory allocation functions */ + void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +@@ -350,61 +347,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); + void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); + + +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} +- +- + /* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..e46ca6621b48 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -26,6 +27,7 @@ + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 + #include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) ++{ ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) + { +- const HUF_CElt* ct = CTable + 1; ++ const HUF_CElt* const ct = CTable + 1; + assert(symbolValue <= HUF_SYMBOLVALUE_MAX); + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +475,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +494,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +503,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + CTable[0] = maxNbBits; + } + +-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) ++size_t ++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, ++ void* workSpace, size_t wkspSize) + { +- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); ++ HUF_buildCTable_wksp_tables* const wksp_tables = ++ (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); + nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1216,79 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t maxBits, hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + /* Zero unused symbols in CTable, so we can check it for validity */ + { +@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): +@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index f620cafca633..81b8cd119cd8 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,6 @@ + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +27,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /* *************************************************************** + * Tuning parameters +@@ -55,14 +56,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -171,12 +175,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -257,9 +258,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +@@ -267,6 +268,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -284,6 +313,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + } + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +362,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -345,6 +381,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); + } +@@ -359,7 +398,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +494,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -613,6 +672,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + default: + return 0; + } +@@ -625,7 +688,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -668,6 +731,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +790,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +808,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +822,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +856,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -866,6 +934,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + CCtxParams->deterministicRefPrefix = !!value; + return CCtxParams->deterministicRefPrefix; + ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ CCtxParams->maxBlockSize = value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; ++ + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + } +@@ -980,6 +1069,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_searchForExternalRepcodes: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1107,24 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ assert(cctx != NULL); ++ if (cctx->streamStage != zcss_init) { ++ /* All parameters in @cparams are allowed to be updated during MT compression. ++ * This must be signaled, so that MT compression picks up the changes */ ++ cctx->cParamsChanged = 1; ++ } ++ /* only update if parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ cctx->requestedParams.cParams = cparams; ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1151,6 +1267,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't reset parameters only when not in init stage."); + ZSTD_clearAllDicts(cctx); ++ ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } + return 0; +@@ -1247,7 +1364,8 @@ static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_cParamMode_e mode, ++ ZSTD_paramSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); +@@ -1281,8 +1399,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1418,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,7 +1464,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + + static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +@@ -1341,7 +1495,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t +@@ -1386,6 +1540,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, +@@ -1393,12 +1554,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_paramSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +@@ -1417,6 +1579,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + + entropySpace + +@@ -1425,7 +1592,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1443,7 +1611,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,7 +1661,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; +@@ -1504,7 +1672,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + } + } + +@@ -1768,6 +1936,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +1945,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,7 +1963,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); ++ buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize); + int resizeWorkspace; + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); +@@ -1838,6 +2006,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1907,6 +2076,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->ldmState.loadedDictEnd = 0; + } + ++ /* reserve space for block-level external sequences */ ++ if (params->useSequenceProducer) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq; ++ zc->externalMatchCtx.seqBuffer = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); + +@@ -1980,7 +2157,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2197,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,14 +2248,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +@@ -2147,6 +2342,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2294,7 +2490,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + { + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; +@@ -2302,18 +2498,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2347,6 +2549,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2560,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2580,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2480,22 +2685,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2708,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2758,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2770,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2598,14 +2805,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + } + + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++ZSTD_entropyCompressSeqStore( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, +@@ -2615,15 +2823,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + +@@ -2718,6 +2932,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) + ssPtr->longLengthType = ZSTD_llt_none; + } + ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3007,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3045,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ zc->appliedParams.useSequenceProducer, ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2774,6 +3065,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ zc->appliedParams.useSequenceProducer, ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ +@@ -2788,7 +3087,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ ++ } else if (zc->appliedParams.useSequenceProducer) { ++ assert( ++ zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->externalMatchCtx.mFinder != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)( ++ zc->externalMatchCtx.mState, ++ zc->externalMatchCtx.seqBuffer, ++ zc->externalMatchCtx.seqBufferCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->externalMatchCtx.seqBuffer, ++ nbExternalSeqs, ++ zc->externalMatchCtx.seqBufferCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_sequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ zc, &seqPos, ++ zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); +@@ -2849,7 +3209,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode + so we provide seqStoreSeqs[i].offset - 1 */ + ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, ++ seqStoreSeqs[i].offBase, + seqStoreSeqs[i].litLength == 0); + literalsRead += outSeqs[i].litLength; + } +@@ -2865,6 +3225,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + zc->seqCollector.seqIndex += seqStoreSeqSize; + } + ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ return (srcSize / ZSTD_MINMATCH_MIN) + 1; ++} ++ + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) + { +@@ -2910,19 +3274,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2938,7 +3300,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,7 +3309,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +@@ -2959,13 +3324,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3341,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3358,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3438,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3451,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3114,23 +3489,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3523,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3550,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,99 +3588,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ seqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + +@@ -3307,15 +3697,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + */ + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3715,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3728,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3763,32 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, ++ const seqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +3798,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3481,45 +3876,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +3926,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +3953,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3583,30 +3988,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4027,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,10 +4036,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +@@ -3643,8 +4050,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +@@ -3655,7 +4060,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; +@@ -3673,9 +4078,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3767,10 +4172,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4184,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3836,7 +4242,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -3860,7 +4266,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; +@@ -3899,7 +4307,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + + ip += blockSize; +@@ -4091,7 +4499,7 @@ size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + + size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +@@ -4111,31 +4519,47 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; +- } ++ } } ++ ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); + } + + DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); +@@ -4158,10 +4582,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); + break; + + case ZSTD_greedy: +@@ -4327,6 +4751,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +4770,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4361,6 +4786,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +4799,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +4813,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +4853,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +4898,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4709,7 +5136,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -5197,30 +5624,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSize - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSize; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5229,8 +5667,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5262,8 +5702,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5274,6 +5713,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); +@@ -5281,9 +5734,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + void* cDst; + size_t cSize; + size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSize); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5306,19 +5758,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? + ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : + ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5388,8 +5837,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5408,22 +5859,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5437,9 +5888,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); +@@ -5451,6 +5902,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5477,6 +5931,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5491,8 +5947,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5510,13 +5985,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5539,6 +6021,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5549,64 +6032,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ++ ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5615,25 +6095,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } ++ ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); + + if (inSeqs[idx].litLength) { +@@ -5642,26 +6152,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return 0; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. +- * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. +- * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; +@@ -5673,6 +6172,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5680,7 +6182,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { +@@ -5688,7 +6190,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5702,7 +6204,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5742,21 +6243,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); +@@ -5779,7 +6282,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + + typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + { + ZSTD_sequenceCopier sequenceCopier = NULL; +@@ -5793,6 +6296,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + return sequenceCopier; + } + ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; ++ } ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; ++} ++ ++/* More a "target" block size */ ++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) ++{ ++ int const lastBlock = (remaining <= blockSize); ++ return lastBlock ? remaining : blockSize; ++} ++ ++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) ++ return blockSize_noDelimiter(blockSize, remaining); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ + /* Compress, block-by-block, all of the sequences given. + * + * Returns the cumulative size of all compressed blocks (including their headers), +@@ -5805,9 +6359,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; + ZSTD_sequencePosition seqPos = {0, 0, 0}; + +@@ -5827,22 +6378,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; + size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSize, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); ++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); ++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); + blockSize -= additionalByteAdjustment; + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5851,6 +6409,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, +@@ -5859,11 +6418,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { ++ ZSTD_isRLE(ip, blockSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 +@@ -5874,12 +6433,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5891,11 +6450,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5906,12 +6464,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { +@@ -5921,7 +6482,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + /* Begin writing output, starting with frame header */ +@@ -5949,26 +6510,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + + + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6090,7 +6659,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6125,3 +6694,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, void* mState, ++ ZSTD_sequenceProducer_F* mFinder ++) { ++ if (mFinder != NULL) { ++ ZSTD_externalMatchCtx emctx; ++ emctx.mState = mState; ++ emctx.mFinder = mFinder; ++ emctx.seqBuffer = NULL; ++ emctx.seqBufferCapacity = 0; ++ zc->externalMatchCtx = emctx; ++ zc->requestedParams.useSequenceProducer = 1; ++ } else { ++ ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx)); ++ zc->requestedParams.useSequenceProducer = 0; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..0198c8f5cac0 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,6 +21,7 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ + + + /*-************************************* +@@ -111,12 +113,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -142,6 +145,12 @@ typedef struct { + size_t capacity; /* The capacity starting from `seq` pointer */ + } rawSeqStore_t; + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_sequencePosition; ++ + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +@@ -228,6 +237,11 @@ struct ZSTD_matchState_t { + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; + const rawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; + }; + + typedef struct { +@@ -324,6 +338,24 @@ struct ZSTD_CCtx_params_s { + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_paramSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Indicates whether an external matchfinder has been referenced. ++ * Users can't set this externally. ++ * It is set internally in ZSTD_registerSequenceProducer(). */ ++ int useSequenceProducer; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_paramSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +@@ -355,6 +387,14 @@ typedef struct { + ZSTD_entropyCTablesMetadata_t entropyMetadata; + } ZSTD_blockSplitCtx; + ++/* Context for block-level external matchfinder API */ ++typedef struct { ++ void* mState; ++ ZSTD_sequenceProducer_F* mFinder; ++ ZSTD_Sequence* seqBuffer; ++ size_t seqBufferCapacity; ++} ZSTD_externalMatchCtx; ++ + struct ZSTD_CCtx_s { + ZSTD_compressionStage_e stage; + int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ +@@ -404,6 +444,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +458,13 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Workspace for external matchfinder */ ++ ZSTD_externalMatchCtx externalMatchCtx; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,7 +486,7 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. +@@ -500,9 +545,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +557,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +577,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +613,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void + ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +642,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +653,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -628,7 +674,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); ++ seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength >= MINMATCH); +@@ -646,17 +692,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -673,11 +719,11 @@ typedef struct repcodes_s { + } repcodes_t; + + MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +731,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -783,32 +776,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } ++static U32 ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes) >> (32-h) ; } + MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } ++static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } + static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } ++static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } + static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } ++static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } + static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } ++static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; } + static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -1167,10 +1164,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1302,6 +1304,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} ++ ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + + /* =============================================================== +@@ -1396,4 +1434,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of ++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++ * Note that the block delimiter must include the last literals of the block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns the number of bytes to move the current read position back by. ++ * Only non-zero if we ended up splitting a sequence. ++ * Otherwise, it may return a ZSTD error if something went wrong. ++ * ++ * This function will attempt to scan through blockSize bytes ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. ++ * ++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to ++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match ++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..3e9ea46a670a 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..5c028c78d889 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..7fe6f4ff5cf2 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..dbacbaf72733 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + return op-ostart; + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { ++static size_t ++ZSTD_seqDecompressedSize(seqStore_t const* seqStore, ++ const seqDef* sequences, size_t nbSeq, ++ size_t litSize, int lastSequence) ++{ + const seqDef* const sstart = sequences; + const seqDef* const send = sequences + nbSeq; + const seqDef* sp = sstart; +@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* + * @return : compressed size of sequences section of a sub-block + * Or 0 if it is unable to compress + * Or error code. */ +-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- const seqDef* sequences, size_t nbSeq, +- const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ const seqDef* sequences, size_t nbSeq, ++ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..ef5e65cfcf9a 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -451,7 +452,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..ab9440a99603 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,43 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + +@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes) ++ PREFETCH_AREA(dictHashSmall, chainTableBytes) ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + +@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..0204f12e4cf7 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..3399b39c5dbc 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,42 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} ++ ++static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 == ip0 + 1, so ++ * we know we will resume searching after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _offset; + } + +@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* first write next hash table entry; we've already calculated it */ ++ if (step <= 4) { ++ /* We need to avoid writing an index into the hash table >= the ++ * position at which we will pick up our searching after we've ++ * taken this match. ++ * ++ * The minimum possible match has length 4, so the earliest ip0 ++ * can be after we take this match will be the current ip0 + 4. ++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely ++ * write this position. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ + goto _offset; + } + +@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic( + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic( + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; +@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes) ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if (((U32) ((prefixStartIndex - 1) - repIndex) >= ++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { ++ /* found a regular match */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict( + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..e64d9e1b2d39 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" + + void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..83727cd46f91 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,6 +11,7 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ + + + /*-************************************* +@@ -197,8 +199,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +220,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -230,7 +232,7 @@ ZSTD_DUBT_findBetterDictMatch ( + static size_t + ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +329,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +363,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -381,14 +383,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + FORCE_INLINE_TEMPLATE size_t + ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* +@@ -561,7 +563,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +600,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -691,7 +693,8 @@ size_t ZSTD_HcFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +706,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +742,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -757,7 +760,6 @@ size_t ZSTD_HcFindBestMatch( + ***********************************/ + /* Constants for row-based hash */ + #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,29 +771,8 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_rotateRight_*(): +@@ -971,7 +952,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1003,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { + const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1111,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1143,6 +1163,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; + +@@ -1185,15 +1206,15 @@ size_t ZSTD_RowFindBestMatch( + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) +@@ -1224,7 +1245,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1258,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,14 +1276,14 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; + if (matchIndex < dmsLowestIndex) + break; +@@ -1285,7 +1307,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1491,7 +1513,8 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +@@ -1512,8 +1535,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1537,7 +1560,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1562,10 +1585,10 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +@@ -1579,12 +1602,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1596,17 +1619,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,12 +1638,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1632,17 +1655,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,24 +1676,24 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } + +@@ -1686,8 +1709,8 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,16 +1724,20 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -1903,7 +1930,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1922,10 +1949,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +@@ -1939,7 +1966,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1951,18 +1978,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,7 +1998,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1983,36 +2010,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } + +@@ -2029,8 +2056,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2096,7 +2123,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row( + size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..9505bed93c03 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -22,6 +23,8 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); + +@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row( + size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- ++ + + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..b7da76b0db7c 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_greedy: +@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..c540731abde7 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..a6bf7f856437 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,7 @@ + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +27,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); + } + +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ +@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -1098,14 +1123,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + /* large match -> immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + lastSequence.litlen = litlen; + lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; ++ lastSequence.off = maxOffBase; + DEBUGLOG(6, "large match (%u>%u), immediate encoding", + maxML, sufficient_len); + cur = 0; +@@ -1122,15 +1147,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); ++ U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); + U32 const sequencePrice = literalsPrice + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", +- pos, ZSTD_fCost(sequencePrice)); ++ pos, ZSTD_fCost((int)sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; ++ opt[pos].off = offBase; + opt[pos].litlen = litlen; + opt[pos].price = (int)sequencePrice; + } } +@@ -1230,7 +1255,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ +@@ -1296,7 +1321,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); +@@ -1308,8 +1333,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } +@@ -1349,7 +1374,7 @@ size_t ZSTD_compressBlock_btopt( + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ + static void + ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +@@ -1368,7 +1393,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1392,20 +1417,20 @@ size_t ZSTD_compressBlock_btultra2( + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +- * the cost is 2x cpu time on first block. */ ++ ** the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..faa73ff4b03d 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..28a036f7543b 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -43,27 +44,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +79,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +105,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +117,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilimit [in] - The input limit, stop when any input pointer is below ilimit. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilimit, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; +@@ -151,15 +168,17 @@ typedef struct { + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; +@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + BYTE* const oend = (BYTE*)dst + dstSize; + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >= 16 so that ip[0] >= ilimit before the loop + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); + + /* If ip[] >= ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is +@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); + bit->start = (const char*)args->iend[0]; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +351,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilimit = args->ilimit; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ int symbol; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilimit); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilimit) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we get close to the end. */ ++ if (op[3] + 20 > olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ for (symbol = 0; symbol < 5; ++symbol) { ++ for (stream = 0; stream < 4; ++stream) { ++ int const index = (int)(bits[stream] >> 53); ++ int const entry = (int)dtable[index]; ++ bits[stream] <<= (entry & 63); ++ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF); ++ } ++ } ++ /* Reload the bitstreams */ ++ for (stream = 0; stream < 4; ++stream) { ++ int const ctz = ZSTD_countTrailingZeros64(bits[stream]); ++ int const nbBits = ctz & 7; ++ int const nbBytes = ctz >> 3; ++ op[stream] += 5; ++ ip[stream] -= nbBytes; ++ bits[stream] = MEM_read64(ip[stream]) | 1; ++ bits[stream] <<= nbBits; ++ } ++ } while (op[3] < olimit); ++ } ++ ++_out: + +-static HUF_ASM_X86_64_BMI2_ATTRS ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + + assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ loopFn(&args); + + /* Our loop guarantees that ip[] >= ilimit and that we haven't + * overwritten any op[]. +@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + (void)iend; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (!(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1124,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilimit = args->ilimit; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ int symbol; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilimit); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilimit) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop if we are too close to the end. */ ++ if (op[3] + 10 > olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++ do { ++ /* Do 5 table lookups for each of the first 3 streams */ ++ for (symbol = 0; symbol < 5; ++symbol) { ++ for (stream = 0; stream < 3; ++stream) { ++ int const index = (int)(bits[stream] >> 53); ++ HUF_DEltX2 const entry = dtable[index]; ++ MEM_write16(op[stream], entry.sequence); ++ bits[stream] <<= (entry.nbBits); ++ op[stream] += (entry.length); ++ } ++ } ++ /* Do 1 table lookup from the final stream */ ++ { ++ int const index = (int)(bits[3] >> 53); ++ HUF_DEltX2 const entry = dtable[index]; ++ MEM_write16(op[3], entry.sequence); ++ bits[3] <<= (entry.nbBits); ++ op[3] += (entry.length); ++ } ++ /* Do 4 table lookups from the final stream & reload bitstreams */ ++ for (stream = 0; stream < 4; ++stream) { ++ /* Do a table lookup from the final stream. ++ * This is interleaved with the reloading to reduce register ++ * pressure. This shouldn't be necessary, but compilers can ++ * struggle with codegen with high register pressure. ++ */ ++ { ++ int const index = (int)(bits[3] >> 53); ++ HUF_DEltX2 const entry = dtable[index]; ++ MEM_write16(op[3], entry.sequence); ++ bits[3] <<= (entry.nbBits); ++ op[3] += (entry.length); ++ } ++ /* Reload the bistreams. The final bitstream must be reloaded ++ * after the 5th symbol was decoded. ++ */ ++ { ++ int const ctz = ZSTD_countTrailingZeros64(bits[stream]); ++ int const nbBits = ctz & 7; ++ int const nbBytes = ctz >> 3; ++ ip[stream] -= nbBytes; ++ bits[stream] = MEM_read64(ip[stream]) | 1; ++ bits[stream] <<= nbBits; ++ } ++ } ++ } while (op[3] < olimit); ++ } ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + + assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= iend); +@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (!(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..4f801e0dd564 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -19,7 +20,6 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +131,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +237,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index b9b935a9f5c0..d7eebb17a2c5 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -56,13 +57,13 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ + #include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + +@@ -72,11 +73,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +238,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -421,16 +423,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ ++** or an error code, which can be tested using ZSTD_isError() */ + size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -730,10 +756,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } +@@ -773,6 +800,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_frameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -798,7 +867,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, + if (srcSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } +- ZSTD_memcpy(dst, src, srcSize); ++ ZSTD_memmove(dst, src, srcSize); + return srcSize; + } + +@@ -858,6 +927,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + + /* Loop on each block */ + while (1) { ++ BYTE* oBlockEnd = oend; + size_t decodedSize; + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); +@@ -867,16 +937,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + remainingSrcSize -= ZSTD_blockHeaderSize; + RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + ++ if (ip >= op && ip < oBlockEnd) { ++ /* We are decompressing in-place. Limit the output pointer so that we ++ * don't overwrite the block that we are currently reading. This will ++ * fail decompression if the input & output pointers aren't spaced ++ * far enough apart. ++ * ++ * This is important to set, even when the pointers are far enough ++ * apart, because ZSTD_decompressBlock_internal() can decide to store ++ * literals in the output buffer, after the block it is decompressing. ++ * Since we don't want anything to overwrite our input, we have to tell ++ * ZSTD_decompressBlock_internal to never write past ip. ++ * ++ * See ZSTD_allocateLiteralsBuffer() for reference. ++ */ ++ oBlockEnd = op + (ip - op); ++ } ++ + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); + break; + case bt_raw : ++ /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ + decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); + break; + case bt_rle : +- decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize); ++ decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize); + break; + case bt_reserved : + default: +@@ -911,6 +999,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); +@@ -1042,8 +1131,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1243,7 +1332,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1284,11 +1373,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1384,7 +1473,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; +@@ -1446,7 +1535,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1455,7 +1544,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1562,7 +1651,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1570,20 +1661,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); + } + +-/* ZSTD_resetDStream() : +- * return : expected size, aka ZSTD_startingInputLength(). +- * this function cannot fail */ +-size_t ZSTD_resetDStream(ZSTD_DStream* dctx) +-{ +- FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); +- return ZSTD_startingInputLength(dctx->format); +-} +- + + size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) + { +@@ -1651,6 +1734,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1691,6 +1779,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1724,6 +1815,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1899,7 +1994,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1913,6 +2007,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1930,8 +2029,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -2015,6 +2115,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2029,7 +2130,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2038,8 +2139,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2049,14 +2153,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2070,7 +2177,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2083,8 +2190,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2121,11 +2228,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..ffbe53ba0346 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } + else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } +@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) +@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } +@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; ++ assert(n>=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; ustateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,9 +1221,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + +@@ -1201,13 +1240,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1552,7 +1594,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + (void)frame; + + /* Regen sequences */ +@@ -1945,34 +1987,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referencable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; ++ ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -1980,20 +2067,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, const int frame, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX. ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2089,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2113,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2017,26 +2127,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..e372f048d186 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..32f79fb2873d 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; + + /* streaming */ + ZSTD_dStreamStage streamStage; +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..2fead39eb743 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index 04e1b5c01d9b..8ecf43226af2 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index f4ed952ed485..7d31518e9d5a 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.39.2 diff --git a/patches/0002-cfs-nice.patch b/patches/0002-cfs-nice.patch new file mode 100644 index 0000000..fdb957a --- /dev/null +++ b/patches/0002-cfs-nice.patch @@ -0,0 +1,1029 @@ +From 78440b24f24a021daf660c0bd212c936e50e5f0a Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 17 Feb 2023 15:38:09 +0100 +Subject: [PATCH] Add latency priority for CFS class + +This patchset restarts the work about adding a latency priority to describe +the latency tolerance of cfs tasks. + +Patch [1] is a new one that has been added with v6. It fixes an +unfairness for low prio tasks because of wakeup_gran() being bigger +than the maximum vruntime credit that a waking task can keep after +sleeping. + +The patches [2-4] have been done by Parth: +https://lore.kernel.org/lkml/20200228090755.22829-1-parth@linux.ibm.com/ + +I have just rebased and moved the set of latency priority outside the +priority update. I have removed the reviewed tag because the patches +are 2 years old. + +This aims to be a generic interface and the following patches is one use +of it to improve the scheduling latency of cfs tasks. + +Patch [5] uses latency nice priority to define a latency offset +and then decide if a cfs task can or should preempt the current +running task. The patch gives some tests results with cyclictests and +hackbench to highlight the benefit of latency priority for short +interactive task or long intensive tasks. + +Patch [6] adds the support of latency nice priority to task group by +adding a cpu.latency.nice field. The range is [-20:19] as for setting task +latency priority. + +Patch [7] makes sched_core taking into account the latency offset. + +Patch [8] adds a rb tree to cover some corner cases where the latency +sensitive task (priority < 0) is preempted by high priority task (RT/DL) +or fails to preempt them. This patch ensures that tasks will have at least +a slice of sched_min_granularity in priority at wakeup. + +Patch [9] removes useless check after adding a latency rb tree. + +I have also backported the patchset on a dragonboard RB3 with an android +mainline kernel based on v5.18 for a quick test. I have used the +TouchLatency app which is part of AOSP and described to be a very good +test to highlight jitter and jank frame sources of a system [1]. +In addition to the app, I have added some short running tasks waking-up +regularly (to use the 8 cpus for 4 ms every 37777us) to stress the system +without overloading it (and disabling EAS). The 1st results shows that the +patchset helps to reduce the missed deadline frames from 5% to less than +0.1% when the cpu.latency.nice of task group are set. I haven't rerun the +test with latest version. + +I have also tested the patchset with the modified version of the alsa +latency test that has been shared by Tim. The test quickly xruns with +default latency nice priority 0 but is able to run without underuns with +a latency -20 and hackbench running simultaneously. + +While preparing the version 8, I have evaluated the benefit of using an +augmented rbtree instead of adding a rbtree for latency sensitive entities, +which was a relevant suggestion done by PeterZ. Although the augmented +rbtree enables to sort additional information in the tree with a limited +overhead, it has more impact on legacy use cases (latency_nice >= 0) +because the augmented callbacks are always called to maintain this +additional information even when there is no sensitive tasks. In such +cases, the dedicated rbtree remains empty and the overhead is reduced to +loading a cached null node pointer. Nevertheless, we might want to +reconsider the augmented rbtree once the use of negative latency_nice will +be more widlely deployed. At now, the different tests that I have done, +have not shown improvements with augmented rbtree. + +Below are some hackbench results: + 2 rbtrees augmented rbtree augmented rbtree + sorted by vruntime sorted by wakeup_vruntime +sched pipe +avg 26311,000 25976,667 25839,556 +stdev 0,15 % 0,28 % 0,24 % +vs tip 0,50 % -0,78 % -1,31 % +hackbench 1 group +avg 1,315 1,344 1,359 +stdev 0,88 % 1,55 % 1,82 % +vs tip -0,47 % -2,68 % -3,87 % +hackbench 4 groups +avg 1,339 1,365 1,367 +stdev 2,39 % 2,26 % 3,58 % +vs tip -0,08 % -2,01 % -2,22 % +hackbench 8 groups +avg 1,233 1,286 1,301 +stdev 0,74 % 1,09 % 1,52 % +vs tip 0,29 % -4,05 % -5,27 % +hackbench 16 groups +avg 1,268 1,313 1,319 +stdev 0,85 % 1,60 % 0,68 % +vs tip -0,02 % -3,56 % -4,01 % + +[1] https://source.android.com/docs/core/debug/eval_perf#touchlatency + +Change since v9: +- Rebase +- add tags + +Change since v8: +- Rename get_sched_latency by get_sleep_latency +- move latency nice defines in sched/prio.h and fix latency_prio init value +- Fix typo and comments + +Change since v7: +- Replaced se->on_latency by using RB_CLEAR_NODE() and RB_EMPTY_NODE() +- Clarify the limit behavior fo the cgroup cpu.latenyc_nice + +Change since v6: +- Fix compilation error for !CONFIG_SCHED_DEBUG + +Change since v5: +- Add patch 1 to fix unfairness for low prio task. This has been + discovered while studying Youssef's tests results with latency nice + which were hitting the same problem. +- Fixed latency_offset computation to take into account + GENTLE_FAIR_SLEEPERS. This has diseappeared with v2and has been raised + by Youssef's tests. +- Reworked and optimized how latency_offset in used to check for + preempting current task at wakeup and tick. This cover more cases too. +- Add patch 9 to remove check_preempt_from_others() which is not needed + anymore with the rb tree. + +Change since v4: +- Removed permission checks to set latency priority. This enables user + without elevated privilege like audio application to set their latency + priority as requested by Tim. +- Removed cpu.latency and replaced it by cpu.latency.nice so we keep a + generic interface not tied to latency_offset which can be used to + implement other latency features. +- Added an entry in Documentation/admin-guide/cgroup-v2.rst to describe + cpu.latency.nice. +- Fix some typos. + +Change since v3: +- Fix 2 compilation warnings raised by kernel test robot + +Change since v2: +- Set a latency_offset field instead of saving a weight and computing it + on the fly. +- Make latency_offset available for task group: cpu.latency +- Fix some corner cases to make latency sensitive tasks schedule first and + add a rb tree for latency sensitive task. + +Change since v1: +- fix typo +- move some codes in the right patch to make bisect happy +- simplify and fixed how the weight is computed +- added support of sched core patch 7 + +Parth Shah (3): + sched: Introduce latency-nice as a per-task attribute + sched/core: Propagate parent task's latency requirements to the child + task + sched: Allow sched_{get,set}attr to change latency_nice of the task + +Vincent Guittot (6): + sched/fair: fix unfairness at wakeup + sched/fair: Take into account latency priority at wakeup + sched/fair: Add sched group latency support + sched/core: Support latency priority with sched core + sched/fair: Add latency list + sched/fair: remove check_preempt_from_others + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/cgroup-v2.rst | 10 ++ + include/linux/sched.h | 4 + + include/linux/sched/prio.h | 27 +++ + include/uapi/linux/sched.h | 4 +- + include/uapi/linux/sched/types.h | 19 +++ + init/init_task.c | 1 + + kernel/sched/core.c | 106 ++++++++++++ + kernel/sched/debug.c | 1 + + kernel/sched/fair.c | 209 ++++++++++++++++++++---- + kernel/sched/sched.h | 45 ++++- + tools/include/uapi/linux/sched.h | 4 +- + 11 files changed, 394 insertions(+), 36 deletions(-) + +diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst +index 74cec76be9f2..2e511d4a4c6a 100644 +--- a/Documentation/admin-guide/cgroup-v2.rst ++++ b/Documentation/admin-guide/cgroup-v2.rst +@@ -1118,6 +1118,16 @@ All time durations are in microseconds. + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + ++ cpu.latency.nice ++ A read-write single value file which exists on non-root ++ cgroups. The default is "0". ++ ++ The nice value is in the range [-20, 19]. ++ ++ This interface file allows reading and setting latency using the ++ same values used by sched_setattr(2). The latency_nice of a group is ++ used to limit the impact of the latency_nice of a task outside the ++ group. + + + Memory +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 28ce1be0ba47..df219c7cd6aa 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -548,6 +548,7 @@ struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; ++ struct rb_node latency_node; + struct list_head group_node; + unsigned int on_rq; + +@@ -571,6 +572,8 @@ struct sched_entity { + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; + #endif ++ /* preemption offset in ns */ ++ long latency_offset; + + #ifdef CONFIG_SMP + /* +@@ -787,6 +790,7 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++ int latency_prio; + + struct sched_entity se; + struct sched_rt_entity rt; +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index ab83d85e1183..be79503d86af 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio) + return (MAX_NICE - prio + 1); + } + ++/* ++ * Latency nice is meant to provide scheduler hints about the relative ++ * latency requirements of a task with respect to other tasks. ++ * Thus a task with latency_nice == 19 can be hinted as the task with no ++ * latency requirements, in contrast to the task with latency_nice == -20 ++ * which should be given priority in terms of lower latency. ++ */ ++#define MAX_LATENCY_NICE 19 ++#define MIN_LATENCY_NICE -20 ++ ++#define LATENCY_NICE_WIDTH \ ++ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) ++ ++/* ++ * Default tasks should be treated as a task with latency_nice = 0. ++ */ ++#define DEFAULT_LATENCY_NICE 0 ++#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) ++ ++/* ++ * Convert user-nice values [ -20 ... 0 ... 19 ] ++ * to static latency [ 0..39 ], ++ * and back. ++ */ ++#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) ++#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) ++ + #endif /* _LINUX_SCHED_PRIO_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..b2e932c25be6 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h +index f2c4589d4dbf..db1e8199e8c8 100644 +--- a/include/uapi/linux/sched/types.h ++++ b/include/uapi/linux/sched/types.h +@@ -10,6 +10,7 @@ struct sched_param { + + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ ++#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ + + /* + * Extended scheduling parameters data structure. +@@ -98,6 +99,22 @@ struct sched_param { + * scheduled on a CPU with no more capacity than the specified value. + * + * A task utilization boundary can be reset by setting the attribute to -1. ++ * ++ * Latency Tolerance Attributes ++ * =========================== ++ * ++ * A subset of sched_attr attributes allows to specify the relative latency ++ * requirements of a task with respect to the other tasks running/queued in the ++ * system. ++ * ++ * @ sched_latency_nice task's latency_nice value ++ * ++ * The latency_nice of a task can have any value in a range of ++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. ++ * ++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be ++ * taken for a task requiring a lower latency as opposed to the task with ++ * higher latency_nice. + */ + struct sched_attr { + __u32 size; +@@ -120,6 +137,8 @@ struct sched_attr { + __u32 sched_util_min; + __u32 sched_util_max; + ++ /* latency requirement hints */ ++ __s32 sched_latency_nice; + }; + + #endif /* _UAPI_LINUX_SCHED_TYPES_H */ +diff --git a/init/init_task.c b/init/init_task.c +index ff6c4b9bfe6b..071deff8dbd1 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -78,6 +78,7 @@ struct task_struct init_task + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++ .latency_prio = DEFAULT_LATENCY_PRIO, + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 5237639786b7..5d6a283a4da9 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1283,6 +1283,16 @@ static void set_load_weight(struct task_struct *p, bool update_load) + } + } + ++static void set_latency_offset(struct task_struct *p) ++{ ++ long weight = sched_latency_to_weight[p->latency_prio]; ++ s64 offset; ++ ++ offset = weight * get_sleep_latency(false); ++ offset = div_s64(offset, NICE_LATENCY_WEIGHT_MAX); ++ p->se.latency_offset = (long)offset; ++} ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * Serializes updates of utilization clamp values +@@ -4432,6 +4442,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.dur_avg = 0; + p->se.prev_sleep_sum_runtime = 0; + INIT_LIST_HEAD(&p->se.group_node); ++ RB_CLEAR_NODE(&p->se.latency_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +@@ -4684,6 +4695,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); + ++ p->latency_prio = NICE_TO_LATENCY(0); ++ set_latency_offset(p); ++ + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: +@@ -7444,6 +7458,16 @@ static void __setscheduler_params(struct task_struct *p, + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); + set_load_weight(p, true); ++ ++} ++ ++static void __setscheduler_latency(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { ++ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); ++ set_latency_offset(p); ++ } + } + + /* +@@ -7586,6 +7610,13 @@ static int __sched_setscheduler(struct task_struct *p, + return retval; + } + ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { ++ if (attr->sched_latency_nice > MAX_LATENCY_NICE) ++ return -EINVAL; ++ if (attr->sched_latency_nice < MIN_LATENCY_NICE) ++ return -EINVAL; ++ } ++ + if (pi) + cpuset_read_lock(); + +@@ -7620,6 +7651,9 @@ static int __sched_setscheduler(struct task_struct *p, + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && ++ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) ++ goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; +@@ -7708,6 +7742,7 @@ static int __sched_setscheduler(struct task_struct *p, + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } ++ __setscheduler_latency(p, attr); + __setscheduler_uclamp(p, attr); + + if (queued) { +@@ -7918,6 +7953,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + ++ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && ++ size < SCHED_ATTR_SIZE_VER2) ++ return -EINVAL; + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? +@@ -8155,6 +8193,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + ++ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine +@@ -11027,6 +11067,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + { + return sched_group_set_idle(css_tg(css), idle); + } ++ ++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ int prio, delta, last_delta = INT_MAX; ++ s64 weight; ++ ++ weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX; ++ weight = div_s64(weight, get_sleep_latency(false)); ++ ++ /* Find the closest nice value to the current weight */ ++ for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) { ++ delta = abs(sched_latency_to_weight[prio] - weight); ++ if (delta >= last_delta) ++ break; ++ last_delta = delta; ++ } ++ ++ return LATENCY_TO_NICE(prio-1); ++} ++ ++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft, s64 nice) ++{ ++ s64 latency_offset; ++ long weight; ++ int idx; ++ ++ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE) ++ return -ERANGE; ++ ++ idx = NICE_TO_LATENCY(nice); ++ idx = array_index_nospec(idx, LATENCY_NICE_WIDTH); ++ weight = sched_latency_to_weight[idx]; ++ ++ latency_offset = weight * get_sleep_latency(false); ++ latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX); ++ ++ return sched_group_set_latency(css_tg(css), latency_offset); ++} ++ + #endif + + static struct cftype cpu_legacy_files[] = { +@@ -11041,6 +11122,11 @@ static struct cftype cpu_legacy_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { +@@ -11258,6 +11344,12 @@ static struct cftype cpu_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { +@@ -11368,6 +11460,20 @@ const u32 sched_prio_to_wmult[40] = { + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, + }; + ++/* ++ * latency weight for wakeup preemption ++ */ ++const int sched_latency_to_weight[40] = { ++ /* -20 */ -1024, -973, -922, -870, -819, ++ /* -15 */ -768, -717, -666, -614, -563, ++ /* -10 */ -512, -461, -410, -358, -307, ++ /* -5 */ -256, -205, -154, -102, -51, ++ /* 0 */ 0, 51, 102, 154, 205, ++ /* 5 */ 256, 307, 358, 410, 461, ++ /* 10 */ 512, 563, 614, 666, 717, ++ /* 15 */ 768, 819, 870, 922, 973, ++}; ++ + void call_trace_sched_update_nr_running(struct rq *rq, int count) + { + trace_sched_update_nr_running_tp(rq, count); +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 8d64fba16cfe..177934290ec4 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -1044,6 +1044,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + #endif + P(policy); + P(prio); ++ P(latency_prio); + if (task_has_dl_policy(p)) { + P(dl.runtime); + P(dl.deadline); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b38a1ce1be49..5ef893ce5734 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -698,7 +698,76 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + + return __node_2_se(last); + } ++#endif ++ ++/************************************************************** ++ * Scheduling class tree data structure manipulation methods: ++ * for latency ++ */ ++ ++static inline bool latency_before(struct sched_entity *a, ++ struct sched_entity *b) ++{ ++ return (s64)(a->vruntime + a->latency_offset - b->vruntime - b->latency_offset) < 0; ++} ++ ++#define __latency_node_2_se(node) \ ++ rb_entry((node), struct sched_entity, latency_node) ++ ++static inline bool __latency_less(struct rb_node *a, const struct rb_node *b) ++{ ++ return latency_before(__latency_node_2_se(a), __latency_node_2_se(b)); ++} ++ ++/* ++ * Enqueue an entity into the latency rb-tree: ++ */ ++static void __enqueue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ++{ ++ ++ /* Only latency sensitive entity can be added to the list */ ++ if (se->latency_offset >= 0) ++ return; ++ ++ if (!RB_EMPTY_NODE(&se->latency_node)) ++ return; ++ ++ /* ++ * An execution time less than sysctl_sched_min_granularity means that ++ * the entity has been preempted by a higher sched class or an entity ++ * with higher latency constraint. ++ * Put it back in the list so it gets a chance to run 1st during the ++ * next slice. ++ */ ++ if (!(flags & ENQUEUE_WAKEUP)) { ++ u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ ++ if (delta_exec >= sysctl_sched_min_granularity) ++ return; ++ } ++ ++ rb_add_cached(&se->latency_node, &cfs_rq->latency_timeline, __latency_less); ++} ++ ++static void __dequeue_latency(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ if (!RB_EMPTY_NODE(&se->latency_node)) { ++ rb_erase_cached(&se->latency_node, &cfs_rq->latency_timeline); ++ RB_CLEAR_NODE(&se->latency_node); ++ } ++} ++ ++static struct sched_entity *__pick_first_latency(struct cfs_rq *cfs_rq) ++{ ++ struct rb_node *left = rb_first_cached(&cfs_rq->latency_timeline); ++ ++ if (!left) ++ return NULL; ++ ++ return __latency_node_2_se(left); ++} + ++#ifdef CONFIG_SCHED_DEBUG + /************************************************************** + * Scheduling class statistics methods: + */ +@@ -4672,33 +4741,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time; + +- /* +- * The 'current' period is already promised to the current tasks, +- * however the extra weight of the new task will slow them down a +- * little, place the new task so that it fits in the slot that +- * stays open at the end. +- */ +- if (initial && sched_feat(START_DEBIT)) +- vruntime += sched_vslice(cfs_rq, se); +- +- /* sleeps up to a single latency don't count. */ +- if (!initial) { +- unsigned long thresh; +- +- if (se_is_idle(se)) +- thresh = sysctl_sched_min_granularity; +- else +- thresh = sysctl_sched_latency; +- ++ if (!initial) ++ /* sleeps up to a single latency don't count. */ ++ vruntime -= get_sleep_latency(se_is_idle(se)); ++ else if (sched_feat(START_DEBIT)) + /* +- * Halve their sleep time's effect, to allow +- * for a gentler effect of sleepers: ++ * The 'current' period is already promised to the current tasks, ++ * however the extra weight of the new task will slow them down a ++ * little, place the new task so that it fits in the slot that ++ * stays open at the end. + */ +- if (sched_feat(GENTLE_FAIR_SLEEPERS)) +- thresh >>= 1; +- +- vruntime -= thresh; +- } ++ vruntime += sched_vslice(cfs_rq, se); + + /* + * Pull vruntime of the entity being placed to the base level of +@@ -4792,8 +4845,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + check_schedstat_required(); + update_stats_enqueue_fair(cfs_rq, se, flags); + check_spread(cfs_rq, se); +- if (!curr) ++ if (!curr) { + __enqueue_entity(cfs_rq, se); ++ __enqueue_latency(cfs_rq, se, flags); ++ } + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) { +@@ -4879,8 +4934,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + clear_buddies(cfs_rq, se); + +- if (se != cfs_rq->curr) ++ if (se != cfs_rq->curr) { + __dequeue_entity(cfs_rq, se); ++ __dequeue_latency(cfs_rq, se); ++ } + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + +@@ -4911,6 +4968,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + update_idle_cfs_rq_clock_pelt(cfs_rq); + } + ++static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se); ++ + /* + * Preempt the current task with a newly woken task if needed: + */ +@@ -4919,7 +4978,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + { + unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; +- s64 delta; ++ s64 delta, offset; + + /* + * When many tasks blow up the sched_period; it is possible that +@@ -4950,10 +5009,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + se = __pick_first_entity(cfs_rq); + delta = curr->vruntime - se->vruntime; + +- if (delta < 0) ++ offset = wakeup_latency_gran(curr, se); ++ if (delta < offset) + return; + +- if (delta > ideal_runtime) ++ if ((delta > ideal_runtime) || ++ (delta > get_latency_max())) + resched_curr(rq_of(cfs_rq)); + } + +@@ -4971,6 +5032,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + */ + update_stats_wait_end_fair(cfs_rq, se); + __dequeue_entity(cfs_rq, se); ++ __dequeue_latency(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); + } + +@@ -5009,7 +5071,7 @@ static struct sched_entity * + pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + { + struct sched_entity *left = __pick_first_entity(cfs_rq); +- struct sched_entity *se; ++ struct sched_entity *latency, *se; + + /* + * If curr is set we have to see if its left of the leftmost entity +@@ -5051,6 +5113,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + se = cfs_rq->last; + } + ++ /* Check for latency sensitive entity waiting for running */ ++ latency = __pick_first_latency(cfs_rq); ++ if (latency && (latency != se) && ++ wakeup_preempt_entity(latency, se) < 1) ++ se = latency; ++ + return se; + } + +@@ -5074,6 +5142,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) + update_stats_wait_start_fair(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); ++ __enqueue_latency(cfs_rq, prev, 0); + /* in !on_rq case, update occurred at dequeue */ + update_load_avg(cfs_rq, prev, 0); + } +@@ -7735,6 +7804,23 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + #endif /* CONFIG_SMP */ + ++static long wakeup_latency_gran(struct sched_entity *curr, struct sched_entity *se) ++{ ++ long latency_offset = se->latency_offset; ++ ++ /* ++ * A negative latency offset means that the sched_entity has latency ++ * requirement that needs to be evaluated versus other entity. ++ * Otherwise, use the latency weight to evaluate how much scheduling ++ * delay is acceptable by se. ++ */ ++ if ((latency_offset < 0) || (curr->latency_offset < 0)) ++ latency_offset -= curr->latency_offset; ++ latency_offset = min_t(long, latency_offset, get_latency_max()); ++ ++ return latency_offset; ++} ++ + static unsigned long wakeup_gran(struct sched_entity *se) + { + unsigned long gran = sysctl_sched_wakeup_granularity; +@@ -7773,11 +7859,24 @@ static int + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) + { + s64 gran, vdiff = curr->vruntime - se->vruntime; ++ s64 offset = wakeup_latency_gran(curr, se); + +- if (vdiff <= 0) ++ if (vdiff < offset) + return -1; + +- gran = wakeup_gran(se); ++ gran = offset + wakeup_gran(se); ++ ++ /* ++ * At wake up, the vruntime of a task is capped to not be older than ++ * a sched_latency period compared to min_vruntime. This prevents long ++ * sleeping task to get unlimited credit at wakeup. Such waking up task ++ * has to preempt current in order to not lose its share of CPU ++ * bandwidth but wakeup_gran() can become higher than scheduling period ++ * for low priority task. Make sure that long sleeping task will get a ++ * chance to preempt current. ++ */ ++ gran = min_t(s64, gran, get_latency_max()); ++ + if (vdiff > gran) + return 1; + +@@ -11995,6 +12094,9 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + ++ /* Take into account latency prio */ ++ delta -= wakeup_latency_gran(sea, seb); ++ + return delta > 0; + } + #else +@@ -12265,6 +12367,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; ++ cfs_rq->latency_timeline = RB_ROOT_CACHED; + u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); +@@ -12320,6 +12423,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + goto err; + + tg->shares = NICE_0_LOAD; ++ tg->latency_offset = 0; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + +@@ -12418,6 +12522,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + } + + se->my_q = cfs_rq; ++ ++ se->latency_offset = tg->latency_offset; ++ + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); + se->parent = parent; +@@ -12548,6 +12655,42 @@ int sched_group_set_idle(struct task_group *tg, long idle) + return 0; + } + ++int sched_group_set_latency(struct task_group *tg, s64 latency) ++{ ++ int i; ++ ++ if (tg == &root_task_group) ++ return -EINVAL; ++ ++ if (abs(latency) > sysctl_sched_latency) ++ return -EINVAL; ++ ++ mutex_lock(&shares_mutex); ++ ++ if (tg->latency_offset == latency) { ++ mutex_unlock(&shares_mutex); ++ return 0; ++ } ++ ++ tg->latency_offset = latency; ++ ++ for_each_possible_cpu(i) { ++ struct sched_entity *se = tg->se[i]; ++ struct rq *rq = cpu_rq(i); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ __dequeue_latency(se->cfs_rq, se); ++ WRITE_ONCE(se->latency_offset, latency); ++ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++ mutex_unlock(&shares_mutex); ++ return 0; ++} ++ + #else /* CONFIG_FAIR_GROUP_SCHED */ + + void free_fair_sched_group(struct task_group *tg) { } +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 9e8bb6278604..c47198dbf740 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -125,6 +125,11 @@ extern int sched_rr_timeslice; + */ + #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + ++/* Maximum nice latency weight used to scale the latency_offset */ ++ ++#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT) ++#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT) ++ + /* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of +@@ -378,6 +383,8 @@ struct task_group { + + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; ++ /* latency constraint of the group. */ ++ int latency_offset; + + #ifdef CONFIG_SMP + /* +@@ -488,6 +495,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + + extern int sched_group_set_idle(struct task_group *tg, long idle); + ++extern int sched_group_set_latency(struct task_group *tg, s64 latency); ++ + #ifdef CONFIG_SMP + extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +@@ -566,6 +575,7 @@ struct cfs_rq { + #endif + + struct rb_root_cached tasks_timeline; ++ struct rb_root_cached latency_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. +@@ -2123,6 +2133,7 @@ static_assert(WF_TTWU == SD_BALANCE_WAKE); + + extern const int sched_prio_to_weight[40]; + extern const u32 sched_prio_to_wmult[40]; ++extern const int sched_latency_to_weight[40]; + + /* + * {de,en}queue flags: +@@ -2461,9 +2472,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + extern const_debug unsigned int sysctl_sched_nr_migrate; + extern const_debug unsigned int sysctl_sched_migration_cost; + +-#ifdef CONFIG_SCHED_DEBUG + extern unsigned int sysctl_sched_latency; + extern unsigned int sysctl_sched_min_granularity; ++#ifdef CONFIG_SCHED_DEBUG + extern unsigned int sysctl_sched_idle_min_granularity; + extern unsigned int sysctl_sched_wakeup_granularity; + extern int sysctl_resched_latency_warn_ms; +@@ -2478,6 +2489,38 @@ extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + #endif + ++static inline unsigned long get_sleep_latency(bool idle) ++{ ++ unsigned long thresh; ++ ++ if (idle) ++ thresh = sysctl_sched_min_granularity; ++ else ++ thresh = sysctl_sched_latency; ++ ++ /* ++ * Halve their sleep time's effect, to allow ++ * for a gentler effect of sleepers: ++ */ ++ if (sched_feat(GENTLE_FAIR_SLEEPERS)) ++ thresh >>= 1; ++ ++ return thresh; ++} ++ ++static inline unsigned long get_latency_max(void) ++{ ++ unsigned long thresh = get_sleep_latency(false); ++ ++ /* ++ * If the waking task failed to preempt current it could to wait up to ++ * sysctl_sched_min_granularity before preempting it during next tick. ++ */ ++ thresh -= sysctl_sched_min_granularity; ++ ++ return thresh; ++} ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h +index 3bac0a8ceab2..b2e932c25be6 100644 +--- a/tools/include/uapi/linux/sched.h ++++ b/tools/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +-- +2.39.2 diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch new file mode 100644 index 0000000..9e5bc88 --- /dev/null +++ b/patches/0003-bore.patch @@ -0,0 +1,388 @@ +From f169eabeb1ba8f339ab9bebec8d503c70c5f5879 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 17 Feb 2023 15:39:23 +0100 +Subject: [PATCH] bore-cachy + +Signed-off-by: Peter Jung +--- + include/linux/sched.h | 5 ++ + init/Kconfig | 20 ++++++ + kernel/sched/core.c | 29 +++++++++ + kernel/sched/debug.c | 3 + + kernel/sched/fair.c | 132 +++++++++++++++++++++++++++++++++++++++- + kernel/sched/features.h | 4 ++ + 6 files changed, 190 insertions(+), 3 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index df219c7cd6aa..a3538eacb095 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -556,6 +556,11 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; ++#ifdef CONFIG_SCHED_BORE ++ u64 prev_burst_time; ++ u64 burst_time; ++ u8 burst_score; ++#endif // CONFIG_SCHED_BORE + + u64 nr_migrations; + u64 prev_sleep_sum_runtime; +diff --git a/init/Kconfig b/init/Kconfig +index 85a602dba878..bc69f062ca76 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ You can turn it off by setting the sysctl kernel.sched_bore = 0. ++ Enabling this feature implies NO_GENTLE_FAIR_SLEEPERS by default. ++ ++ If unsure say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 919edb034108..fd52870a002f 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4420,6 +4420,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) + return try_to_wake_up(p, state, 0); + } + ++#ifdef CONFIG_SCHED_BORE ++static inline void sched_fork_update_prev_burst(struct task_struct *p) ++{ ++ struct task_struct *sib; ++ u32 cnt = 0; ++ u64 sum = 0, avg = 0; ++ list_for_each_entry(sib, &p->sibling, sibling) { ++ cnt++; ++ sum += sib->se.prev_burst_time >> 8; ++ } ++ if (cnt) avg = div_u64(sum, cnt) << 8; ++ if (p->se.prev_burst_time < avg) p->se.prev_burst_time = avg; ++} ++#endif // CONFIG_SCHED_BORE ++ + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -4438,6 +4453,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.vruntime = 0; + p->se.dur_avg = 0; + p->se.prev_sleep_sum_runtime = 0; ++#ifdef CONFIG_SCHED_BORE ++ p->se.burst_time = 0; ++#endif // CONFIG_SCHED_BORE + INIT_LIST_HEAD(&p->se.group_node); + RB_CLEAR_NODE(&p->se.latency_node); + +@@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init); + int sched_fork(unsigned long clone_flags, struct task_struct *p) + { + __sched_fork(clone_flags, p); ++#ifdef CONFIG_SCHED_BORE ++ sched_fork_update_prev_burst(p); ++ p->se.burst_time = 0; ++#endif // CONFIG_SCHED_BORE + /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external +@@ -9154,6 +9176,9 @@ void __init init_idle(struct task_struct *idle, int cpu) + + idle->__state = TASK_RUNNING; + idle->se.exec_start = sched_clock(); ++#ifdef CONFIG_SCHED_BORE ++ idle->se.prev_burst_time = 0; ++#endif //CONFIG_SCHED_BORE + /* + * PF_KTHREAD should already be set at this point; regardless, make it + * look like a proper per-CPU kthread. +@@ -9821,6 +9846,10 @@ void __init sched_init(void) + BUG_ON(&dl_sched_class != &stop_sched_class + 1); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 1.7.10 by Masahito Suzuki"); ++#endif // CONFIG_SCHED_BORE ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 177934290ec4..2f40a238cdad 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.burst_score); ++#endif + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 5ef893ce5734..590adb9a3e37 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -19,6 +19,9 @@ + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021 Masahito Suzuki + */ + #include + #include +@@ -140,6 +143,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + ++#ifdef CONFIG_SCHED_BORE ++unsigned int __read_mostly sched_bore = 1; ++unsigned int __read_mostly sched_burst_penalty_scale = 1280; ++unsigned int __read_mostly sched_burst_granularity = 12; ++unsigned int __read_mostly sched_burst_smoothness = 2; ++static int three = 3; ++static int sixty_four = 64; ++static int maxval_12_bits = 4095; ++#endif // CONFIG_SCHED_BORE ++ + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -203,6 +216,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { ++#ifdef CONFIG_SCHED_BORE ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &three, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_granularity", ++ .data = &sched_burst_granularity, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, ++ }, ++ { ++ .procname = "sched_burst_smoothness", ++ .data = &sched_burst_smoothness, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &three, ++ }, ++#endif // CONFIG_SCHED_BORE + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -978,6 +1029,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) + } + #endif /* CONFIG_SMP */ + ++#ifdef CONFIG_SCHED_BORE ++static inline void update_burst_score(struct sched_entity *se) { ++ u64 burst_time; ++ s32 bits; ++ u32 intgr, fdigs, dec10; ++ ++ burst_time = max(se->burst_time, se->prev_burst_time); ++ bits = fls64(burst_time); ++ intgr = max((u32)bits, sched_burst_granularity) - sched_burst_granularity; ++ fdigs = max(bits - 1, (s32)sched_burst_granularity); ++ dec10 = (intgr << 10) | (burst_time << (64 - fdigs) >> 54); ++ se->burst_score = min((u32)39, dec10 * sched_burst_penalty_scale >> 20); ++} ++ ++static u64 burst_scale(u64 delta, struct sched_entity *se) { ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22); ++} ++ ++static u64 calc_delta_fair_bscale(u64 delta, struct sched_entity *se) { ++ return burst_scale(calc_delta_fair(delta, se), se); ++} ++ ++static inline u64 binary_smooth(u64 old, u64 new, unsigned int smoothness) { ++ return (new + old * ((1 << smoothness) - 1)) >> smoothness; ++} ++ ++static inline void reset_burst(struct sched_entity *se) { ++ se->prev_burst_time = binary_smooth( ++ se->prev_burst_time, se->burst_time, sched_burst_smoothness); ++ se->burst_time = 0; ++} ++#endif // CONFIG_SCHED_BORE ++ + /* + * Update the current task's runtime statistics. + */ +@@ -1007,6 +1091,13 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq->exec_clock, delta_exec); + ++#ifdef CONFIG_SCHED_BORE ++ curr->burst_time += delta_exec; ++ update_burst_score(curr); ++ if (sched_bore & 1) ++ curr->vruntime += calc_delta_fair_bscale(delta_exec, curr); ++ else ++#endif // CONFIG_SCHED_BORE + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + +@@ -5057,6 +5148,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + ++#ifdef CONFIG_SCHED_BORE ++static int ++wakeup_preempt_entity_bscale(struct sched_entity *curr, ++ struct sched_entity *se, bool do_scale); ++#endif // CONFIG_SCHED_BORE + static int + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +@@ -5101,7 +5197,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + se = second; + } + +- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { ++#ifdef CONFIG_SCHED_BORE ++ if (cfs_rq->next && wakeup_preempt_entity_bscale( ++ cfs_rq->next, left, sched_bore & 2) < 1) ++#else // CONFIG_SCHED_BORE ++ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) ++#endif // CONFIG_SCHED_BORE ++ { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ +@@ -6394,6 +6496,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + for_each_sched_entity(se) { ++#ifdef CONFIG_SCHED_BORE ++ if (task_sleep) reset_burst(se); ++#endif // CONFIG_SCHED_BORE + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + +@@ -7856,7 +7961,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) + * + */ + static int ++#ifdef CONFIG_SCHED_BORE ++wakeup_preempt_entity_bscale(struct sched_entity *curr, ++ struct sched_entity *se, bool do_scale) ++#else // CONFIG_SCHED_BORE + wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ++#endif // CONFIG_SCHED_BORE + { + s64 gran, vdiff = curr->vruntime - se->vruntime; + s64 offset = wakeup_latency_gran(curr, se); +@@ -7876,12 +7986,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) + * chance to preempt current. + */ + gran = min_t(s64, gran, get_latency_max()); +- ++#ifdef CONFIG_SCHED_BORE ++ if (do_scale) gran = burst_scale(gran, se); ++#endif // CONFIG_SCHED_BORE + if (vdiff > gran) + return 1; + + return 0; + } ++#ifdef CONFIG_SCHED_BORE ++static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) ++{ ++ return wakeup_preempt_entity_bscale(curr, se, false); ++} ++#endif // CONFIG_SCHED_BORE + + static void set_last_buddy(struct sched_entity *se) + { +@@ -7981,7 +8099,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; + + update_curr(cfs_rq_of(se)); +- if (wakeup_preempt_entity(se, pse) == 1) { ++#ifdef CONFIG_SCHED_BORE ++ if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1) ++#else // CONFIG_SCHED_BORE ++ if (wakeup_preempt_entity(se, pse) == 1) ++#endif // CONFIG_SCHED_BORE ++ { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. +@@ -8217,6 +8340,9 @@ static void yield_task_fair(struct rq *rq) + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; ++#ifdef CONFIG_SCHED_BORE ++ reset_burst(se); ++#endif // CONFIG_SCHED_BORE + + /* + * Are we the only task in the tree? +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index efdc29c42161..0f28637ce1aa 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -4,7 +4,11 @@ + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false) ++#else // CONFIG_SCHED_BORE + SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) ++#endif // CONFIG_SCHED_BORE + + /* + * Place new tasks ahead so that they do not starve already running +-- +2.39.2 diff --git a/patches/0004-hdr.patch b/patches/0004-hdr.patch new file mode 100644 index 0000000..7e467ac --- /dev/null +++ b/patches/0004-hdr.patch @@ -0,0 +1,912 @@ +From 9cab14aa7f6828572f808d1bea60def5f883522c Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 22 Jan 2023 23:10:03 +0100 +Subject: [PATCH 08/16] hdr + +Signed-off-by: Peter Jung +--- + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 110 ++++++++--- + .../amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 57 ++++++ + .../gpu/drm/amd/display/dc/core/dc_resource.c | 100 ++++------ + drivers/gpu/drm/amd/display/dc/dc_stream.h | 2 +- + drivers/gpu/drm/amd/display/dc/dc_types.h | 14 -- + drivers/gpu/drm/display/drm_hdmi_helper.c | 8 +- + drivers/gpu/drm/drm_atomic.c | 2 + + drivers/gpu/drm/drm_connector.c | 181 ++++++++++-------- + .../gpu/drm/i915/display/intel_connector.c | 4 +- + drivers/gpu/drm/vc4/vc4_hdmi.c | 2 +- + include/drm/display/drm_dp.h | 2 +- + include/drm/drm_connector.h | 57 +++--- + 12 files changed, 327 insertions(+), 212 deletions(-) + +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index 93dee3d1a483..b5eb33a97590 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -5172,21 +5172,46 @@ get_aspect_ratio(const struct drm_display_mode *mode_in) + } + + static enum dc_color_space +-get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing) ++get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing, ++ const struct drm_connector_state *connector_state) + { + enum dc_color_space color_space = COLOR_SPACE_SRGB; + +- switch (dc_crtc_timing->pixel_encoding) { +- case PIXEL_ENCODING_YCBCR422: +- case PIXEL_ENCODING_YCBCR444: +- case PIXEL_ENCODING_YCBCR420: +- { ++ switch (connector_state->colorspace) { ++ case DRM_MODE_COLORIMETRY_BT601_YCC: ++ if (dc_crtc_timing->flags.Y_ONLY) ++ color_space = COLOR_SPACE_YCBCR601_LIMITED; ++ else ++ color_space = COLOR_SPACE_YCBCR601; ++ break; ++ case DRM_MODE_COLORIMETRY_BT709_YCC: ++ if (dc_crtc_timing->flags.Y_ONLY) ++ color_space = COLOR_SPACE_YCBCR709_LIMITED; ++ else ++ color_space = COLOR_SPACE_YCBCR709; ++ break; ++ case DRM_MODE_COLORIMETRY_OPRGB: ++ color_space = COLOR_SPACE_ADOBERGB; ++ break; ++ case DRM_MODE_COLORIMETRY_BT2020_RGB: ++ if (dc_crtc_timing->pixel_encoding == PIXEL_ENCODING_RGB) ++ color_space = COLOR_SPACE_2020_RGB_FULLRANGE; ++ else ++ color_space = COLOR_SPACE_2020_YCBCR; ++ break; ++ case DRM_MODE_COLORIMETRY_BT2020_YCC: ++ color_space = COLOR_SPACE_2020_YCBCR; ++ break; ++ case DRM_MODE_COLORIMETRY_DEFAULT: // ITU601 ++ default: ++ if (dc_crtc_timing->pixel_encoding == PIXEL_ENCODING_RGB) { ++ color_space = COLOR_SPACE_SRGB; + /* + * 27030khz is the separation point between HDTV and SDTV + * according to HDMI spec, we use YCbCr709 and YCbCr601 + * respectively + */ +- if (dc_crtc_timing->pix_clk_100hz > 270300) { ++ } else if (dc_crtc_timing->pix_clk_100hz > 270300) { + if (dc_crtc_timing->flags.Y_ONLY) + color_space = + COLOR_SPACE_YCBCR709_LIMITED; +@@ -5199,21 +5224,30 @@ get_output_color_space(const struct dc_crtc_timing *dc_crtc_timing) + else + color_space = COLOR_SPACE_YCBCR601; + } +- +- } +- break; +- case PIXEL_ENCODING_RGB: +- color_space = COLOR_SPACE_SRGB; +- break; +- +- default: +- WARN_ON(1); + break; + } + + return color_space; + } + ++static enum display_content_type ++get_output_content_type(const struct drm_connector_state *connector_state) ++{ ++ switch (connector_state->content_type) { ++ default: ++ case DRM_MODE_CONTENT_TYPE_NO_DATA: ++ return DISPLAY_CONTENT_TYPE_NO_DATA; ++ case DRM_MODE_CONTENT_TYPE_GRAPHICS: ++ return DISPLAY_CONTENT_TYPE_GRAPHICS; ++ case DRM_MODE_CONTENT_TYPE_PHOTO: ++ return DISPLAY_CONTENT_TYPE_PHOTO; ++ case DRM_MODE_CONTENT_TYPE_CINEMA: ++ return DISPLAY_CONTENT_TYPE_CINEMA; ++ case DRM_MODE_CONTENT_TYPE_GAME: ++ return DISPLAY_CONTENT_TYPE_GAME; ++ } ++} ++ + static bool adjust_colour_depth_from_display_info( + struct dc_crtc_timing *timing_out, + const struct drm_display_info *info) +@@ -5307,6 +5341,7 @@ static void fill_stream_properties_from_drm_display_mode( + if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) { + drm_hdmi_avi_infoframe_from_display_mode(&avi_frame, (struct drm_connector *)connector, mode_in); + timing_out->vic = avi_frame.video_code; ++ drm_hdmi_avi_infoframe_colorimetry(&avi_frame, connector_state); + drm_hdmi_vendor_infoframe_from_display_mode(&hv_frame, (struct drm_connector *)connector, mode_in); + timing_out->hdmi_vic = hv_frame.vic; + } +@@ -5346,7 +5381,8 @@ static void fill_stream_properties_from_drm_display_mode( + } + } + +- stream->output_color_space = get_output_color_space(timing_out); ++ stream->output_color_space = get_output_color_space(timing_out, connector_state); ++ stream->content_type = get_output_content_type(connector_state); + } + + static void fill_audio_info(struct audio_info *audio_info, +@@ -5786,15 +5822,14 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, + { + struct drm_display_mode *preferred_mode = NULL; + struct drm_connector *drm_connector; +- const struct drm_connector_state *con_state = +- dm_state ? &dm_state->base : NULL; ++ const struct drm_connector_state *con_state = &dm_state->base; + struct dc_stream_state *stream = NULL; + struct drm_display_mode mode; + struct drm_display_mode saved_mode; + struct drm_display_mode *freesync_mode = NULL; + bool native_mode_found = false; + bool recalculate_timing = false; +- bool scale = dm_state ? (dm_state->scaling != RMX_OFF) : false; ++ bool scale = dm_state->scaling != RMX_OFF; + int mode_refresh; + int preferred_refresh = 0; + enum color_transfer_func tf = TRANSFER_FUNC_UNKNOWN; +@@ -5875,7 +5910,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, + + if (recalculate_timing) + drm_mode_set_crtcinfo(&saved_mode, 0); +- else if (!dm_state) ++ else + drm_mode_set_crtcinfo(&mode, 0); + + /* +@@ -6404,7 +6439,9 @@ enum drm_mode_status amdgpu_dm_connector_mode_valid(struct drm_connector *connec + goto fail; + } + +- stream = create_validate_stream_for_sink(aconnector, mode, NULL, NULL); ++ stream = create_validate_stream_for_sink(aconnector, mode, ++ to_dm_connector_state(connector->state), ++ NULL); + if (stream) { + dc_stream_release(stream); + result = MODE_OK; +@@ -6498,6 +6535,14 @@ amdgpu_dm_connector_atomic_check(struct drm_connector *conn, + if (!crtc) + return 0; + ++ if (new_con_state->colorspace != old_con_state->colorspace) { ++ new_crtc_state = drm_atomic_get_crtc_state(state, crtc); ++ if (IS_ERR(new_crtc_state)) ++ return PTR_ERR(new_crtc_state); ++ ++ new_crtc_state->mode_changed = true; ++ } ++ + if (!drm_connector_atomic_hdr_metadata_equal(old_con_state, new_con_state)) { + struct dc_info_packet hdr_infopacket; + +@@ -6520,7 +6565,7 @@ amdgpu_dm_connector_atomic_check(struct drm_connector *conn, + * set is permissible, however. So only force a + * modeset if we're entering or exiting HDR. + */ +- new_crtc_state->mode_changed = ++ new_crtc_state->mode_changed = new_crtc_state->mode_changed || + !old_con_state->hdr_output_metadata || + !new_con_state->hdr_output_metadata; + } +@@ -7041,6 +7086,12 @@ static int amdgpu_dm_connector_get_modes(struct drm_connector *connector) + return amdgpu_dm_connector->num_modes; + } + ++static const u32 supported_colorspaces = ++ BIT(DRM_MODE_COLORIMETRY_BT709_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_OPRGB) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_YCC); ++ + void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, + struct amdgpu_dm_connector *aconnector, + int connector_type, +@@ -7109,7 +7160,7 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, + drm_connector_attach_max_bpc_property(&aconnector->base, 8, 16); + + /* This defaults to the max in the range, but we want 8bpc for non-edp. */ +- aconnector->base.state->max_bpc = (connector_type == DRM_MODE_CONNECTOR_eDP) ? 16 : 8; ++ aconnector->base.state->max_bpc = 16; + aconnector->base.state->max_requested_bpc = aconnector->base.state->max_bpc; + + if (connector_type == DRM_MODE_CONNECTOR_eDP && +@@ -7118,6 +7169,17 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, + adev->mode_info.abm_level_property, 0); + } + ++ drm_connector_attach_content_type_property(&aconnector->base); ++ ++ if (connector_type == DRM_MODE_CONNECTOR_HDMIA) { ++ if (!drm_mode_create_hdmi_colorspace_property(&aconnector->base, supported_colorspaces)) ++ drm_connector_attach_colorspace_property(&aconnector->base); ++ } else if (connector_type == DRM_MODE_CONNECTOR_DisplayPort || ++ connector_type == DRM_MODE_CONNECTOR_eDP) { ++ if (!drm_mode_create_dp_colorspace_property(&aconnector->base, supported_colorspaces)) ++ drm_connector_attach_colorspace_property(&aconnector->base); ++ } ++ + if (connector_type == DRM_MODE_CONNECTOR_HDMIA || + connector_type == DRM_MODE_CONNECTOR_DisplayPort || + connector_type == DRM_MODE_CONNECTOR_eDP) { +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +index 461037a3dd75..d95d1c9f4805 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +@@ -935,6 +935,61 @@ static int amdgpu_current_bpc_show(struct seq_file *m, void *data) + } + DEFINE_SHOW_ATTRIBUTE(amdgpu_current_bpc); + ++/* ++ * Returns the current bpc for the crtc. ++ * Example usage: cat /sys/kernel/debug/dri/0/crtc-0/amdgpu_current_colorspace ++ */ ++static int amdgpu_current_colorspace_show(struct seq_file *m, void *data) ++{ ++ struct drm_crtc *crtc = m->private; ++ struct drm_device *dev = crtc->dev; ++ struct dm_crtc_state *dm_crtc_state = NULL; ++ int res = -ENODEV; ++ ++ mutex_lock(&dev->mode_config.mutex); ++ drm_modeset_lock(&crtc->mutex, NULL); ++ if (crtc->state == NULL) ++ goto unlock; ++ ++ dm_crtc_state = to_dm_crtc_state(crtc->state); ++ if (dm_crtc_state->stream == NULL) ++ goto unlock; ++ ++ switch (dm_crtc_state->stream->output_color_space) { ++ case COLOR_SPACE_SRGB: ++ seq_printf(m, "RGB"); ++ break; ++ case COLOR_SPACE_YCBCR601: ++ case COLOR_SPACE_YCBCR601_LIMITED: ++ seq_printf(m, "BT601_YCC"); ++ break; ++ case COLOR_SPACE_YCBCR709: ++ case COLOR_SPACE_YCBCR709_LIMITED: ++ seq_printf(m, "BT709_YCC"); ++ break; ++ case COLOR_SPACE_ADOBERGB: ++ seq_printf(m, "opRGB"); ++ break; ++ case COLOR_SPACE_2020_RGB_FULLRANGE: ++ seq_printf(m, "BT2020_RGB"); ++ break; ++ case COLOR_SPACE_2020_YCBCR: ++ seq_printf(m, "BT2020_YCC"); ++ break; ++ default: ++ goto unlock; ++ } ++ res = 0; ++ ++unlock: ++ drm_modeset_unlock(&crtc->mutex); ++ mutex_unlock(&dev->mode_config.mutex); ++ ++ return res; ++} ++DEFINE_SHOW_ATTRIBUTE(amdgpu_current_colorspace); ++ ++ + /* + * Example usage: + * Disable dsc passthrough, i.e.,: have dsc decoding at converver, not external RX +@@ -3326,6 +3381,8 @@ void crtc_debugfs_init(struct drm_crtc *crtc) + #endif + debugfs_create_file("amdgpu_current_bpc", 0644, crtc->debugfs_entry, + crtc, &amdgpu_current_bpc_fops); ++ debugfs_create_file("amdgpu_current_colorspace", 0644, crtc->debugfs_entry, ++ crtc, &amdgpu_current_colorspace_fops); + } + + /* +diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +index da164685547d..e00fadf9d0ff 100644 +--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c ++++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +@@ -2943,14 +2943,9 @@ static void set_avi_info_frame( + uint32_t pixel_encoding = 0; + enum scanning_type scan_type = SCANNING_TYPE_NODATA; + enum dc_aspect_ratio aspect = ASPECT_RATIO_NO_DATA; +- bool itc = false; +- uint8_t itc_value = 0; +- uint8_t cn0_cn1 = 0; +- unsigned int cn0_cn1_value = 0; + uint8_t *check_sum = NULL; + uint8_t byte_index = 0; + union hdmi_info_packet hdmi_info; +- union display_content_support support = {0}; + unsigned int vic = pipe_ctx->stream->timing.vic; + unsigned int rid = pipe_ctx->stream->timing.rid; + unsigned int fr_ind = pipe_ctx->stream->timing.fr_index; +@@ -3010,23 +3005,32 @@ static void set_avi_info_frame( + hdmi_info.bits.S0_S1 = scan_type; + + /* C0, C1 : Colorimetry */ +- if (color_space == COLOR_SPACE_YCBCR709 || +- color_space == COLOR_SPACE_YCBCR709_LIMITED) ++ switch (color_space) { ++ case COLOR_SPACE_YCBCR709: ++ case COLOR_SPACE_YCBCR709_LIMITED: + hdmi_info.bits.C0_C1 = COLORIMETRY_ITU709; +- else if (color_space == COLOR_SPACE_YCBCR601 || +- color_space == COLOR_SPACE_YCBCR601_LIMITED) ++ break; ++ case COLOR_SPACE_YCBCR601: ++ case COLOR_SPACE_YCBCR601_LIMITED: + hdmi_info.bits.C0_C1 = COLORIMETRY_ITU601; +- else { +- hdmi_info.bits.C0_C1 = COLORIMETRY_NO_DATA; +- } +- if (color_space == COLOR_SPACE_2020_RGB_FULLRANGE || +- color_space == COLOR_SPACE_2020_RGB_LIMITEDRANGE || +- color_space == COLOR_SPACE_2020_YCBCR) { ++ break; ++ case COLOR_SPACE_2020_RGB_FULLRANGE: ++ case COLOR_SPACE_2020_RGB_LIMITEDRANGE: + hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_BT2020RGBYCBCR; + hdmi_info.bits.C0_C1 = COLORIMETRY_EXTENDED; +- } else if (color_space == COLOR_SPACE_ADOBERGB) { ++ break; ++ case COLOR_SPACE_2020_YCBCR: ++ hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_BT2020YCC; ++ hdmi_info.bits.C0_C1 = COLORIMETRY_EXTENDED; ++ break; ++ case COLOR_SPACE_ADOBERGB: + hdmi_info.bits.EC0_EC2 = COLORIMETRYEX_ADOBERGB; + hdmi_info.bits.C0_C1 = COLORIMETRY_EXTENDED; ++ break; ++ case COLOR_SPACE_SRGB: ++ default: ++ hdmi_info.bits.C0_C1 = COLORIMETRY_NO_DATA; ++ break; + } + + if (pixel_encoding && color_space == COLOR_SPACE_2020_YCBCR && +@@ -3054,49 +3058,27 @@ static void set_avi_info_frame( + /* Active Format Aspect ratio - same as Picture Aspect Ratio. */ + hdmi_info.bits.R0_R3 = ACTIVE_FORMAT_ASPECT_RATIO_SAME_AS_PICTURE; + +- /* TODO: un-hardcode cn0_cn1 and itc */ +- +- cn0_cn1 = 0; +- cn0_cn1_value = 0; +- +- itc = true; +- itc_value = 1; +- +- support = stream->content_support; +- +- if (itc) { +- if (!support.bits.valid_content_type) { +- cn0_cn1_value = 0; +- } else { +- if (cn0_cn1 == DISPLAY_CONTENT_TYPE_GRAPHICS) { +- if (support.bits.graphics_content == 1) { +- cn0_cn1_value = 0; +- } +- } else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_PHOTO) { +- if (support.bits.photo_content == 1) { +- cn0_cn1_value = 1; +- } else { +- cn0_cn1_value = 0; +- itc_value = 0; +- } +- } else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_CINEMA) { +- if (support.bits.cinema_content == 1) { +- cn0_cn1_value = 2; +- } else { +- cn0_cn1_value = 0; +- itc_value = 0; +- } +- } else if (cn0_cn1 == DISPLAY_CONTENT_TYPE_GAME) { +- if (support.bits.game_content == 1) { +- cn0_cn1_value = 3; +- } else { +- cn0_cn1_value = 0; +- itc_value = 0; +- } +- } +- } +- hdmi_info.bits.CN0_CN1 = cn0_cn1_value; +- hdmi_info.bits.ITC = itc_value; ++ switch (stream->content_type) { ++ case DISPLAY_CONTENT_TYPE_NO_DATA: ++ hdmi_info.bits.CN0_CN1 = 0; ++ hdmi_info.bits.ITC = 0; ++ break; ++ case DISPLAY_CONTENT_TYPE_GRAPHICS: ++ hdmi_info.bits.CN0_CN1 = 0; ++ hdmi_info.bits.ITC = 1; ++ break; ++ case DISPLAY_CONTENT_TYPE_PHOTO: ++ hdmi_info.bits.CN0_CN1 = 1; ++ hdmi_info.bits.ITC = 1; ++ break; ++ case DISPLAY_CONTENT_TYPE_CINEMA: ++ hdmi_info.bits.CN0_CN1 = 2; ++ hdmi_info.bits.ITC = 1; ++ break; ++ case DISPLAY_CONTENT_TYPE_GAME: ++ hdmi_info.bits.CN0_CN1 = 3; ++ hdmi_info.bits.ITC = 1; ++ break; + } + + if (stream->qs_bit == 1) { +diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h +index dfd3df1d2f7e..f78d49e33a6e 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc_stream.h ++++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h +@@ -182,7 +182,6 @@ struct dc_stream_state { + */ + struct link_encoder *link_enc; + struct dc_panel_patch sink_patches; +- union display_content_support content_support; + struct dc_crtc_timing timing; + struct dc_crtc_timing_adjust adjust; + struct dc_info_packet vrr_infopacket; +@@ -205,6 +204,7 @@ struct dc_stream_state { + struct dc_csc_transform csc_color_matrix; + + enum dc_color_space output_color_space; ++ enum display_content_type content_type; + enum dc_dither_option dither_option; + + enum view_3d_format view_format; +diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h +index dc78e2404b48..fdf58a2e3a75 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc_types.h ++++ b/drivers/gpu/drm/amd/display/dc/dc_types.h +@@ -174,18 +174,6 @@ struct dc_edid { + + #define AUDIO_INFO_DISPLAY_NAME_SIZE_IN_CHARS 20 + +-union display_content_support { +- unsigned int raw; +- struct { +- unsigned int valid_content_type :1; +- unsigned int game_content :1; +- unsigned int cinema_content :1; +- unsigned int photo_content :1; +- unsigned int graphics_content :1; +- unsigned int reserved :27; +- } bits; +-}; +- + struct dc_panel_patch { + unsigned int dppowerup_delay; + unsigned int extra_t12_ms; +@@ -218,8 +206,6 @@ struct dc_edid_caps { + uint32_t audio_latency; + uint32_t video_latency; + +- union display_content_support content_support; +- + uint8_t qs_bit; + uint8_t qy_bit; + +diff --git a/drivers/gpu/drm/display/drm_hdmi_helper.c b/drivers/gpu/drm/display/drm_hdmi_helper.c +index 0264abe55278..c1e6851b2606 100644 +--- a/drivers/gpu/drm/display/drm_hdmi_helper.c ++++ b/drivers/gpu/drm/display/drm_hdmi_helper.c +@@ -44,10 +44,8 @@ int drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame, + + /* Sink EOTF is Bit map while infoframe is absolute values */ + if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf, +- connector->hdr_sink_metadata.hdmi_type1.eotf)) { +- DRM_DEBUG_KMS("EOTF Not Supported\n"); +- return -EINVAL; +- } ++ connector->hdr_sink_metadata.hdmi_type1.eotf)) ++ DRM_DEBUG_KMS("Unknown EOTF %d\n", hdr_metadata->hdmi_metadata_type1.eotf); + + err = hdmi_drm_infoframe_init(frame); + if (err < 0) +@@ -105,7 +103,7 @@ EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata); + #define HDMI_COLORIMETRY_DCI_P3_RGB_THEATER (C(3) | EC(7) | ACE(1)) + + static const u32 hdmi_colorimetry_val[] = { +- [DRM_MODE_COLORIMETRY_NO_DATA] = HDMI_COLORIMETRY_NO_DATA, ++ [DRM_MODE_COLORIMETRY_DEFAULT] = HDMI_COLORIMETRY_NO_DATA, + [DRM_MODE_COLORIMETRY_SMPTE_170M_YCC] = HDMI_COLORIMETRY_SMPTE_170M_YCC, + [DRM_MODE_COLORIMETRY_BT709_YCC] = HDMI_COLORIMETRY_BT709_YCC, + [DRM_MODE_COLORIMETRY_XVYCC_601] = HDMI_COLORIMETRY_XVYCC_601, +diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c +index f197f59f6d99..d6d04c4ccfc0 100644 +--- a/drivers/gpu/drm/drm_atomic.c ++++ b/drivers/gpu/drm/drm_atomic.c +@@ -1070,6 +1070,8 @@ static void drm_atomic_connector_print_state(struct drm_printer *p, + drm_printf(p, "connector[%u]: %s\n", connector->base.id, connector->name); + drm_printf(p, "\tcrtc=%s\n", state->crtc ? state->crtc->name : "(null)"); + drm_printf(p, "\tself_refresh_aware=%d\n", state->self_refresh_aware); ++ drm_printf(p, "\tmax_requested_bpc=%d\n", state->max_requested_bpc); ++ drm_printf(p, "\tcolorspace=%s\n", drm_get_colorspace_name(state->colorspace)); + + if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + if (state->writeback_job && state->writeback_job->fb) +diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c +index 547356e00341..e99d397cd228 100644 +--- a/drivers/gpu/drm/drm_connector.c ++++ b/drivers/gpu/drm/drm_connector.c +@@ -1016,64 +1016,72 @@ static const struct drm_prop_enum_list drm_dp_subconnector_enum_list[] = { + DRM_ENUM_NAME_FN(drm_get_dp_subconnector_name, + drm_dp_subconnector_enum_list) + +-static const struct drm_prop_enum_list hdmi_colorspaces[] = { +- /* For Default case, driver will set the colorspace */ +- { DRM_MODE_COLORIMETRY_DEFAULT, "Default" }, +- /* Standard Definition Colorimetry based on CEA 861 */ +- { DRM_MODE_COLORIMETRY_SMPTE_170M_YCC, "SMPTE_170M_YCC" }, +- { DRM_MODE_COLORIMETRY_BT709_YCC, "BT709_YCC" }, +- /* Standard Definition Colorimetry based on IEC 61966-2-4 */ +- { DRM_MODE_COLORIMETRY_XVYCC_601, "XVYCC_601" }, +- /* High Definition Colorimetry based on IEC 61966-2-4 */ +- { DRM_MODE_COLORIMETRY_XVYCC_709, "XVYCC_709" }, +- /* Colorimetry based on IEC 61966-2-1/Amendment 1 */ +- { DRM_MODE_COLORIMETRY_SYCC_601, "SYCC_601" }, +- /* Colorimetry based on IEC 61966-2-5 [33] */ +- { DRM_MODE_COLORIMETRY_OPYCC_601, "opYCC_601" }, +- /* Colorimetry based on IEC 61966-2-5 */ +- { DRM_MODE_COLORIMETRY_OPRGB, "opRGB" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_CYCC, "BT2020_CYCC" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_RGB, "BT2020_RGB" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_YCC, "BT2020_YCC" }, +- /* Added as part of Additional Colorimetry Extension in 861.G */ +- { DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65, "DCI-P3_RGB_D65" }, +- { DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER, "DCI-P3_RGB_Theater" }, ++static const char * const colorspace_names[] = { ++ [DRM_MODE_COLORIMETRY_DEFAULT] = "Default", ++ [DRM_MODE_COLORIMETRY_SMPTE_170M_YCC] = "SMPTE_170M_YCC", ++ [DRM_MODE_COLORIMETRY_BT709_YCC] = "BT709_YCC", ++ [DRM_MODE_COLORIMETRY_XVYCC_601] = "XVYCC_601", ++ [DRM_MODE_COLORIMETRY_XVYCC_709] = "XVYCC_709", ++ [DRM_MODE_COLORIMETRY_SYCC_601] = "SYCC_601", ++ [DRM_MODE_COLORIMETRY_OPYCC_601] = "opYCC_601", ++ [DRM_MODE_COLORIMETRY_OPRGB] = "opRGB", ++ [DRM_MODE_COLORIMETRY_BT2020_CYCC] = "BT2020_CYCC", ++ [DRM_MODE_COLORIMETRY_BT2020_RGB] = "BT2020_RGB", ++ [DRM_MODE_COLORIMETRY_BT2020_YCC] = "BT2020_YCC", ++ [DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65] = "P3_RGB_D65", ++ [DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER] = "P3_RGB_Theater", ++ [DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED] = "RGB_WIDE_FIXED", ++ [DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT] = "RGB_WIDE_FLOAT", ++ [DRM_MODE_COLORIMETRY_BT601_YCC] = "BT601_YCC", + }; + ++/** ++ * drm_get_color_encoding_name - return a string for color encoding ++ * @encoding: color encoding to compute name of ++ * ++ * In contrast to the other drm_get_*_name functions this one here returns a ++ * const pointer and hence is threadsafe. ++ */ ++const char *drm_get_colorspace_name(enum drm_colorspace colorspace) ++{ ++ if (WARN_ON(colorspace >= ARRAY_SIZE(colorspace_names))) ++ return "unknown"; ++ ++ return colorspace_names[colorspace]; ++} ++ ++static const u32 hdmi_colorspaces = ++ BIT(DRM_MODE_COLORIMETRY_SMPTE_170M_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_BT709_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_XVYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_XVYCC_709) | ++ BIT(DRM_MODE_COLORIMETRY_SYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_OPYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_OPRGB) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_CYCC) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65) | ++ BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER); ++ + /* + * As per DP 1.4a spec, 2.2.5.7.5 VSC SDP Payload for Pixel Encoding/Colorimetry + * Format Table 2-120 + */ +-static const struct drm_prop_enum_list dp_colorspaces[] = { +- /* For Default case, driver will set the colorspace */ +- { DRM_MODE_COLORIMETRY_DEFAULT, "Default" }, +- { DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED, "RGB_Wide_Gamut_Fixed_Point" }, +- /* Colorimetry based on scRGB (IEC 61966-2-2) */ +- { DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT, "RGB_Wide_Gamut_Floating_Point" }, +- /* Colorimetry based on IEC 61966-2-5 */ +- { DRM_MODE_COLORIMETRY_OPRGB, "opRGB" }, +- /* Colorimetry based on SMPTE RP 431-2 */ +- { DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65, "DCI-P3_RGB_D65" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_RGB, "BT2020_RGB" }, +- { DRM_MODE_COLORIMETRY_BT601_YCC, "BT601_YCC" }, +- { DRM_MODE_COLORIMETRY_BT709_YCC, "BT709_YCC" }, +- /* Standard Definition Colorimetry based on IEC 61966-2-4 */ +- { DRM_MODE_COLORIMETRY_XVYCC_601, "XVYCC_601" }, +- /* High Definition Colorimetry based on IEC 61966-2-4 */ +- { DRM_MODE_COLORIMETRY_XVYCC_709, "XVYCC_709" }, +- /* Colorimetry based on IEC 61966-2-1/Amendment 1 */ +- { DRM_MODE_COLORIMETRY_SYCC_601, "SYCC_601" }, +- /* Colorimetry based on IEC 61966-2-5 [33] */ +- { DRM_MODE_COLORIMETRY_OPYCC_601, "opYCC_601" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_CYCC, "BT2020_CYCC" }, +- /* Colorimetry based on ITU-R BT.2020 */ +- { DRM_MODE_COLORIMETRY_BT2020_YCC, "BT2020_YCC" }, +-}; ++static const u32 dp_colorspaces = ++ BIT(DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED) | ++ BIT(DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT) | ++ BIT(DRM_MODE_COLORIMETRY_OPRGB) | ++ BIT(DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) | ++ BIT(DRM_MODE_COLORIMETRY_BT601_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_BT709_YCC) | ++ BIT(DRM_MODE_COLORIMETRY_XVYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_XVYCC_709) | ++ BIT(DRM_MODE_COLORIMETRY_SYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_OPYCC_601) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_CYCC) | ++ BIT(DRM_MODE_COLORIMETRY_BT2020_YCC); + + /** + * DOC: standard connector properties +@@ -1975,33 +1983,58 @@ EXPORT_SYMBOL(drm_mode_create_aspect_ratio_property); + * drm_mode_create_dp_colorspace_property() is used for DP connector. + */ + +-/** +- * drm_mode_create_hdmi_colorspace_property - create hdmi colorspace property +- * @connector: connector to create the Colorspace property on. +- * +- * Called by a driver the first time it's needed, must be attached to desired +- * HDMI connectors. +- * +- * Returns: +- * Zero on success, negative errno on failure. +- */ +-int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector) ++static int drm_mode_create_colorspace_property(struct drm_connector *connector, ++ u32 supported_colorspaces) + { + struct drm_device *dev = connector->dev; ++ u32 colorspaces = supported_colorspaces | BIT(DRM_MODE_COLORIMETRY_DEFAULT); ++ struct drm_prop_enum_list enum_list[DRM_MODE_COLORIMETRY_MAX]; ++ int i, len; + + if (connector->colorspace_property) + return 0; + ++ if (WARN_ON(supported_colorspaces == 0 || ++ (supported_colorspaces & -BIT(DRM_MODE_COLORIMETRY_MAX)) != 0)) ++ return -EINVAL; ++ ++ len = 0; ++ for (i = 0; i < DRM_MODE_COLORIMETRY_MAX; i++) { ++ if ((colorspaces & BIT(i)) == 0) ++ continue; ++ ++ enum_list[len].type = i; ++ enum_list[len].name = colorspace_names[i]; ++ len++; ++ } ++ + connector->colorspace_property = + drm_property_create_enum(dev, DRM_MODE_PROP_ENUM, "Colorspace", +- hdmi_colorspaces, +- ARRAY_SIZE(hdmi_colorspaces)); ++ enum_list, ++ len); + + if (!connector->colorspace_property) + return -ENOMEM; + + return 0; + } ++/** ++ * drm_mode_create_hdmi_colorspace_property - create hdmi colorspace property ++ * @connector: connector to create the Colorspace property on. ++ * ++ * Called by a driver the first time it's needed, must be attached to desired ++ * HDMI connectors. ++ * ++ * Returns: ++ * Zero on success, negative errno on failure. ++ */ ++int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector, ++ u32 supported_colorspaces) ++{ ++ u32 colorspaces = supported_colorspaces & hdmi_colorspaces; ++ ++ return drm_mode_create_colorspace_property(connector, colorspaces); ++} + EXPORT_SYMBOL(drm_mode_create_hdmi_colorspace_property); + + /** +@@ -2014,22 +2047,12 @@ EXPORT_SYMBOL(drm_mode_create_hdmi_colorspace_property); + * Returns: + * Zero on success, negative errno on failure. + */ +-int drm_mode_create_dp_colorspace_property(struct drm_connector *connector) ++int drm_mode_create_dp_colorspace_property(struct drm_connector *connector, ++ u32 supported_colorspaces) + { +- struct drm_device *dev = connector->dev; ++ u32 colorspaces = supported_colorspaces & dp_colorspaces; + +- if (connector->colorspace_property) +- return 0; +- +- connector->colorspace_property = +- drm_property_create_enum(dev, DRM_MODE_PROP_ENUM, "Colorspace", +- dp_colorspaces, +- ARRAY_SIZE(dp_colorspaces)); +- +- if (!connector->colorspace_property) +- return -ENOMEM; +- +- return 0; ++ return drm_mode_create_colorspace_property(connector, colorspaces); + } + EXPORT_SYMBOL(drm_mode_create_dp_colorspace_property); + +diff --git a/drivers/gpu/drm/i915/display/intel_connector.c b/drivers/gpu/drm/i915/display/intel_connector.c +index 6205ddd3ded0..843a669afd59 100644 +--- a/drivers/gpu/drm/i915/display/intel_connector.c ++++ b/drivers/gpu/drm/i915/display/intel_connector.c +@@ -283,14 +283,14 @@ intel_attach_aspect_ratio_property(struct drm_connector *connector) + void + intel_attach_hdmi_colorspace_property(struct drm_connector *connector) + { +- if (!drm_mode_create_hdmi_colorspace_property(connector)) ++ if (!drm_mode_create_hdmi_colorspace_property(connector, 0xffffffff)) + drm_connector_attach_colorspace_property(connector); + } + + void + intel_attach_dp_colorspace_property(struct drm_connector *connector) + { +- if (!drm_mode_create_dp_colorspace_property(connector)) ++ if (!drm_mode_create_dp_colorspace_property(connector, 0xffffffff)) + drm_connector_attach_colorspace_property(connector); + } + +diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c +index 55744216392b..280d11648712 100644 +--- a/drivers/gpu/drm/vc4/vc4_hdmi.c ++++ b/drivers/gpu/drm/vc4/vc4_hdmi.c +@@ -618,7 +618,7 @@ static int vc4_hdmi_connector_init(struct drm_device *dev, + if (ret) + return ret; + +- ret = drm_mode_create_hdmi_colorspace_property(connector); ++ ret = drm_mode_create_hdmi_colorspace_property(connector, 0xffffffff); + if (ret) + return ret; + +diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h +index e934aab357be..4fc5120392e3 100644 +--- a/include/drm/display/drm_dp.h ++++ b/include/drm/display/drm_dp.h +@@ -1617,7 +1617,7 @@ enum dp_pixelformat { + * + * This enum is used to indicate DP VSC SDP Colorimetry formats. + * It is based on DP 1.4 spec [Table 2-117: VSC SDP Payload for DB16 through +- * DB18] and a name of enum member follows DRM_MODE_COLORIMETRY definition. ++ * DB18] and a name of enum member follows &enum drm_colorimetry definition. + * + * @DP_COLORIMETRY_DEFAULT: sRGB (IEC 61966-2-1) or + * ITU-R BT.601 colorimetry format +diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h +index 565cf9d3c550..68f26a3ebb99 100644 +--- a/include/drm/drm_connector.h ++++ b/include/drm/drm_connector.h +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + +@@ -371,29 +372,30 @@ enum drm_privacy_screen_status { + * a colorspace property which will be created and exposed to + * userspace. + */ +- +-/* For Default case, driver will set the colorspace */ +-#define DRM_MODE_COLORIMETRY_DEFAULT 0 +-/* CEA 861 Normal Colorimetry options */ +-#define DRM_MODE_COLORIMETRY_NO_DATA 0 +-#define DRM_MODE_COLORIMETRY_SMPTE_170M_YCC 1 +-#define DRM_MODE_COLORIMETRY_BT709_YCC 2 +-/* CEA 861 Extended Colorimetry Options */ +-#define DRM_MODE_COLORIMETRY_XVYCC_601 3 +-#define DRM_MODE_COLORIMETRY_XVYCC_709 4 +-#define DRM_MODE_COLORIMETRY_SYCC_601 5 +-#define DRM_MODE_COLORIMETRY_OPYCC_601 6 +-#define DRM_MODE_COLORIMETRY_OPRGB 7 +-#define DRM_MODE_COLORIMETRY_BT2020_CYCC 8 +-#define DRM_MODE_COLORIMETRY_BT2020_RGB 9 +-#define DRM_MODE_COLORIMETRY_BT2020_YCC 10 +-/* Additional Colorimetry extension added as part of CTA 861.G */ +-#define DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65 11 +-#define DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER 12 +-/* Additional Colorimetry Options added for DP 1.4a VSC Colorimetry Format */ +-#define DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED 13 +-#define DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT 14 +-#define DRM_MODE_COLORIMETRY_BT601_YCC 15 ++enum drm_colorspace { ++ /* For Default case, driver will set the colorspace */ ++ DRM_MODE_COLORIMETRY_DEFAULT, ++ /* CEA 861 Normal Colorimetry options */ ++ DRM_MODE_COLORIMETRY_SMPTE_170M_YCC, ++ DRM_MODE_COLORIMETRY_BT709_YCC, ++ /* CEA 861 Extended Colorimetry Options */ ++ DRM_MODE_COLORIMETRY_XVYCC_601, ++ DRM_MODE_COLORIMETRY_XVYCC_709, ++ DRM_MODE_COLORIMETRY_SYCC_601, ++ DRM_MODE_COLORIMETRY_OPYCC_601, ++ DRM_MODE_COLORIMETRY_OPRGB, ++ DRM_MODE_COLORIMETRY_BT2020_CYCC, ++ DRM_MODE_COLORIMETRY_BT2020_RGB, ++ DRM_MODE_COLORIMETRY_BT2020_YCC, ++ /* Additional Colorimetry extension added as part of CTA 861.G */ ++ DRM_MODE_COLORIMETRY_DCI_P3_RGB_D65, ++ DRM_MODE_COLORIMETRY_DCI_P3_RGB_THEATER, ++ /* Additional Colorimetry Options added for DP 1.4a VSC Colorimetry Format */ ++ DRM_MODE_COLORIMETRY_RGB_WIDE_FIXED, ++ DRM_MODE_COLORIMETRY_RGB_WIDE_FLOAT, ++ DRM_MODE_COLORIMETRY_BT601_YCC, ++ DRM_MODE_COLORIMETRY_MAX ++}; + + /** + * enum drm_bus_flags - bus_flags info for &drm_display_info +@@ -828,7 +830,7 @@ struct drm_connector_state { + * colorspace change on Sink. This is most commonly used to switch + * to wider color gamuts like BT2020. + */ +- u32 colorspace; ++ enum drm_colorspace colorspace; + + /** + * @writeback_job: Writeback job for writeback connectors +@@ -1835,8 +1837,10 @@ int drm_connector_attach_hdr_output_metadata_property(struct drm_connector *conn + bool drm_connector_atomic_hdr_metadata_equal(struct drm_connector_state *old_state, + struct drm_connector_state *new_state); + int drm_mode_create_aspect_ratio_property(struct drm_device *dev); +-int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector); +-int drm_mode_create_dp_colorspace_property(struct drm_connector *connector); ++int drm_mode_create_hdmi_colorspace_property(struct drm_connector *connector, ++ u32 supported_colorspaces); ++int drm_mode_create_dp_colorspace_property(struct drm_connector *connector, ++ u32 supported_colorspaces); + int drm_mode_create_content_type_property(struct drm_device *dev); + int drm_mode_create_suggested_offset_properties(struct drm_device *dev); + +@@ -1919,6 +1923,7 @@ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter); + + bool drm_connector_has_possible_encoder(struct drm_connector *connector, + struct drm_encoder *encoder); ++const char *drm_get_colorspace_name(enum drm_colorspace colorspace); + + /** + * drm_for_each_connector_iter - connector_list iterator macro +-- +2.39.2 diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100644 index 0000000..8a48730 --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "Pika Kernel - Building" + +make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos \ No newline at end of file diff --git a/scripts/config.sh b/scripts/config.sh new file mode 100644 index 0000000..0973a12 --- /dev/null +++ b/scripts/config.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +echo "Pika Kernel - Applying configuration" + +cp ../config .config + +scripts/config -k -e CONFIG_GENERIC_CPU +scripts/config -e CACHY +scripts/config -e SCHED_BORE + +scripts/config -e HZ_300 --set-val HZ 1000 +scripts/config -d HZ_PERIODIC -d NO_HZ_FULL -e NO_HZ_IDLE -e NO_HZ -e NO_HZ_COMMON +scripts/config -e PREEMPT_BUILD -d PREEMPT_NONE -d PREEMPT_VOLUNTARY -e PREEMPT -e PREEMPT_COUNT -e PREEMPTION -e PREEMPT_DYNAMIC + +scripts/config -d CC_OPTIMIZE_FOR_PERFORMANCE \ + -e CC_OPTIMIZE_FOR_PERFORMANCE_O3 + +scripts/config -m TCP_CONG_CUBIC \ + -d DEFAULT_CUBIC \ + -e TCP_CONG_BBR2 \ + -e DEFAULT_BBR2 \ + --set-str DEFAULT_TCP_CONG bbr2 + +scripts/config -e LRU_GEN -e LRU_GEN_ENABLED -d LRU_GEN_STATS + +scripts/config -d TRANSPARENT_HUGEPAGE_ALWAYS -e TRANSPARENT_HUGEPAGE_MADVISE + +scripts/config -e DAMON \ + -e DAMON_VADDR \ + -e DAMON_DBGFS \ + -e DAMON_SYSFS \ + -e DAMON_PADDR \ + -e DAMON_RECLAIM \ + -e DAMON_LRU_SORT + +scripts/config -d ZRAM_DEF_COMP_LZORLE \ + -e ZRAM_DEF_COMP_ZSTD \ + --set-str ZRAM_DEF_COMP zstd \ + -d ZSWAP_COMPRESSOR_DEFAULT_LZ4 \ + -e ZSWAP_COMPRESSOR_DEFAULT_ZSTD \ + --set-str ZSWAP_COMPRESSOR_DEFAULT zstd + +scripts/config --set-val MODULE_COMPRESS_ZSTD_LEVEL 19 -e MODULE_COMPRESS_ZSTD_ULTRA --set-val MODULE_COMPRESS_ZSTD_LEVEL_ULTRA 22 --set-val ZSTD_COMP_VAL 22 + +scripts/config -e USER_NS + +scripts/config -d DEBUG_INFO \ + -d DEBUG_INFO_DWARF4 \ + -d DEBUG_INFO_DWARF5 \ + -d SLUB_DEBUG \ + -d PM_DEBUG \ + -d PM_ADVANCED_DEBUG \ + -d PM_SLEEP_DEBUG \ + -d ACPI_DEBUG \ + -d SCHED_DEBUG \ + -d LATENCYTOP \ + -d DEBUG_PREEMPT + +make prepare \ No newline at end of file diff --git a/scripts/output.sh b/scripts/output.sh new file mode 100644 index 0000000..008c076 --- /dev/null +++ b/scripts/output.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "Pika Kernel - Copying Output" + +cd .. +mkdir ./output +rm ./linux-libc*.deb +cp ./*.deb ./output/ \ No newline at end of file diff --git a/scripts/patch.sh b/scripts/patch.sh new file mode 100644 index 0000000..c62732f --- /dev/null +++ b/scripts/patch.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +echo "Pika Kernel - Applying patches" + +# Cachy patches are here: https://github.com/CachyOS/kernel-patches/ +# orig patch from cachy - 0001-cachyos-base-all.patch +patch -Np1 < "../patches/0001-cachy-all.patch" +# orig patch from cachy - 0001-Add-latency-priority-for-CFS-class.patch +patch -Np1 < "../patches/0002-cfs-nice.patch" +# orig patch from cachy - 0001-bore-cachy.patch +patch -Np1 < "../patches/0003-bore.patch" +# HDR patch - from cachy (but they deleted it) +patch -Np1 < "../patches/0004-hdr.patch" \ No newline at end of file diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100644 index 0000000..c3ca429 --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "Pika Kernel - Releasing Kernel" \ No newline at end of file diff --git a/scripts/source.sh b/scripts/source.sh new file mode 100644 index 0000000..7006289 --- /dev/null +++ b/scripts/source.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "Pika Kernel - Getting source" + +wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.tar.gz +tar -zxf ./linux-6.2.tar.gz + +cd linux-6.2 \ No newline at end of file