41359 lines
1.5 MiB
41359 lines
1.5 MiB
From 0ca55b20120a052c587868cb3199edaa41634a3b Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Mon, 6 Mar 2023 18:43:03 +0100
|
|
Subject: [PATCH 01/10] bbr2
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
include/linux/tcp.h | 3 +-
|
|
include/net/inet_connection_sock.h | 3 +-
|
|
include/net/tcp.h | 41 +-
|
|
include/uapi/linux/inet_diag.h | 33 +
|
|
net/ipv4/Kconfig | 22 +
|
|
net/ipv4/Makefile | 1 +
|
|
net/ipv4/tcp.c | 1 +
|
|
net/ipv4/tcp_bbr.c | 38 +-
|
|
net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++
|
|
net/ipv4/tcp_cong.c | 1 +
|
|
net/ipv4/tcp_input.c | 27 +-
|
|
net/ipv4/tcp_output.c | 26 +-
|
|
net/ipv4/tcp_rate.c | 30 +-
|
|
net/ipv4/tcp_timer.c | 1 +
|
|
14 files changed, 2867 insertions(+), 34 deletions(-)
|
|
create mode 100644 net/ipv4/tcp_bbr2.c
|
|
|
|
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
|
|
index ca7f05a130d2..09dbcd67ee8e 100644
|
|
--- a/include/linux/tcp.h
|
|
+++ b/include/linux/tcp.h
|
|
@@ -255,7 +255,8 @@ struct tcp_sock {
|
|
u8 compressed_ack;
|
|
u8 dup_ack_counter:2,
|
|
tlp_retrans:1, /* TLP is a retransmission */
|
|
- unused:5;
|
|
+ fast_ack_mode:2, /* which fast ack mode ? */
|
|
+ unused:3;
|
|
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
|
|
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
|
|
u8 chrono_type:2, /* current chronograph type */
|
|
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
|
|
index c2b15f7e5516..d85858efa571 100644
|
|
--- a/include/net/inet_connection_sock.h
|
|
+++ b/include/net/inet_connection_sock.h
|
|
@@ -135,7 +135,8 @@ struct inet_connection_sock {
|
|
u32 icsk_probes_tstamp;
|
|
u32 icsk_user_timeout;
|
|
|
|
- u64 icsk_ca_priv[104 / sizeof(u64)];
|
|
+/* XXX inflated by temporary internal debugging info */
|
|
+ u64 icsk_ca_priv[216 / sizeof(u64)];
|
|
#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv)
|
|
};
|
|
|
|
diff --git a/include/net/tcp.h b/include/net/tcp.h
|
|
index db9f828e9d1e..e1f05c2b4707 100644
|
|
--- a/include/net/tcp.h
|
|
+++ b/include/net/tcp.h
|
|
@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
|
|
#define TCP_ECN_QUEUE_CWR 2
|
|
#define TCP_ECN_DEMAND_CWR 4
|
|
#define TCP_ECN_SEEN 8
|
|
+#define TCP_ECN_ECT_PERMANENT 16
|
|
|
|
enum tcp_tw_status {
|
|
TCP_TW_SUCCESS = 0,
|
|
@@ -823,6 +824,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
|
|
return max_t(s64, t1 - t0, 0);
|
|
}
|
|
|
|
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
|
|
+{
|
|
+ return max_t(s32, t1 - t0, 0);
|
|
+}
|
|
+
|
|
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
|
|
{
|
|
return tcp_ns_to_ts(skb->skb_mstamp_ns);
|
|
@@ -898,9 +904,14 @@ struct tcp_skb_cb {
|
|
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
|
|
__u32 delivered;
|
|
/* start of send pipeline phase */
|
|
- u64 first_tx_mstamp;
|
|
+ u32 first_tx_mstamp;
|
|
/* when we reached the "delivered" count */
|
|
- u64 delivered_mstamp;
|
|
+ u32 delivered_mstamp;
|
|
+#define TCPCB_IN_FLIGHT_BITS 20
|
|
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
|
|
+ u32 in_flight:20, /* packets in flight at transmit */
|
|
+ unused2:12;
|
|
+ u32 lost; /* packets lost so far upon tx of skb */
|
|
} tx; /* only used for outgoing skbs */
|
|
union {
|
|
struct inet_skb_parm h4;
|
|
@@ -1026,7 +1037,11 @@ enum tcp_ca_ack_event_flags {
|
|
#define TCP_CONG_NON_RESTRICTED 0x1
|
|
/* Requires ECN/ECT set on all packets */
|
|
#define TCP_CONG_NEEDS_ECN 0x2
|
|
-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
|
|
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
|
|
+#define TCP_CONG_WANTS_CE_EVENTS 0x4
|
|
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \
|
|
+ TCP_CONG_NEEDS_ECN | \
|
|
+ TCP_CONG_WANTS_CE_EVENTS)
|
|
|
|
union tcp_cc_info;
|
|
|
|
@@ -1046,8 +1061,11 @@ struct ack_sample {
|
|
*/
|
|
struct rate_sample {
|
|
u64 prior_mstamp; /* starting timestamp for interval */
|
|
+ u32 prior_lost; /* tp->lost at "prior_mstamp" */
|
|
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
|
|
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
|
|
+ u32 tx_in_flight; /* packets in flight at starting timestamp */
|
|
+ s32 lost; /* number of packets lost over interval */
|
|
s32 delivered; /* number of packets delivered over interval */
|
|
s32 delivered_ce; /* number of packets delivered w/ CE marks*/
|
|
long interval_us; /* time for tp->delivered to incr "delivered" */
|
|
@@ -1061,6 +1079,7 @@ struct rate_sample {
|
|
bool is_app_limited; /* is sample from packet with bubble in pipe? */
|
|
bool is_retrans; /* is sample from retransmission? */
|
|
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
|
|
+ bool is_ece; /* did this ACK have ECN marked? */
|
|
};
|
|
|
|
struct tcp_congestion_ops {
|
|
@@ -1084,8 +1103,11 @@ struct tcp_congestion_ops {
|
|
/* hook for packet ack accounting (optional) */
|
|
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
|
|
|
|
- /* override sysctl_tcp_min_tso_segs */
|
|
- u32 (*min_tso_segs)(struct sock *sk);
|
|
+ /* pick target number of segments per TSO/GSO skb (optional): */
|
|
+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
|
|
+
|
|
+ /* react to a specific lost skb (optional) */
|
|
+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
|
|
|
|
/* call when packets are delivered to update cwnd and pacing rate,
|
|
* after all the ca_state processing. (optional)
|
|
@@ -1148,6 +1170,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
|
|
}
|
|
#endif
|
|
|
|
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
|
|
+{
|
|
+ const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+
|
|
+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
|
|
+ TCP_CONG_WANTS_CE_EVENTS);
|
|
+}
|
|
+
|
|
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
|
|
{
|
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
@@ -1167,6 +1197,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
|
|
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
|
|
|
|
/* From tcp_rate.c */
|
|
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
|
|
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
|
|
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
|
|
struct rate_sample *rs);
|
|
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
|
|
index 50655de04c9b..0e24f11627d5 100644
|
|
--- a/include/uapi/linux/inet_diag.h
|
|
+++ b/include/uapi/linux/inet_diag.h
|
|
@@ -231,9 +231,42 @@ struct tcp_bbr_info {
|
|
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
|
|
};
|
|
|
|
+/* Phase as reported in netlink/ss stats. */
|
|
+enum tcp_bbr2_phase {
|
|
+ BBR2_PHASE_INVALID = 0,
|
|
+ BBR2_PHASE_STARTUP = 1,
|
|
+ BBR2_PHASE_DRAIN = 2,
|
|
+ BBR2_PHASE_PROBE_RTT = 3,
|
|
+ BBR2_PHASE_PROBE_BW_UP = 4,
|
|
+ BBR2_PHASE_PROBE_BW_DOWN = 5,
|
|
+ BBR2_PHASE_PROBE_BW_CRUISE = 6,
|
|
+ BBR2_PHASE_PROBE_BW_REFILL = 7
|
|
+};
|
|
+
|
|
+struct tcp_bbr2_info {
|
|
+ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
|
|
+ __u32 bbr_bw_lsb; /* lower 32 bits of bw */
|
|
+ __u32 bbr_bw_msb; /* upper 32 bits of bw */
|
|
+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */
|
|
+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
|
|
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
|
|
+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
|
|
+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
|
|
+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
|
|
+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
|
|
+ __u8 bbr_mode; /* current bbr_mode in state machine */
|
|
+ __u8 bbr_phase; /* current state machine phase */
|
|
+ __u8 unused1; /* alignment padding; not used yet */
|
|
+ __u8 bbr_version; /* MUST be at this offset in struct */
|
|
+ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */
|
|
+ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */
|
|
+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
|
|
+};
|
|
+
|
|
union tcp_cc_info {
|
|
struct tcpvegas_info vegas;
|
|
struct tcp_dctcp_info dctcp;
|
|
struct tcp_bbr_info bbr;
|
|
+ struct tcp_bbr2_info bbr2;
|
|
};
|
|
#endif /* _UAPI_INET_DIAG_H_ */
|
|
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
|
|
index 2dfb12230f08..b6bec331a82e 100644
|
|
--- a/net/ipv4/Kconfig
|
|
+++ b/net/ipv4/Kconfig
|
|
@@ -678,6 +678,24 @@ config TCP_CONG_BBR
|
|
AQM schemes that do not provide a delay signal. It requires the fq
|
|
("Fair Queue") pacing packet scheduler.
|
|
|
|
+config TCP_CONG_BBR2
|
|
+ tristate "BBR2 TCP"
|
|
+ default n
|
|
+ help
|
|
+
|
|
+ BBR2 TCP congestion control is a model-based congestion control
|
|
+ algorithm that aims to maximize network utilization, keep queues and
|
|
+ retransmit rates low, and to be able to coexist with Reno/CUBIC in
|
|
+ common scenarios. It builds an explicit model of the network path. It
|
|
+ tolerates a targeted degree of random packet loss and delay that are
|
|
+ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
|
|
+ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can
|
|
+ coexist with flows that use loss-based congestion control, and can
|
|
+ operate with shallow buffers, deep buffers, bufferbloat, policers, or
|
|
+ AQM schemes that do not provide a delay signal. It requires pacing,
|
|
+ using either TCP internal pacing or the fq ("Fair Queue") pacing packet
|
|
+ scheduler.
|
|
+
|
|
choice
|
|
prompt "Default TCP congestion control"
|
|
default DEFAULT_CUBIC
|
|
@@ -715,6 +733,9 @@ choice
|
|
config DEFAULT_BBR
|
|
bool "BBR" if TCP_CONG_BBR=y
|
|
|
|
+ config DEFAULT_BBR2
|
|
+ bool "BBR2" if TCP_CONG_BBR2=y
|
|
+
|
|
config DEFAULT_RENO
|
|
bool "Reno"
|
|
endchoice
|
|
@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG
|
|
default "dctcp" if DEFAULT_DCTCP
|
|
default "cdg" if DEFAULT_CDG
|
|
default "bbr" if DEFAULT_BBR
|
|
+ default "bbr2" if DEFAULT_BBR2
|
|
default "cubic"
|
|
|
|
config TCP_MD5SIG
|
|
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
|
|
index 880277c9fd07..ef1da49d20a6 100644
|
|
--- a/net/ipv4/Makefile
|
|
+++ b/net/ipv4/Makefile
|
|
@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
|
|
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
|
|
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
|
|
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
|
|
+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
|
|
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
|
|
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
|
|
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
|
|
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
|
index 288693981b00..1d530667b172 100644
|
|
--- a/net/ipv4/tcp.c
|
|
+++ b/net/ipv4/tcp.c
|
|
@@ -3192,6 +3192,7 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
tp->rx_opt.dsack = 0;
|
|
tp->rx_opt.num_sacks = 0;
|
|
tp->rcv_ooopack = 0;
|
|
+ tp->fast_ack_mode = 0;
|
|
|
|
|
|
/* Clean up fastopen related fields */
|
|
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
|
|
index 146792cd26fe..16038f6ee52a 100644
|
|
--- a/net/ipv4/tcp_bbr.c
|
|
+++ b/net/ipv4/tcp_bbr.c
|
|
@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
|
|
sk->sk_pacing_rate = rate;
|
|
}
|
|
|
|
-/* override sysctl_tcp_min_tso_segs */
|
|
__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
|
|
{
|
|
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
|
|
}
|
|
|
|
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
|
|
+ * a particular max gso size as a constraint.
|
|
+ */
|
|
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
|
|
+ u32 gso_max_size)
|
|
+{
|
|
+ u32 segs;
|
|
+ u64 bytes;
|
|
+
|
|
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
|
|
+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
|
|
+
|
|
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
|
|
+ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
|
|
+ return segs;
|
|
+}
|
|
+
|
|
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
|
|
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
+{
|
|
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
|
|
+}
|
|
+
|
|
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
|
|
static u32 bbr_tso_segs_goal(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
- u32 segs, bytes;
|
|
-
|
|
- /* Sort of tcp_tso_autosize() but ignoring
|
|
- * driver provided sk_gso_max_size.
|
|
- */
|
|
- bytes = min_t(unsigned long,
|
|
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
|
|
- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
|
|
- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
|
|
|
|
- return min(segs, 0x7FU);
|
|
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
|
|
}
|
|
|
|
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
|
|
@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
|
|
.undo_cwnd = bbr_undo_cwnd,
|
|
.cwnd_event = bbr_cwnd_event,
|
|
.ssthresh = bbr_ssthresh,
|
|
- .min_tso_segs = bbr_min_tso_segs,
|
|
+ .tso_segs = bbr_tso_segs,
|
|
.get_info = bbr_get_info,
|
|
.set_state = bbr_set_state,
|
|
};
|
|
diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
|
|
new file mode 100644
|
|
index 000000000000..85f8052144d1
|
|
--- /dev/null
|
|
+++ b/net/ipv4/tcp_bbr2.c
|
|
@@ -0,0 +1,2674 @@
|
|
+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
|
|
+ *
|
|
+ * BBRv2 is a model-based congestion control algorithm that aims for low
|
|
+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
|
|
+ * of the network path, it uses measurements of bandwidth and RTT, as well as
|
|
+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that
|
|
+ * although it can use ECN or loss signals explicitly, it does not require
|
|
+ * either; it can bound its in-flight data based on its estimate of the BDP.
|
|
+ *
|
|
+ * The model has both higher and lower bounds for the operating range:
|
|
+ * lo: bw_lo, inflight_lo: conservative short-term lower bound
|
|
+ * hi: bw_hi, inflight_hi: robust long-term upper bound
|
|
+ * The bandwidth-probing time scale is (a) extended dynamically based on
|
|
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
|
|
+ * an interactive wall-clock time-scale to be more scalable and responsive
|
|
+ * than Reno and CUBIC.
|
|
+ *
|
|
+ * Here is a state transition diagram for BBR:
|
|
+ *
|
|
+ * |
|
|
+ * V
|
|
+ * +---> STARTUP ----+
|
|
+ * | | |
|
|
+ * | V |
|
|
+ * | DRAIN ----+
|
|
+ * | | |
|
|
+ * | V |
|
|
+ * +---> PROBE_BW ----+
|
|
+ * | ^ | |
|
|
+ * | | | |
|
|
+ * | +----+ |
|
|
+ * | |
|
|
+ * +---- PROBE_RTT <--+
|
|
+ *
|
|
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
|
|
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
|
|
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
|
|
+ * A long-lived BBR flow spends the vast majority of its time remaining
|
|
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
|
|
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
|
|
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
|
|
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
|
|
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
|
|
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
|
|
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
|
|
+ * otherwise we enter STARTUP to try to fill the pipe.
|
|
+ *
|
|
+ * BBR is described in detail in:
|
|
+ * "BBR: Congestion-Based Congestion Control",
|
|
+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
|
|
+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
|
|
+ *
|
|
+ * There is a public e-mail list for discussing BBR development and testing:
|
|
+ * https://groups.google.com/forum/#!forum/bbr-dev
|
|
+ *
|
|
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
|
|
+ * otherwise TCP stack falls back to an internal pacing using one high
|
|
+ * resolution timer per TCP socket and may use more resources.
|
|
+ */
|
|
+#include <linux/module.h>
|
|
+#include <net/tcp.h>
|
|
+#include <linux/inet_diag.h>
|
|
+#include <linux/inet.h>
|
|
+#include <linux/random.h>
|
|
+
|
|
+#include "tcp_dctcp.h"
|
|
+
|
|
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
|
|
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
|
|
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
|
|
+ * Since the minimum window is >=4 packets, the lower bound isn't
|
|
+ * an issue. The upper bound isn't an issue with existing technologies.
|
|
+ */
|
|
+#define BW_SCALE 24
|
|
+#define BW_UNIT (1 << BW_SCALE)
|
|
+
|
|
+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
|
|
+#define BBR_UNIT (1 << BBR_SCALE)
|
|
+
|
|
+#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */
|
|
+#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */
|
|
+
|
|
+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
|
|
+
|
|
+/* BBR has the following modes for deciding how fast to send: */
|
|
+enum bbr_mode {
|
|
+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
|
|
+ BBR_DRAIN, /* drain any queue created during startup */
|
|
+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
|
|
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
|
|
+};
|
|
+
|
|
+/* How does the incoming ACK stream relate to our bandwidth probing? */
|
|
+enum bbr_ack_phase {
|
|
+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */
|
|
+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */
|
|
+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */
|
|
+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */
|
|
+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */
|
|
+};
|
|
+
|
|
+/* BBR congestion control block */
|
|
+struct bbr {
|
|
+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
|
|
+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */
|
|
+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
|
|
+ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */
|
|
+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/
|
|
+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
|
|
+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */
|
|
+ u64 cycle_mstamp; /* time of this cycle phase start */
|
|
+ u32 mode:3, /* current bbr_mode in state machine */
|
|
+ prev_ca_state:3, /* CA state on previous ACK */
|
|
+ packet_conservation:1, /* use packet conservation? */
|
|
+ round_start:1, /* start of packet-timed tx->ack round? */
|
|
+ ce_state:1, /* If most recent data has CE bit set */
|
|
+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */
|
|
+ try_fast_path:1, /* can we take fast path? */
|
|
+ unused2:11,
|
|
+ idle_restart:1, /* restarting after idle? */
|
|
+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
|
|
+ cycle_idx:3, /* current index in pacing_gain cycle array */
|
|
+ has_seen_rtt:1; /* have we seen an RTT sample yet? */
|
|
+ u32 pacing_gain:11, /* current gain for setting pacing rate */
|
|
+ cwnd_gain:11, /* current gain for setting cwnd */
|
|
+ full_bw_reached:1, /* reached full bw in Startup? */
|
|
+ full_bw_cnt:2, /* number of rounds without large bw gains */
|
|
+ init_cwnd:7; /* initial cwnd */
|
|
+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
|
|
+ u32 full_bw; /* recent bw, to estimate if pipe is full */
|
|
+
|
|
+ /* For tracking ACK aggregation: */
|
|
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
|
|
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
|
|
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
|
|
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
|
|
+ extra_acked_win_idx:1, /* current index in extra_acked array */
|
|
+ /* BBR v2 state: */
|
|
+ unused1:2,
|
|
+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */
|
|
+ loss_in_cycle:1, /* packet loss in this cycle? */
|
|
+ ecn_in_cycle:1; /* ECN in this cycle? */
|
|
+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */
|
|
+ u32 undo_bw_lo; /* bw_lo before latest losses */
|
|
+ u32 undo_inflight_lo; /* inflight_lo before latest losses */
|
|
+ u32 undo_inflight_hi; /* inflight_hi before latest losses */
|
|
+ u32 bw_latest; /* max delivered bw in last round trip */
|
|
+ u32 bw_lo; /* lower bound on sending bandwidth */
|
|
+ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/
|
|
+ u32 inflight_latest; /* max delivered data in last round trip */
|
|
+ u32 inflight_lo; /* lower bound of inflight data range */
|
|
+ u32 inflight_hi; /* upper bound of inflight data range */
|
|
+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
|
|
+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */
|
|
+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */
|
|
+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */
|
|
+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */
|
|
+ bw_probe_samples:1, /* rate samples reflect bw probing? */
|
|
+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */
|
|
+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
|
|
+ rounds_since_probe:8, /* packet-timed rounds since probed bw */
|
|
+ loss_round_start:1, /* loss_round_delivered round trip? */
|
|
+ loss_in_round:1, /* loss marked in this round trip? */
|
|
+ ecn_in_round:1, /* ECN marked in this round trip? */
|
|
+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */
|
|
+ loss_events_in_round:4,/* losses in STARTUP round */
|
|
+ initialized:1; /* has bbr_init() been called? */
|
|
+ u32 alpha_last_delivered; /* tp->delivered at alpha update */
|
|
+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
|
|
+
|
|
+ /* Params configurable using setsockopt. Refer to correspoding
|
|
+ * module param for detailed description of params.
|
|
+ */
|
|
+ struct bbr_params {
|
|
+ u32 high_gain:11, /* max allowed value: 2047 */
|
|
+ drain_gain:10, /* max allowed value: 1023 */
|
|
+ cwnd_gain:11; /* max allowed value: 2047 */
|
|
+ u32 cwnd_min_target:4, /* max allowed value: 15 */
|
|
+ min_rtt_win_sec:5, /* max allowed value: 31 */
|
|
+ probe_rtt_mode_ms:9, /* max allowed value: 511 */
|
|
+ full_bw_cnt:3, /* max allowed value: 7 */
|
|
+ cwnd_tso_budget:1, /* allowed values: {0, 1} */
|
|
+ unused3:6,
|
|
+ drain_to_target:1, /* boolean */
|
|
+ precise_ece_ack:1, /* boolean */
|
|
+ extra_acked_in_startup:1, /* allowed values: {0, 1} */
|
|
+ fast_path:1; /* boolean */
|
|
+ u32 full_bw_thresh:10, /* max allowed value: 1023 */
|
|
+ startup_cwnd_gain:11, /* max allowed value: 2047 */
|
|
+ bw_probe_pif_gain:9, /* max allowed value: 511 */
|
|
+ usage_based_cwnd:1, /* boolean */
|
|
+ unused2:1;
|
|
+ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */
|
|
+ refill_add_inc:2; /* max allowed value: 3 */
|
|
+ u16 extra_acked_gain:11, /* max allowed value: 2047 */
|
|
+ extra_acked_win_rtts:5; /* max allowed value: 31*/
|
|
+ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
|
|
+ /* Mostly BBR v2 parameters below here: */
|
|
+ u32 ecn_alpha_gain:8, /* max allowed value: 255 */
|
|
+ ecn_factor:8, /* max allowed value: 255 */
|
|
+ ecn_thresh:8, /* max allowed value: 255 */
|
|
+ beta:8; /* max allowed value: 255 */
|
|
+ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */
|
|
+ bw_probe_reno_gain:9, /* max allowed value: 511 */
|
|
+ full_loss_cnt:4; /* max allowed value: 15 */
|
|
+ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */
|
|
+ inflight_headroom:8, /* max allowed value: 255 */
|
|
+ loss_thresh:8, /* max allowed value: 255 */
|
|
+ bw_probe_max_rounds:8; /* max allowed value: 255 */
|
|
+ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */
|
|
+ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */
|
|
+ full_ecn_cnt:2; /* max allowed value: 3 */
|
|
+ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */
|
|
+ undo:1, /* boolean */
|
|
+ tso_rtt_shift:4, /* max allowed value: 15 */
|
|
+ unused5:1;
|
|
+ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */
|
|
+ unused1:14,
|
|
+ ecn_alpha_init:9; /* max allowed value: 256 */
|
|
+ } params;
|
|
+
|
|
+ struct {
|
|
+ u32 snd_isn; /* Initial sequence number */
|
|
+ u32 rs_bw; /* last valid rate sample bw */
|
|
+ u32 target_cwnd; /* target cwnd, based on BDP */
|
|
+ u8 undo:1, /* Undo even happened but not yet logged */
|
|
+ unused:7;
|
|
+ char event; /* single-letter event debug codes */
|
|
+ u16 unused2;
|
|
+ } debug;
|
|
+};
|
|
+
|
|
+struct bbr_context {
|
|
+ u32 sample_bw;
|
|
+ u32 target_cwnd;
|
|
+ u32 log:1;
|
|
+};
|
|
+
|
|
+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
|
|
+static u32 bbr_min_rtt_win_sec = 10;
|
|
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
|
|
+ * Max allowed value is 511 (0x1FF).
|
|
+ */
|
|
+static u32 bbr_probe_rtt_mode_ms = 200;
|
|
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
|
|
+ * typical interval between PROBE_RTT mode entries.
|
|
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
|
|
+ */
|
|
+static u32 bbr_probe_rtt_win_ms = 5000;
|
|
+/* Skip TSO below the following bandwidth (bits/sec): */
|
|
+static int bbr_min_tso_rate = 1200000;
|
|
+
|
|
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
|
|
+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
|
|
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
|
|
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
|
|
+ */
|
|
+static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */
|
|
+
|
|
+/* Select cwnd TSO budget approach:
|
|
+ * 0: padding
|
|
+ * 1: flooring
|
|
+ */
|
|
+static uint bbr_cwnd_tso_budget = 1;
|
|
+
|
|
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
|
|
+ * In order to help drive the network toward lower queues and low latency while
|
|
+ * maintaining high utilization, the average pacing rate aims to be slightly
|
|
+ * lower than the estimated bandwidth. This is an important aspect of the
|
|
+ * design.
|
|
+ */
|
|
+static const int bbr_pacing_margin_percent = 1;
|
|
+
|
|
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
|
|
+ * that will allow a smoothly increasing pacing rate that will double each RTT
|
|
+ * and send the same number of packets per RTT that an un-paced, slow-starting
|
|
+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
|
|
+ */
|
|
+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
|
|
+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
|
|
+static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1;
|
|
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
|
|
+ * the queue created in BBR_STARTUP in a single round. Max allowed value
|
|
+ * is 1023 (0x3FF).
|
|
+ */
|
|
+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
|
|
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
|
|
+ * Max allowed value is 2047 (0x7FF).
|
|
+ */
|
|
+static int bbr_cwnd_gain = BBR_UNIT * 2;
|
|
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
|
|
+ * Max allowed value for each element is 1023 (0x3FF).
|
|
+ */
|
|
+enum bbr_pacing_gain_phase {
|
|
+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */
|
|
+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */
|
|
+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */
|
|
+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */
|
|
+};
|
|
+static int bbr_pacing_gain[] = {
|
|
+ BBR_UNIT * 5 / 4, /* probe for more available bw */
|
|
+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
|
|
+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
|
|
+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
|
|
+};
|
|
+
|
|
+/* Try to keep at least this many packets in flight, if things go smoothly. For
|
|
+ * smooth functioning, a sliding window protocol ACKing every other packet
|
|
+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
|
|
+ */
|
|
+static u32 bbr_cwnd_min_target = 4;
|
|
+
|
|
+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
|
|
+ * Use 0 to disable. Max allowed value is 255.
|
|
+ */
|
|
+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
|
|
+
|
|
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
|
|
+/* If bw has increased significantly (1.25x), there may be more bw available.
|
|
+ * Max allowed value is 1023 (0x3FF).
|
|
+ */
|
|
+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
|
|
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
|
|
+ * Max allowed value is 7 (0x7).
|
|
+ */
|
|
+static u32 bbr_full_bw_cnt = 3;
|
|
+
|
|
+static u32 bbr_flags; /* Debugging related stuff */
|
|
+
|
|
+/* Whether to debug using printk.
|
|
+ */
|
|
+static bool bbr_debug_with_printk;
|
|
+
|
|
+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
|
|
+ * Ignored when bbr_debug_with_printk is set.
|
|
+ */
|
|
+static bool bbr_debug_ftrace;
|
|
+
|
|
+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
|
|
+static bool bbr_drain_to_target = true; /* default: enabled */
|
|
+
|
|
+/* Experiment: Flags to control BBR with ECN behavior.
|
|
+ */
|
|
+static bool bbr_precise_ece_ack = true; /* default: enabled */
|
|
+
|
|
+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
|
|
+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
|
|
+ */
|
|
+static u32 bbr_cwnd_warn_val = 1U << 20;
|
|
+
|
|
+static u16 bbr_debug_port_mask;
|
|
+
|
|
+/* BBR module parameters. These are module parameters only in Google prod.
|
|
+ * Upstream these are intentionally not module parameters.
|
|
+ */
|
|
+static int bbr_pacing_gain_size = CYCLE_LEN;
|
|
+
|
|
+/* Gain factor for adding extra_acked to target cwnd: */
|
|
+static int bbr_extra_acked_gain = 256;
|
|
+
|
|
+/* Window length of extra_acked window. Max allowed val is 31. */
|
|
+static u32 bbr_extra_acked_win_rtts = 5;
|
|
+
|
|
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
|
|
+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
|
|
+
|
|
+/* Time period for clamping cwnd increment due to ack aggregation */
|
|
+static u32 bbr_extra_acked_max_us = 100 * 1000;
|
|
+
|
|
+/* Use extra acked in startup ?
|
|
+ * 0: disabled
|
|
+ * 1: use latest extra_acked value from 1-2 rtt in startup
|
|
+ */
|
|
+static int bbr_extra_acked_in_startup = 1; /* default: enabled */
|
|
+
|
|
+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
|
|
+static bool bbr_usage_based_cwnd; /* default: disabled */
|
|
+
|
|
+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
|
|
+ * when they know that any ECN marks that the connections experience will be
|
|
+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
|
|
+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
|
|
+ * negotiation or configuration that is outside the scope of the BBRv2
|
|
+ * alpha release.
|
|
+ */
|
|
+static bool bbr_ecn_enable = false;
|
|
+
|
|
+module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644);
|
|
+module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644);
|
|
+module_param_named(high_gain, bbr_high_gain, int, 0644);
|
|
+module_param_named(drain_gain, bbr_drain_gain, int, 0644);
|
|
+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644);
|
|
+module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644);
|
|
+module_param_array_named(pacing_gain, bbr_pacing_gain, int,
|
|
+ &bbr_pacing_gain_size, 0644);
|
|
+module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644);
|
|
+module_param_named(probe_rtt_cwnd_gain,
|
|
+ bbr_probe_rtt_cwnd_gain, uint, 0664);
|
|
+module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664);
|
|
+module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644);
|
|
+module_param_named(flags, bbr_flags, uint, 0644);
|
|
+module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644);
|
|
+module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644);
|
|
+module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644);
|
|
+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644);
|
|
+module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644);
|
|
+module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644);
|
|
+module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644);
|
|
+module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664);
|
|
+module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664);
|
|
+module_param_named(extra_acked_win_rtts,
|
|
+ bbr_extra_acked_win_rtts, uint, 0664);
|
|
+module_param_named(extra_acked_max_us,
|
|
+ bbr_extra_acked_max_us, uint, 0664);
|
|
+module_param_named(ack_epoch_acked_reset_thresh,
|
|
+ bbr_ack_epoch_acked_reset_thresh, uint, 0664);
|
|
+module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664);
|
|
+module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664);
|
|
+module_param_named(extra_acked_in_startup,
|
|
+ bbr_extra_acked_in_startup, int, 0664);
|
|
+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664);
|
|
+module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664);
|
|
+
|
|
+static void bbr2_exit_probe_rtt(struct sock *sk);
|
|
+static void bbr2_reset_congestion_signals(struct sock *sk);
|
|
+
|
|
+static void bbr_check_probe_rtt_done(struct sock *sk);
|
|
+
|
|
+/* Do we estimate that STARTUP filled the pipe? */
|
|
+static bool bbr_full_bw_reached(const struct sock *sk)
|
|
+{
|
|
+ const struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return bbr->full_bw_reached;
|
|
+}
|
|
+
|
|
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
|
|
+static u32 bbr_max_bw(const struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return max(bbr->bw_hi[0], bbr->bw_hi[1]);
|
|
+}
|
|
+
|
|
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
|
|
+static u32 bbr_bw(const struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return min(bbr_max_bw(sk), bbr->bw_lo);
|
|
+}
|
|
+
|
|
+/* Return maximum extra acked in past k-2k round trips,
|
|
+ * where k = bbr_extra_acked_win_rtts.
|
|
+ */
|
|
+static u16 bbr_extra_acked(const struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
|
|
+}
|
|
+
|
|
+/* Return rate in bytes per second, optionally with a gain.
|
|
+ * The order here is chosen carefully to avoid overflow of u64. This should
|
|
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
|
|
+ */
|
|
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
|
|
+ int margin)
|
|
+{
|
|
+ unsigned int mss = tcp_sk(sk)->mss_cache;
|
|
+
|
|
+ rate *= mss;
|
|
+ rate *= gain;
|
|
+ rate >>= BBR_SCALE;
|
|
+ rate *= USEC_PER_SEC / 100 * (100 - margin);
|
|
+ rate >>= BW_SCALE;
|
|
+ rate = max(rate, 1ULL);
|
|
+ return rate;
|
|
+}
|
|
+
|
|
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
|
|
+{
|
|
+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
|
|
+}
|
|
+
|
|
+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
|
|
+{
|
|
+ rate = bbr_bw_bytes_per_sec(sk, rate);
|
|
+ rate *= 8;
|
|
+ do_div(rate, 1000);
|
|
+ return rate;
|
|
+}
|
|
+
|
|
+static u32 bbr_tso_segs_goal(struct sock *sk);
|
|
+static void bbr_debug(struct sock *sk, u32 acked,
|
|
+ const struct rate_sample *rs, struct bbr_context *ctx)
|
|
+{
|
|
+ static const char ca_states[] = {
|
|
+ [TCP_CA_Open] = 'O',
|
|
+ [TCP_CA_Disorder] = 'D',
|
|
+ [TCP_CA_CWR] = 'C',
|
|
+ [TCP_CA_Recovery] = 'R',
|
|
+ [TCP_CA_Loss] = 'L',
|
|
+ };
|
|
+ static const char mode[] = {
|
|
+ 'G', /* Growing - BBR_STARTUP */
|
|
+ 'D', /* Drain - BBR_DRAIN */
|
|
+ 'W', /* Window - BBR_PROBE_BW */
|
|
+ 'M', /* Min RTT - BBR_PROBE_RTT */
|
|
+ };
|
|
+ static const char ack_phase[] = { /* bbr_ack_phase strings */
|
|
+ 'I', /* BBR_ACKS_INIT - 'Init' */
|
|
+ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */
|
|
+ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */
|
|
+ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
|
|
+ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */
|
|
+ };
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ const u32 una = tp->snd_una - bbr->debug.snd_isn;
|
|
+ const u32 fack = tcp_highest_sack_seq(tp);
|
|
+ const u16 dport = ntohs(inet_sk(sk)->inet_dport);
|
|
+ bool is_port_match = (bbr_debug_port_mask &&
|
|
+ ((dport & bbr_debug_port_mask) == 0));
|
|
+ char debugmsg[320];
|
|
+
|
|
+ if (sk->sk_state == TCP_SYN_SENT)
|
|
+ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */
|
|
+
|
|
+ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
|
|
+ char addr[INET6_ADDRSTRLEN + 10] = { 0 };
|
|
+
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ snprintf(addr, sizeof(addr), "%pI4:%u",
|
|
+ &inet_sk(sk)->inet_daddr, dport);
|
|
+ else if (sk->sk_family == AF_INET6)
|
|
+ snprintf(addr, sizeof(addr), "%pI6:%u",
|
|
+ &sk->sk_v6_daddr, dport);
|
|
+
|
|
+ WARN_ONCE(1,
|
|
+ "BBR %s cwnd alert: %u "
|
|
+ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
|
|
+ "bw: %u rtt: %u min_rtt: %u "
|
|
+ "acked: %u tso_segs: %u "
|
|
+ "bw: %d %ld %d pif: %u\n",
|
|
+ addr, tp->snd_cwnd,
|
|
+ una, inet_csk(sk)->icsk_ca_state,
|
|
+ bbr->pacing_gain, bbr->cwnd_gain,
|
|
+ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
|
|
+ acked, bbr_tso_segs_goal(sk),
|
|
+ rs->delivered, rs->interval_us, rs->is_retrans,
|
|
+ tcp_packets_in_flight(tp));
|
|
+ }
|
|
+
|
|
+ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
|
|
+ return;
|
|
+
|
|
+ if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
|
|
+ return;
|
|
+
|
|
+ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
|
|
+ return;
|
|
+
|
|
+ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
|
|
+ !(bbr_flags & FLAG_DEBUG_LOOPBACK))
|
|
+ return;
|
|
+
|
|
+ snprintf(debugmsg, sizeof(debugmsg) - 1,
|
|
+ "BBR %pI4:%-5u %5u,%03u:%-7u %c "
|
|
+ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
|
|
+ "bw %llu lb %llu ib %llu qb %llu "
|
|
+ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
|
|
+ "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
|
|
+ "v %d %c %u %c %s\n",
|
|
+ &inet_sk(sk)->inet_daddr, dport,
|
|
+ una / 1000, una % 1000, fack - tp->snd_una,
|
|
+ ca_states[inet_csk(sk)->icsk_ca_state],
|
|
+ bbr->debug.undo ? '@' : mode[bbr->mode],
|
|
+ tp->snd_cwnd,
|
|
+ bbr_extra_acked(sk), /* br (legacy): extra_acked */
|
|
+ rs->tx_in_flight, /* cr (legacy): tx_inflight */
|
|
+ rs->rtt_us,
|
|
+ rs->delivered,
|
|
+ rs->interval_us,
|
|
+ bbr->min_rtt_us,
|
|
+ rs->is_app_limited ? '_' : 'l',
|
|
+ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
|
|
+ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
|
|
+ 0ULL, /* lb: [obsolete] */
|
|
+ 0ULL, /* ib: [obsolete] */
|
|
+ div_u64((u64)sk->sk_pacing_rate * 8, 1000),
|
|
+ acked,
|
|
+ tcp_packets_in_flight(tp),
|
|
+ rs->is_ack_delayed ? 'd' : '.',
|
|
+ bbr->round_start ? '*' : '.',
|
|
+ tp->delivered, tp->lost,
|
|
+ tp->app_limited,
|
|
+ 0, /* #: [obsolete] */
|
|
+ ctx->target_cwnd,
|
|
+ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */
|
|
+ ca_states[bbr->prev_ca_state],
|
|
+ (rs->lost + rs->delivered) > 0 ?
|
|
+ (1000 * rs->lost /
|
|
+ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */
|
|
+ (rs->delivered) > 0 ?
|
|
+ (1000 * rs->delivered_ce /
|
|
+ (rs->delivered)) : 0, /* er: ECN rate x1000 */
|
|
+ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */
|
|
+ bbr->bw_lo == ~0U ?
|
|
+ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
|
|
+ bbr->inflight_lo, /* il */
|
|
+ bbr->inflight_hi, /* ih */
|
|
+ bbr->bw_probe_up_cnt, /* c */
|
|
+ 2, /* v: version */
|
|
+ bbr->debug.event,
|
|
+ bbr->cycle_idx,
|
|
+ ack_phase[bbr->ack_phase],
|
|
+ bbr->bw_probe_samples ? "Y" : "N");
|
|
+ debugmsg[sizeof(debugmsg) - 1] = 0;
|
|
+
|
|
+ /* printk takes a higher precedence. */
|
|
+ if (bbr_debug_with_printk)
|
|
+ printk(KERN_DEBUG "%s", debugmsg);
|
|
+
|
|
+ if (unlikely(bbr->debug.undo))
|
|
+ bbr->debug.undo = 0;
|
|
+}
|
|
+
|
|
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
|
|
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
|
|
+{
|
|
+ u64 rate = bw;
|
|
+
|
|
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain,
|
|
+ bbr_pacing_margin_percent);
|
|
+ rate = min_t(u64, rate, sk->sk_max_pacing_rate);
|
|
+ return rate;
|
|
+}
|
|
+
|
|
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
|
|
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u64 bw;
|
|
+ u32 rtt_us;
|
|
+
|
|
+ if (tp->srtt_us) { /* any RTT sample yet? */
|
|
+ rtt_us = max(tp->srtt_us >> 3, 1U);
|
|
+ bbr->has_seen_rtt = 1;
|
|
+ } else { /* no RTT sample yet */
|
|
+ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */
|
|
+ }
|
|
+ bw = (u64)tp->snd_cwnd * BW_UNIT;
|
|
+ do_div(bw, rtt_us);
|
|
+ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
|
|
+}
|
|
+
|
|
+/* Pace using current bw estimate and a gain factor. */
|
|
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
|
|
+
|
|
+ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
|
|
+ bbr_init_pacing_rate_from_rtt(sk);
|
|
+ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
|
|
+ sk->sk_pacing_rate = rate;
|
|
+}
|
|
+
|
|
+static u32 bbr_min_tso_segs(struct sock *sk)
|
|
+{
|
|
+ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
|
|
+}
|
|
+
|
|
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
|
|
+ * a particular max gso size as a constraint.
|
|
+ */
|
|
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
|
|
+ u32 gso_max_size)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 segs, r;
|
|
+ u64 bytes;
|
|
+
|
|
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
|
|
+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
|
|
+
|
|
+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every
|
|
+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
|
|
+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
|
|
+ */
|
|
+ if (bbr->params.tso_rtt_shift) {
|
|
+ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
|
|
+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */
|
|
+ bytes += GSO_MAX_SIZE >> r;
|
|
+ }
|
|
+
|
|
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
|
|
+ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
|
|
+ return segs;
|
|
+}
|
|
+
|
|
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
|
|
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
+{
|
|
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
|
|
+}
|
|
+
|
|
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
|
|
+static u32 bbr_tso_segs_goal(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
|
|
+}
|
|
+
|
|
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
|
|
+static void bbr_save_cwnd(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
|
|
+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
|
|
+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
|
|
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
|
|
+}
|
|
+
|
|
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (event == CA_EVENT_TX_START && tp->app_limited) {
|
|
+ bbr->idle_restart = 1;
|
|
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
|
|
+ bbr->ack_epoch_acked = 0;
|
|
+ /* Avoid pointless buffer overflows: pace at est. bw if we don't
|
|
+ * need more speed (we're restarting from idle and app-limited).
|
|
+ */
|
|
+ if (bbr->mode == BBR_PROBE_BW)
|
|
+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
|
|
+ else if (bbr->mode == BBR_PROBE_RTT)
|
|
+ bbr_check_probe_rtt_done(sk);
|
|
+ } else if ((event == CA_EVENT_ECN_IS_CE ||
|
|
+ event == CA_EVENT_ECN_NO_CE) &&
|
|
+ bbr_ecn_enable &&
|
|
+ bbr->params.precise_ece_ack) {
|
|
+ u32 state = bbr->ce_state;
|
|
+ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
|
|
+ bbr->ce_state = state;
|
|
+ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
|
|
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
|
|
+ *
|
|
+ * bdp = ceil(bw * min_rtt * gain)
|
|
+ *
|
|
+ * The key factor, gain, controls the amount of queue. While a small gain
|
|
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
|
|
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
|
|
+ * noise may cause BBR to under-estimate the rate.
|
|
+ */
|
|
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 bdp;
|
|
+ u64 w;
|
|
+
|
|
+ /* If we've never had a valid RTT sample, cap cwnd at the initial
|
|
+ * default. This should only happen when the connection is not using TCP
|
|
+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets
|
|
+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
|
|
+ * case we need to slow-start up toward something safe: initial cwnd.
|
|
+ */
|
|
+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
|
|
+ return bbr->init_cwnd; /* be safe: cap at initial cwnd */
|
|
+
|
|
+ w = (u64)bw * bbr->min_rtt_us;
|
|
+
|
|
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
|
|
+ * round the value up to avoid a negative feedback loop.
|
|
+ */
|
|
+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
|
|
+
|
|
+ return bdp;
|
|
+}
|
|
+
|
|
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
|
|
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
|
|
+ * - one skb in sending host Qdisc,
|
|
+ * - one skb in sending host TSO/GSO engine
|
|
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
|
|
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
|
|
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
|
|
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
|
|
+ * full even with ACK-every-other-packet delayed ACKs.
|
|
+ */
|
|
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 tso_segs_goal;
|
|
+
|
|
+ tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
|
|
+
|
|
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
|
|
+ if (bbr->params.cwnd_tso_budget == 1) {
|
|
+ cwnd = max_t(u32, cwnd, tso_segs_goal);
|
|
+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
|
|
+ } else {
|
|
+ cwnd += tso_segs_goal;
|
|
+ cwnd = (cwnd + 1) & ~1U;
|
|
+ }
|
|
+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
|
|
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
|
|
+ cwnd += 2;
|
|
+
|
|
+ return cwnd;
|
|
+}
|
|
+
|
|
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
|
|
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
|
|
+{
|
|
+ u32 inflight;
|
|
+
|
|
+ inflight = bbr_bdp(sk, bw, gain);
|
|
+ inflight = bbr_quantization_budget(sk, inflight);
|
|
+
|
|
+ return inflight;
|
|
+}
|
|
+
|
|
+/* With pacing at lower layers, there's often less data "in the network" than
|
|
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
|
|
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
|
|
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
|
|
+ * inflight level that it estimates has already been "baked in" by previous
|
|
+ * departure time decisions. We calculate a rough estimate of the number of our
|
|
+ * packets that might be in the network at the earliest departure time for the
|
|
+ * next skb scheduled:
|
|
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
|
|
+ * If we're increasing inflight, then we want to know if the transmit of the
|
|
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
|
|
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
|
|
+ * then estimate if inflight will sink too low just before the EDT transmit.
|
|
+ */
|
|
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u64 now_ns, edt_ns, interval_us;
|
|
+ u32 interval_delivered, inflight_at_edt;
|
|
+
|
|
+ now_ns = tp->tcp_clock_cache;
|
|
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
|
|
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
|
|
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
|
|
+ inflight_at_edt = inflight_now;
|
|
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
|
|
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
|
|
+ if (interval_delivered >= inflight_at_edt)
|
|
+ return 0;
|
|
+ return inflight_at_edt - interval_delivered;
|
|
+}
|
|
+
|
|
+/* Find the cwnd increment based on estimate of ack aggregation */
|
|
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 max_aggr_cwnd, aggr_cwnd = 0;
|
|
+
|
|
+ if (bbr->params.extra_acked_gain &&
|
|
+ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
|
|
+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
|
|
+ / BW_UNIT;
|
|
+ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
|
|
+ >> BBR_SCALE;
|
|
+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
|
|
+ }
|
|
+
|
|
+ return aggr_cwnd;
|
|
+}
|
|
+
|
|
+/* Returns the cwnd for PROBE_RTT mode. */
|
|
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr->params.probe_rtt_cwnd_gain == 0)
|
|
+ return bbr->params.cwnd_min_target;
|
|
+ return max_t(u32, bbr->params.cwnd_min_target,
|
|
+ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
|
|
+}
|
|
+
|
|
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
|
|
+ * has drawn us down below target), or snap down to target if we're above it.
|
|
+ */
|
|
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
|
|
+ u32 acked, u32 bw, int gain, u32 cwnd,
|
|
+ struct bbr_context *ctx)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
|
|
+
|
|
+ if (!acked)
|
|
+ goto done; /* no packet fully ACKed; just apply caps */
|
|
+
|
|
+ target_cwnd = bbr_bdp(sk, bw, gain);
|
|
+
|
|
+ /* Increment the cwnd to account for excess ACKed data that seems
|
|
+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
|
|
+ */
|
|
+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
|
|
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
|
|
+
|
|
+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
|
|
+ bbr->debug.target_cwnd = target_cwnd;
|
|
+
|
|
+ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
|
|
+ bbr->try_fast_path = 0;
|
|
+ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
|
|
+ cwnd += acked;
|
|
+ if (cwnd >= target_cwnd) {
|
|
+ cwnd = target_cwnd;
|
|
+ bbr->try_fast_path = 1;
|
|
+ }
|
|
+ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) {
|
|
+ cwnd += acked;
|
|
+ } else {
|
|
+ bbr->try_fast_path = 1;
|
|
+ }
|
|
+
|
|
+ /* When growing cwnd, don't grow beyond twice what we just probed. */
|
|
+ if (bbr->params.usage_based_cwnd) {
|
|
+ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
|
|
+ cwnd = min(cwnd, max_probe);
|
|
+ }
|
|
+
|
|
+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
|
|
+done:
|
|
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
|
|
+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
|
|
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
|
|
+
|
|
+ ctx->target_cwnd = target_cwnd;
|
|
+ ctx->log = (tp->snd_cwnd != prev_cwnd);
|
|
+}
|
|
+
|
|
+/* See if we have reached next round trip */
|
|
+static void bbr_update_round_start(struct sock *sk,
|
|
+ const struct rate_sample *rs, struct bbr_context *ctx)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->round_start = 0;
|
|
+
|
|
+ /* See if we've reached the next RTT */
|
|
+ if (rs->interval_us > 0 &&
|
|
+ !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ bbr->round_start = 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Calculate the bandwidth based on how fast packets are delivered */
|
|
+static void bbr_calculate_bw_sample(struct sock *sk,
|
|
+ const struct rate_sample *rs, struct bbr_context *ctx)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u64 bw = 0;
|
|
+
|
|
+ /* Divide delivered by the interval to find a (lower bound) bottleneck
|
|
+ * bandwidth sample. Delivered is in packets and interval_us in uS and
|
|
+ * ratio will be <<1 for most connections. So delivered is first scaled.
|
|
+ * Round up to allow growth at low rates, even with integer division.
|
|
+ */
|
|
+ if (rs->interval_us > 0) {
|
|
+ if (WARN_ONCE(rs->delivered < 0,
|
|
+ "negative delivered: %d interval_us: %ld\n",
|
|
+ rs->delivered, rs->interval_us))
|
|
+ return;
|
|
+
|
|
+ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
|
|
+ }
|
|
+
|
|
+ ctx->sample_bw = bw;
|
|
+ bbr->debug.rs_bw = bw;
|
|
+}
|
|
+
|
|
+/* Estimates the windowed max degree of ack aggregation.
|
|
+ * This is used to provision extra in-flight data to keep sending during
|
|
+ * inter-ACK silences.
|
|
+ *
|
|
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
|
|
+ *
|
|
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
|
|
+ * cwnd += max_extra_acked
|
|
+ *
|
|
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
|
|
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
|
|
+ * trips for non-startup phase, and 1-2 round trips for startup.
|
|
+ */
|
|
+static void bbr_update_ack_aggregation(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ u32 epoch_us, expected_acked, extra_acked;
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
|
|
+
|
|
+ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
|
|
+ rs->delivered < 0 || rs->interval_us <= 0)
|
|
+ return;
|
|
+
|
|
+ if (bbr->round_start) {
|
|
+ bbr->extra_acked_win_rtts = min(0x1F,
|
|
+ bbr->extra_acked_win_rtts + 1);
|
|
+ if (bbr->params.extra_acked_in_startup &&
|
|
+ !bbr_full_bw_reached(sk))
|
|
+ extra_acked_win_rtts_thresh = 1;
|
|
+ if (bbr->extra_acked_win_rtts >=
|
|
+ extra_acked_win_rtts_thresh) {
|
|
+ bbr->extra_acked_win_rtts = 0;
|
|
+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
|
|
+ 0 : 1;
|
|
+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Compute how many packets we expected to be delivered over epoch. */
|
|
+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
|
|
+ bbr->ack_epoch_mstamp);
|
|
+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
|
|
+
|
|
+ /* Reset the aggregation epoch if ACK rate is below expected rate or
|
|
+ * significantly large no. of ack received since epoch (potentially
|
|
+ * quite old epoch).
|
|
+ */
|
|
+ if (bbr->ack_epoch_acked <= expected_acked ||
|
|
+ (bbr->ack_epoch_acked + rs->acked_sacked >=
|
|
+ bbr_ack_epoch_acked_reset_thresh)) {
|
|
+ bbr->ack_epoch_acked = 0;
|
|
+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
|
|
+ expected_acked = 0;
|
|
+ }
|
|
+
|
|
+ /* Compute excess data delivered, beyond what was expected. */
|
|
+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
|
|
+ bbr->ack_epoch_acked + rs->acked_sacked);
|
|
+ extra_acked = bbr->ack_epoch_acked - expected_acked;
|
|
+ extra_acked = min(extra_acked, tp->snd_cwnd);
|
|
+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
|
|
+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
|
|
+}
|
|
+
|
|
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
|
|
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
|
|
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
|
|
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
|
|
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
|
|
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
|
|
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
|
|
+ */
|
|
+static void bbr_check_full_bw_reached(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 bw_thresh;
|
|
+
|
|
+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
|
|
+ return;
|
|
+
|
|
+ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
|
|
+ if (bbr_max_bw(sk) >= bw_thresh) {
|
|
+ bbr->full_bw = bbr_max_bw(sk);
|
|
+ bbr->full_bw_cnt = 0;
|
|
+ return;
|
|
+ }
|
|
+ ++bbr->full_bw_cnt;
|
|
+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
|
|
+}
|
|
+
|
|
+/* If pipe is probably full, drain the queue and then enter steady-state. */
|
|
+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
|
|
+ struct bbr_context *ctx)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
|
|
+ bbr->mode = BBR_DRAIN; /* drain queue we created */
|
|
+ tcp_sk(sk)->snd_ssthresh =
|
|
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
|
|
+ bbr2_reset_congestion_signals(sk);
|
|
+ } /* fall through to check if in-flight is already small: */
|
|
+ if (bbr->mode == BBR_DRAIN &&
|
|
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
|
|
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
|
|
+ return true; /* exiting DRAIN now */
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void bbr_check_probe_rtt_done(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (!(bbr->probe_rtt_done_stamp &&
|
|
+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
|
|
+ return;
|
|
+
|
|
+ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
|
|
+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
|
|
+ bbr2_exit_probe_rtt(sk);
|
|
+}
|
|
+
|
|
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
|
|
+ * periodically drain the bottleneck queue, to converge to measure the true
|
|
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
|
|
+ * small (reducing queuing delay and packet loss) and achieve fairness among
|
|
+ * BBR flows.
|
|
+ *
|
|
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
|
|
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
|
|
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
|
|
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
|
|
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
|
|
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
|
|
+ *
|
|
+ * Note that flows need only pay 2% if they are busy sending over the last 10
|
|
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
|
|
+ * natural silences or low-rate periods within 10 seconds where the rate is low
|
|
+ * enough for long enough to drain its queue in the bottleneck. We pick up
|
|
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
|
|
+ */
|
|
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ bool probe_rtt_expired, min_rtt_expired;
|
|
+ u32 expire;
|
|
+
|
|
+ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
|
|
+ expire = bbr->probe_rtt_min_stamp +
|
|
+ msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
|
|
+ probe_rtt_expired = after(tcp_jiffies32, expire);
|
|
+ if (rs->rtt_us >= 0 &&
|
|
+ (rs->rtt_us <= bbr->probe_rtt_min_us ||
|
|
+ (probe_rtt_expired && !rs->is_ack_delayed))) {
|
|
+ bbr->probe_rtt_min_us = rs->rtt_us;
|
|
+ bbr->probe_rtt_min_stamp = tcp_jiffies32;
|
|
+ }
|
|
+ /* Track min RTT seen in the min_rtt_win_sec filter window: */
|
|
+ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
|
|
+ min_rtt_expired = after(tcp_jiffies32, expire);
|
|
+ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
|
|
+ min_rtt_expired) {
|
|
+ bbr->min_rtt_us = bbr->probe_rtt_min_us;
|
|
+ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
|
|
+ }
|
|
+
|
|
+ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
|
|
+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
|
|
+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
|
|
+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */
|
|
+ bbr->probe_rtt_done_stamp = 0;
|
|
+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ }
|
|
+
|
|
+ if (bbr->mode == BBR_PROBE_RTT) {
|
|
+ /* Ignore low rate samples during this mode. */
|
|
+ tp->app_limited =
|
|
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
|
|
+ /* Maintain min packets in flight for max(200 ms, 1 round). */
|
|
+ if (!bbr->probe_rtt_done_stamp &&
|
|
+ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
|
|
+ bbr->probe_rtt_done_stamp = tcp_jiffies32 +
|
|
+ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
|
|
+ bbr->probe_rtt_round_done = 0;
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ } else if (bbr->probe_rtt_done_stamp) {
|
|
+ if (bbr->round_start)
|
|
+ bbr->probe_rtt_round_done = 1;
|
|
+ if (bbr->probe_rtt_round_done)
|
|
+ bbr_check_probe_rtt_done(sk);
|
|
+ }
|
|
+ }
|
|
+ /* Restart after idle ends only once we process a new S/ACK for data */
|
|
+ if (rs->delivered > 0)
|
|
+ bbr->idle_restart = 0;
|
|
+}
|
|
+
|
|
+static void bbr_update_gains(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ switch (bbr->mode) {
|
|
+ case BBR_STARTUP:
|
|
+ bbr->pacing_gain = bbr->params.high_gain;
|
|
+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain;
|
|
+ break;
|
|
+ case BBR_DRAIN:
|
|
+ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */
|
|
+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */
|
|
+ break;
|
|
+ case BBR_PROBE_BW:
|
|
+ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
|
|
+ bbr->cwnd_gain = bbr->params.cwnd_gain;
|
|
+ break;
|
|
+ case BBR_PROBE_RTT:
|
|
+ bbr->pacing_gain = BBR_UNIT;
|
|
+ bbr->cwnd_gain = BBR_UNIT;
|
|
+ break;
|
|
+ default:
|
|
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bbr_init(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ int i;
|
|
+
|
|
+ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
|
|
+
|
|
+ bbr->initialized = 1;
|
|
+ bbr->params.high_gain = min(0x7FF, bbr_high_gain);
|
|
+ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
|
|
+ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
|
|
+ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
|
|
+ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
|
|
+ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
|
|
+ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
|
|
+ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
|
|
+ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
|
|
+ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
|
|
+ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
|
|
+ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
|
|
+ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
|
|
+ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
|
|
+ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
|
|
+ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
|
|
+ bbr->params.probe_rtt_win_ms =
|
|
+ min(0x3FFFU,
|
|
+ min_t(u32, bbr_probe_rtt_win_ms,
|
|
+ bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
|
|
+ for (i = 0; i < CYCLE_LEN; i++)
|
|
+ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
|
|
+ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
|
|
+ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift);
|
|
+
|
|
+ bbr->debug.snd_isn = tp->snd_una;
|
|
+ bbr->debug.target_cwnd = 0;
|
|
+ bbr->debug.undo = 0;
|
|
+
|
|
+ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
|
|
+ bbr->prior_cwnd = tp->prior_cwnd;
|
|
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
|
+ bbr->next_rtt_delivered = 0;
|
|
+ bbr->prev_ca_state = TCP_CA_Open;
|
|
+ bbr->packet_conservation = 0;
|
|
+
|
|
+ bbr->probe_rtt_done_stamp = 0;
|
|
+ bbr->probe_rtt_round_done = 0;
|
|
+ bbr->probe_rtt_min_us = tcp_min_rtt(tp);
|
|
+ bbr->probe_rtt_min_stamp = tcp_jiffies32;
|
|
+ bbr->min_rtt_us = tcp_min_rtt(tp);
|
|
+ bbr->min_rtt_stamp = tcp_jiffies32;
|
|
+
|
|
+ bbr->has_seen_rtt = 0;
|
|
+ bbr_init_pacing_rate_from_rtt(sk);
|
|
+
|
|
+ bbr->round_start = 0;
|
|
+ bbr->idle_restart = 0;
|
|
+ bbr->full_bw_reached = 0;
|
|
+ bbr->full_bw = 0;
|
|
+ bbr->full_bw_cnt = 0;
|
|
+ bbr->cycle_mstamp = 0;
|
|
+ bbr->cycle_idx = 0;
|
|
+ bbr->mode = BBR_STARTUP;
|
|
+ bbr->debug.rs_bw = 0;
|
|
+
|
|
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
|
|
+ bbr->ack_epoch_acked = 0;
|
|
+ bbr->extra_acked_win_rtts = 0;
|
|
+ bbr->extra_acked_win_idx = 0;
|
|
+ bbr->extra_acked[0] = 0;
|
|
+ bbr->extra_acked[1] = 0;
|
|
+
|
|
+ bbr->ce_state = 0;
|
|
+ bbr->prior_rcv_nxt = tp->rcv_nxt;
|
|
+ bbr->try_fast_path = 0;
|
|
+
|
|
+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
|
|
+}
|
|
+
|
|
+static u32 bbr_sndbuf_expand(struct sock *sk)
|
|
+{
|
|
+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */
|
|
+ return 3;
|
|
+}
|
|
+
|
|
+/* __________________________________________________________________________
|
|
+ *
|
|
+ * Functions new to BBR v2 ("bbr") congestion control are below here.
|
|
+ * __________________________________________________________________________
|
|
+ */
|
|
+
|
|
+/* Incorporate a new bw sample into the current window of our max filter. */
|
|
+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
|
|
+}
|
|
+
|
|
+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
|
|
+static void bbr2_advance_bw_hi_filter(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (!bbr->bw_hi[1])
|
|
+ return; /* no samples in this window; remember old window */
|
|
+ bbr->bw_hi[0] = bbr->bw_hi[1];
|
|
+ bbr->bw_hi[1] = 0;
|
|
+}
|
|
+
|
|
+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
|
|
+static u32 bbr2_target_inflight(struct sock *sk)
|
|
+{
|
|
+ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
|
|
+
|
|
+ return min(bdp, tcp_sk(sk)->snd_cwnd);
|
|
+}
|
|
+
|
|
+static bool bbr2_is_probing_bandwidth(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return (bbr->mode == BBR_STARTUP) ||
|
|
+ (bbr->mode == BBR_PROBE_BW &&
|
|
+ (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
|
|
+ bbr->cycle_idx == BBR_BW_PROBE_UP));
|
|
+}
|
|
+
|
|
+/* Has the given amount of time elapsed since we marked the phase start? */
|
|
+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ const struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ return tcp_stamp_us_delta(tp->tcp_mstamp,
|
|
+ bbr->cycle_mstamp + interval_us) > 0;
|
|
+}
|
|
+
|
|
+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->full_bw_reached = 1;
|
|
+ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
|
|
+}
|
|
+
|
|
+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
|
|
+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
|
|
+ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
|
|
+ return;
|
|
+
|
|
+ if (ce_ratio >= bbr->params.ecn_thresh)
|
|
+ bbr->startup_ecn_rounds++;
|
|
+ else
|
|
+ bbr->startup_ecn_rounds = 0;
|
|
+
|
|
+ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
|
|
+ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */
|
|
+ bbr2_handle_queue_too_high_in_startup(sk);
|
|
+ return;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bbr2_update_ecn_alpha(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ s32 delivered, delivered_ce;
|
|
+ u64 alpha, ce_ratio;
|
|
+ u32 gain;
|
|
+
|
|
+ if (bbr->params.ecn_factor == 0)
|
|
+ return;
|
|
+
|
|
+ delivered = tp->delivered - bbr->alpha_last_delivered;
|
|
+ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
|
|
+
|
|
+ if (delivered == 0 || /* avoid divide by zero */
|
|
+ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */
|
|
+ return;
|
|
+
|
|
+ /* See if we should use ECN sender logic for this connection. */
|
|
+ if (!bbr->ecn_eligible && bbr_ecn_enable &&
|
|
+ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
|
|
+ !bbr->params.ecn_max_rtt_us))
|
|
+ bbr->ecn_eligible = 1;
|
|
+
|
|
+ ce_ratio = (u64)delivered_ce << BBR_SCALE;
|
|
+ do_div(ce_ratio, delivered);
|
|
+ gain = bbr->params.ecn_alpha_gain;
|
|
+ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
|
|
+ alpha += (gain * ce_ratio) >> BBR_SCALE;
|
|
+ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
|
|
+
|
|
+ bbr->alpha_last_delivered = tp->delivered;
|
|
+ bbr->alpha_last_delivered_ce = tp->delivered_ce;
|
|
+
|
|
+ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
|
|
+}
|
|
+
|
|
+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
|
|
+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 growth_this_round, cnt;
|
|
+
|
|
+ /* Calculate "slope": packets S/Acked per inflight_hi increment. */
|
|
+ growth_this_round = 1 << bbr->bw_probe_up_rounds;
|
|
+ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
|
|
+ cnt = tp->snd_cwnd / growth_this_round;
|
|
+ cnt = max(cnt, 1U);
|
|
+ bbr->bw_probe_up_cnt = cnt;
|
|
+ bbr->debug.event = 'G'; /* Grow inflight_hi slope */
|
|
+}
|
|
+
|
|
+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
|
|
+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 delta;
|
|
+
|
|
+ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
|
|
+ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */
|
|
+ return; /* not fully using inflight_hi, so don't grow it */
|
|
+ }
|
|
+
|
|
+ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
|
|
+ bbr->bw_probe_up_acks += rs->acked_sacked;
|
|
+ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) {
|
|
+ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
|
|
+ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
|
|
+ bbr->inflight_hi += delta;
|
|
+ bbr->debug.event = 'I'; /* Increment inflight_hi */
|
|
+ }
|
|
+
|
|
+ if (bbr->round_start)
|
|
+ bbr2_raise_inflight_hi_slope(sk);
|
|
+}
|
|
+
|
|
+/* Does loss/ECN rate for this sample say inflight is "too high"?
|
|
+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
|
|
+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
|
|
+ * uses it to notice when loss/ECN rates suggest inflight is too high.
|
|
+ */
|
|
+static bool bbr2_is_inflight_too_high(const struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ const struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 loss_thresh, ecn_thresh;
|
|
+
|
|
+ if (rs->lost > 0 && rs->tx_in_flight) {
|
|
+ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
|
|
+ BBR_SCALE;
|
|
+ if (rs->lost > loss_thresh)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (rs->delivered_ce > 0 && rs->delivered > 0 &&
|
|
+ bbr->ecn_eligible && bbr->params.ecn_thresh) {
|
|
+ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
|
|
+ BBR_SCALE;
|
|
+ if (rs->delivered_ce >= ecn_thresh)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Calculate the tx_in_flight level that corresponded to excessive loss.
|
|
+ * We find "lost_prefix" segs of the skb where loss rate went too high,
|
|
+ * by solving for "lost_prefix" in the following equation:
|
|
+ * lost / inflight >= loss_thresh
|
|
+ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
|
|
+ * Then we take that equation, convert it to fixed point, and
|
|
+ * round up to the nearest packet.
|
|
+ */
|
|
+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
|
|
+ const struct rate_sample *rs,
|
|
+ const struct sk_buff *skb)
|
|
+{
|
|
+ const struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 loss_thresh = bbr->params.loss_thresh;
|
|
+ u32 pcount, divisor, inflight_hi;
|
|
+ s32 inflight_prev, lost_prev;
|
|
+ u64 loss_budget, lost_prefix;
|
|
+
|
|
+ pcount = tcp_skb_pcount(skb);
|
|
+
|
|
+ /* How much data was in flight before this skb? */
|
|
+ inflight_prev = rs->tx_in_flight - pcount;
|
|
+ if (WARN_ONCE(inflight_prev < 0,
|
|
+ "tx_in_flight: %u pcount: %u reneg: %u",
|
|
+ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
|
|
+ return ~0U;
|
|
+
|
|
+ /* How much inflight data was marked lost before this skb? */
|
|
+ lost_prev = rs->lost - pcount;
|
|
+ if (WARN_ON_ONCE(lost_prev < 0))
|
|
+ return ~0U;
|
|
+
|
|
+ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */
|
|
+ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
|
|
+ loss_budget >>= BBR_SCALE;
|
|
+ if (lost_prev >= loss_budget) {
|
|
+ lost_prefix = 0; /* previous losses crossed loss_thresh */
|
|
+ } else {
|
|
+ lost_prefix = loss_budget - lost_prev;
|
|
+ lost_prefix <<= BBR_SCALE;
|
|
+ divisor = BBR_UNIT - loss_thresh;
|
|
+ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */
|
|
+ return ~0U;
|
|
+ do_div(lost_prefix, divisor);
|
|
+ }
|
|
+
|
|
+ inflight_hi = inflight_prev + lost_prefix;
|
|
+ return inflight_hi;
|
|
+}
|
|
+
|
|
+/* If loss/ECN rates during probing indicated we may have overfilled a
|
|
+ * buffer, return an operating point that tries to leave unutilized headroom in
|
|
+ * the path for other flows, for fairness convergence and lower RTTs and loss.
|
|
+ */
|
|
+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 headroom, headroom_fraction;
|
|
+
|
|
+ if (bbr->inflight_hi == ~0U)
|
|
+ return ~0U;
|
|
+
|
|
+ headroom_fraction = bbr->params.inflight_headroom;
|
|
+ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
|
|
+ headroom = max(headroom, 1U);
|
|
+ return max_t(s32, bbr->inflight_hi - headroom,
|
|
+ bbr->params.cwnd_min_target);
|
|
+}
|
|
+
|
|
+/* Bound cwnd to a sensible level, based on our current probing state
|
|
+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
|
|
+ */
|
|
+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 cap;
|
|
+
|
|
+ /* tcp_rcv_synsent_state_process() currently calls tcp_ack()
|
|
+ * and thus cong_control() without first initializing us(!).
|
|
+ */
|
|
+ if (!bbr->initialized)
|
|
+ return;
|
|
+
|
|
+ cap = ~0U;
|
|
+ if (bbr->mode == BBR_PROBE_BW &&
|
|
+ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
|
|
+ /* Probe to see if more packets fit in the path. */
|
|
+ cap = bbr->inflight_hi;
|
|
+ } else {
|
|
+ if (bbr->mode == BBR_PROBE_RTT ||
|
|
+ (bbr->mode == BBR_PROBE_BW &&
|
|
+ bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
|
|
+ cap = bbr2_inflight_with_headroom(sk);
|
|
+ }
|
|
+ /* Adapt to any loss/ECN since our last bw probe. */
|
|
+ cap = min(cap, bbr->inflight_lo);
|
|
+
|
|
+ cap = max_t(u32, cap, bbr->params.cwnd_min_target);
|
|
+ tp->snd_cwnd = min(cap, tp->snd_cwnd);
|
|
+}
|
|
+
|
|
+/* Estimate a short-term lower bound on the capacity available now, based
|
|
+ * on measurements of the current delivery process and recent history. When we
|
|
+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
|
|
+ * move toward flow balance by multiplicatively cutting our short-term
|
|
+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
|
|
+ * multiplicative decrease in order to converge to a lower capacity in time
|
|
+ * logarithmic in the magnitude of the decrease.
|
|
+ *
|
|
+ * However, we do not cut our short-term estimates lower than the current rate
|
|
+ * and volume of delivered data from this round trip, since from the current
|
|
+ * delivery process we can estimate the measured capacity available now.
|
|
+ *
|
|
+ * Anything faster than that approach would knowingly risk high loss, which can
|
|
+ * cause low bw for Reno/CUBIC and high loss recovery latency for
|
|
+ * request/response flows using any congestion control.
|
|
+ */
|
|
+static void bbr2_adapt_lower_bounds(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 ecn_cut, ecn_inflight_lo, beta;
|
|
+
|
|
+ /* We only use lower-bound estimates when not probing bw.
|
|
+ * When probing we need to push inflight higher to probe bw.
|
|
+ */
|
|
+ if (bbr2_is_probing_bandwidth(sk))
|
|
+ return;
|
|
+
|
|
+ /* ECN response. */
|
|
+ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
|
|
+ /* Reduce inflight to (1 - alpha*ecn_factor). */
|
|
+ ecn_cut = (BBR_UNIT -
|
|
+ ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
|
|
+ BBR_SCALE));
|
|
+ if (bbr->inflight_lo == ~0U)
|
|
+ bbr->inflight_lo = tp->snd_cwnd;
|
|
+ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
|
|
+ } else {
|
|
+ ecn_inflight_lo = ~0U;
|
|
+ }
|
|
+
|
|
+ /* Loss response. */
|
|
+ if (bbr->loss_in_round) {
|
|
+ /* Reduce bw and inflight to (1 - beta). */
|
|
+ if (bbr->bw_lo == ~0U)
|
|
+ bbr->bw_lo = bbr_max_bw(sk);
|
|
+ if (bbr->inflight_lo == ~0U)
|
|
+ bbr->inflight_lo = tp->snd_cwnd;
|
|
+ beta = bbr->params.beta;
|
|
+ bbr->bw_lo =
|
|
+ max_t(u32, bbr->bw_latest,
|
|
+ (u64)bbr->bw_lo *
|
|
+ (BBR_UNIT - beta) >> BBR_SCALE);
|
|
+ bbr->inflight_lo =
|
|
+ max_t(u32, bbr->inflight_latest,
|
|
+ (u64)bbr->inflight_lo *
|
|
+ (BBR_UNIT - beta) >> BBR_SCALE);
|
|
+ }
|
|
+
|
|
+ /* Adjust to the lower of the levels implied by loss or ECN. */
|
|
+ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
|
|
+}
|
|
+
|
|
+/* Reset any short-term lower-bound adaptation to congestion, so that we can
|
|
+ * push our inflight up.
|
|
+ */
|
|
+static void bbr2_reset_lower_bounds(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->bw_lo = ~0U;
|
|
+ bbr->inflight_lo = ~0U;
|
|
+}
|
|
+
|
|
+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
|
|
+ * machine phase where we adapt our lower bound based on congestion signals.
|
|
+ */
|
|
+static void bbr2_reset_congestion_signals(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->loss_in_round = 0;
|
|
+ bbr->ecn_in_round = 0;
|
|
+ bbr->loss_in_cycle = 0;
|
|
+ bbr->ecn_in_cycle = 0;
|
|
+ bbr->bw_latest = 0;
|
|
+ bbr->inflight_latest = 0;
|
|
+}
|
|
+
|
|
+/* Update (most of) our congestion signals: track the recent rate and volume of
|
|
+ * delivered data, presence of loss, and EWMA degree of ECN marking.
|
|
+ */
|
|
+static void bbr2_update_congestion_signals(
|
|
+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u64 bw;
|
|
+
|
|
+ bbr->loss_round_start = 0;
|
|
+ if (rs->interval_us <= 0 || !rs->acked_sacked)
|
|
+ return; /* Not a valid observation */
|
|
+ bw = ctx->sample_bw;
|
|
+
|
|
+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
|
|
+ bbr2_take_bw_hi_sample(sk, bw);
|
|
+
|
|
+ bbr->loss_in_round |= (rs->losses > 0);
|
|
+
|
|
+ /* Update rate and volume of delivered data from latest round trip: */
|
|
+ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw);
|
|
+ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
|
|
+
|
|
+ if (before(rs->prior_delivered, bbr->loss_round_delivered))
|
|
+ return; /* skip the per-round-trip updates */
|
|
+ /* Now do per-round-trip updates. */
|
|
+ bbr->loss_round_delivered = tp->delivered; /* mark round trip */
|
|
+ bbr->loss_round_start = 1;
|
|
+ bbr2_adapt_lower_bounds(sk);
|
|
+
|
|
+ /* Update windowed "latest" (single-round-trip) filters. */
|
|
+ bbr->loss_in_round = 0;
|
|
+ bbr->ecn_in_round = 0;
|
|
+ bbr->bw_latest = ctx->sample_bw;
|
|
+ bbr->inflight_latest = rs->delivered;
|
|
+}
|
|
+
|
|
+/* Bandwidth probing can cause loss. To help coexistence with loss-based
|
|
+ * congestion control we spread out our probing in a Reno-conscious way. Due to
|
|
+ * the shape of the Reno sawtooth, the time required between loss epochs for an
|
|
+ * idealized Reno flow is a number of round trips that is the BDP of that
|
|
+ * flow. We count packet-timed round trips directly, since measured RTT can
|
|
+ * vary widely, and Reno is driven by packet-timed round trips.
|
|
+ */
|
|
+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 inflight, rounds, reno_gain, reno_rounds;
|
|
+
|
|
+ /* Random loss can shave some small percentage off of our inflight
|
|
+ * in each round. To survive this, flows need robust periodic probes.
|
|
+ */
|
|
+ rounds = bbr->params.bw_probe_max_rounds;
|
|
+
|
|
+ reno_gain = bbr->params.bw_probe_reno_gain;
|
|
+ if (reno_gain) {
|
|
+ inflight = bbr2_target_inflight(sk);
|
|
+ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
|
|
+ rounds = min(rounds, reno_rounds);
|
|
+ }
|
|
+ return bbr->rounds_since_probe >= rounds;
|
|
+}
|
|
+
|
|
+/* How long do we want to wait before probing for bandwidth (and risking
|
|
+ * loss)? We randomize the wait, for better mixing and fairness convergence.
|
|
+ *
|
|
+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
|
|
+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
|
|
+ * (eg 4K video to a broadband user):
|
|
+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
|
|
+ *
|
|
+ * We bound the BBR-native inter-bw-probe wall clock time to be:
|
|
+ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time
|
|
+ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
|
|
+ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
|
|
+ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable
|
|
+ * amount of time to discover unutilized bw on human-scale interactive
|
|
+ * time-scales (e.g. perhaps traffic from a web page download that we
|
|
+ * were competing with is now complete).
|
|
+ */
|
|
+static void bbr2_pick_probe_wait(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ /* Decide the random round-trip bound for wait until probe: */
|
|
+ bbr->rounds_since_probe =
|
|
+ get_random_u32_below(bbr->params.bw_probe_rand_rounds);
|
|
+ /* Decide the random wall clock bound for wait until probe: */
|
|
+ bbr->probe_wait_us = bbr->params.bw_probe_base_us +
|
|
+ get_random_u32_below(bbr->params.bw_probe_rand_us);
|
|
+}
|
|
+
|
|
+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->cycle_idx = cycle_idx;
|
|
+ /* New phase, so need to update cwnd and pacing rate. */
|
|
+ bbr->try_fast_path = 0;
|
|
+}
|
|
+
|
|
+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
|
|
+ * before PROBE_UP, because as soon as we send faster than the available bw
|
|
+ * we will start building a queue, and if the buffer is shallow we can cause
|
|
+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
|
|
+ * inflight_hi estimates will underestimate.
|
|
+ */
|
|
+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr2_reset_lower_bounds(sk);
|
|
+ if (bbr->inflight_hi != ~0U)
|
|
+ bbr->inflight_hi += bbr->params.refill_add_inc;
|
|
+ bbr->bw_probe_up_rounds = bw_probe_up_rounds;
|
|
+ bbr->bw_probe_up_acks = 0;
|
|
+ bbr->stopped_risky_probe = 0;
|
|
+ bbr->ack_phase = BBR_ACKS_REFILLING;
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
|
|
+}
|
|
+
|
|
+/* Now probe max deliverable data rate and volume. */
|
|
+static void bbr2_start_bw_probe_up(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ bbr->cycle_mstamp = tp->tcp_mstamp;
|
|
+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
|
|
+ bbr2_raise_inflight_hi_slope(sk);
|
|
+}
|
|
+
|
|
+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
|
|
+ * clock time at which to probe beyond an inflight that we think to be
|
|
+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
|
|
+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
|
|
+ * if we estimate a Reno flow at our BDP would probe faster.
|
|
+ */
|
|
+static void bbr2_start_bw_probe_down(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr2_reset_congestion_signals(sk);
|
|
+ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */
|
|
+ bbr2_pick_probe_wait(sk);
|
|
+ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */
|
|
+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
|
|
+ bbr->next_rtt_delivered = tp->delivered;
|
|
+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
|
|
+}
|
|
+
|
|
+/* Cruise: maintain what we estimate to be a neutral, conservative
|
|
+ * operating point, without attempting to probe up for bandwidth or down for
|
|
+ * RTT, and only reducing inflight in response to loss/ECN signals.
|
|
+ */
|
|
+static void bbr2_start_bw_probe_cruise(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr->inflight_lo != ~0U)
|
|
+ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
|
|
+
|
|
+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
|
|
+}
|
|
+
|
|
+/* Loss and/or ECN rate is too high while probing.
|
|
+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
|
|
+ */
|
|
+static void bbr2_handle_inflight_too_high(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ const u32 beta = bbr->params.beta;
|
|
+
|
|
+ bbr->prev_probe_too_high = 1;
|
|
+ bbr->bw_probe_samples = 0; /* only react once per probe */
|
|
+ bbr->debug.event = 'L'; /* Loss/ECN too high */
|
|
+ /* If we are app-limited then we are not robustly
|
|
+ * probing the max volume of inflight data we think
|
|
+ * might be safe (analogous to how app-limited bw
|
|
+ * samples are not known to be robustly probing bw).
|
|
+ */
|
|
+ if (!rs->is_app_limited)
|
|
+ bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
|
|
+ (u64)bbr2_target_inflight(sk) *
|
|
+ (BBR_UNIT - beta) >> BBR_SCALE);
|
|
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
|
|
+ bbr2_start_bw_probe_down(sk);
|
|
+}
|
|
+
|
|
+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
|
|
+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
|
|
+ * inflight_hi downward. If we're able to push inflight higher without such
|
|
+ * signals, push higher: adapt inflight_hi upward.
|
|
+ */
|
|
+static bool bbr2_adapt_upper_bounds(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ /* Track when we'll see bw/loss samples resulting from our bw probes. */
|
|
+ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
|
|
+ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
|
|
+ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
|
|
+ /* End of samples from bw probing phase. */
|
|
+ bbr->bw_probe_samples = 0;
|
|
+ bbr->ack_phase = BBR_ACKS_INIT;
|
|
+ /* At this point in the cycle, our current bw sample is also
|
|
+ * our best recent chance at finding the highest available bw
|
|
+ * for this flow. So now is the best time to forget the bw
|
|
+ * samples from the previous cycle, by advancing the window.
|
|
+ */
|
|
+ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
|
|
+ bbr2_advance_bw_hi_filter(sk);
|
|
+ /* If we had an inflight_hi, then probed and pushed inflight all
|
|
+ * the way up to hit that inflight_hi without seeing any
|
|
+ * high loss/ECN in all the resulting ACKs from that probing,
|
|
+ * then probe up again, this time letting inflight persist at
|
|
+ * inflight_hi for a round trip, then accelerating beyond.
|
|
+ */
|
|
+ if (bbr->mode == BBR_PROBE_BW &&
|
|
+ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
|
|
+ bbr->debug.event = 'R'; /* reprobe */
|
|
+ bbr2_start_bw_probe_refill(sk, 0);
|
|
+ return true; /* yes, decided state transition */
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (bbr2_is_inflight_too_high(sk, rs)) {
|
|
+ if (bbr->bw_probe_samples) /* sample is from bw probing? */
|
|
+ bbr2_handle_inflight_too_high(sk, rs);
|
|
+ } else {
|
|
+ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */
|
|
+ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */
|
|
+ return false;
|
|
+
|
|
+ /* To be resilient to random loss, we must raise inflight_hi
|
|
+ * if we observe in any phase that a higher level is safe.
|
|
+ */
|
|
+ if (rs->tx_in_flight > bbr->inflight_hi) {
|
|
+ bbr->inflight_hi = rs->tx_in_flight;
|
|
+ bbr->debug.event = 'U'; /* raise up inflight_hi */
|
|
+ }
|
|
+
|
|
+ if (bbr->mode == BBR_PROBE_BW &&
|
|
+ bbr->cycle_idx == BBR_BW_PROBE_UP)
|
|
+ bbr2_probe_inflight_hi_upward(sk, rs);
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
|
|
+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 n;
|
|
+
|
|
+ /* If we seem to be at an operating point where we are not seeing loss
|
|
+ * but we are seeing ECN marks, then when the ECN marks cease we reprobe
|
|
+ * quickly (in case a burst of cross-traffic has ceased and freed up bw,
|
|
+ * or in case we are sharing with multiplicatively probing traffic).
|
|
+ */
|
|
+ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
|
|
+ bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
|
|
+ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
|
|
+ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */
|
|
+ /* Calculate n so that when bbr2_raise_inflight_hi_slope()
|
|
+ * computes growth_this_round as 2^n it will be roughly the
|
|
+ * desired volume of data (inflight_hi*ecn_reprobe_gain).
|
|
+ */
|
|
+ n = ilog2((((u64)bbr->inflight_hi *
|
|
+ bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
|
|
+ bbr2_start_bw_probe_refill(sk, n);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
|
|
+ bbr2_is_reno_coexistence_probe_time(sk)) {
|
|
+ bbr2_start_bw_probe_refill(sk, 0);
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
|
|
+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ bool is_under_bdp, is_long_enough;
|
|
+
|
|
+ /* Always need to pull inflight down to leave headroom in queue. */
|
|
+ if (inflight > bbr2_inflight_with_headroom(sk))
|
|
+ return false;
|
|
+
|
|
+ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
|
|
+ if (bbr->params.drain_to_target)
|
|
+ return is_under_bdp;
|
|
+
|
|
+ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
|
|
+ return is_under_bdp || is_long_enough;
|
|
+}
|
|
+
|
|
+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
|
|
+static void bbr2_update_cycle_phase(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ bool is_risky = false, is_queuing = false;
|
|
+ u32 inflight, bw;
|
|
+
|
|
+ if (!bbr_full_bw_reached(sk))
|
|
+ return;
|
|
+
|
|
+ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
|
|
+ if (bbr2_adapt_upper_bounds(sk, rs))
|
|
+ return; /* already decided state transition */
|
|
+
|
|
+ if (bbr->mode != BBR_PROBE_BW)
|
|
+ return;
|
|
+
|
|
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
|
|
+ bw = bbr_max_bw(sk);
|
|
+
|
|
+ switch (bbr->cycle_idx) {
|
|
+ /* First we spend most of our time cruising with a pacing_gain of 1.0,
|
|
+ * which paces at the estimated bw, to try to fully use the pipe
|
|
+ * without building queue. If we encounter loss/ECN marks, we adapt
|
|
+ * by slowing down.
|
|
+ */
|
|
+ case BBR_BW_PROBE_CRUISE:
|
|
+ if (bbr2_check_time_to_probe_bw(sk))
|
|
+ return; /* already decided state transition */
|
|
+ break;
|
|
+
|
|
+ /* After cruising, when it's time to probe, we first "refill": we send
|
|
+ * at the estimated bw to fill the pipe, before probing higher and
|
|
+ * knowingly risking overflowing the bottleneck buffer (causing loss).
|
|
+ */
|
|
+ case BBR_BW_PROBE_REFILL:
|
|
+ if (bbr->round_start) {
|
|
+ /* After one full round trip of sending in REFILL, we
|
|
+ * start to see bw samples reflecting our REFILL, which
|
|
+ * may be putting too much data in flight.
|
|
+ */
|
|
+ bbr->bw_probe_samples = 1;
|
|
+ bbr2_start_bw_probe_up(sk);
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
|
|
+ * probe for bw. If we have not seen loss/ECN, we try to raise inflight
|
|
+ * to at least pacing_gain*BDP; note that this may take more than
|
|
+ * min_rtt if min_rtt is small (e.g. on a LAN).
|
|
+ *
|
|
+ * We terminate PROBE_UP bandwidth probing upon any of the following:
|
|
+ *
|
|
+ * (1) We've pushed inflight up to hit the inflight_hi target set in the
|
|
+ * most recent previous bw probe phase. Thus we want to start
|
|
+ * draining the queue immediately because it's very likely the most
|
|
+ * recently sent packets will fill the queue and cause drops.
|
|
+ * (checked here)
|
|
+ * (2) We have probed for at least 1*min_rtt_us, and the
|
|
+ * estimated queue is high enough (inflight > 1.25 * estimated_bdp).
|
|
+ * (checked here)
|
|
+ * (3) Loss filter says loss rate is "too high".
|
|
+ * (checked in bbr_is_inflight_too_high())
|
|
+ * (4) ECN filter says ECN mark rate is "too high".
|
|
+ * (checked in bbr_is_inflight_too_high())
|
|
+ */
|
|
+ case BBR_BW_PROBE_UP:
|
|
+ if (bbr->prev_probe_too_high &&
|
|
+ inflight >= bbr->inflight_hi) {
|
|
+ bbr->stopped_risky_probe = 1;
|
|
+ is_risky = true;
|
|
+ bbr->debug.event = 'D'; /* D for danger */
|
|
+ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
|
|
+ inflight >=
|
|
+ bbr_inflight(sk, bw,
|
|
+ bbr->params.bw_probe_pif_gain)) {
|
|
+ is_queuing = true;
|
|
+ bbr->debug.event = 'Q'; /* building Queue */
|
|
+ }
|
|
+ if (is_risky || is_queuing) {
|
|
+ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */
|
|
+ bbr2_start_bw_probe_down(sk); /* restart w/ down */
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ /* After probing in PROBE_UP, we have usually accumulated some data in
|
|
+ * the bottleneck buffer (if bw probing didn't find more bw). We next
|
|
+ * enter PROBE_DOWN to try to drain any excess data from the queue. To
|
|
+ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
|
|
+ * our inflight is less then that target cruising point, which is the
|
|
+ * minimum of (a) the amount needed to leave headroom, and (b) the
|
|
+ * estimated BDP. Once inflight falls to match the target, we estimate
|
|
+ * the queue is drained; persisting would underutilize the pipe.
|
|
+ */
|
|
+ case BBR_BW_PROBE_DOWN:
|
|
+ if (bbr2_check_time_to_probe_bw(sk))
|
|
+ return; /* already decided state transition */
|
|
+ if (bbr2_check_time_to_cruise(sk, inflight, bw))
|
|
+ bbr2_start_bw_probe_cruise(sk);
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
|
|
+static void bbr2_exit_probe_rtt(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr2_reset_lower_bounds(sk);
|
|
+ if (bbr_full_bw_reached(sk)) {
|
|
+ bbr->mode = BBR_PROBE_BW;
|
|
+ /* Raising inflight after PROBE_RTT may cause loss, so reset
|
|
+ * the PROBE_BW clock and schedule the next bandwidth probe for
|
|
+ * a friendly and randomized future point in time.
|
|
+ */
|
|
+ bbr2_start_bw_probe_down(sk);
|
|
+ /* Since we are exiting PROBE_RTT, we know inflight is
|
|
+ * below our estimated BDP, so it is reasonable to cruise.
|
|
+ */
|
|
+ bbr2_start_bw_probe_cruise(sk);
|
|
+ } else {
|
|
+ bbr->mode = BBR_STARTUP;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
|
|
+ * the end of the round in recovery to get a good estimate of how many packets
|
|
+ * have been lost, and how many we need to drain with a low pacing rate.
|
|
+ */
|
|
+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
|
|
+ const struct rate_sample *rs)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr_full_bw_reached(sk))
|
|
+ return;
|
|
+
|
|
+ /* For STARTUP exit, check the loss rate at the end of each round trip
|
|
+ * of Recovery episodes in STARTUP. We check the loss rate at the end
|
|
+ * of the round trip to filter out noisy/low loss and have a better
|
|
+ * sense of inflight (extent of loss), so we can drain more accurately.
|
|
+ */
|
|
+ if (rs->losses && bbr->loss_events_in_round < 0xf)
|
|
+ bbr->loss_events_in_round++; /* update saturating counter */
|
|
+ if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
|
|
+ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
|
|
+ bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
|
|
+ bbr2_is_inflight_too_high(sk, rs)) {
|
|
+ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */
|
|
+ bbr2_handle_queue_too_high_in_startup(sk);
|
|
+ return;
|
|
+ }
|
|
+ if (bbr->loss_round_start)
|
|
+ bbr->loss_events_in_round = 0;
|
|
+}
|
|
+
|
|
+/* If we are done draining, advance into steady state operation in PROBE_BW. */
|
|
+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
|
|
+ struct bbr_context *ctx)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (bbr_check_drain(sk, rs, ctx)) {
|
|
+ bbr->mode = BBR_PROBE_BW;
|
|
+ bbr2_start_bw_probe_down(sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
|
|
+ struct bbr_context *ctx)
|
|
+{
|
|
+ bbr2_update_congestion_signals(sk, rs, ctx);
|
|
+ bbr_update_ack_aggregation(sk, rs);
|
|
+ bbr2_check_loss_too_high_in_startup(sk, rs);
|
|
+ bbr_check_full_bw_reached(sk, rs);
|
|
+ bbr2_check_drain(sk, rs, ctx);
|
|
+ bbr2_update_cycle_phase(sk, rs);
|
|
+ bbr_update_min_rtt(sk, rs);
|
|
+}
|
|
+
|
|
+/* Fast path for app-limited case.
|
|
+ *
|
|
+ * On each ack, we execute bbr state machine, which primarily consists of:
|
|
+ * 1) update model based on new rate sample, and
|
|
+ * 2) update control based on updated model or state change.
|
|
+ *
|
|
+ * There are certain workload/scenarios, e.g. app-limited case, where
|
|
+ * either we can skip updating model or we can skip update of both model
|
|
+ * as well as control. This provides signifcant softirq cpu savings for
|
|
+ * processing incoming acks.
|
|
+ *
|
|
+ * In case of app-limited, if there is no congestion (loss/ecn) and
|
|
+ * if observed bw sample is less than current estimated bw, then we can
|
|
+ * skip some of the computation in bbr state processing:
|
|
+ *
|
|
+ * - if there is no rtt/mode/phase change: In this case, since all the
|
|
+ * parameters of the network model are constant, we can skip model
|
|
+ * as well control update.
|
|
+ *
|
|
+ * - else we can skip rest of the model update. But we still need to
|
|
+ * update the control to account for the new rtt/mode/phase.
|
|
+ *
|
|
+ * Returns whether we can take fast path or not.
|
|
+ */
|
|
+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
|
|
+ const struct rate_sample *rs, struct bbr_context *ctx)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u32 prev_min_rtt_us, prev_mode;
|
|
+
|
|
+ if (bbr->params.fast_path && bbr->try_fast_path &&
|
|
+ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
|
|
+ !bbr->loss_in_round && !bbr->ecn_in_round) {
|
|
+ prev_mode = bbr->mode;
|
|
+ prev_min_rtt_us = bbr->min_rtt_us;
|
|
+ bbr2_check_drain(sk, rs, ctx);
|
|
+ bbr2_update_cycle_phase(sk, rs);
|
|
+ bbr_update_min_rtt(sk, rs);
|
|
+
|
|
+ if (bbr->mode == prev_mode &&
|
|
+ bbr->min_rtt_us == prev_min_rtt_us &&
|
|
+ bbr->try_fast_path)
|
|
+ return true;
|
|
+
|
|
+ /* Skip model update, but control still needs to be updated */
|
|
+ *update_model = false;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ struct bbr_context ctx = { 0 };
|
|
+ bool update_model = true;
|
|
+ u32 bw;
|
|
+
|
|
+ bbr->debug.event = '.'; /* init to default NOP (no event yet) */
|
|
+
|
|
+ bbr_update_round_start(sk, rs, &ctx);
|
|
+ if (bbr->round_start) {
|
|
+ bbr->rounds_since_probe =
|
|
+ min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
|
|
+ bbr2_update_ecn_alpha(sk);
|
|
+ }
|
|
+
|
|
+ bbr->ecn_in_round |= rs->is_ece;
|
|
+ bbr_calculate_bw_sample(sk, rs, &ctx);
|
|
+
|
|
+ if (bbr2_fast_path(sk, &update_model, rs, &ctx))
|
|
+ goto out;
|
|
+
|
|
+ if (update_model)
|
|
+ bbr2_update_model(sk, rs, &ctx);
|
|
+
|
|
+ bbr_update_gains(sk);
|
|
+ bw = bbr_bw(sk);
|
|
+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
|
|
+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
|
|
+ tp->snd_cwnd, &ctx);
|
|
+ bbr2_bound_cwnd_for_inflight_model(sk);
|
|
+
|
|
+out:
|
|
+ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
|
|
+ bbr->loss_in_cycle |= rs->lost > 0;
|
|
+ bbr->ecn_in_cycle |= rs->delivered_ce > 0;
|
|
+
|
|
+ bbr_debug(sk, rs->acked_sacked, rs, &ctx);
|
|
+}
|
|
+
|
|
+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
|
|
+ * down here, so that the algorithm functions that use the parameters must use
|
|
+ * the per-socket parameters; if they accidentally use the global version
|
|
+ * then there will be a compile error.
|
|
+ * TODO(ncardwell): move all per-socket parameters down to this section.
|
|
+ */
|
|
+
|
|
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
|
|
+ * No loss response when 0. Max allwed value is 255.
|
|
+ */
|
|
+static u32 bbr_beta = BBR_UNIT * 30 / 100;
|
|
+
|
|
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
|
|
+ * Max allowed value is 255.
|
|
+ */
|
|
+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */
|
|
+
|
|
+/* The initial value for the ecn_alpha state variable. Default and max
|
|
+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
|
|
+ * to congestion if the bottleneck is congested when the flow starts up.
|
|
+ */
|
|
+static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */
|
|
+
|
|
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
|
|
+ * No ECN based bounding when 0. Max allwed value is 255.
|
|
+ */
|
|
+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */
|
|
+
|
|
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
|
|
+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
|
|
+ */
|
|
+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */
|
|
+
|
|
+/* Max RTT (in usec) at which to use sender-side ECN logic.
|
|
+ * Disabled when 0 (ECN allowed at any RTT).
|
|
+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
|
|
+ */
|
|
+static u32 bbr_ecn_max_rtt_us = 5000;
|
|
+
|
|
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
|
|
+ * clears then use a multiplicative increase to quickly reprobe bw by
|
|
+ * starting inflight probing at the given multiple of inflight_hi.
|
|
+ * Default for this experimental knob is 0 (disabled).
|
|
+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
|
|
+ */
|
|
+static u32 bbr_ecn_reprobe_gain;
|
|
+
|
|
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
|
|
+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */
|
|
+
|
|
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
|
|
+ * and loss rate is higher than bbr_loss_thresh.
|
|
+ * Disabled if 0. Max allowed value is 15 (0xF).
|
|
+ */
|
|
+static u32 bbr_full_loss_cnt = 8;
|
|
+
|
|
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
|
|
+ * meets this count. Max allowed value is 3.
|
|
+ */
|
|
+static u32 bbr_full_ecn_cnt = 2;
|
|
+
|
|
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
|
|
+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
|
|
+
|
|
+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
|
|
+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
|
|
+ */
|
|
+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
|
|
+
|
|
+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
|
|
+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
|
|
+ * Max allowed is 511.
|
|
+ */
|
|
+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
|
|
+
|
|
+/* Max number of packet-timed rounds to wait before probing for bandwidth. If
|
|
+ * we want to tolerate 1% random loss per round, and not have this cut our
|
|
+ * inflight too much, we must probe for bw periodically on roughly this scale.
|
|
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
|
|
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
|
|
+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
|
|
+ */
|
|
+static u32 bbr_bw_probe_max_rounds = 63;
|
|
+
|
|
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
|
|
+ * Max value is 15.
|
|
+ */
|
|
+static u32 bbr_bw_probe_rand_rounds = 2;
|
|
+
|
|
+/* Use BBR-native probe time scale starting at this many usec.
|
|
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
|
|
+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
|
|
+ */
|
|
+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */
|
|
+
|
|
+/* Use BBR-native probes spread over this many usec: */
|
|
+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */
|
|
+
|
|
+/* Undo the model changes made in loss recovery if recovery was spurious? */
|
|
+static bool bbr_undo = true;
|
|
+
|
|
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
|
|
+static bool bbr_fast_path = true; /* default: enabled */
|
|
+
|
|
+/* Use fast ack mode ? */
|
|
+static int bbr_fast_ack_mode = 1; /* default: rwnd check off */
|
|
+
|
|
+/* How much to additively increase inflight_hi when entering REFILL? */
|
|
+static u32 bbr_refill_add_inc; /* default: disabled */
|
|
+
|
|
+module_param_named(beta, bbr_beta, uint, 0644);
|
|
+module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644);
|
|
+module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644);
|
|
+module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644);
|
|
+module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644);
|
|
+module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644);
|
|
+module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644);
|
|
+module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664);
|
|
+module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664);
|
|
+module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664);
|
|
+module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664);
|
|
+module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664);
|
|
+module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664);
|
|
+module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664);
|
|
+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
|
|
+module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664);
|
|
+module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664);
|
|
+module_param_named(undo, bbr_undo, bool, 0664);
|
|
+module_param_named(fast_path, bbr_fast_path, bool, 0664);
|
|
+module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664);
|
|
+module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664);
|
|
+
|
|
+static void bbr2_init(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr_init(sk); /* run shared init code for v1 and v2 */
|
|
+
|
|
+ /* BBR v2 parameters: */
|
|
+ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
|
|
+ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
|
|
+ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
|
|
+ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
|
|
+ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
|
|
+ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
|
|
+ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
|
|
+ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
|
|
+ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
|
|
+ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
|
|
+ bbr->params.inflight_headroom =
|
|
+ min_t(u32, 0xFFU, bbr_inflight_headroom);
|
|
+ bbr->params.bw_probe_pif_gain =
|
|
+ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
|
|
+ bbr->params.bw_probe_reno_gain =
|
|
+ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
|
|
+ bbr->params.bw_probe_max_rounds =
|
|
+ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
|
|
+ bbr->params.bw_probe_rand_rounds =
|
|
+ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
|
|
+ bbr->params.bw_probe_base_us =
|
|
+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
|
|
+ bbr->params.bw_probe_rand_us =
|
|
+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
|
|
+ bbr->params.undo = bbr_undo;
|
|
+ bbr->params.fast_path = bbr_fast_path ? 1 : 0;
|
|
+ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
|
|
+
|
|
+ /* BBR v2 state: */
|
|
+ bbr->initialized = 1;
|
|
+ /* Start sampling ECN mark rate after first full flight is ACKed: */
|
|
+ bbr->loss_round_delivered = tp->delivered + 1;
|
|
+ bbr->loss_round_start = 0;
|
|
+ bbr->undo_bw_lo = 0;
|
|
+ bbr->undo_inflight_lo = 0;
|
|
+ bbr->undo_inflight_hi = 0;
|
|
+ bbr->loss_events_in_round = 0;
|
|
+ bbr->startup_ecn_rounds = 0;
|
|
+ bbr2_reset_congestion_signals(sk);
|
|
+ bbr->bw_lo = ~0U;
|
|
+ bbr->bw_hi[0] = 0;
|
|
+ bbr->bw_hi[1] = 0;
|
|
+ bbr->inflight_lo = ~0U;
|
|
+ bbr->inflight_hi = ~0U;
|
|
+ bbr->bw_probe_up_cnt = ~0U;
|
|
+ bbr->bw_probe_up_acks = 0;
|
|
+ bbr->bw_probe_up_rounds = 0;
|
|
+ bbr->probe_wait_us = 0;
|
|
+ bbr->stopped_risky_probe = 0;
|
|
+ bbr->ack_phase = BBR_ACKS_INIT;
|
|
+ bbr->rounds_since_probe = 0;
|
|
+ bbr->bw_probe_samples = 0;
|
|
+ bbr->prev_probe_too_high = 0;
|
|
+ bbr->ecn_eligible = 0;
|
|
+ bbr->ecn_alpha = bbr->params.ecn_alpha_init;
|
|
+ bbr->alpha_last_delivered = 0;
|
|
+ bbr->alpha_last_delivered_ce = 0;
|
|
+
|
|
+ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
|
|
+
|
|
+ if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
|
|
+ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
|
|
+}
|
|
+
|
|
+/* Core TCP stack informs us that the given skb was just marked lost. */
|
|
+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
|
|
+ struct rate_sample rs;
|
|
+
|
|
+ /* Capture "current" data over the full round trip of loss,
|
|
+ * to have a better chance to see the full capacity of the path.
|
|
+ */
|
|
+ if (!bbr->loss_in_round) /* first loss in this round trip? */
|
|
+ bbr->loss_round_delivered = tp->delivered; /* set round trip */
|
|
+ bbr->loss_in_round = 1;
|
|
+ bbr->loss_in_cycle = 1;
|
|
+
|
|
+ if (!bbr->bw_probe_samples)
|
|
+ return; /* not an skb sent while probing for bandwidth */
|
|
+ if (unlikely(!scb->tx.delivered_mstamp))
|
|
+ return; /* skb was SACKed, reneged, marked lost; ignore it */
|
|
+ /* We are probing for bandwidth. Construct a rate sample that
|
|
+ * estimates what happened in the flight leading up to this lost skb,
|
|
+ * then see if the loss rate went too high, and if so at which packet.
|
|
+ */
|
|
+ memset(&rs, 0, sizeof(rs));
|
|
+ rs.tx_in_flight = scb->tx.in_flight;
|
|
+ rs.lost = tp->lost - scb->tx.lost;
|
|
+ rs.is_app_limited = scb->tx.is_app_limited;
|
|
+ if (bbr2_is_inflight_too_high(sk, &rs)) {
|
|
+ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
|
|
+ bbr2_handle_inflight_too_high(sk, &rs);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Revert short-term model if current loss recovery event was spurious. */
|
|
+static u32 bbr2_undo_cwnd(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr->debug.undo = 1;
|
|
+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
|
|
+ bbr->full_bw_cnt = 0;
|
|
+ bbr->loss_in_round = 0;
|
|
+
|
|
+ if (!bbr->params.undo)
|
|
+ return tp->snd_cwnd;
|
|
+
|
|
+ /* Revert to cwnd and other state saved before loss episode. */
|
|
+ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
|
|
+ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
|
|
+ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
|
|
+ return bbr->prior_cwnd;
|
|
+}
|
|
+
|
|
+/* Entering loss recovery, so save state for when we undo recovery. */
|
|
+static u32 bbr2_ssthresh(struct sock *sk)
|
|
+{
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ bbr_save_cwnd(sk);
|
|
+ /* For undo, save state that adapts based on loss signal. */
|
|
+ bbr->undo_bw_lo = bbr->bw_lo;
|
|
+ bbr->undo_inflight_lo = bbr->inflight_lo;
|
|
+ bbr->undo_inflight_hi = bbr->inflight_hi;
|
|
+ return tcp_sk(sk)->snd_ssthresh;
|
|
+}
|
|
+
|
|
+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
|
|
+{
|
|
+ switch (bbr->mode) {
|
|
+ case BBR_STARTUP:
|
|
+ return BBR2_PHASE_STARTUP;
|
|
+ case BBR_DRAIN:
|
|
+ return BBR2_PHASE_DRAIN;
|
|
+ case BBR_PROBE_BW:
|
|
+ break;
|
|
+ case BBR_PROBE_RTT:
|
|
+ return BBR2_PHASE_PROBE_RTT;
|
|
+ default:
|
|
+ return BBR2_PHASE_INVALID;
|
|
+ }
|
|
+ switch (bbr->cycle_idx) {
|
|
+ case BBR_BW_PROBE_UP:
|
|
+ return BBR2_PHASE_PROBE_BW_UP;
|
|
+ case BBR_BW_PROBE_DOWN:
|
|
+ return BBR2_PHASE_PROBE_BW_DOWN;
|
|
+ case BBR_BW_PROBE_CRUISE:
|
|
+ return BBR2_PHASE_PROBE_BW_CRUISE;
|
|
+ case BBR_BW_PROBE_REFILL:
|
|
+ return BBR2_PHASE_PROBE_BW_REFILL;
|
|
+ default:
|
|
+ return BBR2_PHASE_INVALID;
|
|
+ }
|
|
+}
|
|
+
|
|
+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
|
|
+ union tcp_cc_info *info)
|
|
+{
|
|
+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
|
|
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
|
|
+ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
|
|
+ u64 bw_lo = bbr->bw_lo == ~0U ?
|
|
+ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
|
|
+
|
|
+ memset(&info->bbr2, 0, sizeof(info->bbr2));
|
|
+ info->bbr2.bbr_bw_lsb = (u32)bw;
|
|
+ info->bbr2.bbr_bw_msb = (u32)(bw >> 32);
|
|
+ info->bbr2.bbr_min_rtt = bbr->min_rtt_us;
|
|
+ info->bbr2.bbr_pacing_gain = bbr->pacing_gain;
|
|
+ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain;
|
|
+ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi;
|
|
+ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32);
|
|
+ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo;
|
|
+ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32);
|
|
+ info->bbr2.bbr_mode = bbr->mode;
|
|
+ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr);
|
|
+ info->bbr2.bbr_version = (__u8)2;
|
|
+ info->bbr2.bbr_inflight_lo = bbr->inflight_lo;
|
|
+ info->bbr2.bbr_inflight_hi = bbr->inflight_hi;
|
|
+ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk);
|
|
+ *attr = INET_DIAG_BBRINFO;
|
|
+ return sizeof(info->bbr2);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bbr2_set_state(struct sock *sk, u8 new_state)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct bbr *bbr = inet_csk_ca(sk);
|
|
+
|
|
+ if (new_state == TCP_CA_Loss) {
|
|
+ struct rate_sample rs = { .losses = 1 };
|
|
+ struct bbr_context ctx = { 0 };
|
|
+
|
|
+ bbr->prev_ca_state = TCP_CA_Loss;
|
|
+ bbr->full_bw = 0;
|
|
+ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
|
|
+ /* bbr_adapt_lower_bounds() needs cwnd before
|
|
+ * we suffered an RTO, to update inflight_lo:
|
|
+ */
|
|
+ bbr->inflight_lo =
|
|
+ max(tp->snd_cwnd, bbr->prior_cwnd);
|
|
+ }
|
|
+ bbr_debug(sk, 0, &rs, &ctx);
|
|
+ } else if (bbr->prev_ca_state == TCP_CA_Loss &&
|
|
+ new_state != TCP_CA_Loss) {
|
|
+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
|
|
+ bbr->try_fast_path = 0; /* bound cwnd using latest model */
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
|
|
+ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
|
|
+ .name = "bbr2",
|
|
+ .owner = THIS_MODULE,
|
|
+ .init = bbr2_init,
|
|
+ .cong_control = bbr2_main,
|
|
+ .sndbuf_expand = bbr_sndbuf_expand,
|
|
+ .skb_marked_lost = bbr2_skb_marked_lost,
|
|
+ .undo_cwnd = bbr2_undo_cwnd,
|
|
+ .cwnd_event = bbr_cwnd_event,
|
|
+ .ssthresh = bbr2_ssthresh,
|
|
+ .tso_segs = bbr_tso_segs,
|
|
+ .get_info = bbr2_get_info,
|
|
+ .set_state = bbr2_set_state,
|
|
+};
|
|
+
|
|
+static int __init bbr_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
|
|
+ return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
|
|
+}
|
|
+
|
|
+static void __exit bbr_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
|
|
+}
|
|
+
|
|
+module_init(bbr_register);
|
|
+module_exit(bbr_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
|
|
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
|
|
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
|
|
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
|
|
+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
|
|
+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
|
|
+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
|
|
+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
|
|
+
|
|
+MODULE_LICENSE("Dual BSD/GPL");
|
|
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
|
|
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
|
|
index db8b4b488c31..0d6d1a949e11 100644
|
|
--- a/net/ipv4/tcp_cong.c
|
|
+++ b/net/ipv4/tcp_cong.c
|
|
@@ -189,6 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
|
|
tcp_sk(sk)->prior_ssthresh = 0;
|
|
+ tcp_sk(sk)->fast_ack_mode = 0;
|
|
if (icsk->icsk_ca_ops->init)
|
|
icsk->icsk_ca_ops->init(sk);
|
|
if (tcp_ca_needs_ecn(sk))
|
|
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
|
index cc072d2cfcd8..754e0212c951 100644
|
|
--- a/net/ipv4/tcp_input.c
|
|
+++ b/net/ipv4/tcp_input.c
|
|
@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
|
|
tcp_enter_quickack_mode(sk, 2);
|
|
break;
|
|
case INET_ECN_CE:
|
|
- if (tcp_ca_needs_ecn(sk))
|
|
+ if (tcp_ca_wants_ce_events(sk))
|
|
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
|
|
|
|
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
|
|
@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
|
|
tp->ecn_flags |= TCP_ECN_SEEN;
|
|
break;
|
|
default:
|
|
- if (tcp_ca_needs_ecn(sk))
|
|
+ if (tcp_ca_wants_ce_events(sk))
|
|
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
|
|
tp->ecn_flags |= TCP_ECN_SEEN;
|
|
break;
|
|
@@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
|
|
*/
|
|
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
|
|
{
|
|
+ struct sock *sk = (struct sock *)tp;
|
|
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
|
|
+
|
|
tp->lost += tcp_skb_pcount(skb);
|
|
+ if (ca_ops->skb_marked_lost)
|
|
+ ca_ops->skb_marked_lost(sk, skb);
|
|
}
|
|
|
|
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
|
|
@@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
|
|
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
|
|
tcp_skb_pcount_add(skb, -pcount);
|
|
|
|
+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */
|
|
+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
|
|
+ "prev in_flight: %u skb in_flight: %u pcount: %u",
|
|
+ TCP_SKB_CB(prev)->tx.in_flight,
|
|
+ TCP_SKB_CB(skb)->tx.in_flight,
|
|
+ pcount))
|
|
+ TCP_SKB_CB(skb)->tx.in_flight = 0;
|
|
+ else
|
|
+ TCP_SKB_CB(skb)->tx.in_flight -= pcount;
|
|
+ TCP_SKB_CB(prev)->tx.in_flight += pcount;
|
|
+
|
|
/* When we're adding to gso_segs == 1, gso_size will be zero,
|
|
* in theory this shouldn't be necessary but as long as DSACK
|
|
* code can come after this skb later on it's better to keep
|
|
@@ -3813,6 +3829,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|
|
|
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
|
|
rs.prior_in_flight = tcp_packets_in_flight(tp);
|
|
+ tcp_rate_check_app_limited(sk);
|
|
|
|
/* ts_recent update must be made after we are sure that the packet
|
|
* is in window.
|
|
@@ -3911,6 +3928,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|
delivered = tcp_newly_delivered(sk, delivered, flag);
|
|
lost = tp->lost - lost; /* freshly marked lost */
|
|
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
|
|
+ rs.is_ece = !!(flag & FLAG_ECE);
|
|
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
|
|
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
|
|
tcp_xmit_recovery(sk, rexmit);
|
|
@@ -5521,13 +5539,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
|
|
|
|
/* More than one full frame received... */
|
|
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
|
|
+ (tp->fast_ack_mode == 1 ||
|
|
/* ... and right edge of window advances far enough.
|
|
* (tcp_recvmsg() will send ACK otherwise).
|
|
* If application uses SO_RCVLOWAT, we want send ack now if
|
|
* we have not received enough bytes to satisfy the condition.
|
|
*/
|
|
- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
|
|
- __tcp_select_window(sk) >= tp->rcv_wnd)) ||
|
|
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
|
|
+ __tcp_select_window(sk) >= tp->rcv_wnd))) ||
|
|
/* We ACK each frame or... */
|
|
tcp_in_quickack_mode(sk) ||
|
|
/* Protocol state mandates a one-time immediate ACK */
|
|
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
|
index ba839e441450..5ffec885e66f 100644
|
|
--- a/net/ipv4/tcp_output.c
|
|
+++ b/net/ipv4/tcp_output.c
|
|
@@ -375,7 +375,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
|
|
th->cwr = 1;
|
|
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
|
|
}
|
|
- } else if (!tcp_ca_needs_ecn(sk)) {
|
|
+ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
|
|
+ !tcp_ca_needs_ecn(sk)) {
|
|
/* ACK or retransmitted segment: clear ECT|CE */
|
|
INET_ECN_dontxmit(sk);
|
|
}
|
|
@@ -1530,7 +1531,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct sk_buff *buff;
|
|
- int nsize, old_factor;
|
|
+ int nsize, old_factor, inflight_prev;
|
|
long limit;
|
|
int nlen;
|
|
u8 flags;
|
|
@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
|
|
if (diff)
|
|
tcp_adjust_pcount(sk, skb, diff);
|
|
+
|
|
+ /* Set buff tx.in_flight as if buff were sent by itself. */
|
|
+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
|
|
+ if (WARN_ONCE(inflight_prev < 0,
|
|
+ "inconsistent: tx.in_flight: %u old_factor: %d",
|
|
+ TCP_SKB_CB(skb)->tx.in_flight, old_factor))
|
|
+ inflight_prev = 0;
|
|
+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
|
|
+ tcp_skb_pcount(buff);
|
|
}
|
|
|
|
/* Link BUFF into the send queue. */
|
|
@@ -1990,13 +2000,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
|
|
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
{
|
|
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
|
|
- u32 min_tso, tso_segs;
|
|
-
|
|
- min_tso = ca_ops->min_tso_segs ?
|
|
- ca_ops->min_tso_segs(sk) :
|
|
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
|
|
+ u32 tso_segs;
|
|
|
|
- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
|
|
+ tso_segs = ca_ops->tso_segs ?
|
|
+ ca_ops->tso_segs(sk, mss_now) :
|
|
+ tcp_tso_autosize(sk, mss_now,
|
|
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs));
|
|
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
|
|
}
|
|
|
|
@@ -2632,6 +2641,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
|
|
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
|
|
tcp_init_tso_segs(skb, mss_now);
|
|
+ tcp_set_tx_in_flight(sk, skb);
|
|
goto repair; /* Skip network transmission */
|
|
}
|
|
|
|
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
|
|
index a8f6d9d06f2e..a8b4c9504570 100644
|
|
--- a/net/ipv4/tcp_rate.c
|
|
+++ b/net/ipv4/tcp_rate.c
|
|
@@ -34,6 +34,24 @@
|
|
* ready to send in the write queue.
|
|
*/
|
|
|
|
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ u32 in_flight;
|
|
+
|
|
+ /* Check, sanitize, and record packets in flight after skb was sent. */
|
|
+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
|
|
+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
|
|
+ "insane in_flight %u cc %s mss %u "
|
|
+ "cwnd %u pif %u %u %u %u\n",
|
|
+ in_flight, inet_csk(sk)->icsk_ca_ops->name,
|
|
+ tp->mss_cache, tp->snd_cwnd,
|
|
+ tp->packets_out, tp->retrans_out,
|
|
+ tp->sacked_out, tp->lost_out))
|
|
+ in_flight = TCPCB_IN_FLIGHT_MAX;
|
|
+ TCP_SKB_CB(skb)->tx.in_flight = in_flight;
|
|
+}
|
|
+
|
|
/* Snapshot the current delivery information in the skb, to generate
|
|
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
|
|
*/
|
|
@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
|
|
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
|
|
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
|
|
TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
|
|
+ TCP_SKB_CB(skb)->tx.lost = tp->lost;
|
|
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
|
|
+ tcp_set_tx_in_flight(sk, skb);
|
|
}
|
|
|
|
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
|
|
@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
|
|
if (!rs->prior_delivered ||
|
|
tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
|
|
scb->end_seq, rs->last_end_seq)) {
|
|
+ rs->prior_lost = scb->tx.lost;
|
|
rs->prior_delivered_ce = scb->tx.delivered_ce;
|
|
rs->prior_delivered = scb->tx.delivered;
|
|
rs->prior_mstamp = scb->tx.delivered_mstamp;
|
|
rs->is_app_limited = scb->tx.is_app_limited;
|
|
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
|
|
rs->last_end_seq = scb->end_seq;
|
|
+ rs->tx_in_flight = scb->tx.in_flight;
|
|
|
|
/* Record send time of most recently ACKed packet: */
|
|
tp->first_tx_mstamp = tx_tstamp;
|
|
/* Find the duration of the "send phase" of this window: */
|
|
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
|
|
- scb->tx.first_tx_mstamp);
|
|
+ rs->interval_us = tcp_stamp32_us_delta(
|
|
+ tp->first_tx_mstamp,
|
|
+ scb->tx.first_tx_mstamp);
|
|
|
|
}
|
|
/* Mark off the skb delivered once it's sacked to avoid being
|
|
@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
|
|
return;
|
|
}
|
|
rs->delivered = tp->delivered - rs->prior_delivered;
|
|
+ rs->lost = tp->lost - rs->prior_lost;
|
|
|
|
rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
|
|
/* delivered_ce occupies less than 32 bits in the skb control block */
|
|
@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
|
|
* longer phase.
|
|
*/
|
|
snd_us = rs->interval_us; /* send phase */
|
|
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
|
|
+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
|
|
rs->prior_mstamp); /* ack phase */
|
|
rs->interval_us = max(snd_us, ack_us);
|
|
|
|
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
|
|
index cb79127f45c3..70e4de876a7f 100644
|
|
--- a/net/ipv4/tcp_timer.c
|
|
+++ b/net/ipv4/tcp_timer.c
|
|
@@ -605,6 +605,7 @@ void tcp_write_timer_handler(struct sock *sk)
|
|
return;
|
|
}
|
|
|
|
+ tcp_rate_check_app_limited(sk);
|
|
tcp_mstamp_refresh(tcp_sk(sk));
|
|
event = icsk->icsk_pending;
|
|
|
|
--
|
|
2.40.1
|
|
|
|
From 0927bc0b168ee599f356a757df60102be68472dc Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Mon, 17 Apr 2023 18:21:50 +0200
|
|
Subject: [PATCH 02/10] bfq
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
block/bfq-iosched.c | 8 ++++++++
|
|
1 file changed, 8 insertions(+)
|
|
|
|
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
|
|
index d9ed3108c17a..66146bbcd4af 100644
|
|
--- a/block/bfq-iosched.c
|
|
+++ b/block/bfq-iosched.c
|
|
@@ -649,6 +649,8 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
|
|
sched_data->service_tree[i].wsum;
|
|
}
|
|
}
|
|
+ if (!wsum)
|
|
+ continue;
|
|
limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
|
|
if (entity->allocated >= limit) {
|
|
bfq_log_bfqq(bfqq->bfqd, bfqq,
|
|
@@ -7617,6 +7619,7 @@ MODULE_ALIAS("bfq-iosched");
|
|
static int __init bfq_init(void)
|
|
{
|
|
int ret;
|
|
+ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.3";
|
|
|
|
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
ret = blkcg_policy_register(&blkcg_policy_bfq);
|
|
@@ -7648,6 +7651,11 @@ static int __init bfq_init(void)
|
|
if (ret)
|
|
goto slab_kill;
|
|
|
|
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
|
+ strcat(msg, " (with cgroups support)");
|
|
+#endif
|
|
+ pr_info("%s", msg);
|
|
+
|
|
return 0;
|
|
|
|
slab_kill:
|
|
--
|
|
2.40.1
|
|
|
|
From 978269efc945dfd3e330da87db88188fab9b92c1 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Fri, 28 Apr 2023 19:58:48 +0200
|
|
Subject: [PATCH 03/10] cachy
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
.gitignore | 1 +
|
|
.../admin-guide/kernel-parameters.txt | 12 +
|
|
Documentation/dontdiff | 1 +
|
|
Makefile | 8 +-
|
|
arch/arc/configs/axs101_defconfig | 1 +
|
|
arch/arc/configs/axs103_defconfig | 1 +
|
|
arch/arc/configs/axs103_smp_defconfig | 1 +
|
|
arch/arc/configs/haps_hs_defconfig | 1 +
|
|
arch/arc/configs/haps_hs_smp_defconfig | 1 +
|
|
arch/arc/configs/hsdk_defconfig | 1 +
|
|
arch/arc/configs/nsim_700_defconfig | 1 +
|
|
arch/arc/configs/nsimosci_defconfig | 1 +
|
|
arch/arc/configs/nsimosci_hs_defconfig | 1 +
|
|
arch/arc/configs/nsimosci_hs_smp_defconfig | 1 +
|
|
arch/arc/configs/tb10x_defconfig | 1 +
|
|
arch/arc/configs/vdk_hs38_defconfig | 1 +
|
|
arch/arc/configs/vdk_hs38_smp_defconfig | 1 +
|
|
arch/x86/Kconfig.cpu | 416 ++-
|
|
arch/x86/Makefile | 45 +-
|
|
arch/x86/Makefile.postlink | 41 +
|
|
arch/x86/boot/compressed/.gitignore | 1 -
|
|
arch/x86/boot/compressed/Makefile | 10 +-
|
|
arch/x86/include/asm/pci.h | 6 +
|
|
arch/x86/include/asm/vermagic.h | 72 +
|
|
arch/x86/pci/common.c | 7 +-
|
|
drivers/Makefile | 15 +-
|
|
drivers/ata/ahci.c | 23 +-
|
|
drivers/cpufreq/Kconfig.x86 | 2 -
|
|
drivers/cpufreq/intel_pstate.c | 2 +
|
|
drivers/i2c/busses/Kconfig | 9 +
|
|
drivers/i2c/busses/Makefile | 1 +
|
|
drivers/i2c/busses/i2c-nct6775.c | 647 ++++
|
|
drivers/i2c/busses/i2c-piix4.c | 4 +-
|
|
drivers/md/dm-crypt.c | 5 +
|
|
drivers/pci/controller/Makefile | 6 +
|
|
drivers/pci/controller/intel-nvme-remap.c | 462 +++
|
|
drivers/pci/quirks.c | 101 +
|
|
drivers/platform/x86/Kconfig | 24 +
|
|
drivers/platform/x86/Makefile | 4 +
|
|
drivers/platform/x86/legion-laptop.c | 2783 +++++++++++++++++
|
|
drivers/platform/x86/steamdeck.c | 523 ++++
|
|
include/linux/pagemap.h | 2 +-
|
|
include/linux/user_namespace.h | 4 +
|
|
include/net/netns/ipv4.h | 1 +
|
|
include/trace/events/tcp.h | 7 +
|
|
init/Kconfig | 39 +
|
|
kernel/Kconfig.hz | 24 +
|
|
kernel/fork.c | 14 +
|
|
kernel/module/Kconfig | 25 +
|
|
kernel/sched/fair.c | 20 +-
|
|
kernel/sysctl.c | 12 +
|
|
kernel/user_namespace.c | 7 +
|
|
mm/Kconfig | 2 +-
|
|
mm/compaction.c | 4 +
|
|
mm/page-writeback.c | 8 +
|
|
mm/swap.c | 5 +
|
|
mm/vmpressure.c | 4 +
|
|
mm/vmscan.c | 8 +
|
|
net/ipv4/sysctl_net_ipv4.c | 7 +
|
|
net/ipv4/tcp_input.c | 36 +
|
|
net/ipv4/tcp_ipv4.c | 2 +
|
|
scripts/Makefile.lib | 13 +-
|
|
scripts/Makefile.modinst | 7 +-
|
|
63 files changed, 5431 insertions(+), 64 deletions(-)
|
|
create mode 100644 arch/x86/Makefile.postlink
|
|
create mode 100644 drivers/i2c/busses/i2c-nct6775.c
|
|
create mode 100644 drivers/pci/controller/intel-nvme-remap.c
|
|
create mode 100644 drivers/platform/x86/legion-laptop.c
|
|
create mode 100644 drivers/platform/x86/steamdeck.c
|
|
|
|
diff --git a/.gitignore b/.gitignore
|
|
index 70ec6037fa7a..9bafd3c6bb5f 100644
|
|
--- a/.gitignore
|
|
+++ b/.gitignore
|
|
@@ -65,6 +65,7 @@ modules.order
|
|
/vmlinux
|
|
/vmlinux.32
|
|
/vmlinux.map
|
|
+/vmlinux.relocs
|
|
/vmlinux.symvers
|
|
/vmlinux-gdb.py
|
|
/vmlinuz
|
|
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
|
index 7016cb12dc4e..97303fa40350 100644
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -2084,6 +2084,9 @@
|
|
disable
|
|
Do not enable intel_pstate as the default
|
|
scaling driver for the supported processors
|
|
+ enable
|
|
+ Enable intel_pstate in-case "disable" was passed
|
|
+ previously in the kernel boot parameters
|
|
passive
|
|
Use intel_pstate as a scaling driver, but configure it
|
|
to work with generic cpufreq governors (instead of
|
|
@@ -4190,6 +4193,15 @@
|
|
nomsi [MSI] If the PCI_MSI kernel config parameter is
|
|
enabled, this kernel boot option can be used to
|
|
disable the use of MSI interrupts system-wide.
|
|
+ pcie_acs_override =
|
|
+ [PCIE] Override missing PCIe ACS support for:
|
|
+ downstream
|
|
+ All downstream ports - full ACS capabilities
|
|
+ multfunction
|
|
+ All multifunction devices - multifunction ACS subset
|
|
+ id:nnnn:nnnn
|
|
+ Specfic device - full ACS capabilities
|
|
+ Specified as vid:did (vendor/device ID) in hex
|
|
noioapicquirk [APIC] Disable all boot interrupt quirks.
|
|
Safety option to keep boot IRQs enabled. This
|
|
should never be necessary.
|
|
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
|
|
index 3c399f132e2d..a62ad01e6d11 100644
|
|
--- a/Documentation/dontdiff
|
|
+++ b/Documentation/dontdiff
|
|
@@ -254,6 +254,7 @@ vmlinux.aout
|
|
vmlinux.bin.all
|
|
vmlinux.lds
|
|
vmlinux.map
|
|
+vmlinux.relocs
|
|
vmlinux.symvers
|
|
vmlinuz
|
|
voffset.h
|
|
diff --git a/Makefile b/Makefile
|
|
index f5543eef4f82..44c927047211 100644
|
|
--- a/Makefile
|
|
+++ b/Makefile
|
|
@@ -818,6 +818,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
|
|
ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
|
|
KBUILD_CFLAGS += -O2
|
|
KBUILD_RUSTFLAGS += -Copt-level=2
|
|
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
|
|
+KBUILD_CFLAGS += -O3
|
|
+KBUILD_RUSTFLAGS += -Copt-level=3
|
|
else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
|
|
KBUILD_CFLAGS += -Os
|
|
KBUILD_RUSTFLAGS += -Copt-level=s
|
|
@@ -1060,11 +1063,6 @@ KBUILD_CFLAGS += -fno-strict-overflow
|
|
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
|
|
KBUILD_CFLAGS += -fno-stack-check
|
|
|
|
-# conserve stack if available
|
|
-ifdef CONFIG_CC_IS_GCC
|
|
-KBUILD_CFLAGS += -fconserve-stack
|
|
-endif
|
|
-
|
|
# Prohibit date/time macros, which would make the build non-deterministic
|
|
KBUILD_CFLAGS += -Werror=date-time
|
|
|
|
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
|
|
index 81764160451f..2c15d3bf747a 100644
|
|
--- a/arch/arc/configs/axs101_defconfig
|
|
+++ b/arch/arc/configs/axs101_defconfig
|
|
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
|
|
index d5181275490e..7d868e148d9a 100644
|
|
--- a/arch/arc/configs/axs103_defconfig
|
|
+++ b/arch/arc/configs/axs103_defconfig
|
|
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
|
|
index 2f336d99a8cf..777a9f21eb6b 100644
|
|
--- a/arch/arc/configs/axs103_smp_defconfig
|
|
+++ b/arch/arc/configs/axs103_smp_defconfig
|
|
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
|
|
index 899b2fd5c71d..bda15a876849 100644
|
|
--- a/arch/arc/configs/haps_hs_defconfig
|
|
+++ b/arch/arc/configs/haps_hs_defconfig
|
|
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EXPERT=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_COMPAT_BRK is not set
|
|
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
|
|
index 0d32aac8069f..dbd74fea69aa 100644
|
|
--- a/arch/arc/configs/haps_hs_smp_defconfig
|
|
+++ b/arch/arc/configs/haps_hs_smp_defconfig
|
|
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
|
|
index d18378d2c2a6..2396ca417182 100644
|
|
--- a/arch/arc/configs/hsdk_defconfig
|
|
+++ b/arch/arc/configs/hsdk_defconfig
|
|
@@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
CONFIG_BLK_DEV_RAM=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
|
|
index 3e9829775992..5044609540cc 100644
|
|
--- a/arch/arc/configs/nsim_700_defconfig
|
|
+++ b/arch/arc/configs/nsim_700_defconfig
|
|
@@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_KALLSYMS_ALL=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
|
|
index 502c87f351c8..748c809d1c4c 100644
|
|
--- a/arch/arc/configs/nsimosci_defconfig
|
|
+++ b/arch/arc/configs/nsimosci_defconfig
|
|
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_KALLSYMS_ALL=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
|
|
index f721cc3997d0..205c32b0074c 100644
|
|
--- a/arch/arc/configs/nsimosci_hs_defconfig
|
|
+++ b/arch/arc/configs/nsimosci_hs_defconfig
|
|
@@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_KALLSYMS_ALL=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
|
|
index 1419fc946a08..2477b7c80977 100644
|
|
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
|
|
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
|
|
@@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y
|
|
# CONFIG_UTS_NS is not set
|
|
# CONFIG_PID_NS is not set
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_COMPAT_BRK is not set
|
|
CONFIG_KPROBES=y
|
|
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
|
|
index 6f0d2be9d926..cf02ad0fc210 100644
|
|
--- a/arch/arc/configs/tb10x_defconfig
|
|
+++ b/arch/arc/configs/tb10x_defconfig
|
|
@@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio"
|
|
CONFIG_INITRAMFS_ROOT_UID=2100
|
|
CONFIG_INITRAMFS_ROOT_GID=501
|
|
# CONFIG_RD_GZIP is not set
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_KALLSYMS_ALL=y
|
|
# CONFIG_AIO is not set
|
|
CONFIG_EMBEDDED=y
|
|
diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
|
|
index d3ef189c75f8..922b1b24f518 100644
|
|
--- a/arch/arc/configs/vdk_hs38_defconfig
|
|
+++ b/arch/arc/configs/vdk_hs38_defconfig
|
|
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
|
|
CONFIG_IKCONFIG=y
|
|
CONFIG_IKCONFIG_PROC=y
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
|
|
index 944b347025fd..ed64319f7eb2 100644
|
|
--- a/arch/arc/configs/vdk_hs38_smp_defconfig
|
|
+++ b/arch/arc/configs/vdk_hs38_smp_defconfig
|
|
@@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
|
|
CONFIG_IKCONFIG=y
|
|
CONFIG_IKCONFIG_PROC=y
|
|
CONFIG_BLK_DEV_INITRD=y
|
|
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y
|
|
CONFIG_EMBEDDED=y
|
|
CONFIG_PERF_EVENTS=y
|
|
# CONFIG_VM_EVENT_COUNTERS is not set
|
|
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
|
|
index 542377cd419d..08d887d1220d 100644
|
|
--- a/arch/x86/Kconfig.cpu
|
|
+++ b/arch/x86/Kconfig.cpu
|
|
@@ -157,7 +157,7 @@ config MPENTIUM4
|
|
|
|
|
|
config MK6
|
|
- bool "K6/K6-II/K6-III"
|
|
+ bool "AMD K6/K6-II/K6-III"
|
|
depends on X86_32
|
|
help
|
|
Select this for an AMD K6-family processor. Enables use of
|
|
@@ -165,7 +165,7 @@ config MK6
|
|
flags to GCC.
|
|
|
|
config MK7
|
|
- bool "Athlon/Duron/K7"
|
|
+ bool "AMD Athlon/Duron/K7"
|
|
depends on X86_32
|
|
help
|
|
Select this for an AMD Athlon K7-family processor. Enables use of
|
|
@@ -173,12 +173,106 @@ config MK7
|
|
flags to GCC.
|
|
|
|
config MK8
|
|
- bool "Opteron/Athlon64/Hammer/K8"
|
|
+ bool "AMD Opteron/Athlon64/Hammer/K8"
|
|
help
|
|
Select this for an AMD Opteron or Athlon64 Hammer-family processor.
|
|
Enables use of some extended instructions, and passes appropriate
|
|
optimization flags to GCC.
|
|
|
|
+config MK8SSE3
|
|
+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
|
|
+ help
|
|
+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
|
|
+ Enables use of some extended instructions, and passes appropriate
|
|
+ optimization flags to GCC.
|
|
+
|
|
+config MK10
|
|
+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
|
|
+ help
|
|
+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
|
|
+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
|
|
+ Enables use of some extended instructions, and passes appropriate
|
|
+ optimization flags to GCC.
|
|
+
|
|
+config MBARCELONA
|
|
+ bool "AMD Barcelona"
|
|
+ help
|
|
+ Select this for AMD Family 10h Barcelona processors.
|
|
+
|
|
+ Enables -march=barcelona
|
|
+
|
|
+config MBOBCAT
|
|
+ bool "AMD Bobcat"
|
|
+ help
|
|
+ Select this for AMD Family 14h Bobcat processors.
|
|
+
|
|
+ Enables -march=btver1
|
|
+
|
|
+config MJAGUAR
|
|
+ bool "AMD Jaguar"
|
|
+ help
|
|
+ Select this for AMD Family 16h Jaguar processors.
|
|
+
|
|
+ Enables -march=btver2
|
|
+
|
|
+config MBULLDOZER
|
|
+ bool "AMD Bulldozer"
|
|
+ help
|
|
+ Select this for AMD Family 15h Bulldozer processors.
|
|
+
|
|
+ Enables -march=bdver1
|
|
+
|
|
+config MPILEDRIVER
|
|
+ bool "AMD Piledriver"
|
|
+ help
|
|
+ Select this for AMD Family 15h Piledriver processors.
|
|
+
|
|
+ Enables -march=bdver2
|
|
+
|
|
+config MSTEAMROLLER
|
|
+ bool "AMD Steamroller"
|
|
+ help
|
|
+ Select this for AMD Family 15h Steamroller processors.
|
|
+
|
|
+ Enables -march=bdver3
|
|
+
|
|
+config MEXCAVATOR
|
|
+ bool "AMD Excavator"
|
|
+ help
|
|
+ Select this for AMD Family 15h Excavator processors.
|
|
+
|
|
+ Enables -march=bdver4
|
|
+
|
|
+config MZEN
|
|
+ bool "AMD Zen"
|
|
+ help
|
|
+ Select this for AMD Family 17h Zen processors.
|
|
+
|
|
+ Enables -march=znver1
|
|
+
|
|
+config MZEN2
|
|
+ bool "AMD Zen 2"
|
|
+ help
|
|
+ Select this for AMD Family 17h Zen 2 processors.
|
|
+
|
|
+ Enables -march=znver2
|
|
+
|
|
+config MZEN3
|
|
+ bool "AMD Zen 3"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ help
|
|
+ Select this for AMD Family 19h Zen 3 processors.
|
|
+
|
|
+ Enables -march=znver3
|
|
+
|
|
+config MZEN4
|
|
+ bool "AMD Zen 4"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
|
|
+ help
|
|
+ Select this for AMD Family 19h Zen 4 processors.
|
|
+
|
|
+ Enables -march=znver4
|
|
+
|
|
config MCRUSOE
|
|
bool "Crusoe"
|
|
depends on X86_32
|
|
@@ -270,7 +364,7 @@ config MPSC
|
|
in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
|
|
|
|
config MCORE2
|
|
- bool "Core 2/newer Xeon"
|
|
+ bool "Intel Core 2"
|
|
help
|
|
|
|
Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
|
|
@@ -278,6 +372,8 @@ config MCORE2
|
|
family in /proc/cpuinfo. Newer ones have 6 and older ones 15
|
|
(not a typo)
|
|
|
|
+ Enables -march=core2
|
|
+
|
|
config MATOM
|
|
bool "Intel Atom"
|
|
help
|
|
@@ -287,6 +383,202 @@ config MATOM
|
|
accordingly optimized code. Use a recent GCC with specific Atom
|
|
support in order to fully benefit from selecting this option.
|
|
|
|
+config MNEHALEM
|
|
+ bool "Intel Nehalem"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 1st Gen Core processors in the Nehalem family.
|
|
+
|
|
+ Enables -march=nehalem
|
|
+
|
|
+config MWESTMERE
|
|
+ bool "Intel Westmere"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for the Intel Westmere formerly Nehalem-C family.
|
|
+
|
|
+ Enables -march=westmere
|
|
+
|
|
+config MSILVERMONT
|
|
+ bool "Intel Silvermont"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for the Intel Silvermont platform.
|
|
+
|
|
+ Enables -march=silvermont
|
|
+
|
|
+config MGOLDMONT
|
|
+ bool "Intel Goldmont"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
|
|
+
|
|
+ Enables -march=goldmont
|
|
+
|
|
+config MGOLDMONTPLUS
|
|
+ bool "Intel Goldmont Plus"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for the Intel Goldmont Plus platform including Gemini Lake.
|
|
+
|
|
+ Enables -march=goldmont-plus
|
|
+
|
|
+config MSANDYBRIDGE
|
|
+ bool "Intel Sandy Bridge"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 2nd Gen Core processors in the Sandy Bridge family.
|
|
+
|
|
+ Enables -march=sandybridge
|
|
+
|
|
+config MIVYBRIDGE
|
|
+ bool "Intel Ivy Bridge"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 3rd Gen Core processors in the Ivy Bridge family.
|
|
+
|
|
+ Enables -march=ivybridge
|
|
+
|
|
+config MHASWELL
|
|
+ bool "Intel Haswell"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 4th Gen Core processors in the Haswell family.
|
|
+
|
|
+ Enables -march=haswell
|
|
+
|
|
+config MBROADWELL
|
|
+ bool "Intel Broadwell"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 5th Gen Core processors in the Broadwell family.
|
|
+
|
|
+ Enables -march=broadwell
|
|
+
|
|
+config MSKYLAKE
|
|
+ bool "Intel Skylake"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 6th Gen Core processors in the Skylake family.
|
|
+
|
|
+ Enables -march=skylake
|
|
+
|
|
+config MSKYLAKEX
|
|
+ bool "Intel Skylake X"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 6th Gen Core processors in the Skylake X family.
|
|
+
|
|
+ Enables -march=skylake-avx512
|
|
+
|
|
+config MCANNONLAKE
|
|
+ bool "Intel Cannon Lake"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 8th Gen Core processors
|
|
+
|
|
+ Enables -march=cannonlake
|
|
+
|
|
+config MICELAKE
|
|
+ bool "Intel Ice Lake"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for 10th Gen Core processors in the Ice Lake family.
|
|
+
|
|
+ Enables -march=icelake-client
|
|
+
|
|
+config MCASCADELAKE
|
|
+ bool "Intel Cascade Lake"
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for Xeon processors in the Cascade Lake family.
|
|
+
|
|
+ Enables -march=cascadelake
|
|
+
|
|
+config MCOOPERLAKE
|
|
+ bool "Intel Cooper Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for Xeon processors in the Cooper Lake family.
|
|
+
|
|
+ Enables -march=cooperlake
|
|
+
|
|
+config MTIGERLAKE
|
|
+ bool "Intel Tiger Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for third-generation 10 nm process processors in the Tiger Lake family.
|
|
+
|
|
+ Enables -march=tigerlake
|
|
+
|
|
+config MSAPPHIRERAPIDS
|
|
+ bool "Intel Sapphire Rapids"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for third-generation 10 nm process processors in the Sapphire Rapids family.
|
|
+
|
|
+ Enables -march=sapphirerapids
|
|
+
|
|
+config MROCKETLAKE
|
|
+ bool "Intel Rocket Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for eleventh-generation processors in the Rocket Lake family.
|
|
+
|
|
+ Enables -march=rocketlake
|
|
+
|
|
+config MALDERLAKE
|
|
+ bool "Intel Alder Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for twelfth-generation processors in the Alder Lake family.
|
|
+
|
|
+ Enables -march=alderlake
|
|
+
|
|
+config MRAPTORLAKE
|
|
+ bool "Intel Raptor Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for thirteenth-generation processors in the Raptor Lake family.
|
|
+
|
|
+ Enables -march=raptorlake
|
|
+
|
|
+config MMETEORLAKE
|
|
+ bool "Intel Meteor Lake"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
|
|
+ select X86_P6_NOP
|
|
+ help
|
|
+
|
|
+ Select this for fourteenth-generation processors in the Meteor Lake family.
|
|
+
|
|
+ Enables -march=meteorlake
|
|
+
|
|
config GENERIC_CPU
|
|
bool "Generic-x86-64"
|
|
depends on X86_64
|
|
@@ -294,6 +586,50 @@ config GENERIC_CPU
|
|
Generic x86-64 CPU.
|
|
Run equally well on all x86-64 CPUs.
|
|
|
|
+config GENERIC_CPU2
|
|
+ bool "Generic-x86-64-v2"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ depends on X86_64
|
|
+ help
|
|
+ Generic x86-64 CPU.
|
|
+ Run equally well on all x86-64 CPUs with min support of x86-64-v2.
|
|
+
|
|
+config GENERIC_CPU3
|
|
+ bool "Generic-x86-64-v3"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ depends on X86_64
|
|
+ help
|
|
+ Generic x86-64-v3 CPU with v3 instructions.
|
|
+ Run equally well on all x86-64 CPUs with min support of x86-64-v3.
|
|
+
|
|
+config GENERIC_CPU4
|
|
+ bool "Generic-x86-64-v4"
|
|
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
|
|
+ depends on X86_64
|
|
+ help
|
|
+ Generic x86-64 CPU with v4 instructions.
|
|
+ Run equally well on all x86-64 CPUs with min support of x86-64-v4.
|
|
+
|
|
+config MNATIVE_INTEL
|
|
+ bool "Intel-Native optimizations autodetected by the compiler"
|
|
+ help
|
|
+
|
|
+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
|
|
+ the optimum settings to use based on your processor. Do NOT use this
|
|
+ for AMD CPUs. Intel Only!
|
|
+
|
|
+ Enables -march=native
|
|
+
|
|
+config MNATIVE_AMD
|
|
+ bool "AMD-Native optimizations autodetected by the compiler"
|
|
+ help
|
|
+
|
|
+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
|
|
+ the optimum settings to use based on your processor. Do NOT use this
|
|
+ for Intel CPUs. AMD Only!
|
|
+
|
|
+ Enables -march=native
|
|
+
|
|
endchoice
|
|
|
|
config X86_GENERIC
|
|
@@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT
|
|
config X86_L1_CACHE_SHIFT
|
|
int
|
|
default "7" if MPENTIUM4 || MPSC
|
|
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
|
|
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
|
|
+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
|
|
+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
|
|
+ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
|
|
+ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
|
|
+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
|
|
+ || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \
|
|
+ || GENERIC_CPU4
|
|
default "4" if MELAN || M486SX || M486 || MGEODEGX1
|
|
- default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
|
|
+ default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
|
|
+ || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
|
|
|
|
config X86_F00F_BUG
|
|
def_bool y
|
|
@@ -332,15 +676,27 @@ config X86_INVD_BUG
|
|
|
|
config X86_ALIGNMENT_16
|
|
def_bool y
|
|
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
|
|
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
|
|
+ || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
|
|
|
|
config X86_INTEL_USERCOPY
|
|
def_bool y
|
|
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
|
|
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
|
|
+ || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
|
|
+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
|
|
+ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
|
|
+ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL
|
|
|
|
config X86_USE_PPRO_CHECKSUM
|
|
def_bool y
|
|
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
|
|
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
|
|
+ || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
|
|
+ || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
|
|
+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
|
|
+ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
|
|
+ || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
|
|
+ || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
|
|
+ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD
|
|
|
|
#
|
|
# P6_NOPs are a relatively minor optimization that require a family >=
|
|
@@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM
|
|
config X86_P6_NOP
|
|
def_bool y
|
|
depends on X86_64
|
|
- depends on (MCORE2 || MPENTIUM4 || MPSC)
|
|
+ depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
|
|
+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
|
|
+ || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
|
|
+ || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL)
|
|
|
|
config X86_TSC
|
|
def_bool y
|
|
- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
|
|
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
|
|
+ || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
|
|
+ || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
|
|
+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \
|
|
+ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
|
|
+ || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
|
|
+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \
|
|
+ || MNATIVE_AMD) || X86_64
|
|
|
|
config X86_CMPXCHG64
|
|
def_bool y
|
|
- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
|
|
+ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
|
|
+ || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
|
|
+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
|
|
+ || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
|
|
+ || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
|
|
+ || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
|
|
+ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD
|
|
|
|
# this should be set for all -march=.. options where the compiler
|
|
# generates cmov.
|
|
config X86_CMOV
|
|
def_bool y
|
|
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
|
|
+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
|
|
+ || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
|
|
+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
|
|
+ || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
|
|
+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
|
|
+ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
|
|
+ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD)
|
|
|
|
config X86_MINIMUM_CPU_FAMILY
|
|
int
|
|
default "64" if X86_64
|
|
- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
|
|
+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
|
|
+ || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 \
|
|
+ || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
|
|
+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \
|
|
+ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
|
|
+ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
|
|
+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
|
|
+ || MNATIVE_INTEL || MNATIVE_AMD)
|
|
default "5" if X86_32 && X86_CMPXCHG64
|
|
default "4"
|
|
|
|
config X86_DEBUGCTLMSR
|
|
def_bool y
|
|
- depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
|
|
+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \
|
|
+ || M486SX || M486) && !UML
|
|
|
|
config IA32_FEAT_CTL
|
|
def_bool y
|
|
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
|
|
index b39975977c03..00d94852490b 100644
|
|
--- a/arch/x86/Makefile
|
|
+++ b/arch/x86/Makefile
|
|
@@ -67,7 +67,7 @@ export BITS
|
|
#
|
|
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
|
|
#
|
|
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
|
|
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -O3 -fno-tree-vectorize
|
|
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
|
|
|
|
ifeq ($(CONFIG_X86_KERNEL_IBT),y)
|
|
@@ -151,8 +151,47 @@ else
|
|
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
|
|
cflags-$(CONFIG_MK8) += -march=k8
|
|
cflags-$(CONFIG_MPSC) += -march=nocona
|
|
- cflags-$(CONFIG_MCORE2) += -march=core2
|
|
- cflags-$(CONFIG_MATOM) += -march=atom
|
|
+ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3
|
|
+ cflags-$(CONFIG_MK10) += -march=amdfam10
|
|
+ cflags-$(CONFIG_MBARCELONA) += -march=barcelona
|
|
+ cflags-$(CONFIG_MBOBCAT) += -march=btver1
|
|
+ cflags-$(CONFIG_MJAGUAR) += -march=btver2
|
|
+ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1
|
|
+ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm
|
|
+ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm
|
|
+ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm
|
|
+ cflags-$(CONFIG_MZEN) += -march=znver1
|
|
+ cflags-$(CONFIG_MZEN2) += -march=znver2
|
|
+ cflags-$(CONFIG_MZEN3) += -march=znver3
|
|
+ cflags-$(CONFIG_MZEN4) += -march=znver4
|
|
+ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
|
|
+ cflags-$(CONFIG_MNATIVE_AMD) += -march=native
|
|
+ cflags-$(CONFIG_MATOM) += -march=bonnell
|
|
+ cflags-$(CONFIG_MCORE2) += -march=core2
|
|
+ cflags-$(CONFIG_MNEHALEM) += -march=nehalem
|
|
+ cflags-$(CONFIG_MWESTMERE) += -march=westmere
|
|
+ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont
|
|
+ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont
|
|
+ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
|
|
+ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge
|
|
+ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge
|
|
+ cflags-$(CONFIG_MHASWELL) += -march=haswell
|
|
+ cflags-$(CONFIG_MBROADWELL) += -march=broadwell
|
|
+ cflags-$(CONFIG_MSKYLAKE) += -march=skylake
|
|
+ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512
|
|
+ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake
|
|
+ cflags-$(CONFIG_MICELAKE) += -march=icelake-client
|
|
+ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake
|
|
+ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake
|
|
+ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake
|
|
+ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
|
|
+ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake
|
|
+ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake
|
|
+ cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake
|
|
+ cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake
|
|
+ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2
|
|
+ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3
|
|
+ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4
|
|
cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
|
|
KBUILD_CFLAGS += $(cflags-y)
|
|
|
|
diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink
|
|
new file mode 100644
|
|
index 000000000000..195af937aa4d
|
|
--- /dev/null
|
|
+++ b/arch/x86/Makefile.postlink
|
|
@@ -0,0 +1,41 @@
|
|
+# SPDX-License-Identifier: GPL-2.0
|
|
+# ===========================================================================
|
|
+# Post-link x86 pass
|
|
+# ===========================================================================
|
|
+#
|
|
+# 1. Separate relocations from vmlinux into vmlinux.relocs.
|
|
+# 2. Strip relocations from vmlinux.
|
|
+
|
|
+PHONY := __archpost
|
|
+__archpost:
|
|
+
|
|
+-include include/config/auto.conf
|
|
+include $(srctree)/scripts/Kbuild.include
|
|
+
|
|
+CMD_RELOCS = arch/x86/tools/relocs
|
|
+quiet_cmd_relocs = RELOCS $@.relocs
|
|
+ cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@
|
|
+
|
|
+quiet_cmd_strip_relocs = RSTRIP $@
|
|
+ cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@
|
|
+
|
|
+# `@true` prevents complaint when there is nothing to be done
|
|
+
|
|
+vmlinux: FORCE
|
|
+ @true
|
|
+ifeq ($(CONFIG_X86_NEED_RELOCS),y)
|
|
+ $(call cmd,relocs)
|
|
+ $(call cmd,strip_relocs)
|
|
+endif
|
|
+
|
|
+%.ko: FORCE
|
|
+ @true
|
|
+
|
|
+clean:
|
|
+ @rm -f vmlinux.relocs
|
|
+
|
|
+PHONY += FORCE clean
|
|
+
|
|
+FORCE:
|
|
+
|
|
+.PHONY: $(PHONY)
|
|
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
|
|
index 25805199a506..b2968175fc27 100644
|
|
--- a/arch/x86/boot/compressed/.gitignore
|
|
+++ b/arch/x86/boot/compressed/.gitignore
|
|
@@ -1,7 +1,6 @@
|
|
# SPDX-License-Identifier: GPL-2.0-only
|
|
relocs
|
|
vmlinux.bin.all
|
|
-vmlinux.relocs
|
|
vmlinux.lds
|
|
mkpiggy
|
|
piggy.S
|
|
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
|
|
index 6b6cfe607bdb..19d1fb601796 100644
|
|
--- a/arch/x86/boot/compressed/Makefile
|
|
+++ b/arch/x86/boot/compressed/Makefile
|
|
@@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE
|
|
|
|
targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
|
|
|
|
-CMD_RELOCS = arch/x86/tools/relocs
|
|
-quiet_cmd_relocs = RELOCS $@
|
|
- cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
|
|
-$(obj)/vmlinux.relocs: vmlinux FORCE
|
|
- $(call if_changed,relocs)
|
|
+# vmlinux.relocs is created by the vmlinux postlink step.
|
|
+vmlinux.relocs: vmlinux
|
|
+ @true
|
|
|
|
vmlinux.bin.all-y := $(obj)/vmlinux.bin
|
|
-vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
|
|
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs
|
|
|
|
$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
|
|
$(call if_changed,gzip)
|
|
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
|
|
index b40c462b4af3..c4e66e60d559 100644
|
|
--- a/arch/x86/include/asm/pci.h
|
|
+++ b/arch/x86/include/asm/pci.h
|
|
@@ -27,6 +27,7 @@ struct pci_sysdata {
|
|
#if IS_ENABLED(CONFIG_VMD)
|
|
struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */
|
|
#endif
|
|
+ struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */
|
|
};
|
|
|
|
extern int pci_routeirq;
|
|
@@ -70,6 +71,11 @@ static inline bool is_vmd(struct pci_bus *bus)
|
|
#define is_vmd(bus) false
|
|
#endif /* CONFIG_VMD */
|
|
|
|
+static inline bool is_nvme_remap(struct pci_bus *bus)
|
|
+{
|
|
+ return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
|
|
+}
|
|
+
|
|
/* Can be used to override the logic in pci_scan_bus for skipping
|
|
already-configured bus numbers - to be used for buggy BIOSes
|
|
or architectures with incomplete PCI setup by the loader */
|
|
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
|
|
index 75884d2cdec3..18021e8c0c28 100644
|
|
--- a/arch/x86/include/asm/vermagic.h
|
|
+++ b/arch/x86/include/asm/vermagic.h
|
|
@@ -17,6 +17,52 @@
|
|
#define MODULE_PROC_FAMILY "586MMX "
|
|
#elif defined CONFIG_MCORE2
|
|
#define MODULE_PROC_FAMILY "CORE2 "
|
|
+#elif defined CONFIG_MNATIVE_INTEL
|
|
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
|
|
+#elif defined CONFIG_MNATIVE_AMD
|
|
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
|
|
+#elif defined CONFIG_MNEHALEM
|
|
+#define MODULE_PROC_FAMILY "NEHALEM "
|
|
+#elif defined CONFIG_MWESTMERE
|
|
+#define MODULE_PROC_FAMILY "WESTMERE "
|
|
+#elif defined CONFIG_MSILVERMONT
|
|
+#define MODULE_PROC_FAMILY "SILVERMONT "
|
|
+#elif defined CONFIG_MGOLDMONT
|
|
+#define MODULE_PROC_FAMILY "GOLDMONT "
|
|
+#elif defined CONFIG_MGOLDMONTPLUS
|
|
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
|
|
+#elif defined CONFIG_MSANDYBRIDGE
|
|
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
|
|
+#elif defined CONFIG_MIVYBRIDGE
|
|
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
|
|
+#elif defined CONFIG_MHASWELL
|
|
+#define MODULE_PROC_FAMILY "HASWELL "
|
|
+#elif defined CONFIG_MBROADWELL
|
|
+#define MODULE_PROC_FAMILY "BROADWELL "
|
|
+#elif defined CONFIG_MSKYLAKE
|
|
+#define MODULE_PROC_FAMILY "SKYLAKE "
|
|
+#elif defined CONFIG_MSKYLAKEX
|
|
+#define MODULE_PROC_FAMILY "SKYLAKEX "
|
|
+#elif defined CONFIG_MCANNONLAKE
|
|
+#define MODULE_PROC_FAMILY "CANNONLAKE "
|
|
+#elif defined CONFIG_MICELAKE
|
|
+#define MODULE_PROC_FAMILY "ICELAKE "
|
|
+#elif defined CONFIG_MCASCADELAKE
|
|
+#define MODULE_PROC_FAMILY "CASCADELAKE "
|
|
+#elif defined CONFIG_MCOOPERLAKE
|
|
+#define MODULE_PROC_FAMILY "COOPERLAKE "
|
|
+#elif defined CONFIG_MTIGERLAKE
|
|
+#define MODULE_PROC_FAMILY "TIGERLAKE "
|
|
+#elif defined CONFIG_MSAPPHIRERAPIDS
|
|
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
|
|
+#elif defined CONFIG_ROCKETLAKE
|
|
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
|
|
+#elif defined CONFIG_MALDERLAKE
|
|
+#define MODULE_PROC_FAMILY "ALDERLAKE "
|
|
+#elif defined CONFIG_MRAPTORLAKE
|
|
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
|
|
+#elif defined CONFIG_MMETEORLAKE
|
|
+#define MODULE_PROC_FAMILY "METEORLAKE "
|
|
#elif defined CONFIG_MATOM
|
|
#define MODULE_PROC_FAMILY "ATOM "
|
|
#elif defined CONFIG_M686
|
|
@@ -35,6 +81,32 @@
|
|
#define MODULE_PROC_FAMILY "K7 "
|
|
#elif defined CONFIG_MK8
|
|
#define MODULE_PROC_FAMILY "K8 "
|
|
+#elif defined CONFIG_MK8SSE3
|
|
+#define MODULE_PROC_FAMILY "K8SSE3 "
|
|
+#elif defined CONFIG_MK10
|
|
+#define MODULE_PROC_FAMILY "K10 "
|
|
+#elif defined CONFIG_MBARCELONA
|
|
+#define MODULE_PROC_FAMILY "BARCELONA "
|
|
+#elif defined CONFIG_MBOBCAT
|
|
+#define MODULE_PROC_FAMILY "BOBCAT "
|
|
+#elif defined CONFIG_MBULLDOZER
|
|
+#define MODULE_PROC_FAMILY "BULLDOZER "
|
|
+#elif defined CONFIG_MPILEDRIVER
|
|
+#define MODULE_PROC_FAMILY "PILEDRIVER "
|
|
+#elif defined CONFIG_MSTEAMROLLER
|
|
+#define MODULE_PROC_FAMILY "STEAMROLLER "
|
|
+#elif defined CONFIG_MJAGUAR
|
|
+#define MODULE_PROC_FAMILY "JAGUAR "
|
|
+#elif defined CONFIG_MEXCAVATOR
|
|
+#define MODULE_PROC_FAMILY "EXCAVATOR "
|
|
+#elif defined CONFIG_MZEN
|
|
+#define MODULE_PROC_FAMILY "ZEN "
|
|
+#elif defined CONFIG_MZEN2
|
|
+#define MODULE_PROC_FAMILY "ZEN2 "
|
|
+#elif defined CONFIG_MZEN3
|
|
+#define MODULE_PROC_FAMILY "ZEN3 "
|
|
+#elif defined CONFIG_MZEN4
|
|
+#define MODULE_PROC_FAMILY "ZEN4 "
|
|
#elif defined CONFIG_MELAN
|
|
#define MODULE_PROC_FAMILY "ELAN "
|
|
#elif defined CONFIG_MCRUSOE
|
|
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
|
|
index ddb798603201..7c20387d8202 100644
|
|
--- a/arch/x86/pci/common.c
|
|
+++ b/arch/x86/pci/common.c
|
|
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
|
|
return 0;
|
|
}
|
|
|
|
-#if IS_ENABLED(CONFIG_VMD)
|
|
struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
|
|
{
|
|
+#if IS_ENABLED(CONFIG_VMD)
|
|
if (is_vmd(dev->bus))
|
|
return to_pci_sysdata(dev->bus)->vmd_dev;
|
|
+#endif
|
|
+
|
|
+ if (is_nvme_remap(dev->bus))
|
|
+ return to_pci_sysdata(dev->bus)->nvme_remap_dev;
|
|
|
|
return dev;
|
|
}
|
|
-#endif
|
|
diff --git a/drivers/Makefile b/drivers/Makefile
|
|
index 20b118dca999..c19dee206e53 100644
|
|
--- a/drivers/Makefile
|
|
+++ b/drivers/Makefile
|
|
@@ -64,15 +64,8 @@ obj-y += char/
|
|
# iommu/ comes before gpu as gpu are using iommu controllers
|
|
obj-y += iommu/
|
|
|
|
-# gpu/ comes after char for AGP vs DRM startup and after iommu
|
|
-obj-y += gpu/
|
|
-
|
|
obj-$(CONFIG_CONNECTOR) += connector/
|
|
|
|
-# i810fb and intelfb depend on char/agp/
|
|
-obj-$(CONFIG_FB_I810) += video/fbdev/i810/
|
|
-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
|
|
-
|
|
obj-$(CONFIG_PARPORT) += parport/
|
|
obj-y += base/ block/ misc/ mfd/ nfc/
|
|
obj-$(CONFIG_LIBNVDIMM) += nvdimm/
|
|
@@ -84,6 +77,14 @@ obj-y += macintosh/
|
|
obj-y += scsi/
|
|
obj-y += nvme/
|
|
obj-$(CONFIG_ATA) += ata/
|
|
+
|
|
+# gpu/ comes after char for AGP vs DRM startup and after iommu
|
|
+obj-y += gpu/
|
|
+
|
|
+# i810fb and intelfb depend on char/agp/
|
|
+obj-$(CONFIG_FB_I810) += video/fbdev/i810/
|
|
+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
|
|
+
|
|
obj-$(CONFIG_TARGET_CORE) += target/
|
|
obj-$(CONFIG_MTD) += mtd/
|
|
obj-$(CONFIG_SPI) += spi/
|
|
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
|
|
index 14a1c0d14916..7f5a77ddc7d4 100644
|
|
--- a/drivers/ata/ahci.c
|
|
+++ b/drivers/ata/ahci.c
|
|
@@ -1522,7 +1522,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
|
|
}
|
|
#endif
|
|
|
|
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
|
|
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
|
|
struct ahci_host_priv *hpriv)
|
|
{
|
|
int i;
|
|
@@ -1535,7 +1535,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
|
|
pci_resource_len(pdev, bar) < SZ_512K ||
|
|
bar != AHCI_PCI_BAR_STANDARD ||
|
|
!(readl(hpriv->mmio + AHCI_VSCAP) & 1))
|
|
- return;
|
|
+ return 0;
|
|
|
|
cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
|
|
for (i = 0; i < AHCI_MAX_REMAP; i++) {
|
|
@@ -1550,18 +1550,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
|
|
}
|
|
|
|
if (!hpriv->remapped_nvme)
|
|
- return;
|
|
-
|
|
- dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
|
|
- hpriv->remapped_nvme);
|
|
- dev_warn(&pdev->dev,
|
|
- "Switch your BIOS from RAID to AHCI mode to use them.\n");
|
|
+ return 0;
|
|
|
|
- /*
|
|
- * Don't rely on the msi-x capability in the remap case,
|
|
- * share the legacy interrupt across ahci and remapped devices.
|
|
- */
|
|
- hpriv->flags |= AHCI_HFLAG_NO_MSI;
|
|
+ /* Abort probe, allowing intel-nvme-remap to step in when available */
|
|
+ dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
|
|
+ return -ENODEV;
|
|
}
|
|
|
|
static int ahci_get_irq_vector(struct ata_host *host, int port)
|
|
@@ -1781,7 +1774,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
|
|
hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
|
|
|
|
/* detect remapped nvme devices */
|
|
- ahci_remap_check(pdev, ahci_pci_bar, hpriv);
|
|
+ rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
|
|
+ if (rc)
|
|
+ return rc;
|
|
|
|
sysfs_add_file_to_group(&pdev->dev.kobj,
|
|
&dev_attr_remapped_nvme.attr,
|
|
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
|
|
index 00476e94db90..c3a219218fac 100644
|
|
--- a/drivers/cpufreq/Kconfig.x86
|
|
+++ b/drivers/cpufreq/Kconfig.x86
|
|
@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
|
|
select ACPI_PROCESSOR if ACPI
|
|
select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
|
|
select CPU_FREQ_GOV_PERFORMANCE
|
|
- select CPU_FREQ_GOV_SCHEDUTIL if SMP
|
|
help
|
|
This driver provides a P state for Intel core processors.
|
|
The driver implements an internal governor and will become
|
|
@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
|
|
depends on X86 && ACPI
|
|
select ACPI_PROCESSOR
|
|
select ACPI_CPPC_LIB if X86_64
|
|
- select CPU_FREQ_GOV_SCHEDUTIL if SMP
|
|
help
|
|
This driver adds a CPUFreq driver which utilizes a fine grain
|
|
processor performance frequency control range instead of legacy
|
|
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
|
|
index 48a4613cef1e..ad9414c32060 100644
|
|
--- a/drivers/cpufreq/intel_pstate.c
|
|
+++ b/drivers/cpufreq/intel_pstate.c
|
|
@@ -3491,6 +3491,8 @@ static int __init intel_pstate_setup(char *str)
|
|
|
|
if (!strcmp(str, "disable"))
|
|
no_load = 1;
|
|
+ else if (!strcmp(str, "enable"))
|
|
+ no_load = 0;
|
|
else if (!strcmp(str, "active"))
|
|
default_driver = &intel_pstate;
|
|
else if (!strcmp(str, "passive"))
|
|
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
|
|
index 25eb4e8fd22f..2f95d74ad0b4 100644
|
|
--- a/drivers/i2c/busses/Kconfig
|
|
+++ b/drivers/i2c/busses/Kconfig
|
|
@@ -229,6 +229,15 @@ config I2C_CHT_WC
|
|
combined with a FUSB302 Type-C port-controller as such it is advised
|
|
to also select CONFIG_TYPEC_FUSB302=m.
|
|
|
|
+config I2C_NCT6775
|
|
+ tristate "Nuvoton NCT6775 and compatible SMBus controller"
|
|
+ help
|
|
+ If you say yes to this option, support will be included for the
|
|
+ Nuvoton NCT6775 and compatible SMBus controllers.
|
|
+
|
|
+ This driver can also be built as a module. If so, the module
|
|
+ will be called i2c-nct6775.
|
|
+
|
|
config I2C_NFORCE2
|
|
tristate "Nvidia nForce2, nForce3 and nForce4"
|
|
depends on PCI
|
|
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
|
|
index af56fe2c75c0..76be74584719 100644
|
|
--- a/drivers/i2c/busses/Makefile
|
|
+++ b/drivers/i2c/busses/Makefile
|
|
@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o
|
|
obj-$(CONFIG_I2C_I801) += i2c-i801.o
|
|
obj-$(CONFIG_I2C_ISCH) += i2c-isch.o
|
|
obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o
|
|
+obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o
|
|
obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o
|
|
obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o
|
|
obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o
|
|
diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
|
|
new file mode 100644
|
|
index 000000000000..0462f0952043
|
|
--- /dev/null
|
|
+++ b/drivers/i2c/busses/i2c-nct6775.c
|
|
@@ -0,0 +1,647 @@
|
|
+/*
|
|
+ * i2c-nct6775 - Driver for the SMBus master functionality of
|
|
+ * Nuvoton NCT677x Super-I/O chips
|
|
+ *
|
|
+ * Copyright (C) 2019 Adam Honse <calcprogrammer1@gmail.com>
|
|
+ *
|
|
+ * Derived from nct6775 hwmon driver
|
|
+ * Copyright (C) 2012 Guenter Roeck <linux@roeck-us.net>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or modify
|
|
+ * it under the terms of the GNU General Public License as published by
|
|
+ * the Free Software Foundation; either version 2 of the License, or
|
|
+ * (at your option) any later version.
|
|
+ *
|
|
+ * This program is distributed in the hope that it will be useful,
|
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
+ * GNU General Public License for more details.
|
|
+ *
|
|
+ * You should have received a copy of the GNU General Public License
|
|
+ * along with this program; if not, write to the Free Software
|
|
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
+ *
|
|
+ */
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <linux/init.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/jiffies.h>
|
|
+#include <linux/platform_device.h>
|
|
+#include <linux/hwmon.h>
|
|
+#include <linux/hwmon-sysfs.h>
|
|
+#include <linux/hwmon-vid.h>
|
|
+#include <linux/err.h>
|
|
+#include <linux/mutex.h>
|
|
+#include <linux/delay.h>
|
|
+#include <linux/ioport.h>
|
|
+#include <linux/i2c.h>
|
|
+#include <linux/acpi.h>
|
|
+#include <linux/bitops.h>
|
|
+#include <linux/dmi.h>
|
|
+#include <linux/io.h>
|
|
+#include <linux/nospec.h>
|
|
+
|
|
+#define DRVNAME "i2c-nct6775"
|
|
+
|
|
+/* Nuvoton SMBus address offsets */
|
|
+#define SMBHSTDAT (0 + nuvoton_nct6793d_smba)
|
|
+#define SMBBLKSZ (1 + nuvoton_nct6793d_smba)
|
|
+#define SMBHSTCMD (2 + nuvoton_nct6793d_smba)
|
|
+#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers
|
|
+#define SMBHSTCTL (4 + nuvoton_nct6793d_smba)
|
|
+#define SMBHSTADD (5 + nuvoton_nct6793d_smba)
|
|
+#define SMBHSTERR (9 + nuvoton_nct6793d_smba)
|
|
+#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba)
|
|
+
|
|
+/* Command register */
|
|
+#define NCT6793D_READ_BYTE 0
|
|
+#define NCT6793D_READ_WORD 1
|
|
+#define NCT6793D_READ_BLOCK 2
|
|
+#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3
|
|
+#define NCT6793D_PROC_CALL 4
|
|
+#define NCT6793D_WRITE_BYTE 8
|
|
+#define NCT6793D_WRITE_WORD 9
|
|
+#define NCT6793D_WRITE_BLOCK 10
|
|
+
|
|
+/* Control register */
|
|
+#define NCT6793D_MANUAL_START 128
|
|
+#define NCT6793D_SOFT_RESET 64
|
|
+
|
|
+/* Error register */
|
|
+#define NCT6793D_NO_ACK 32
|
|
+
|
|
+/* Status register */
|
|
+#define NCT6793D_FIFO_EMPTY 1
|
|
+#define NCT6793D_FIFO_FULL 2
|
|
+#define NCT6793D_MANUAL_ACTIVE 4
|
|
+
|
|
+#define NCT6775_LD_SMBUS 0x0B
|
|
+
|
|
+/* Other settings */
|
|
+#define MAX_RETRIES 400
|
|
+
|
|
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793,
|
|
+ nct6795, nct6796, nct6798 };
|
|
+
|
|
+struct nct6775_sio_data {
|
|
+ int sioreg;
|
|
+ enum kinds kind;
|
|
+};
|
|
+
|
|
+/* used to set data->name = nct6775_device_names[data->sio_kind] */
|
|
+static const char * const nct6775_device_names[] = {
|
|
+ "nct6106",
|
|
+ "nct6775",
|
|
+ "nct6776",
|
|
+ "nct6779",
|
|
+ "nct6791",
|
|
+ "nct6792",
|
|
+ "nct6793",
|
|
+ "nct6795",
|
|
+ "nct6796",
|
|
+ "nct6798",
|
|
+};
|
|
+
|
|
+static const char * const nct6775_sio_names[] __initconst = {
|
|
+ "NCT6106D",
|
|
+ "NCT6775F",
|
|
+ "NCT6776D/F",
|
|
+ "NCT6779D",
|
|
+ "NCT6791D",
|
|
+ "NCT6792D",
|
|
+ "NCT6793D",
|
|
+ "NCT6795D",
|
|
+ "NCT6796D",
|
|
+ "NCT6798D",
|
|
+};
|
|
+
|
|
+#define SIO_REG_LDSEL 0x07 /* Logical device select */
|
|
+#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */
|
|
+#define SIO_REG_SMBA 0x62 /* SMBus base address register */
|
|
+
|
|
+#define SIO_NCT6106_ID 0xc450
|
|
+#define SIO_NCT6775_ID 0xb470
|
|
+#define SIO_NCT6776_ID 0xc330
|
|
+#define SIO_NCT6779_ID 0xc560
|
|
+#define SIO_NCT6791_ID 0xc800
|
|
+#define SIO_NCT6792_ID 0xc910
|
|
+#define SIO_NCT6793_ID 0xd120
|
|
+#define SIO_NCT6795_ID 0xd350
|
|
+#define SIO_NCT6796_ID 0xd420
|
|
+#define SIO_NCT6798_ID 0xd428
|
|
+#define SIO_ID_MASK 0xFFF0
|
|
+
|
|
+static inline void
|
|
+superio_outb(int ioreg, int reg, int val)
|
|
+{
|
|
+ outb(reg, ioreg);
|
|
+ outb(val, ioreg + 1);
|
|
+}
|
|
+
|
|
+static inline int
|
|
+superio_inb(int ioreg, int reg)
|
|
+{
|
|
+ outb(reg, ioreg);
|
|
+ return inb(ioreg + 1);
|
|
+}
|
|
+
|
|
+static inline void
|
|
+superio_select(int ioreg, int ld)
|
|
+{
|
|
+ outb(SIO_REG_LDSEL, ioreg);
|
|
+ outb(ld, ioreg + 1);
|
|
+}
|
|
+
|
|
+static inline int
|
|
+superio_enter(int ioreg)
|
|
+{
|
|
+ /*
|
|
+ * Try to reserve <ioreg> and <ioreg + 1> for exclusive access.
|
|
+ */
|
|
+ if (!request_muxed_region(ioreg, 2, DRVNAME))
|
|
+ return -EBUSY;
|
|
+
|
|
+ outb(0x87, ioreg);
|
|
+ outb(0x87, ioreg);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline void
|
|
+superio_exit(int ioreg)
|
|
+{
|
|
+ outb(0xaa, ioreg);
|
|
+ outb(0x02, ioreg);
|
|
+ outb(0x02, ioreg + 1);
|
|
+ release_region(ioreg, 2);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * ISA constants
|
|
+ */
|
|
+
|
|
+#define IOREGION_ALIGNMENT (~7)
|
|
+#define IOREGION_LENGTH 2
|
|
+#define ADDR_REG_OFFSET 0
|
|
+#define DATA_REG_OFFSET 1
|
|
+
|
|
+#define NCT6775_REG_BANK 0x4E
|
|
+#define NCT6775_REG_CONFIG 0x40
|
|
+
|
|
+static struct i2c_adapter *nct6775_adapter;
|
|
+
|
|
+struct i2c_nct6775_adapdata {
|
|
+ unsigned short smba;
|
|
+};
|
|
+
|
|
+/* Return negative errno on error. */
|
|
+static s32 nct6775_access(struct i2c_adapter * adap, u16 addr,
|
|
+ unsigned short flags, char read_write,
|
|
+ u8 command, int size, union i2c_smbus_data * data)
|
|
+{
|
|
+ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
|
|
+ unsigned short nuvoton_nct6793d_smba = adapdata->smba;
|
|
+ int i, len, cnt;
|
|
+ union i2c_smbus_data tmp_data;
|
|
+ int timeout = 0;
|
|
+
|
|
+ tmp_data.word = 0;
|
|
+ cnt = 0;
|
|
+ len = 0;
|
|
+
|
|
+ outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL);
|
|
+
|
|
+ switch (size) {
|
|
+ case I2C_SMBUS_QUICK:
|
|
+ outb_p((addr << 1) | read_write,
|
|
+ SMBHSTADD);
|
|
+ break;
|
|
+ case I2C_SMBUS_BYTE_DATA:
|
|
+ tmp_data.byte = data->byte;
|
|
+ case I2C_SMBUS_BYTE:
|
|
+ outb_p((addr << 1) | read_write,
|
|
+ SMBHSTADD);
|
|
+ outb_p(command, SMBHSTIDX);
|
|
+ if (read_write == I2C_SMBUS_WRITE) {
|
|
+ outb_p(tmp_data.byte, SMBHSTDAT);
|
|
+ outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD);
|
|
+ }
|
|
+ else {
|
|
+ outb_p(NCT6793D_READ_BYTE, SMBHSTCMD);
|
|
+ }
|
|
+ break;
|
|
+ case I2C_SMBUS_WORD_DATA:
|
|
+ outb_p((addr << 1) | read_write,
|
|
+ SMBHSTADD);
|
|
+ outb_p(command, SMBHSTIDX);
|
|
+ if (read_write == I2C_SMBUS_WRITE) {
|
|
+ outb_p(data->word & 0xff, SMBHSTDAT);
|
|
+ outb_p((data->word & 0xff00) >> 8, SMBHSTDAT);
|
|
+ outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD);
|
|
+ }
|
|
+ else {
|
|
+ outb_p(NCT6793D_READ_WORD, SMBHSTCMD);
|
|
+ }
|
|
+ break;
|
|
+ case I2C_SMBUS_BLOCK_DATA:
|
|
+ outb_p((addr << 1) | read_write,
|
|
+ SMBHSTADD);
|
|
+ outb_p(command, SMBHSTIDX);
|
|
+ if (read_write == I2C_SMBUS_WRITE) {
|
|
+ len = data->block[0];
|
|
+ if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
|
|
+ return -EINVAL;
|
|
+ outb_p(len, SMBBLKSZ);
|
|
+
|
|
+ cnt = 1;
|
|
+ if (len >= 4) {
|
|
+ for (i = cnt; i <= 4; i++) {
|
|
+ outb_p(data->block[i], SMBHSTDAT);
|
|
+ }
|
|
+
|
|
+ len -= 4;
|
|
+ cnt += 4;
|
|
+ }
|
|
+ else {
|
|
+ for (i = cnt; i <= len; i++ ) {
|
|
+ outb_p(data->block[i], SMBHSTDAT);
|
|
+ }
|
|
+
|
|
+ len = 0;
|
|
+ }
|
|
+
|
|
+ outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD);
|
|
+ }
|
|
+ else {
|
|
+ return -ENOTSUPP;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ dev_warn(&adap->dev, "Unsupported transaction %d\n", size);
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ outb_p(NCT6793D_MANUAL_START, SMBHSTCTL);
|
|
+
|
|
+ while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) {
|
|
+ if (read_write == I2C_SMBUS_WRITE) {
|
|
+ timeout = 0;
|
|
+ while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0)
|
|
+ {
|
|
+ if(timeout > MAX_RETRIES)
|
|
+ {
|
|
+ return -ETIMEDOUT;
|
|
+ }
|
|
+ usleep_range(250, 500);
|
|
+ timeout++;
|
|
+ }
|
|
+
|
|
+ //Load more bytes into FIFO
|
|
+ if (len >= 4) {
|
|
+ for (i = cnt; i <= (cnt + 4); i++) {
|
|
+ outb_p(data->block[i], SMBHSTDAT);
|
|
+ }
|
|
+
|
|
+ len -= 4;
|
|
+ cnt += 4;
|
|
+ }
|
|
+ else {
|
|
+ for (i = cnt; i <= (cnt + len); i++) {
|
|
+ outb_p(data->block[i], SMBHSTDAT);
|
|
+ }
|
|
+
|
|
+ len = 0;
|
|
+ }
|
|
+ }
|
|
+ else {
|
|
+ return -ENOTSUPP;
|
|
+ }
|
|
+
|
|
+ }
|
|
+
|
|
+ //wait for manual mode to complete
|
|
+ timeout = 0;
|
|
+ while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0)
|
|
+ {
|
|
+ if(timeout > MAX_RETRIES)
|
|
+ {
|
|
+ return -ETIMEDOUT;
|
|
+ }
|
|
+ usleep_range(250, 500);
|
|
+ timeout++;
|
|
+ }
|
|
+
|
|
+ if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) {
|
|
+ return -ENXIO;
|
|
+ }
|
|
+ else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ switch (size) {
|
|
+ case I2C_SMBUS_QUICK:
|
|
+ case I2C_SMBUS_BYTE_DATA:
|
|
+ data->byte = inb_p(SMBHSTDAT);
|
|
+ break;
|
|
+ case I2C_SMBUS_WORD_DATA:
|
|
+ data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8);
|
|
+ break;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static u32 nct6775_func(struct i2c_adapter *adapter)
|
|
+{
|
|
+ return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
|
|
+ I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
|
|
+ I2C_FUNC_SMBUS_BLOCK_DATA;
|
|
+}
|
|
+
|
|
+static const struct i2c_algorithm smbus_algorithm = {
|
|
+ .smbus_xfer = nct6775_access,
|
|
+ .functionality = nct6775_func,
|
|
+};
|
|
+
|
|
+static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap)
|
|
+{
|
|
+ struct i2c_adapter *adap;
|
|
+ struct i2c_nct6775_adapdata *adapdata;
|
|
+ int retval;
|
|
+
|
|
+ adap = kzalloc(sizeof(*adap), GFP_KERNEL);
|
|
+ if (adap == NULL) {
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ adap->owner = THIS_MODULE;
|
|
+ adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
|
|
+ adap->algo = &smbus_algorithm;
|
|
+
|
|
+ adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
|
|
+ if (adapdata == NULL) {
|
|
+ kfree(adap);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ adapdata->smba = smba;
|
|
+
|
|
+ snprintf(adap->name, sizeof(adap->name),
|
|
+ "SMBus NCT67xx adapter%s at %04x", name, smba);
|
|
+
|
|
+ i2c_set_adapdata(adap, adapdata);
|
|
+
|
|
+ retval = i2c_add_adapter(adap);
|
|
+ if (retval) {
|
|
+ kfree(adapdata);
|
|
+ kfree(adap);
|
|
+ return retval;
|
|
+ }
|
|
+
|
|
+ *padap = adap;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void nct6775_remove_adapter(struct i2c_adapter *adap)
|
|
+{
|
|
+ struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
|
|
+
|
|
+ if (adapdata->smba) {
|
|
+ i2c_del_adapter(adap);
|
|
+ kfree(adapdata);
|
|
+ kfree(adap);
|
|
+ }
|
|
+}
|
|
+
|
|
+//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume);
|
|
+
|
|
+/*
|
|
+ * when Super-I/O functions move to a separate file, the Super-I/O
|
|
+ * bus will manage the lifetime of the device and this module will only keep
|
|
+ * track of the nct6775 driver. But since we use platform_device_alloc(), we
|
|
+ * must keep track of the device
|
|
+ */
|
|
+static struct platform_device *pdev[2];
|
|
+
|
|
+static int nct6775_probe(struct platform_device *pdev)
|
|
+{
|
|
+ struct device *dev = &pdev->dev;
|
|
+ struct nct6775_sio_data *sio_data = dev_get_platdata(dev);
|
|
+ struct resource *res;
|
|
+
|
|
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
|
|
+ if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH,
|
|
+ DRVNAME))
|
|
+ return -EBUSY;
|
|
+
|
|
+ switch (sio_data->kind) {
|
|
+ case nct6791:
|
|
+ case nct6792:
|
|
+ case nct6793:
|
|
+ case nct6795:
|
|
+ case nct6796:
|
|
+ case nct6798:
|
|
+ nct6775_add_adapter(res->start, "", &nct6775_adapter);
|
|
+ break;
|
|
+ default:
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+/*
|
|
+static void nct6791_enable_io_mapping(int sioaddr)
|
|
+{
|
|
+ int val;
|
|
+
|
|
+ val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE);
|
|
+ if (val & 0x10) {
|
|
+ pr_info("Enabling hardware monitor logical device mappings.\n");
|
|
+ superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE,
|
|
+ val & ~0x10);
|
|
+ }
|
|
+}*/
|
|
+
|
|
+static struct platform_driver i2c_nct6775_driver = {
|
|
+ .driver = {
|
|
+ .name = DRVNAME,
|
|
+// .pm = &nct6775_dev_pm_ops,
|
|
+ },
|
|
+ .probe = nct6775_probe,
|
|
+};
|
|
+
|
|
+static void __exit i2c_nct6775_exit(void)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ if(nct6775_adapter)
|
|
+ nct6775_remove_adapter(nct6775_adapter);
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(pdev); i++) {
|
|
+ if (pdev[i])
|
|
+ platform_device_unregister(pdev[i]);
|
|
+ }
|
|
+ platform_driver_unregister(&i2c_nct6775_driver);
|
|
+}
|
|
+
|
|
+/* nct6775_find() looks for a '627 in the Super-I/O config space */
|
|
+static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
|
|
+{
|
|
+ u16 val;
|
|
+ int err;
|
|
+ int addr;
|
|
+
|
|
+ err = superio_enter(sioaddr);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) |
|
|
+ superio_inb(sioaddr, SIO_REG_DEVID + 1);
|
|
+
|
|
+ switch (val & SIO_ID_MASK) {
|
|
+ case SIO_NCT6106_ID:
|
|
+ sio_data->kind = nct6106;
|
|
+ break;
|
|
+ case SIO_NCT6775_ID:
|
|
+ sio_data->kind = nct6775;
|
|
+ break;
|
|
+ case SIO_NCT6776_ID:
|
|
+ sio_data->kind = nct6776;
|
|
+ break;
|
|
+ case SIO_NCT6779_ID:
|
|
+ sio_data->kind = nct6779;
|
|
+ break;
|
|
+ case SIO_NCT6791_ID:
|
|
+ sio_data->kind = nct6791;
|
|
+ break;
|
|
+ case SIO_NCT6792_ID:
|
|
+ sio_data->kind = nct6792;
|
|
+ break;
|
|
+ case SIO_NCT6793_ID:
|
|
+ sio_data->kind = nct6793;
|
|
+ break;
|
|
+ case SIO_NCT6795_ID:
|
|
+ sio_data->kind = nct6795;
|
|
+ break;
|
|
+ case SIO_NCT6796_ID:
|
|
+ sio_data->kind = nct6796;
|
|
+ break;
|
|
+ case SIO_NCT6798_ID:
|
|
+ sio_data->kind = nct6798;
|
|
+ break;
|
|
+ default:
|
|
+ if (val != 0xffff)
|
|
+ pr_debug("unsupported chip ID: 0x%04x\n", val);
|
|
+ superio_exit(sioaddr);
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ /* We have a known chip, find the SMBus I/O address */
|
|
+ superio_select(sioaddr, NCT6775_LD_SMBUS);
|
|
+ val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8)
|
|
+ | superio_inb(sioaddr, SIO_REG_SMBA + 1);
|
|
+ addr = val & IOREGION_ALIGNMENT;
|
|
+ if (addr == 0) {
|
|
+ pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n");
|
|
+ superio_exit(sioaddr);
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
|
|
+ // sio_data->kind == nct6793 || sio_data->kind == nct6795 ||
|
|
+ // sio_data->kind == nct6796)
|
|
+ // nct6791_enable_io_mapping(sioaddr);
|
|
+
|
|
+ superio_exit(sioaddr);
|
|
+ pr_info("Found %s or compatible chip at %#x:%#x\n",
|
|
+ nct6775_sio_names[sio_data->kind], sioaddr, addr);
|
|
+ sio_data->sioreg = sioaddr;
|
|
+
|
|
+ return addr;
|
|
+}
|
|
+
|
|
+static int __init i2c_nct6775_init(void)
|
|
+{
|
|
+ int i, err;
|
|
+ bool found = false;
|
|
+ int address;
|
|
+ struct resource res;
|
|
+ struct nct6775_sio_data sio_data;
|
|
+ int sioaddr[2] = { 0x2e, 0x4e };
|
|
+
|
|
+ err = platform_driver_register(&i2c_nct6775_driver);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ /*
|
|
+ * initialize sio_data->kind and sio_data->sioreg.
|
|
+ *
|
|
+ * when Super-I/O functions move to a separate file, the Super-I/O
|
|
+ * driver will probe 0x2e and 0x4e and auto-detect the presence of a
|
|
+ * nct6775 hardware monitor, and call probe()
|
|
+ */
|
|
+ for (i = 0; i < ARRAY_SIZE(pdev); i++) {
|
|
+ address = nct6775_find(sioaddr[i], &sio_data);
|
|
+ if (address <= 0)
|
|
+ continue;
|
|
+
|
|
+ found = true;
|
|
+
|
|
+ pdev[i] = platform_device_alloc(DRVNAME, address);
|
|
+ if (!pdev[i]) {
|
|
+ err = -ENOMEM;
|
|
+ goto exit_device_unregister;
|
|
+ }
|
|
+
|
|
+ err = platform_device_add_data(pdev[i], &sio_data,
|
|
+ sizeof(struct nct6775_sio_data));
|
|
+ if (err)
|
|
+ goto exit_device_put;
|
|
+
|
|
+ memset(&res, 0, sizeof(res));
|
|
+ res.name = DRVNAME;
|
|
+ res.start = address;
|
|
+ res.end = address + IOREGION_LENGTH - 1;
|
|
+ res.flags = IORESOURCE_IO;
|
|
+
|
|
+ err = acpi_check_resource_conflict(&res);
|
|
+ if (err) {
|
|
+ platform_device_put(pdev[i]);
|
|
+ pdev[i] = NULL;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ err = platform_device_add_resources(pdev[i], &res, 1);
|
|
+ if (err)
|
|
+ goto exit_device_put;
|
|
+
|
|
+ /* platform_device_add calls probe() */
|
|
+ err = platform_device_add(pdev[i]);
|
|
+ if (err)
|
|
+ goto exit_device_put;
|
|
+ }
|
|
+ if (!found) {
|
|
+ err = -ENODEV;
|
|
+ goto exit_unregister;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+
|
|
+exit_device_put:
|
|
+ platform_device_put(pdev[i]);
|
|
+exit_device_unregister:
|
|
+ while (--i >= 0) {
|
|
+ if (pdev[i])
|
|
+ platform_device_unregister(pdev[i]);
|
|
+ }
|
|
+exit_unregister:
|
|
+ platform_driver_unregister(&i2c_nct6775_driver);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+MODULE_AUTHOR("Adam Honse <calcprogrammer1@gmail.com>");
|
|
+MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips");
|
|
+MODULE_LICENSE("GPL");
|
|
+
|
|
+module_init(i2c_nct6775_init);
|
|
+module_exit(i2c_nct6775_exit);
|
|
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
|
|
index 809fbd014cd6..d54b35b147ee 100644
|
|
--- a/drivers/i2c/busses/i2c-piix4.c
|
|
+++ b/drivers/i2c/busses/i2c-piix4.c
|
|
@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
|
|
if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */
|
|
usleep_range(2000, 2100);
|
|
else
|
|
- usleep_range(250, 500);
|
|
+ usleep_range(25, 50);
|
|
|
|
while ((++timeout < MAX_TIMEOUT) &&
|
|
((temp = inb_p(SMBHSTSTS)) & 0x01))
|
|
- usleep_range(250, 500);
|
|
+ usleep_range(25, 50);
|
|
|
|
/* If the SMBus is still busy, we give up */
|
|
if (timeout == MAX_TIMEOUT) {
|
|
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
|
|
index 3ba53dc3cc3f..0fde1b3ced78 100644
|
|
--- a/drivers/md/dm-crypt.c
|
|
+++ b/drivers/md/dm-crypt.c
|
|
@@ -3213,6 +3213,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
goto bad;
|
|
}
|
|
|
|
+#ifdef CONFIG_CACHY
|
|
+ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
|
|
+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
|
|
+#endif
|
|
+
|
|
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
|
|
if (ret < 0)
|
|
goto bad;
|
|
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
|
|
index 37c8663de7fe..897d19f92ede 100644
|
|
--- a/drivers/pci/controller/Makefile
|
|
+++ b/drivers/pci/controller/Makefile
|
|
@@ -1,4 +1,10 @@
|
|
# SPDX-License-Identifier: GPL-2.0
|
|
+ifdef CONFIG_X86_64
|
|
+ifdef CONFIG_SATA_AHCI
|
|
+obj-y += intel-nvme-remap.o
|
|
+endif
|
|
+endif
|
|
+
|
|
obj-$(CONFIG_PCIE_CADENCE) += cadence/
|
|
obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
|
|
obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
|
|
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
|
|
new file mode 100644
|
|
index 000000000000..e105e6f5cc91
|
|
--- /dev/null
|
|
+++ b/drivers/pci/controller/intel-nvme-remap.c
|
|
@@ -0,0 +1,462 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Intel remapped NVMe device support.
|
|
+ *
|
|
+ * Copyright (c) 2019 Endless Mobile, Inc.
|
|
+ * Author: Daniel Drake <drake@endlessm.com>
|
|
+ *
|
|
+ * Some products ship by default with the SATA controller in "RAID" or
|
|
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
|
|
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
|
|
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
|
|
+ * available within the AHCI device BARs.
|
|
+ *
|
|
+ * This scheme is understood to be a way of avoiding usage of the standard
|
|
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
|
|
+ * driver instead, which has better power management, and presumably offers
|
|
+ * some RAID/disk-caching solutions too.
|
|
+ *
|
|
+ * Here in this driver, we support the remapped NVMe mode by claiming the
|
|
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
|
|
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
|
|
+ * devices corresponding to the remapped NVMe devices are created. The usual
|
|
+ * ahci and nvme drivers are then expected to bind to these devices and
|
|
+ * operate as normal.
|
|
+ *
|
|
+ * The PCI configuration space for the NVMe devices is completely
|
|
+ * unavailable, so we fake a minimal one and hope for the best.
|
|
+ *
|
|
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
|
|
+ * we only support the legacy interrupt here, although MSI support
|
|
+ * could potentially be added later.
|
|
+ */
|
|
+
|
|
+#define MODULE_NAME "intel-nvme-remap"
|
|
+
|
|
+#include <linux/ahci-remap.h>
|
|
+#include <linux/irq.h>
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/pci.h>
|
|
+
|
|
+#define AHCI_PCI_BAR_STANDARD 5
|
|
+
|
|
+struct nvme_remap_dev {
|
|
+ struct pci_dev *dev; /* AHCI device */
|
|
+ struct pci_bus *bus; /* our fake PCI bus */
|
|
+ struct pci_sysdata sysdata;
|
|
+ int irq_base; /* our fake interrupts */
|
|
+
|
|
+ /*
|
|
+ * When we detect an all-ones write to a BAR register, this flag
|
|
+ * is set, so that we return the BAR size on the next read (a
|
|
+ * standard PCI behaviour).
|
|
+ * This includes the assumption that an all-ones BAR write is
|
|
+ * immediately followed by a read of the same register.
|
|
+ */
|
|
+ bool bar_sizing;
|
|
+
|
|
+ /*
|
|
+ * Resources copied from the AHCI device, to be regarded as
|
|
+ * resources on our fake bus.
|
|
+ */
|
|
+ struct resource ahci_resources[PCI_NUM_RESOURCES];
|
|
+
|
|
+ /* Resources corresponding to the NVMe devices. */
|
|
+ struct resource remapped_dev_mem[AHCI_MAX_REMAP];
|
|
+
|
|
+ /* Number of remapped NVMe devices found. */
|
|
+ int num_remapped_devices;
|
|
+};
|
|
+
|
|
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
|
|
+{
|
|
+ return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
|
|
+}
|
|
+
|
|
+
|
|
+/******** PCI configuration space **********/
|
|
+
|
|
+/*
|
|
+ * Helper macros for tweaking returned contents of PCI configuration space.
|
|
+ *
|
|
+ * value contains len bytes of data read from reg.
|
|
+ * If fixup_reg is included in that range, fix up the contents of that
|
|
+ * register to fixed_value.
|
|
+ */
|
|
+#define NR_FIX8(fixup_reg, fixed_value) do { \
|
|
+ if (reg <= fixup_reg && fixup_reg < reg + len) \
|
|
+ ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
|
|
+ } while (0)
|
|
+
|
|
+#define NR_FIX16(fixup_reg, fixed_value) do { \
|
|
+ NR_FIX8(fixup_reg, fixed_value); \
|
|
+ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
|
|
+ } while (0)
|
|
+
|
|
+#define NR_FIX24(fixup_reg, fixed_value) do { \
|
|
+ NR_FIX8(fixup_reg, fixed_value); \
|
|
+ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
|
|
+ NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
|
|
+ } while (0)
|
|
+
|
|
+#define NR_FIX32(fixup_reg, fixed_value) do { \
|
|
+ NR_FIX16(fixup_reg, (u16) fixed_value); \
|
|
+ NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
|
|
+ } while (0)
|
|
+
|
|
+/*
|
|
+ * Read PCI config space of the slot 0 (AHCI) device.
|
|
+ * We pass through the read request to the underlying device, but
|
|
+ * tweak the results in some cases.
|
|
+ */
|
|
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
|
|
+ int len, u32 *value)
|
|
+{
|
|
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
|
+ struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
|
|
+ int ret;
|
|
+
|
|
+ ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
|
|
+ reg, len, value);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ /*
|
|
+ * Adjust the device class, to prevent this driver from attempting to
|
|
+ * additionally probe the device we're simulating here.
|
|
+ */
|
|
+ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
|
|
+
|
|
+ /*
|
|
+ * Unset interrupt pin, otherwise ACPI tries to find routing
|
|
+ * info for our virtual IRQ, fails, and complains.
|
|
+ */
|
|
+ NR_FIX8(PCI_INTERRUPT_PIN, 0);
|
|
+
|
|
+ /*
|
|
+ * Truncate the AHCI BAR to not include the region that covers the
|
|
+ * hidden devices. This will cause the ahci driver to successfully
|
|
+ * probe th new device (instead of handing it over to this driver).
|
|
+ */
|
|
+ if (nrdev->bar_sizing) {
|
|
+ NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
|
|
+ nrdev->bar_sizing = false;
|
|
+ }
|
|
+
|
|
+ return PCIBIOS_SUCCESSFUL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Read PCI config space of a remapped device.
|
|
+ * Since the original PCI config space is inaccessible, we provide a minimal,
|
|
+ * fake config space instead.
|
|
+ */
|
|
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
|
|
+ int reg, int len, u32 *value)
|
|
+{
|
|
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
|
+ struct resource *remapped_mem;
|
|
+
|
|
+ if (port > nrdev->num_remapped_devices)
|
|
+ return PCIBIOS_DEVICE_NOT_FOUND;
|
|
+
|
|
+ *value = 0;
|
|
+ remapped_mem = &nrdev->remapped_dev_mem[port - 1];
|
|
+
|
|
+ /* Set a Vendor ID, otherwise Linux assumes no device is present */
|
|
+ NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
|
|
+
|
|
+ /* Always appear on & bus mastering */
|
|
+ NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
|
|
+
|
|
+ /* Set class so that nvme driver probes us */
|
|
+ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
|
|
+
|
|
+ if (nrdev->bar_sizing) {
|
|
+ NR_FIX32(PCI_BASE_ADDRESS_0,
|
|
+ ~(resource_size(remapped_mem) - 1));
|
|
+ nrdev->bar_sizing = false;
|
|
+ } else {
|
|
+ resource_size_t mem_start = remapped_mem->start;
|
|
+
|
|
+ mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
|
|
+ NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
|
|
+ mem_start >>= 32;
|
|
+ NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
|
|
+ }
|
|
+
|
|
+ return PCIBIOS_SUCCESSFUL;
|
|
+}
|
|
+
|
|
+/* Read PCI configuration space. */
|
|
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
|
|
+ int reg, int len, u32 *value)
|
|
+{
|
|
+ if (PCI_SLOT(devfn) == 0)
|
|
+ return nvme_remap_pci_read_slot0(bus, reg, len, value);
|
|
+ else
|
|
+ return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
|
|
+ reg, len, value);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Write PCI config space of the slot 0 (AHCI) device.
|
|
+ * Apart from the special case of BAR sizing, we disable all writes.
|
|
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
|
|
+ * that would affect the operation of the NVMe devices.
|
|
+ */
|
|
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
|
|
+ int len, u32 value)
|
|
+{
|
|
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
|
+ struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
|
|
+
|
|
+ if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
|
|
+ /*
|
|
+ * Writing all-ones to a BAR means that the size of the
|
|
+ * memory region is being checked. Flag this so that we can
|
|
+ * reply with an appropriate size on the next read.
|
|
+ */
|
|
+ if (value == ~0)
|
|
+ nrdev->bar_sizing = true;
|
|
+
|
|
+ return ahci_dev_bus->ops->write(ahci_dev_bus,
|
|
+ nrdev->dev->devfn,
|
|
+ reg, len, value);
|
|
+ }
|
|
+
|
|
+ return PCIBIOS_SET_FAILED;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Write PCI config space of a remapped device.
|
|
+ * Since the original PCI config space is inaccessible, we reject all
|
|
+ * writes, except for the special case of BAR probing.
|
|
+ */
|
|
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
|
|
+ unsigned int port,
|
|
+ int reg, int len, u32 value)
|
|
+{
|
|
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
|
+
|
|
+ if (port > nrdev->num_remapped_devices)
|
|
+ return PCIBIOS_DEVICE_NOT_FOUND;
|
|
+
|
|
+ /*
|
|
+ * Writing all-ones to a BAR means that the size of the memory
|
|
+ * region is being checked. Flag this so that we can reply with
|
|
+ * an appropriate size on the next read.
|
|
+ */
|
|
+ if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
|
|
+ && reg <= PCI_BASE_ADDRESS_5) {
|
|
+ nrdev->bar_sizing = true;
|
|
+ return PCIBIOS_SUCCESSFUL;
|
|
+ }
|
|
+
|
|
+ return PCIBIOS_SET_FAILED;
|
|
+}
|
|
+
|
|
+/* Write PCI configuration space. */
|
|
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
|
|
+ int reg, int len, u32 value)
|
|
+{
|
|
+ if (PCI_SLOT(devfn) == 0)
|
|
+ return nvme_remap_pci_write_slot0(bus, reg, len, value);
|
|
+ else
|
|
+ return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
|
|
+ reg, len, value);
|
|
+}
|
|
+
|
|
+static struct pci_ops nvme_remap_pci_ops = {
|
|
+ .read = nvme_remap_pci_read,
|
|
+ .write = nvme_remap_pci_write,
|
|
+};
|
|
+
|
|
+
|
|
+/******** Initialization & exit **********/
|
|
+
|
|
+/*
|
|
+ * Find a PCI domain ID to use for our fake bus.
|
|
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
|
|
+ */
|
|
+static int find_free_domain(void)
|
|
+{
|
|
+ int domain = 0xffff;
|
|
+ struct pci_bus *bus = NULL;
|
|
+
|
|
+ while ((bus = pci_find_next_bus(bus)) != NULL)
|
|
+ domain = max_t(int, domain, pci_domain_nr(bus));
|
|
+
|
|
+ return domain + 1;
|
|
+}
|
|
+
|
|
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
|
|
+ struct list_head *resources)
|
|
+{
|
|
+ void __iomem *mmio;
|
|
+ int i, count = 0;
|
|
+ u32 cap;
|
|
+
|
|
+ mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
|
|
+ pci_resource_len(nrdev->dev,
|
|
+ AHCI_PCI_BAR_STANDARD));
|
|
+ if (!mmio)
|
|
+ return -ENODEV;
|
|
+
|
|
+ /* Check if this device might have remapped nvme devices. */
|
|
+ if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
|
|
+ !(readl(mmio + AHCI_VSCAP) & 1))
|
|
+ return -ENODEV;
|
|
+
|
|
+ cap = readq(mmio + AHCI_REMAP_CAP);
|
|
+ for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
|
|
+ struct resource *remapped_mem;
|
|
+
|
|
+ if ((cap & (1 << i)) == 0)
|
|
+ continue;
|
|
+ if (readl(mmio + ahci_remap_dcc(i))
|
|
+ != PCI_CLASS_STORAGE_EXPRESS)
|
|
+ continue;
|
|
+
|
|
+ /* We've found a remapped device */
|
|
+ remapped_mem = &nrdev->remapped_dev_mem[count++];
|
|
+ remapped_mem->start =
|
|
+ pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
|
|
+ + ahci_remap_base(i);
|
|
+ remapped_mem->end = remapped_mem->start
|
|
+ + AHCI_REMAP_N_SIZE - 1;
|
|
+ remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
|
|
+ pci_add_resource(resources, remapped_mem);
|
|
+ }
|
|
+
|
|
+ pcim_iounmap(nrdev->dev, mmio);
|
|
+
|
|
+ if (count == 0)
|
|
+ return -ENODEV;
|
|
+
|
|
+ nrdev->num_remapped_devices = count;
|
|
+ dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
|
|
+ nrdev->num_remapped_devices);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void nvme_remap_remove_root_bus(void *data)
|
|
+{
|
|
+ struct pci_bus *bus = data;
|
|
+
|
|
+ pci_stop_root_bus(bus);
|
|
+ pci_remove_root_bus(bus);
|
|
+}
|
|
+
|
|
+static int nvme_remap_probe(struct pci_dev *dev,
|
|
+ const struct pci_device_id *id)
|
|
+{
|
|
+ struct nvme_remap_dev *nrdev;
|
|
+ LIST_HEAD(resources);
|
|
+ int i;
|
|
+ int ret;
|
|
+ struct pci_dev *child;
|
|
+
|
|
+ nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
|
|
+ nrdev->sysdata.domain = find_free_domain();
|
|
+ nrdev->sysdata.nvme_remap_dev = dev;
|
|
+ nrdev->dev = dev;
|
|
+ pci_set_drvdata(dev, nrdev);
|
|
+
|
|
+ ret = pcim_enable_device(dev);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ pci_set_master(dev);
|
|
+
|
|
+ ret = find_remapped_devices(nrdev, &resources);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ /* Add resources from the original AHCI device */
|
|
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
|
|
+ struct resource *res = &dev->resource[i];
|
|
+
|
|
+ if (res->start) {
|
|
+ struct resource *nr_res = &nrdev->ahci_resources[i];
|
|
+
|
|
+ nr_res->start = res->start;
|
|
+ nr_res->end = res->end;
|
|
+ nr_res->flags = res->flags;
|
|
+ pci_add_resource(&resources, nr_res);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Create virtual interrupts */
|
|
+ nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
|
|
+ nrdev->num_remapped_devices + 1,
|
|
+ 0);
|
|
+ if (nrdev->irq_base < 0)
|
|
+ return nrdev->irq_base;
|
|
+
|
|
+ /* Create and populate PCI bus */
|
|
+ nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
|
|
+ &nrdev->sysdata, &resources);
|
|
+ if (!nrdev->bus)
|
|
+ return -ENODEV;
|
|
+
|
|
+ if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
|
|
+ nrdev->bus))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ /* We don't support sharing MSI interrupts between these devices */
|
|
+ nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
|
|
+
|
|
+ pci_scan_child_bus(nrdev->bus);
|
|
+
|
|
+ list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
|
|
+ /*
|
|
+ * Prevent PCI core from trying to move memory BARs around.
|
|
+ * The hidden NVMe devices are at fixed locations.
|
|
+ */
|
|
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
|
|
+ struct resource *res = &child->resource[i];
|
|
+
|
|
+ if (res->flags & IORESOURCE_MEM)
|
|
+ res->flags |= IORESOURCE_PCI_FIXED;
|
|
+ }
|
|
+
|
|
+ /* Share the legacy IRQ between all devices */
|
|
+ child->irq = dev->irq;
|
|
+ }
|
|
+
|
|
+ pci_assign_unassigned_bus_resources(nrdev->bus);
|
|
+ pci_bus_add_devices(nrdev->bus);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const struct pci_device_id nvme_remap_ids[] = {
|
|
+ /*
|
|
+ * Match all Intel RAID controllers.
|
|
+ *
|
|
+ * There's overlap here with the set of devices detected by the ahci
|
|
+ * driver, but ahci will only successfully probe when there
|
|
+ * *aren't* any remapped NVMe devices, and this driver will only
|
|
+ * successfully probe when there *are* remapped NVMe devices that
|
|
+ * need handling.
|
|
+ */
|
|
+ {
|
|
+ PCI_VDEVICE(INTEL, PCI_ANY_ID),
|
|
+ .class = PCI_CLASS_STORAGE_RAID << 8,
|
|
+ .class_mask = 0xffffff00,
|
|
+ },
|
|
+ {0,}
|
|
+};
|
|
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
|
|
+
|
|
+static struct pci_driver nvme_remap_drv = {
|
|
+ .name = MODULE_NAME,
|
|
+ .id_table = nvme_remap_ids,
|
|
+ .probe = nvme_remap_probe,
|
|
+};
|
|
+module_pci_driver(nvme_remap_drv);
|
|
+
|
|
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
|
|
+MODULE_LICENSE("GPL v2");
|
|
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
|
|
index 44cab813bf95..25edf55de985 100644
|
|
--- a/drivers/pci/quirks.c
|
|
+++ b/drivers/pci/quirks.c
|
|
@@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
|
|
dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
|
|
}
|
|
|
|
+static bool acs_on_downstream;
|
|
+static bool acs_on_multifunction;
|
|
+
|
|
+#define NUM_ACS_IDS 16
|
|
+struct acs_on_id {
|
|
+ unsigned short vendor;
|
|
+ unsigned short device;
|
|
+};
|
|
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
|
|
+static u8 max_acs_id;
|
|
+
|
|
+static __init int pcie_acs_override_setup(char *p)
|
|
+{
|
|
+ if (!p)
|
|
+ return -EINVAL;
|
|
+
|
|
+ while (*p) {
|
|
+ if (!strncmp(p, "downstream", 10))
|
|
+ acs_on_downstream = true;
|
|
+ if (!strncmp(p, "multifunction", 13))
|
|
+ acs_on_multifunction = true;
|
|
+ if (!strncmp(p, "id:", 3)) {
|
|
+ char opt[5];
|
|
+ int ret;
|
|
+ long val;
|
|
+
|
|
+ if (max_acs_id >= NUM_ACS_IDS - 1) {
|
|
+ pr_warn("Out of PCIe ACS override slots (%d)\n",
|
|
+ NUM_ACS_IDS);
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ p += 3;
|
|
+ snprintf(opt, 5, "%s", p);
|
|
+ ret = kstrtol(opt, 16, &val);
|
|
+ if (ret) {
|
|
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
|
|
+ goto next;
|
|
+ }
|
|
+ acs_on_ids[max_acs_id].vendor = val;
|
|
+
|
|
+ p += strcspn(p, ":");
|
|
+ if (*p != ':') {
|
|
+ pr_warn("PCIe ACS invalid ID\n");
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ p++;
|
|
+ snprintf(opt, 5, "%s", p);
|
|
+ ret = kstrtol(opt, 16, &val);
|
|
+ if (ret) {
|
|
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
|
|
+ goto next;
|
|
+ }
|
|
+ acs_on_ids[max_acs_id].device = val;
|
|
+ max_acs_id++;
|
|
+ }
|
|
+next:
|
|
+ p += strcspn(p, ",");
|
|
+ if (*p == ',')
|
|
+ p++;
|
|
+ }
|
|
+
|
|
+ if (acs_on_downstream || acs_on_multifunction || max_acs_id)
|
|
+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+early_param("pcie_acs_override", pcie_acs_override_setup);
|
|
+
|
|
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* Never override ACS for legacy devices or devices with ACS caps */
|
|
+ if (!pci_is_pcie(dev) ||
|
|
+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
|
|
+ return -ENOTTY;
|
|
+
|
|
+ for (i = 0; i < max_acs_id; i++)
|
|
+ if (acs_on_ids[i].vendor == dev->vendor &&
|
|
+ acs_on_ids[i].device == dev->device)
|
|
+ return 1;
|
|
+
|
|
+ switch (pci_pcie_type(dev)) {
|
|
+ case PCI_EXP_TYPE_DOWNSTREAM:
|
|
+ case PCI_EXP_TYPE_ROOT_PORT:
|
|
+ if (acs_on_downstream)
|
|
+ return 1;
|
|
+ break;
|
|
+ case PCI_EXP_TYPE_ENDPOINT:
|
|
+ case PCI_EXP_TYPE_UPSTREAM:
|
|
+ case PCI_EXP_TYPE_LEG_END:
|
|
+ case PCI_EXP_TYPE_RC_END:
|
|
+ if (acs_on_multifunction && dev->multifunction)
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ return -ENOTTY;
|
|
+}
|
|
/*
|
|
* Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
|
|
* prevented for those affected devices.
|
|
@@ -5002,6 +5102,7 @@ static const struct pci_dev_acs_enabled {
|
|
{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
|
|
/* Wangxun nics */
|
|
{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
|
|
+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
|
|
{ 0 }
|
|
};
|
|
|
|
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
|
|
index 4a01b315e0a9..e4a6c31a80df 100644
|
|
--- a/drivers/platform/x86/Kconfig
|
|
+++ b/drivers/platform/x86/Kconfig
|
|
@@ -641,6 +641,16 @@ config THINKPAD_LMI
|
|
To compile this driver as a module, choose M here: the module will
|
|
be called think-lmi.
|
|
|
|
+config LEGION_LAPTOP
|
|
+ tristate "Lenovo Legion Laptop Extras"
|
|
+ depends on ACPI
|
|
+ depends on ACPI_WMI || ACPI_WMI = n
|
|
+ depends on HWMON || HWMON = n
|
|
+ select ACPI_PLATFORM_PROFILE
|
|
+ help
|
|
+ This is a driver for Lenovo Legion laptops and contains drivers for
|
|
+ hotkey, fan control, and power mode.
|
|
+
|
|
source "drivers/platform/x86/intel/Kconfig"
|
|
|
|
config MSI_LAPTOP
|
|
@@ -1099,6 +1109,20 @@ config WINMATE_FM07_KEYS
|
|
buttons below the display. This module adds an input device
|
|
that delivers key events when these buttons are pressed.
|
|
|
|
+config STEAMDECK
|
|
+ tristate "Valve Steam Deck platform driver"
|
|
+ depends on X86_64
|
|
+ help
|
|
+ Driver exposing various bits and pieces of functionality
|
|
+ provided by Steam Deck specific VLV0100 device presented by
|
|
+ EC firmware. This includes but not limited to:
|
|
+ - CPU/device's fan control
|
|
+ - Read-only access to DDIC registers
|
|
+ - Battery tempreature measurements
|
|
+ - Various display related control knobs
|
|
+ - USB Type-C connector event notification
|
|
+ Say N unless you are running on a Steam Deck.
|
|
+
|
|
endif # X86_PLATFORM_DEVICES
|
|
|
|
config P2SB
|
|
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
|
|
index 1d3d1b02541b..fde9a683103e 100644
|
|
--- a/drivers/platform/x86/Makefile
|
|
+++ b/drivers/platform/x86/Makefile
|
|
@@ -66,6 +66,7 @@ obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o
|
|
obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o
|
|
obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o
|
|
obj-$(CONFIG_THINKPAD_LMI) += think-lmi.o
|
|
+obj-$(CONFIG_LEGION_LAPTOP) += legion-laptop.o
|
|
|
|
# Intel
|
|
obj-y += intel/
|
|
@@ -134,3 +135,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += simatic-ipc.o
|
|
|
|
# Winmate
|
|
obj-$(CONFIG_WINMATE_FM07_KEYS) += winmate-fm07-keys.o
|
|
+
|
|
+# Steam Deck
|
|
+obj-$(CONFIG_STEAMDECK) += steamdeck.o
|
|
diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c
|
|
new file mode 100644
|
|
index 000000000000..d1268d239cc5
|
|
--- /dev/null
|
|
+++ b/drivers/platform/x86/legion-laptop.c
|
|
@@ -0,0 +1,2783 @@
|
|
+// SPDX-License-Identifier: GPL-2.0-or-later
|
|
+/*
|
|
+ * legion-laptop.c - Extra Lenovo Legion laptop support, in
|
|
+ * particular for fan curve control and power mode.
|
|
+ *
|
|
+ * Copyright (C) 2022 johnfan <johnfan (at) example (dot) com>
|
|
+ *
|
|
+ *
|
|
+ * This driver might work on other Lenovo Legion models. If you
|
|
+ * want to try it you can pass force=1 as argument
|
|
+ * to the module which will force it to load even when the DMI
|
|
+ * data doesn't match the model AND FIRMWARE.
|
|
+ *
|
|
+ * Support for other hardware of this model is already partially
|
|
+ * provided by the module ideapd-laptop.
|
|
+ *
|
|
+ * The development page for this driver is located at
|
|
+ * https://github.com/johnfanv2/LenovoLegionLinux
|
|
+ *
|
|
+ * This driver exports the files:
|
|
+ * - /sys/kernel/debug/legion/fancurve (ro)
|
|
+ * The fan curve in the form stored in the firmware in an
|
|
+ * human readable table.
|
|
+ *
|
|
+ * - /sys/module/legion_laptop/drivers/platform\:legion/PNP0C09\:00/powermode (rw)
|
|
+ * 0: balanced mode (white)
|
|
+ * 1: performance mode (red)
|
|
+ * 2: quiet mode (blue)
|
|
+ * ?: custom mode (pink)
|
|
+ *
|
|
+ * NOTE: Writing to this will load the default fan curve from
|
|
+ * the firmware for this mode, so the fan curve might
|
|
+ * have to be reconfigured if needed.
|
|
+ *
|
|
+ * It implements the usual hwmon interface to monitor fan speed and temmperature
|
|
+ * and allows to set the fan curve inside the firware.
|
|
+ *
|
|
+ * - /sys/class/hwmon/X/fan1_input or /sys/class/hwmon/X/fan2_input (ro)
|
|
+ * Current fan speed of fan1/fan2.
|
|
+ * - /sys/class/hwmon/X/temp1_input (ro)
|
|
+ * - /sys/class/hwmon/X/temp2_input (ro)
|
|
+ * - /sys/class/hwmon/X/temp3_input (ro)
|
|
+ * Temperature (Celsius) of CPU, GPU, and IC used for fan control.
|
|
+ * - /sys/class/hwmon/X/pwmY_auto_pointZ_pwm (rw)
|
|
+ * PWM (0-255) of the fan at the Y-level in the fan curve
|
|
+ * - /sys/class/hwmon/X/pwmY_auto_pointZ_temp (rw)
|
|
+ * upper temperature of tempZ (CPU, GPU, or IC) at the Y-level in the fan curve
|
|
+ * - /sys/class/hwmon/X/pwmY_auto_pointZ_temp_hyst (rw)
|
|
+ * hysteris (CPU, GPU, or IC) at the Y-level in the fan curve. The lower
|
|
+ * temperatue of the level is the upper temperature minus the hysteris
|
|
+ *
|
|
+ *
|
|
+ * Credits for reverse engineering the firmware to:
|
|
+ * - David Woodhouse: heavily inspired by lenovo_laptop.c
|
|
+ * - Luke Cama: Windows version "LegionFanControl"
|
|
+ * - SmokelessCPU: reverse engineering of custom registers in EC
|
|
+ * and commincation method with EC via ports
|
|
+ * - 0x1F9F1: additional reverse engineering for complete fan curve
|
|
+ */
|
|
+
|
|
+#include <linux/acpi.h>
|
|
+#include <asm/io.h>
|
|
+#include <linux/debugfs.h>
|
|
+#include <linux/delay.h>
|
|
+#include <linux/dmi.h>
|
|
+#include <linux/hwmon.h>
|
|
+#include <linux/hwmon-sysfs.h>
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/moduleparam.h>
|
|
+#include <linux/platform_device.h>
|
|
+#include <linux/platform_profile.h>
|
|
+#include <linux/types.h>
|
|
+#include <linux/wmi.h>
|
|
+
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_AUTHOR("johnfan");
|
|
+MODULE_DESCRIPTION("Lenovo Legion laptop extras");
|
|
+
|
|
+static bool force;
|
|
+module_param(force, bool, 0440);
|
|
+MODULE_PARM_DESC(
|
|
+ force,
|
|
+ "Force loading this module even if model or BIOS does not match.");
|
|
+
|
|
+static bool ec_readonly;
|
|
+module_param(ec_readonly, bool, 0440);
|
|
+MODULE_PARM_DESC(
|
|
+ ec_readonly,
|
|
+ "Only read from embedded controller but do not write or change settings.");
|
|
+
|
|
+#define LEGIONFEATURES \
|
|
+ "fancurve powermode platformprofile platformprofilenotify minifancurve"
|
|
+
|
|
+//Size of fancurve stored in embedded controller
|
|
+#define MAXFANCURVESIZE 10
|
|
+
|
|
+#define LEGION_DRVR_SHORTNAME "legion"
|
|
+#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon"
|
|
+
|
|
+/* =============================== */
|
|
+/* Embedded Controller Description */
|
|
+/* =============================== */
|
|
+
|
|
+/* The configuration and registers to access the embedded controller
|
|
+ * depending on different the version of the software on the
|
|
+ * embedded controller or and the BIOS/UEFI firmware.
|
|
+ *
|
|
+ * To control fan curve in the embedded controller (EC) one has to
|
|
+ * write to its "RAM". There are different possibilities:
|
|
+ * - EC RAM is memory mapped (write to it with ioremap)
|
|
+ * - access EC RAM via ported mapped IO (outb/inb)
|
|
+ * - access EC RAM via ACPI methods. It is only possible to write
|
|
+ * to part of it (first 0xFF bytes?)
|
|
+ *
|
|
+ * In later models the firmware directly exposes ACPI methods to
|
|
+ * set the fan curve direclty, without writing to EC RAM. This
|
|
+ * is done inside the ACPI method.
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * Offsets for interseting values inside the EC RAM (0 = start of
|
|
+ * EC RAM. These might change depending on the software inside of
|
|
+ * the EC, which can be updated by a BIOS update from Lenovo.
|
|
+ */
|
|
+// TODO: same order as in initialization
|
|
+struct ec_register_offsets {
|
|
+ // Super I/O Configuration Registers
|
|
+ // 7.15 General Control (GCTRL)
|
|
+ // General Control (GCTRL)
|
|
+ // (see EC Interface Registers and 6.2 Plug and Play Configuration (PNPCFG)) in datasheet
|
|
+ // note: these are in two places saved
|
|
+ // in EC Interface Registers and in super io configuraion registers
|
|
+ // Chip ID
|
|
+ u16 ECHIPID1;
|
|
+ u16 ECHIPID2;
|
|
+ // Chip Version
|
|
+ u16 ECHIPVER;
|
|
+ u16 ECDEBUG;
|
|
+
|
|
+ // Lenovo Custom OEM extension
|
|
+ // Firmware of ITE can be extended by
|
|
+ // custom program using its own "variables"
|
|
+ // These are the offsets to these "variables"
|
|
+ u16 EXT_FAN_CUR_POINT;
|
|
+ u16 EXT_FAN_POINTS_SIZE;
|
|
+ u16 EXT_FAN1_BASE;
|
|
+ u16 EXT_FAN2_BASE;
|
|
+ u16 EXT_FAN_ACC_BASE;
|
|
+ u16 EXT_FAN_DEC_BASE;
|
|
+ u16 EXT_CPU_TEMP;
|
|
+ u16 EXT_CPU_TEMP_HYST;
|
|
+ u16 EXT_GPU_TEMP;
|
|
+ u16 EXT_GPU_TEMP_HYST;
|
|
+ u16 EXT_VRM_TEMP;
|
|
+ u16 EXT_VRM_TEMP_HYST;
|
|
+ u16 EXT_FAN1_RPM_LSB;
|
|
+ u16 EXT_FAN1_RPM_MSB;
|
|
+ u16 EXT_FAN2_RPM_LSB;
|
|
+ u16 EXT_FAN2_RPM_MSB;
|
|
+ u16 EXT_FAN1_TARGET_RPM;
|
|
+ u16 EXT_FAN2_TARGET_RPM;
|
|
+ u16 EXT_POWERMODE;
|
|
+ u16 EXT_MINIFANCURVE_ON_COOL;
|
|
+ // values
|
|
+ // 0x04: enable mini fan curve if very long on cool level
|
|
+ // - this might be due to potential temp failure
|
|
+ // - or just because really so cool
|
|
+ // 0xA0: disable it
|
|
+ u16 EXT_LOCKFANCONTROLLER;
|
|
+ u16 EXT_MAXIMUMFANSPEED;
|
|
+ u16 EXT_WHITE_KEYBOARD_BACKLIGHT;
|
|
+ u16 EXT_IC_TEMP_INPUT;
|
|
+ u16 EXT_CPU_TEMP_INPUT;
|
|
+ u16 EXT_GPU_TEMP_INPUT;
|
|
+};
|
|
+
|
|
+struct model_config {
|
|
+ const struct ec_register_offsets *registers;
|
|
+ bool check_embedded_controller_id;
|
|
+ u16 embedded_controller_id;
|
|
+
|
|
+ // first addr in EC we access/scan
|
|
+ phys_addr_t memoryio_physical_ec_start;
|
|
+ size_t memoryio_size;
|
|
+
|
|
+ // TODO: maybe use bitfield
|
|
+ bool has_minifancurve;
|
|
+};
|
|
+
|
|
+/* =================================== */
|
|
+/* Coinfiguration for different models */
|
|
+/* =================================== */
|
|
+
|
|
+// Idea by SmokelesssCPU (modified)
|
|
+// - all default names and register addresses are supported by datasheet
|
|
+// - register addresses for custom firmware by SmokelesssCPU
|
|
+static const struct ec_register_offsets ec_register_offsets_v0 = {
|
|
+ .ECHIPID1 = 0x2000,
|
|
+ .ECHIPID2 = 0x2001,
|
|
+ .ECHIPVER = 0x2002,
|
|
+ .ECDEBUG = 0x2003,
|
|
+ .EXT_FAN_CUR_POINT = 0xC534,
|
|
+ .EXT_FAN_POINTS_SIZE = 0xC535,
|
|
+ .EXT_FAN1_BASE = 0xC540,
|
|
+ .EXT_FAN2_BASE = 0xC550,
|
|
+ .EXT_FAN_ACC_BASE = 0xC560,
|
|
+ .EXT_FAN_DEC_BASE = 0xC570,
|
|
+ .EXT_CPU_TEMP = 0xC580,
|
|
+ .EXT_CPU_TEMP_HYST = 0xC590,
|
|
+ .EXT_GPU_TEMP = 0xC5A0,
|
|
+ .EXT_GPU_TEMP_HYST = 0xC5B0,
|
|
+ .EXT_VRM_TEMP = 0xC5C0,
|
|
+ .EXT_VRM_TEMP_HYST = 0xC5D0,
|
|
+ .EXT_FAN1_RPM_LSB = 0xC5E0,
|
|
+ .EXT_FAN1_RPM_MSB = 0xC5E1,
|
|
+ .EXT_FAN2_RPM_LSB = 0xC5E2,
|
|
+ .EXT_FAN2_RPM_MSB = 0xC5E3,
|
|
+ .EXT_MINIFANCURVE_ON_COOL = 0xC536,
|
|
+ .EXT_LOCKFANCONTROLLER = 0xc4AB,
|
|
+ .EXT_CPU_TEMP_INPUT = 0xc538,
|
|
+ .EXT_GPU_TEMP_INPUT = 0xc539,
|
|
+ .EXT_IC_TEMP_INPUT = 0xC5E8,
|
|
+ .EXT_POWERMODE = 0xc420,
|
|
+ .EXT_FAN1_TARGET_RPM = 0xc600,
|
|
+ .EXT_FAN2_TARGET_RPM = 0xc601,
|
|
+ .EXT_MAXIMUMFANSPEED = 0xBD,
|
|
+ .EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400)
|
|
+};
|
|
+
|
|
+static const struct model_config model_v0 = {
|
|
+ .registers = &ec_register_offsets_v0,
|
|
+ .check_embedded_controller_id = true,
|
|
+ .embedded_controller_id = 0x8227,
|
|
+ .memoryio_physical_ec_start = 0xC400,
|
|
+ .memoryio_size = 0x300,
|
|
+ .has_minifancurve = true
|
|
+};
|
|
+
|
|
+static const struct model_config model_kfcn = {
|
|
+ .registers = &ec_register_offsets_v0,
|
|
+ .check_embedded_controller_id = true,
|
|
+ .embedded_controller_id = 0x8227,
|
|
+ .memoryio_physical_ec_start = 0xC400,
|
|
+ .memoryio_size = 0x300,
|
|
+ .has_minifancurve = false
|
|
+};
|
|
+
|
|
+static const struct model_config model_hacn = {
|
|
+ .registers = &ec_register_offsets_v0,
|
|
+ .check_embedded_controller_id = false,
|
|
+ .embedded_controller_id = 0x8227,
|
|
+ .memoryio_physical_ec_start = 0xC400,
|
|
+ .memoryio_size = 0x300,
|
|
+ .has_minifancurve = false
|
|
+};
|
|
+
|
|
+
|
|
+static const struct model_config model_k9cn = {
|
|
+ .registers = &ec_register_offsets_v0,
|
|
+ .check_embedded_controller_id = false,
|
|
+ .embedded_controller_id = 0x8227,
|
|
+ .memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400 ?
|
|
+ .memoryio_size = 0x300,
|
|
+ .has_minifancurve = false
|
|
+};
|
|
+
|
|
+
|
|
+
|
|
+static const struct dmi_system_id denylist[] = { {} };
|
|
+
|
|
+static const struct dmi_system_id optimistic_allowlist[] = {
|
|
+ {
|
|
+ // modelyear: 2021
|
|
+ // generation: 6
|
|
+ // name: Legion 5, Legion 5 pro, Legion 7
|
|
+ // Family: Legion 5 15ACH6H, ...
|
|
+ .ident = "GKCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "GKCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2020
|
|
+ .ident = "EUCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "EUCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2020
|
|
+ .ident = "EFCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "EFCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2020
|
|
+ .ident = "FSCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "FSCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2021
|
|
+ .ident = "HHCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "HHCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2022
|
|
+ .ident = "H1CN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "H1CN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2022
|
|
+ .ident = "J2CN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "J2CN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2022
|
|
+ .ident = "JUCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "JUCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2022
|
|
+ .ident = "KFCN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "KFCN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_kfcn
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2021
|
|
+ .ident = "HACN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "HACN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_hacn
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2021
|
|
+ .ident = "G9CN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "G9CN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_v0
|
|
+ },
|
|
+ {
|
|
+ // modelyear: 2022
|
|
+ .ident = "K9CN",
|
|
+ .matches = {
|
|
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
|
|
+ DMI_MATCH(DMI_BIOS_VERSION, "K9CN"),
|
|
+ },
|
|
+ .driver_data = (void *)&model_k9cn
|
|
+ },
|
|
+ {}
|
|
+};
|
|
+
|
|
+/* ================================= */
|
|
+/* ACPI access */
|
|
+/* ================================= */
|
|
+
|
|
+// function from ideapad-laptop.c
|
|
+static int eval_int(acpi_handle handle, const char *name, unsigned long *res)
|
|
+{
|
|
+ unsigned long long result;
|
|
+ acpi_status status;
|
|
+
|
|
+ status = acpi_evaluate_integer(handle, (char *)name, NULL, &result);
|
|
+ if (ACPI_FAILURE(status))
|
|
+ return -EIO;
|
|
+
|
|
+ *res = result;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+// function from ideapad-laptop.c
|
|
+static int exec_simple_method(acpi_handle handle, const char *name,
|
|
+ unsigned long arg)
|
|
+{
|
|
+ acpi_status status =
|
|
+ acpi_execute_simple_method(handle, (char *)name, arg);
|
|
+
|
|
+ return ACPI_FAILURE(status) ? -EIO : 0;
|
|
+}
|
|
+
|
|
+// function from ideapad-laptop.c
|
|
+static int exec_sbmc(acpi_handle handle, unsigned long arg)
|
|
+{
|
|
+ // \_SB.PCI0.LPC0.EC0.VPC0.SBMC
|
|
+ return exec_simple_method(handle, "SBMC", arg);
|
|
+}
|
|
+
|
|
+static int eval_qcho(acpi_handle handle, unsigned long *res)
|
|
+{
|
|
+ // \_SB.PCI0.LPC0.EC0.QCHO
|
|
+ return eval_int(handle, "QCHO", res);
|
|
+}
|
|
+
|
|
+/* ================================= */
|
|
+/* EC RAM Access with port-mapped IO */
|
|
+/* ================================= */
|
|
+
|
|
+/*
|
|
+ * See datasheet of e.g. IT8502E/F/G, e.g.
|
|
+ * 6.2 Plug and Play Configuration (PNPCFG)
|
|
+ *
|
|
+ * Depending on configured BARDSEL register
|
|
+ * the ports
|
|
+ * ECRAM_PORTIO_ADDR_PORT and
|
|
+ * ECRAM_PORTIO_DATA_PORT
|
|
+ * are configured.
|
|
+ *
|
|
+ * By performing IO on these ports one can
|
|
+ * read/write to registers in the EC.
|
|
+ *
|
|
+ * "To access a register of PNPCFG, write target index to
|
|
+ * address port and access this PNPCFG register via
|
|
+ * data port" [datasheet, 6.2 Plug and Play Configuration]
|
|
+ */
|
|
+
|
|
+// IO ports used to write to communicate with embedded controller
|
|
+// Start of used ports
|
|
+#define ECRAM_PORTIO_START_PORT 0x4E
|
|
+// Number of used ports
|
|
+#define ECRAM_PORTIO_PORTS_SIZE 2
|
|
+// Port used to specify address in EC RAM to read/write
|
|
+// 0x4E/0x4F is the usual port for IO super controler
|
|
+// 0x2E/0x2F also common (ITE can also be configure to use these)
|
|
+#define ECRAM_PORTIO_ADDR_PORT 0x4E
|
|
+// Port to send/receive the value to write/read
|
|
+#define ECRAM_PORTIO_DATA_PORT 0x4F
|
|
+// Name used to request ports
|
|
+#define ECRAM_PORTIO_NAME "legion"
|
|
+
|
|
+struct ecram_portio {
|
|
+ /* protects read/write to EC RAM performed
|
|
+ * as a certain sequence of outb, inb
|
|
+ * commands on the IO ports. There can
|
|
+ * be at most one.
|
|
+ */
|
|
+ struct mutex io_port_mutex;
|
|
+};
|
|
+
|
|
+ssize_t ecram_portio_init(struct ecram_portio *ec_portio)
|
|
+{
|
|
+ if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE,
|
|
+ ECRAM_PORTIO_NAME)) {
|
|
+ pr_info("Cannot init ecram_portio the %x ports starting at %x\n",
|
|
+ ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
|
|
+ return -ENODEV;
|
|
+ }
|
|
+ //pr_info("Reserved %x ports starting at %x\n", ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
|
|
+ mutex_init(&ec_portio->io_port_mutex);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void ecram_portio_exit(struct ecram_portio *ec_portio)
|
|
+{
|
|
+ release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE);
|
|
+}
|
|
+
|
|
+/* Read a byte from the EC RAM.
|
|
+ *
|
|
+ * Return status because of commong signature for alle
|
|
+ * methods to access EC RAM.
|
|
+ */
|
|
+ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, u8 *value)
|
|
+{
|
|
+ mutex_lock(&ec_portio->io_port_mutex);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x11, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ // TODO: no explicit cast between types seems to be sometimes
|
|
+ // done and sometimes not
|
|
+ outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x10, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x12, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ *value = inb(ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ mutex_unlock(&ec_portio->io_port_mutex);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Write a byte to the EC RAM.
|
|
+ *
|
|
+ * Return status because of commong signature for alle
|
|
+ * methods to access EC RAM.
|
|
+ */
|
|
+ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, u8 value)
|
|
+{
|
|
+ mutex_lock(&ec_portio->io_port_mutex);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x11, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ // TODO: no explicit cast between types seems to be sometimes
|
|
+ // done and sometimes not
|
|
+ outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x10, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(0x12, ECRAM_PORTIO_DATA_PORT);
|
|
+ outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
|
|
+ outb(value, ECRAM_PORTIO_DATA_PORT);
|
|
+
|
|
+ mutex_unlock(&ec_portio->io_port_mutex);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* =================================== */
|
|
+/* EC RAM Access */
|
|
+/* =================================== */
|
|
+
|
|
+struct ecram {
|
|
+ struct ecram_portio portio;
|
|
+};
|
|
+
|
|
+ssize_t ecram_init(struct ecram *ecram, phys_addr_t memoryio_ec_physical_start,
|
|
+ size_t region_size)
|
|
+{
|
|
+ ssize_t err;
|
|
+
|
|
+ err = ecram_portio_init(&ecram->portio);
|
|
+ if (err) {
|
|
+ pr_info("Failed ecram_portio_init\n");
|
|
+ goto err_ecram_portio_init;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err_ecram_portio_init:
|
|
+ return err;
|
|
+}
|
|
+
|
|
+void ecram_exit(struct ecram *ecram)
|
|
+{
|
|
+ pr_info("Unloading legion ecram\n");
|
|
+ ecram_portio_exit(&ecram->portio);
|
|
+ pr_info("Unloading legion ecram done\n");
|
|
+}
|
|
+
|
|
+/**
|
|
+ * ecram_offset address on the EC
|
|
+ */
|
|
+static u8 ecram_read(struct ecram *ecram, u16 ecram_offset)
|
|
+{
|
|
+ u8 value;
|
|
+ int err;
|
|
+
|
|
+ err = ecram_portio_read(&ecram->portio, ecram_offset, &value);
|
|
+ if (err)
|
|
+ pr_info("Error reading EC RAM at 0x%x\n", ecram_offset);
|
|
+ return value;
|
|
+}
|
|
+
|
|
+static void ecram_write(struct ecram *ecram, u16 ecram_offset, u8 value)
|
|
+{
|
|
+ int err;
|
|
+
|
|
+ if (ec_readonly) {
|
|
+ pr_info("Skipping writing EC RAM at 0x%x because readonly.\n",
|
|
+ ecram_offset);
|
|
+ return;
|
|
+ }
|
|
+ err = ecram_portio_write(&ecram->portio, ecram_offset, value);
|
|
+ if (err)
|
|
+ pr_info("Error writing EC RAM at 0x%x\n", ecram_offset);
|
|
+}
|
|
+
|
|
+/* =============================== */
|
|
+/* Reads from EC */
|
|
+/* =============================== */
|
|
+
|
|
+u16 read_ec_id(struct ecram *ecram, const struct model_config *model)
|
|
+{
|
|
+ u8 id1 = ecram_read(ecram, model->registers->ECHIPID1);
|
|
+ u8 id2 = ecram_read(ecram, model->registers->ECHIPID2);
|
|
+
|
|
+ return (id1 << 8) + id2;
|
|
+}
|
|
+
|
|
+u16 read_ec_version(struct ecram *ecram, const struct model_config *model)
|
|
+{
|
|
+ u8 vers = ecram_read(ecram, model->registers->ECHIPVER);
|
|
+ u8 debug = ecram_read(ecram, model->registers->ECDEBUG);
|
|
+
|
|
+ return (vers << 8) + debug;
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* Data model for sensor values */
|
|
+/* ============================ */
|
|
+
|
|
+struct sensor_values {
|
|
+ u16 fan1_rpm; // current speed in rpm of fan 1
|
|
+ u16 fan2_rpm; // current speed in rpm of fan2
|
|
+ u16 fan1_target_rpm; // target speed in rpm of fan 1
|
|
+ u16 fan2_target_rpm; // target speed in rpm of fan 2
|
|
+ u8 cpu_temp_celsius; // cpu temperature in celcius
|
|
+ u8 gpu_temp_celsius; // gpu temperature in celcius
|
|
+ u8 ic_temp_celsius; // ic temperature in celcius
|
|
+};
|
|
+
|
|
+enum SENSOR_ATTR {
|
|
+ SENSOR_CPU_TEMP_ID = 1,
|
|
+ SENSOR_GPU_TEMP_ID = 2,
|
|
+ SENSOR_IC_TEMP_ID = 3,
|
|
+ SENSOR_FAN1_RPM_ID = 4,
|
|
+ SENSOR_FAN2_RPM_ID = 5,
|
|
+ SENSOR_FAN1_TARGET_RPM_ID = 6,
|
|
+ SENSOR_FAN2_TARGET_RPM_ID = 7
|
|
+};
|
|
+
|
|
+static int read_sensor_values(struct ecram *ecram,
|
|
+ const struct model_config *model,
|
|
+ struct sensor_values *values)
|
|
+{
|
|
+ values->fan1_target_rpm =
|
|
+ 100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM);
|
|
+ values->fan2_target_rpm =
|
|
+ 100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM);
|
|
+
|
|
+ values->fan1_rpm =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) +
|
|
+ (((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB))
|
|
+ << 8);
|
|
+ values->fan2_rpm =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) +
|
|
+ (((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB))
|
|
+ << 8);
|
|
+
|
|
+ values->cpu_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT);
|
|
+ values->gpu_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT);
|
|
+ values->ic_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT);
|
|
+
|
|
+ values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6);
|
|
+ values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7);
|
|
+ values->ic_temp_celsius = ecram_read(ecram, 0xC5E8);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* =============================== */
|
|
+/* Behaviour changing functions */
|
|
+/* =============================== */
|
|
+
|
|
+int read_powermode(struct ecram *ecram, const struct model_config *model)
|
|
+{
|
|
+ return ecram_read(ecram, model->registers->EXT_POWERMODE);
|
|
+}
|
|
+
|
|
+ssize_t write_powermode(struct ecram *ecram, const struct model_config *model,
|
|
+ u8 value)
|
|
+{
|
|
+ if (!(value >= 0 && value <= 2)) {
|
|
+ pr_info("Unexpected power mode value ignored: %d\n", value);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+ ecram_write(ecram, model->registers->EXT_POWERMODE, value);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Shortly toggle powermode to a different mode
|
|
+ * and switch back, e.g. to reset fan curve.
|
|
+ */
|
|
+void toggle_powermode(struct ecram *ecram, const struct model_config *model)
|
|
+{
|
|
+ int old_powermode = read_powermode(ecram, model);
|
|
+ int next_powermode = old_powermode == 0 ? 1 : 0;
|
|
+
|
|
+ write_powermode(ecram, model, next_powermode);
|
|
+ mdelay(1500);
|
|
+ write_powermode(ecram, model, old_powermode);
|
|
+}
|
|
+
|
|
+#define lockfancontroller_ON 8
|
|
+#define lockfancontroller_OFF 0
|
|
+
|
|
+ssize_t write_lockfancontroller(struct ecram *ecram,
|
|
+ const struct model_config *model, bool state)
|
|
+{
|
|
+ u8 val = state ? lockfancontroller_ON : lockfancontroller_OFF;
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int read_lockfancontroller(struct ecram *ecram,
|
|
+ const struct model_config *model, bool *state)
|
|
+{
|
|
+ int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER);
|
|
+
|
|
+ switch (value) {
|
|
+ case lockfancontroller_ON:
|
|
+ *state = true;
|
|
+ break;
|
|
+ case lockfancontroller_OFF:
|
|
+ *state = false;
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Unexpected value in lockfanspeed register:%d\n",
|
|
+ value);
|
|
+ return -1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define MAXIMUMFANSPEED_ON 0x40
|
|
+#define MAXIMUMFANSPEED_OFF 0x00
|
|
+
|
|
+int read_maximumfanspeed(struct ecram *ecram, const struct model_config *model,
|
|
+ bool *state)
|
|
+{
|
|
+ int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED);
|
|
+
|
|
+ switch (value) {
|
|
+ case MAXIMUMFANSPEED_ON:
|
|
+ *state = true;
|
|
+ break;
|
|
+ case MAXIMUMFANSPEED_OFF:
|
|
+ *state = false;
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Unexpected value in maximumfanspeed register:%d\n",
|
|
+ value);
|
|
+ return -1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+ssize_t write_maximumfanspeed(struct ecram *ecram,
|
|
+ const struct model_config *model, bool state)
|
|
+{
|
|
+ u8 val = state ? MAXIMUMFANSPEED_ON : MAXIMUMFANSPEED_OFF;
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define MINIFANCUVE_ON_COOL_ON 0x04
|
|
+#define MINIFANCUVE_ON_COOL_OFF 0xA0
|
|
+
|
|
+int read_minifancurve(struct ecram *ecram, const struct model_config *model,
|
|
+ bool *state)
|
|
+{
|
|
+ int value =
|
|
+ ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL);
|
|
+
|
|
+ switch (value) {
|
|
+ case MINIFANCUVE_ON_COOL_ON:
|
|
+ *state = true;
|
|
+ break;
|
|
+ case MINIFANCUVE_ON_COOL_OFF:
|
|
+ *state = false;
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Unexpected value in MINIFANCURVE register:%d\n",
|
|
+ value);
|
|
+ return -1;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+ssize_t write_minifancurve(struct ecram *ecram,
|
|
+ const struct model_config *model, bool state)
|
|
+{
|
|
+ u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF;
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define KEYBOARD_BACKLIGHT_OFF 18
|
|
+#define KEYBOARD_BACKLIGHT_ON1 21
|
|
+#define KEYBOARD_BACKLIGHT_ON2 23
|
|
+
|
|
+int read_keyboard_backlight(struct ecram *ecram,
|
|
+ const struct model_config *model, int *state)
|
|
+{
|
|
+ int value = ecram_read(ecram,
|
|
+ model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT);
|
|
+
|
|
+ //switch (value) {
|
|
+ //case MINIFANCUVE_ON_COOL_ON:
|
|
+ // *state = true;
|
|
+ // break;
|
|
+ //case MINIFANCUVE_ON_COOL_OFF:
|
|
+ // *state = false;
|
|
+ // break;
|
|
+ //default:
|
|
+ // pr_info("Unexpected value in MINIFANCURVE register:%d\n",
|
|
+ // value);
|
|
+ // return -1;
|
|
+ //}
|
|
+ *state = value;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int write_keyboard_backlight(struct ecram *ecram,
|
|
+ const struct model_config *model, int state)
|
|
+{
|
|
+ u8 val = state > 0 ? KEYBOARD_BACKLIGHT_ON1 : KEYBOARD_BACKLIGHT_OFF;
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT, val);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define FCT_RAPID_CHARGE_ON 0x07
|
|
+#define FCT_RAPID_CHARGE_OFF 0x08
|
|
+#define RAPID_CHARGE_ON 0x0
|
|
+#define RAPID_CHARGE_OFF 0x1
|
|
+
|
|
+int read_rapidcharge(acpi_handle acpihandle, int *state)
|
|
+{
|
|
+ unsigned long result;
|
|
+ int err;
|
|
+
|
|
+ err = eval_qcho(acpihandle, &result);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ *state = result;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int write_rapidcharge(acpi_handle acpihandle, bool state)
|
|
+{
|
|
+ unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON :
|
|
+ FCT_RAPID_CHARGE_OFF;
|
|
+ return exec_sbmc(acpihandle, fct_nr);
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* Data model for fan curve */
|
|
+/* ============================ */
|
|
+
|
|
+struct fancurve_point {
|
|
+ // rpm1 devided by 100
|
|
+ u8 rpm1_raw;
|
|
+ // rpm2 devided by 100
|
|
+ u8 rpm2_raw;
|
|
+ // >=2 , <=5 (lower is faster); must be increasing by level
|
|
+ u8 accel;
|
|
+ // >=2 , <=5 (lower is faster); must be increasing by level
|
|
+ u8 decel;
|
|
+
|
|
+ // min must be lower or equal than max
|
|
+ // last level max must be 127
|
|
+ // <=127 cpu max temp for this level; must be increasing by level
|
|
+ u8 cpu_max_temp_celsius;
|
|
+ // <=127 cpu min temp for this level; must be increasing by level
|
|
+ u8 cpu_min_temp_celsius;
|
|
+ // <=127 gpu min temp for this level; must be increasing by level
|
|
+ u8 gpu_max_temp_celsius;
|
|
+ // <=127 gpu max temp for this level; must be increasing by level
|
|
+ u8 gpu_min_temp_celsius;
|
|
+ // <=127 ic max temp for this level; must be increasing by level
|
|
+ u8 ic_max_temp_celsius;
|
|
+ // <=127 ic max temp for this level; must be increasing by level
|
|
+ u8 ic_min_temp_celsius;
|
|
+};
|
|
+
|
|
+enum FANCURVE_ATTR {
|
|
+ FANCURVE_ATTR_PWM1 = 1,
|
|
+ FANCURVE_ATTR_PWM2 = 2,
|
|
+ FANCURVE_ATTR_CPU_TEMP = 3,
|
|
+ FANCURVE_ATTR_CPU_HYST = 4,
|
|
+ FANCURVE_ATTR_GPU_TEMP = 5,
|
|
+ FANCURVE_ATTR_GPU_HYST = 6,
|
|
+ FANCURVE_ATTR_IC_TEMP = 7,
|
|
+ FANCURVE_ATTR_IC_HYST = 8,
|
|
+ FANCURVE_ATTR_ACCEL = 9,
|
|
+ FANCURVE_ATTR_DECEL = 10,
|
|
+ FANCURVE_SIZE = 11,
|
|
+ FANCURVE_MINIFANCURVE_ON_COOL = 12
|
|
+};
|
|
+
|
|
+// used for clearing table entries
|
|
+static const struct fancurve_point fancurve_point_zero = { 0, 0, 0, 0, 0,
|
|
+ 0, 0, 0, 0, 0 };
|
|
+
|
|
+struct fancurve {
|
|
+ struct fancurve_point points[MAXFANCURVESIZE];
|
|
+ // number of points used; must be <= MAXFANCURVESIZE
|
|
+ size_t size;
|
|
+ // the point that at which fans are run currently
|
|
+ size_t current_point_i;
|
|
+};
|
|
+
|
|
+// calculate derived values
|
|
+
|
|
+int fancurve_get_cpu_deltahyst(struct fancurve_point *point)
|
|
+{
|
|
+ return ((int)point->cpu_max_temp_celsius) -
|
|
+ ((int)point->cpu_min_temp_celsius);
|
|
+}
|
|
+
|
|
+int fancurve_get_gpu_deltahyst(struct fancurve_point *point)
|
|
+{
|
|
+ return ((int)point->gpu_max_temp_celsius) -
|
|
+ ((int)point->gpu_min_temp_celsius);
|
|
+}
|
|
+
|
|
+int fancurve_get_ic_deltahyst(struct fancurve_point *point)
|
|
+{
|
|
+ return ((int)point->ic_max_temp_celsius) -
|
|
+ ((int)point->ic_min_temp_celsius);
|
|
+}
|
|
+
|
|
+// validation functions
|
|
+
|
|
+bool fancurve_is_valid_min_temp(int min_temp)
|
|
+{
|
|
+ return min_temp >= 0 && min_temp <= 127;
|
|
+}
|
|
+
|
|
+bool fancurve_is_valid_max_temp(int max_temp)
|
|
+{
|
|
+ return max_temp >= 0 && max_temp <= 127;
|
|
+}
|
|
+
|
|
+// setters with validation
|
|
+// - make hwmon implementation easier
|
|
+// - keep fancurve valid, otherwise EC will not properly control fan
|
|
+
|
|
+bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm)
|
|
+{
|
|
+ bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].rpm1_raw = rpm / 100;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm)
|
|
+{
|
|
+ bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].rpm2_raw = rpm / 100;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+// TODO: remove { ... } from single line if body
|
|
+
|
|
+bool fancurve_set_accel(struct fancurve *fancurve, int point_id, int accel)
|
|
+{
|
|
+ bool valid = accel >= 2 && accel <= 5;
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].accel = accel;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_decel(struct fancurve *fancurve, int point_id, int decel)
|
|
+{
|
|
+ bool valid = decel >= 2 && decel <= 5;
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].decel = decel;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].cpu_max_temp_celsius = value;
|
|
+
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].gpu_max_temp_celsius = value;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].ic_max_temp_celsius = value;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].cpu_min_temp_celsius = value;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].gpu_min_temp_celsius = value;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id,
|
|
+ int value)
|
|
+{
|
|
+ bool valid = fancurve_is_valid_max_temp(value);
|
|
+
|
|
+ if (valid)
|
|
+ fancurve->points[point_id].ic_min_temp_celsius = value;
|
|
+ return valid;
|
|
+}
|
|
+
|
|
+bool fancurve_set_size(struct fancurve *fancurve, int size, bool init_values)
|
|
+{
|
|
+ bool valid = size >= 1 && size <= MAXFANCURVESIZE;
|
|
+
|
|
+ if (!valid)
|
|
+ return false;
|
|
+ if (init_values && size < fancurve->size) {
|
|
+ // fancurve size is decreased, but last etnry alwasy needs 127 temperatures
|
|
+ // Note: size >=1
|
|
+ fancurve->points[size - 1].cpu_max_temp_celsius = 127;
|
|
+ fancurve->points[size - 1].ic_max_temp_celsius = 127;
|
|
+ fancurve->points[size - 1].gpu_max_temp_celsius = 127;
|
|
+ }
|
|
+ if (init_values && size > fancurve->size) {
|
|
+ // fancurve increased, so new entries need valid values
|
|
+ int i;
|
|
+ int last = fancurve->size > 0 ? fancurve->size - 1 : 0;
|
|
+
|
|
+ for (i = fancurve->size; i < size; ++i)
|
|
+ fancurve->points[i] = fancurve->points[last];
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Read the fan curve from the EC.
|
|
+ *
|
|
+ * In newer models (>=2022) there is an ACPI/WMI to read fan curve as
|
|
+ * a whole. So read/write fan table as a whole to use
|
|
+ * same interface for both cases.
|
|
+ *
|
|
+ * It reads all points from EC memory, even if stored fancurve is smaller, so
|
|
+ * it can contain 0 entries.
|
|
+ */
|
|
+static int read_fancurve(struct ecram *ecram, const struct model_config *model,
|
|
+ struct fancurve *fancurve)
|
|
+{
|
|
+ size_t i = 0;
|
|
+
|
|
+ for (i = 0; i < MAXFANCURVESIZE; ++i) {
|
|
+ struct fancurve_point *point = &fancurve->points[i];
|
|
+
|
|
+ point->rpm1_raw =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
|
|
+ point->rpm2_raw =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
|
|
+
|
|
+ point->accel = ecram_read(
|
|
+ ecram, model->registers->EXT_FAN_ACC_BASE + i);
|
|
+ point->decel = ecram_read(
|
|
+ ecram, model->registers->EXT_FAN_DEC_BASE + i);
|
|
+ point->cpu_max_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_CPU_TEMP + i);
|
|
+ point->cpu_min_temp_celsius = ecram_read(
|
|
+ ecram, model->registers->EXT_CPU_TEMP_HYST + i);
|
|
+ point->gpu_max_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_GPU_TEMP + i);
|
|
+ point->gpu_min_temp_celsius = ecram_read(
|
|
+ ecram, model->registers->EXT_GPU_TEMP_HYST + i);
|
|
+ point->ic_max_temp_celsius =
|
|
+ ecram_read(ecram, model->registers->EXT_VRM_TEMP + i);
|
|
+ point->ic_min_temp_celsius = ecram_read(
|
|
+ ecram, model->registers->EXT_VRM_TEMP_HYST + i);
|
|
+ }
|
|
+
|
|
+ // Do not trust that hardware; It might suddendly report
|
|
+ // a larger size, so clamp it.
|
|
+ fancurve->size =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN_POINTS_SIZE);
|
|
+ fancurve->size =
|
|
+ min(fancurve->size, (typeof(fancurve->size))(MAXFANCURVESIZE));
|
|
+ fancurve->current_point_i =
|
|
+ ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT);
|
|
+ fancurve->current_point_i =
|
|
+ min(fancurve->current_point_i, fancurve->size);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int write_fancurve(struct ecram *ecram, const struct model_config *model,
|
|
+ const struct fancurve *fancurve, bool write_size)
|
|
+{
|
|
+ size_t i;
|
|
+ // Reset fan update counters (try to avoid any race conditions)
|
|
+ ecram_write(ecram, 0xC5FE, 0);
|
|
+ ecram_write(ecram, 0xC5FF, 0);
|
|
+ for (i = 0; i < MAXFANCURVESIZE; ++i) {
|
|
+ // Entries for points larger than fancurve size should be cleared
|
|
+ // to 0
|
|
+ const struct fancurve_point *point =
|
|
+ i < fancurve->size ? &fancurve->points[i] :
|
|
+ &fancurve_point_zero;
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_FAN1_BASE + i,
|
|
+ point->rpm1_raw);
|
|
+ ecram_write(ecram, model->registers->EXT_FAN2_BASE + i,
|
|
+ point->rpm2_raw);
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_FAN_ACC_BASE + i,
|
|
+ point->accel);
|
|
+ ecram_write(ecram, model->registers->EXT_FAN_DEC_BASE + i,
|
|
+ point->decel);
|
|
+
|
|
+ ecram_write(ecram, model->registers->EXT_CPU_TEMP + i,
|
|
+ point->cpu_max_temp_celsius);
|
|
+ ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i,
|
|
+ point->cpu_min_temp_celsius);
|
|
+ ecram_write(ecram, model->registers->EXT_GPU_TEMP + i,
|
|
+ point->gpu_max_temp_celsius);
|
|
+ ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i,
|
|
+ point->gpu_min_temp_celsius);
|
|
+ ecram_write(ecram, model->registers->EXT_VRM_TEMP + i,
|
|
+ point->ic_max_temp_celsius);
|
|
+ ecram_write(ecram, model->registers->EXT_VRM_TEMP_HYST + i,
|
|
+ point->ic_min_temp_celsius);
|
|
+ }
|
|
+
|
|
+ if (write_size) {
|
|
+ ecram_write(ecram, model->registers->EXT_FAN_POINTS_SIZE,
|
|
+ fancurve->size);
|
|
+ }
|
|
+
|
|
+ // Reset current fan level to 0, so algorithm in EC
|
|
+ // selects fan curve point again and resetting hysterisis
|
|
+ // effects
|
|
+ ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0);
|
|
+
|
|
+ // Reset internal fan levels
|
|
+ ecram_write(ecram, 0xC634, 0); // CPU
|
|
+ ecram_write(ecram, 0xC635, 0); // GPU
|
|
+ ecram_write(ecram, 0xC636, 0); // SENSOR
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve,
|
|
+ struct seq_file *s)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ seq_printf(
|
|
+ s,
|
|
+ "rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n");
|
|
+ for (i = 0; i < fancurve->size; ++i) {
|
|
+ const struct fancurve_point *point = &fancurve->points[i];
|
|
+
|
|
+ seq_printf(
|
|
+ s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n",
|
|
+ point->rpm1_raw * 100, point->rpm2_raw * 100,
|
|
+ point->accel, point->decel, point->cpu_min_temp_celsius,
|
|
+ point->cpu_max_temp_celsius,
|
|
+ point->gpu_min_temp_celsius,
|
|
+ point->gpu_max_temp_celsius, point->ic_min_temp_celsius,
|
|
+ point->ic_max_temp_celsius);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* Global and shared data between */
|
|
+/* all calls to this module */
|
|
+/* ============================ */
|
|
+// Implemented like ideapad-laptop.c but currenlty still
|
|
+// wihtout dynamic memory allocation (instaed global _priv)
|
|
+
|
|
+struct legion_private {
|
|
+ struct platform_device *platform_device;
|
|
+ // TODO: remove or keep? init?
|
|
+ // struct acpi_device *adev;
|
|
+
|
|
+ // Method to access ECRAM
|
|
+ struct ecram ecram;
|
|
+ // Configuration with registers an ECRAM access method
|
|
+ const struct model_config *conf;
|
|
+
|
|
+ // TODO: maybe refactor an keep only local to each function
|
|
+ // last known fan curve
|
|
+ struct fancurve fancurve;
|
|
+ // configured fan curve from user space
|
|
+ struct fancurve fancurve_configured;
|
|
+
|
|
+ // update lock, when partial values of fancurve are changed
|
|
+ struct mutex fancurve_mutex;
|
|
+
|
|
+ //interfaces
|
|
+ struct dentry *debugfs_dir;
|
|
+ struct device *hwmon_dev;
|
|
+ struct platform_profile_handler platform_profile_handler;
|
|
+
|
|
+ // TODO: remove?
|
|
+ bool loaded;
|
|
+};
|
|
+
|
|
+// shared between different drivers: WMI, platform and proteced by mutex
|
|
+static struct legion_private *legion_shared;
|
|
+static struct legion_private _priv;
|
|
+static DEFINE_MUTEX(legion_shared_mutex);
|
|
+
|
|
+static int legion_shared_init(struct legion_private *priv)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&legion_shared_mutex);
|
|
+
|
|
+ if (!legion_shared) {
|
|
+ legion_shared = priv;
|
|
+ mutex_init(&legion_shared->fancurve_mutex);
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ pr_warn("Found multiple platform devices\n");
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+
|
|
+ priv->loaded = true;
|
|
+ mutex_unlock(&legion_shared_mutex);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void legion_shared_exit(struct legion_private *priv)
|
|
+{
|
|
+ pr_info("Unloading legion shared\n");
|
|
+ mutex_lock(&legion_shared_mutex);
|
|
+
|
|
+ if (legion_shared == priv)
|
|
+ legion_shared = NULL;
|
|
+
|
|
+ mutex_unlock(&legion_shared_mutex);
|
|
+ pr_info("Unloading legion shared done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* debugfs interface */
|
|
+/* ============================ */
|
|
+
|
|
+static int debugfs_ecmemory_show(struct seq_file *s, void *unused)
|
|
+{
|
|
+ struct legion_private *priv = s->private;
|
|
+ size_t offset;
|
|
+
|
|
+ for (offset = 0; offset < priv->conf->memoryio_size; ++offset) {
|
|
+ char value = ecram_read(&priv->ecram,
|
|
+ priv->conf->memoryio_physical_ec_start +
|
|
+ offset);
|
|
+
|
|
+ seq_write(s, &value, 1);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory);
|
|
+
|
|
+static int debugfs_fancurve_show(struct seq_file *s, void *unused)
|
|
+{
|
|
+ struct legion_private *priv = s->private;
|
|
+ bool is_minifancurve;
|
|
+ bool is_lockfancontroller;
|
|
+ bool is_maximumfanspeed;
|
|
+ int err;
|
|
+
|
|
+ seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf));
|
|
+ seq_printf(s, "EC Chip Version: %x\n",
|
|
+ read_ec_version(&priv->ecram, priv->conf));
|
|
+ seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES);
|
|
+ seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly);
|
|
+ read_fancurve(&priv->ecram, priv->conf, &priv->fancurve);
|
|
+
|
|
+ seq_printf(s, "minifancurve feature enabled: %d\n",
|
|
+ priv->conf->has_minifancurve);
|
|
+ err = read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve);
|
|
+ seq_printf(s, "minifancurve on cool: %s\n",
|
|
+ err ? "error" : (is_minifancurve ? "true" : "false"));
|
|
+ err = read_lockfancontroller(&priv->ecram, priv->conf,
|
|
+ &is_lockfancontroller);
|
|
+ seq_printf(s, "lock fan controller: %s\n",
|
|
+ err ? "error" : (is_lockfancontroller ? "true" : "false"));
|
|
+ err = read_maximumfanspeed(&priv->ecram, priv->conf,
|
|
+ &is_maximumfanspeed);
|
|
+ seq_printf(s, "enable maximumfanspeed: %s\n",
|
|
+ err ? "error" : (is_maximumfanspeed ? "true" : "false"));
|
|
+ seq_printf(s, "enable maximumfanspeed status: %d\n", err);
|
|
+
|
|
+ seq_printf(s, "fan curve current point id: %ld\n",
|
|
+ priv->fancurve.current_point_i);
|
|
+ seq_printf(s, "fan curve points size: %ld\n", priv->fancurve.size);
|
|
+
|
|
+ seq_puts(s, "Current fan curve in hardware (embedded controller):\n");
|
|
+ fancurve_print_seqfile(&priv->fancurve, s);
|
|
+ seq_puts(s, "=====================\n");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+DEFINE_SHOW_ATTRIBUTE(debugfs_fancurve);
|
|
+
|
|
+static void legion_debugfs_init(struct legion_private *priv)
|
|
+{
|
|
+ struct dentry *dir;
|
|
+
|
|
+ // TODO: remove this note
|
|
+ // Note: as other kernel modules, do not catch errors here
|
|
+ // because if kernel is build without debugfs this
|
|
+ // will return an error but module still has to
|
|
+ // work, just without debugfs
|
|
+ // TODO: what permissions; some modules do 400
|
|
+ // other do 444
|
|
+ dir = debugfs_create_dir(LEGION_DRVR_SHORTNAME, NULL);
|
|
+ debugfs_create_file("fancurve", 0444, dir, priv,
|
|
+ &debugfs_fancurve_fops);
|
|
+ debugfs_create_file("ecmemory", 0444, dir, priv,
|
|
+ &debugfs_ecmemory_fops);
|
|
+
|
|
+ priv->debugfs_dir = dir;
|
|
+}
|
|
+
|
|
+static void legion_debugfs_exit(struct legion_private *priv)
|
|
+{
|
|
+ pr_info("Unloading legion dubugfs\n");
|
|
+ // The following is does nothing if pointer is NULL
|
|
+ debugfs_remove_recursive(priv->debugfs_dir);
|
|
+ priv->debugfs_dir = NULL;
|
|
+ pr_info("Unloading legion dubugfs done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* sysfs interface */
|
|
+/* ============================ */
|
|
+
|
|
+static ssize_t powermode_show(struct device *dev, struct device_attribute *attr,
|
|
+ char *buf)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int power_mode = read_powermode(&priv->ecram, priv->conf);
|
|
+
|
|
+ return sysfs_emit(buf, "%d\n", power_mode);
|
|
+}
|
|
+
|
|
+static ssize_t powermode_store(struct device *dev,
|
|
+ struct device_attribute *attr, const char *buf,
|
|
+ size_t count)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int powermode;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtouint(buf, 0, &powermode);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ err = write_powermode(&priv->ecram, priv->conf, powermode);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ // TODO: better?
|
|
+ // we have to wait a bit before change is done in hardware and
|
|
+ // readback done after notifying returns correct value, otherwise
|
|
+ // the notified reader will read old value
|
|
+ msleep(500);
|
|
+ platform_profile_notify();
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static DEVICE_ATTR_RW(powermode);
|
|
+
|
|
+static ssize_t lockfancontroller_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ bool is_lockfancontroller;
|
|
+ int err;
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = read_lockfancontroller(&priv->ecram, priv->conf,
|
|
+ &is_lockfancontroller);
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return sysfs_emit(buf, "%d\n", is_lockfancontroller);
|
|
+}
|
|
+
|
|
+static ssize_t lockfancontroller_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ bool is_lockfancontroller;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtobool(buf, &is_lockfancontroller);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = write_lockfancontroller(&priv->ecram, priv->conf,
|
|
+ is_lockfancontroller);
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static DEVICE_ATTR_RW(lockfancontroller);
|
|
+
|
|
+static ssize_t keyboard_backlight_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ int state;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ read_keyboard_backlight(&priv->ecram, priv->conf, &state);
|
|
+ return sysfs_emit(buf, "%d\n", state);
|
|
+}
|
|
+
|
|
+static ssize_t keyboard_backlight_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int state;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtouint(buf, 0, &state);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ err = write_keyboard_backlight(&priv->ecram, priv->conf, state);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+static DEVICE_ATTR_RW(keyboard_backlight);
|
|
+
|
|
+static struct attribute *legion_sysfs_attributes[] = {
|
|
+ &dev_attr_powermode.attr, &dev_attr_lockfancontroller.attr,
|
|
+ &dev_attr_keyboard_backlight.attr, NULL
|
|
+};
|
|
+
|
|
+static const struct attribute_group legion_attribute_group = {
|
|
+ .attrs = legion_sysfs_attributes
|
|
+};
|
|
+
|
|
+static int legion_sysfs_init(struct legion_private *priv)
|
|
+{
|
|
+ return device_add_group(&priv->platform_device->dev,
|
|
+ &legion_attribute_group);
|
|
+}
|
|
+
|
|
+static void legion_sysfs_exit(struct legion_private *priv)
|
|
+{
|
|
+ pr_info("Unloading legion sysfs\n");
|
|
+ device_remove_group(&priv->platform_device->dev,
|
|
+ &legion_attribute_group);
|
|
+ pr_info("Unloading legion sysfs done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* WMI + ACPI */
|
|
+/* ============================ */
|
|
+// heavily based on ideapad_laptop.c
|
|
+
|
|
+// TODO: proper names if meaning of all events is clear
|
|
+enum LEGION_WMI_EVENT {
|
|
+ LEGION_WMI_EVENT_GAMEZONE = 1,
|
|
+ LEGION_EVENT_A,
|
|
+ LEGION_EVENT_B,
|
|
+ LEGION_EVENT_C,
|
|
+ LEGION_EVENT_D,
|
|
+ LEGION_EVENT_E,
|
|
+ LEGION_EVENT_F,
|
|
+ LEGION_EVENT_G
|
|
+};
|
|
+
|
|
+struct legion_wmi_private {
|
|
+ enum LEGION_WMI_EVENT event;
|
|
+};
|
|
+
|
|
+//static void legion_wmi_notify2(u32 value, void *context)
|
|
+// {
|
|
+// pr_info("WMI notify\n" );
|
|
+// }
|
|
+
|
|
+static void legion_wmi_notify(struct wmi_device *wdev, union acpi_object *data)
|
|
+{
|
|
+ struct legion_wmi_private *wpriv;
|
|
+ struct legion_private *priv;
|
|
+
|
|
+ mutex_lock(&legion_shared_mutex);
|
|
+ priv = legion_shared;
|
|
+ if ((!priv) && (priv->loaded)) {
|
|
+ pr_info("Received WMI event while not initialized!\n");
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ wpriv = dev_get_drvdata(&wdev->dev);
|
|
+ switch (wpriv->event) {
|
|
+ case LEGION_EVENT_A:
|
|
+ pr_info("Fan event: legion type: %d; acpi type: %d (%d=integer)",
|
|
+ wpriv->event, data->type, ACPI_TYPE_INTEGER);
|
|
+ // TODO: here it is too early (first unlock mutext, then wait a bit)
|
|
+ //platform_profile_notify();
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Event: legion type: %d; acpi type: %d (%d=integer)",
|
|
+ wpriv->event, data->type, ACPI_TYPE_INTEGER);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+unlock:
|
|
+ mutex_unlock(&legion_shared_mutex);
|
|
+ // todo; fix that!
|
|
+ // problem: we get a event just before the powermode change (from the key?),
|
|
+ // so if we notify to early, it will read the old power mode/platform profile
|
|
+ msleep(500);
|
|
+ platform_profile_notify();
|
|
+}
|
|
+
|
|
+static int legion_wmi_probe(struct wmi_device *wdev, const void *context)
|
|
+{
|
|
+ struct legion_wmi_private *wpriv;
|
|
+
|
|
+ wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL);
|
|
+ if (!wpriv)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ *wpriv = *(const struct legion_wmi_private *)context;
|
|
+
|
|
+ dev_set_drvdata(&wdev->dev, wpriv);
|
|
+ dev_info(&wdev->dev, "Register after probing for WMI.\n");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const struct legion_wmi_private legion_wmi_context_gamezone = {
|
|
+ .event = LEGION_WMI_EVENT_GAMEZONE
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_a = {
|
|
+ .event = LEGION_EVENT_A
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_b = {
|
|
+ .event = LEGION_EVENT_B
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_c = {
|
|
+ .event = LEGION_EVENT_C
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_d = {
|
|
+ .event = LEGION_EVENT_D
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_e = {
|
|
+ .event = LEGION_EVENT_E
|
|
+};
|
|
+static const struct legion_wmi_private legion_wmi_context_f = {
|
|
+ .event = LEGION_EVENT_F
|
|
+};
|
|
+
|
|
+// check if really a method
|
|
+#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0"
|
|
+
|
|
+#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F"
|
|
+#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59"
|
|
+#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \
|
|
+ "10afc6d9-ea8b-4590-a2e7-1cd3c84bb4b1"
|
|
+#define LEGION_WMI_GUID_GAMEZONE_GPU_EVENT \
|
|
+ "bfd42481-aee3-4502-a107-afb68425c5f8"
|
|
+#define LEGION_WMI_GUID_GAMEZONE_OC_EVENT "d062906b-12d4-4510-999d-4831ee80e985"
|
|
+#define LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT \
|
|
+ "bfd42481-aee3-4501-a107-afb68425c5f8"
|
|
+//#define LEGION_WMI_GUID_GAMEZONE_DATA_EVENT "887b54e3-dddc-4b2c-8b88-68a26a8835d0"
|
|
+
|
|
+static const struct wmi_device_id legion_wmi_ids[] = {
|
|
+ { LEGION_WMI_GAMEZONE_GUID, &legion_wmi_context_gamezone },
|
|
+ { LEGION_WMI_GUID_FAN_EVENT, &legion_wmi_context_a },
|
|
+ { LEGION_WMI_GUID_FAN2_EVENT, &legion_wmi_context_b },
|
|
+ { LEGION_WMI_GUID_GAMEZONE_KEY_EVENT, &legion_wmi_context_c },
|
|
+ { LEGION_WMI_GUID_GAMEZONE_GPU_EVENT, &legion_wmi_context_d },
|
|
+ { LEGION_WMI_GUID_GAMEZONE_OC_EVENT, &legion_wmi_context_e },
|
|
+ { LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT, &legion_wmi_context_f },
|
|
+ { "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294",
|
|
+ &legion_wmi_context_gamezone }, /* Legion 5 */
|
|
+ {},
|
|
+};
|
|
+MODULE_DEVICE_TABLE(wmi, legion_wmi_ids);
|
|
+
|
|
+static struct wmi_driver legion_wmi_driver = {
|
|
+ .driver = {
|
|
+ .name = "legion_wmi",
|
|
+ },
|
|
+ .id_table = legion_wmi_ids,
|
|
+ .probe = legion_wmi_probe,
|
|
+ .notify = legion_wmi_notify,
|
|
+};
|
|
+
|
|
+//acpi_status status = wmi_install_notify_handler(LEGION_WMI_GAMEZONE_GUID,
|
|
+// legion_wmi_notify2, NULL);
|
|
+//if (ACPI_FAILURE(status)) {
|
|
+// return -ENODEV;
|
|
+//}
|
|
+//return 0;
|
|
+
|
|
+static int legion_wmi_init(void)
|
|
+{
|
|
+ return wmi_driver_register(&legion_wmi_driver);
|
|
+}
|
|
+
|
|
+static void legion_wmi_exit(void)
|
|
+{
|
|
+ // TODO: remove this
|
|
+ pr_info("Unloading legion WMI\n");
|
|
+
|
|
+ //wmi_remove_notify_handler(LEGION_WMI_GAMEZONE_GUID);
|
|
+ wmi_driver_unregister(&legion_wmi_driver);
|
|
+ pr_info("Unloading legion WMI done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* Platform profile */
|
|
+/* ============================ */
|
|
+
|
|
+enum LEGION_POWERMODE {
|
|
+ LEGION_POWERMODE_BALANCED = 0,
|
|
+ LEGION_POWERMODE_PERFORMANCE = 1,
|
|
+ LEGION_POWERMODE_QUIET = 2,
|
|
+};
|
|
+
|
|
+static int legion_platform_profile_get(struct platform_profile_handler *pprof,
|
|
+ enum platform_profile_option *profile)
|
|
+{
|
|
+ int powermode;
|
|
+ struct legion_private *priv;
|
|
+
|
|
+ priv = container_of(pprof, struct legion_private,
|
|
+ platform_profile_handler);
|
|
+ powermode = read_powermode(&priv->ecram, priv->conf);
|
|
+
|
|
+ switch (powermode) {
|
|
+ case LEGION_POWERMODE_BALANCED:
|
|
+ *profile = PLATFORM_PROFILE_BALANCED;
|
|
+ break;
|
|
+ case LEGION_POWERMODE_PERFORMANCE:
|
|
+ *profile = PLATFORM_PROFILE_PERFORMANCE;
|
|
+ break;
|
|
+ case LEGION_POWERMODE_QUIET:
|
|
+ *profile = PLATFORM_PROFILE_QUIET;
|
|
+ break;
|
|
+ default:
|
|
+ return -EINVAL;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int legion_platform_profile_set(struct platform_profile_handler *pprof,
|
|
+ enum platform_profile_option profile)
|
|
+{
|
|
+ int powermode;
|
|
+ struct legion_private *priv;
|
|
+
|
|
+ priv = container_of(pprof, struct legion_private,
|
|
+ platform_profile_handler);
|
|
+
|
|
+ switch (profile) {
|
|
+ case PLATFORM_PROFILE_BALANCED:
|
|
+ powermode = LEGION_POWERMODE_BALANCED;
|
|
+ break;
|
|
+ case PLATFORM_PROFILE_PERFORMANCE:
|
|
+ powermode = LEGION_POWERMODE_PERFORMANCE;
|
|
+ break;
|
|
+ case PLATFORM_PROFILE_QUIET:
|
|
+ powermode = LEGION_POWERMODE_QUIET;
|
|
+ break;
|
|
+ default:
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return write_powermode(&priv->ecram, priv->conf, powermode);
|
|
+}
|
|
+
|
|
+static int legion_platform_profile_init(struct legion_private *priv)
|
|
+{
|
|
+ int err;
|
|
+
|
|
+ priv->platform_profile_handler.profile_get =
|
|
+ legion_platform_profile_get;
|
|
+ priv->platform_profile_handler.profile_set =
|
|
+ legion_platform_profile_set;
|
|
+
|
|
+ set_bit(PLATFORM_PROFILE_QUIET, priv->platform_profile_handler.choices);
|
|
+ set_bit(PLATFORM_PROFILE_BALANCED,
|
|
+ priv->platform_profile_handler.choices);
|
|
+ set_bit(PLATFORM_PROFILE_PERFORMANCE,
|
|
+ priv->platform_profile_handler.choices);
|
|
+
|
|
+ err = platform_profile_register(&priv->platform_profile_handler);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void legion_platform_profile_exit(struct legion_private *priv)
|
|
+{
|
|
+ pr_info("Unloading legion platform profile\n");
|
|
+ platform_profile_remove();
|
|
+ pr_info("Unloading legion platform profile done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* hwom interface */
|
|
+/* ============================ */
|
|
+
|
|
+// hw-mon interface
|
|
+
|
|
+// todo: register_group or register_info?
|
|
+
|
|
+// TODO: use one common function (like here) or one function per attribute?
|
|
+static ssize_t sensor_label_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ int sensor_id = (to_sensor_dev_attr(attr))->index;
|
|
+ const char *label;
|
|
+
|
|
+ switch (sensor_id) {
|
|
+ case SENSOR_CPU_TEMP_ID:
|
|
+ label = "CPU Temperature\n";
|
|
+ break;
|
|
+ case SENSOR_GPU_TEMP_ID:
|
|
+ label = "GPU Temperature\n";
|
|
+ break;
|
|
+ case SENSOR_IC_TEMP_ID:
|
|
+ label = "IC Temperature\n";
|
|
+ break;
|
|
+ case SENSOR_FAN1_RPM_ID:
|
|
+ label = "Fan 1\n";
|
|
+ break;
|
|
+ case SENSOR_FAN2_RPM_ID:
|
|
+ label = "Fan 2\n";
|
|
+ break;
|
|
+ case SENSOR_FAN1_TARGET_RPM_ID:
|
|
+ label = "Fan 1 Target\n";
|
|
+ break;
|
|
+ case SENSOR_FAN2_TARGET_RPM_ID:
|
|
+ label = "Fan 2 Target\n";
|
|
+ break;
|
|
+ default:
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return sprintf(buf, label);
|
|
+}
|
|
+
|
|
+// TODO: use one common function (like here) or one function per attribute?
|
|
+static ssize_t sensor_show(struct device *dev, struct device_attribute *devattr,
|
|
+ char *buf)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int sensor_id = (to_sensor_dev_attr(devattr))->index;
|
|
+ struct sensor_values values;
|
|
+ int outval;
|
|
+
|
|
+ read_sensor_values(&priv->ecram, priv->conf, &values);
|
|
+
|
|
+ switch (sensor_id) {
|
|
+ case SENSOR_CPU_TEMP_ID:
|
|
+ outval = 1000 * values.cpu_temp_celsius;
|
|
+ break;
|
|
+ case SENSOR_GPU_TEMP_ID:
|
|
+ outval = 1000 * values.gpu_temp_celsius;
|
|
+ break;
|
|
+ case SENSOR_IC_TEMP_ID:
|
|
+ outval = 1000 * values.ic_temp_celsius;
|
|
+ break;
|
|
+ case SENSOR_FAN1_RPM_ID:
|
|
+ outval = values.fan1_rpm;
|
|
+ break;
|
|
+ case SENSOR_FAN2_RPM_ID:
|
|
+ outval = values.fan2_rpm;
|
|
+ break;
|
|
+ case SENSOR_FAN1_TARGET_RPM_ID:
|
|
+ outval = values.fan1_target_rpm;
|
|
+ break;
|
|
+ case SENSOR_FAN2_TARGET_RPM_ID:
|
|
+ outval = values.fan2_target_rpm;
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Error reading sensor value with id %d\n", sensor_id);
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return sprintf(buf, "%d\n", outval);
|
|
+}
|
|
+
|
|
+static SENSOR_DEVICE_ATTR_RO(temp1_input, sensor, SENSOR_CPU_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(temp1_label, sensor_label, SENSOR_CPU_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(temp2_input, sensor, SENSOR_GPU_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(temp2_label, sensor_label, SENSOR_GPU_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(temp3_input, sensor, SENSOR_IC_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(temp3_label, sensor_label, SENSOR_IC_TEMP_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan1_input, sensor, SENSOR_FAN1_RPM_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan1_label, sensor_label, SENSOR_FAN1_RPM_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan2_input, sensor, SENSOR_FAN2_RPM_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan2_label, sensor_label, SENSOR_FAN2_RPM_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan1_target, sensor, SENSOR_FAN1_TARGET_RPM_ID);
|
|
+static SENSOR_DEVICE_ATTR_RO(fan2_target, sensor, SENSOR_FAN2_TARGET_RPM_ID);
|
|
+
|
|
+static struct attribute *sensor_hwmon_attributes[] = {
|
|
+ &sensor_dev_attr_temp1_input.dev_attr.attr,
|
|
+ &sensor_dev_attr_temp1_label.dev_attr.attr,
|
|
+ &sensor_dev_attr_temp2_input.dev_attr.attr,
|
|
+ &sensor_dev_attr_temp2_label.dev_attr.attr,
|
|
+ &sensor_dev_attr_temp3_input.dev_attr.attr,
|
|
+ &sensor_dev_attr_temp3_label.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan1_input.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan1_label.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan2_input.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan2_label.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan1_target.dev_attr.attr,
|
|
+ &sensor_dev_attr_fan2_target.dev_attr.attr,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static ssize_t autopoint_show(struct device *dev,
|
|
+ struct device_attribute *devattr, char *buf)
|
|
+{
|
|
+ struct fancurve fancurve;
|
|
+ int err;
|
|
+ int value;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
|
|
+ int point_id = to_sensor_dev_attr_2(devattr)->index;
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+
|
|
+ if (err) {
|
|
+ pr_info("Reading fancurve failed\n");
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+ if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
|
|
+ pr_info("Reading fancurve failed due to wrong point id: %d\n",
|
|
+ point_id);
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ switch (fancurve_attr_id) {
|
|
+ case FANCURVE_ATTR_PWM1:
|
|
+ value = fancurve.points[point_id].rpm1_raw * 100;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_PWM2:
|
|
+ value = fancurve.points[point_id].rpm2_raw * 100;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_CPU_TEMP:
|
|
+ value = fancurve.points[point_id].cpu_max_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_CPU_HYST:
|
|
+ value = fancurve.points[point_id].cpu_min_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_GPU_TEMP:
|
|
+ value = fancurve.points[point_id].gpu_max_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_GPU_HYST:
|
|
+ value = fancurve.points[point_id].gpu_min_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_IC_TEMP:
|
|
+ value = fancurve.points[point_id].ic_max_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_IC_HYST:
|
|
+ value = fancurve.points[point_id].ic_min_temp_celsius;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_ACCEL:
|
|
+ value = fancurve.points[point_id].accel;
|
|
+ break;
|
|
+ case FANCURVE_ATTR_DECEL:
|
|
+ value = fancurve.points[point_id].decel;
|
|
+ break;
|
|
+ case FANCURVE_SIZE:
|
|
+ value = fancurve.size;
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Reading fancurve failed due to wrong attribute id: %d\n",
|
|
+ fancurve_attr_id);
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return sprintf(buf, "%d\n", value);
|
|
+}
|
|
+
|
|
+static ssize_t autopoint_store(struct device *dev,
|
|
+ struct device_attribute *devattr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct fancurve fancurve;
|
|
+ int err;
|
|
+ int value;
|
|
+ bool valid;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
|
|
+ int point_id = to_sensor_dev_attr_2(devattr)->index;
|
|
+
|
|
+ if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
|
|
+ pr_info("Reading fancurve failed due to wrong point id: %d\n",
|
|
+ point_id);
|
|
+ err = -EOPNOTSUPP;
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ err = kstrtoint(buf, 0, &value);
|
|
+ if (err) {
|
|
+ pr_info("Parse for hwmon store is not succesful: error:%d; point_id: %d; fancurve_attr_id: %d\\n",
|
|
+ err, point_id, fancurve_attr_id);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
|
|
+
|
|
+ if (err) {
|
|
+ pr_info("Reading fancurve failed\n");
|
|
+ err = -EOPNOTSUPP;
|
|
+ goto error_mutex;
|
|
+ }
|
|
+
|
|
+ switch (fancurve_attr_id) {
|
|
+ case FANCURVE_ATTR_PWM1:
|
|
+ valid = fancurve_set_rpm1(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_PWM2:
|
|
+ valid = fancurve_set_rpm2(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_CPU_TEMP:
|
|
+ valid = fancurve_set_cpu_temp_max(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_CPU_HYST:
|
|
+ valid = fancurve_set_cpu_temp_min(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_GPU_TEMP:
|
|
+ valid = fancurve_set_gpu_temp_max(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_GPU_HYST:
|
|
+ valid = fancurve_set_gpu_temp_min(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_IC_TEMP:
|
|
+ valid = fancurve_set_ic_temp_max(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_IC_HYST:
|
|
+ valid = fancurve_set_ic_temp_min(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_ACCEL:
|
|
+ valid = fancurve_set_accel(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_ATTR_DECEL:
|
|
+ valid = fancurve_set_decel(&fancurve, point_id, value);
|
|
+ break;
|
|
+ case FANCURVE_SIZE:
|
|
+ valid = fancurve_set_size(&fancurve, value, true);
|
|
+ break;
|
|
+ default:
|
|
+ pr_info("Writing fancurve failed due to wrong attribute id: %d\n",
|
|
+ fancurve_attr_id);
|
|
+ err = -EOPNOTSUPP;
|
|
+ goto error_mutex;
|
|
+ }
|
|
+
|
|
+ if (!valid) {
|
|
+ pr_info("Ignoring invalid fancurve value %d for attribute %d at point %d\n",
|
|
+ value, fancurve_attr_id, point_id);
|
|
+ err = -EOPNOTSUPP;
|
|
+ goto error_mutex;
|
|
+ }
|
|
+
|
|
+ err = write_fancurve(&priv->ecram, priv->conf, &fancurve, false);
|
|
+ if (err) {
|
|
+ pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n",
|
|
+ point_id);
|
|
+ err = -EOPNOTSUPP;
|
|
+ goto error_mutex;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return count;
|
|
+
|
|
+error_mutex:
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+error:
|
|
+ return count;
|
|
+}
|
|
+
|
|
+// rpm1
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM1, 9);
|
|
+// rpm2
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_pwm, autopoint,
|
|
+ FANCURVE_ATTR_PWM2, 9);
|
|
+// CPU temp
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp, autopoint,
|
|
+ FANCURVE_ATTR_CPU_TEMP, 9);
|
|
+// CPU temp hyst
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_CPU_HYST, 9);
|
|
+// GPU temp
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp, autopoint,
|
|
+ FANCURVE_ATTR_GPU_TEMP, 9);
|
|
+// GPU temp hyst
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_GPU_HYST, 9);
|
|
+// IC temp
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp, autopoint,
|
|
+ FANCURVE_ATTR_IC_TEMP, 9);
|
|
+// IC temp hyst
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp_hyst, autopoint,
|
|
+ FANCURVE_ATTR_IC_HYST, 9);
|
|
+// accel
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_accel, autopoint,
|
|
+ FANCURVE_ATTR_ACCEL, 9);
|
|
+// decel
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 0);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 1);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 2);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 3);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 4);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 5);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 6);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 7);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 8);
|
|
+static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_decel, autopoint,
|
|
+ FANCURVE_ATTR_DECEL, 9);
|
|
+//size
|
|
+static SENSOR_DEVICE_ATTR_2_RW(auto_points_size, autopoint, FANCURVE_SIZE, 0);
|
|
+
|
|
+static ssize_t minifancurve_show(struct device *dev,
|
|
+ struct device_attribute *devattr, char *buf)
|
|
+{
|
|
+ bool value;
|
|
+ int err;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = read_minifancurve(&priv->ecram, priv->conf, &value);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Reading minifancurve not succesful\n");
|
|
+ goto error_unlock;
|
|
+ }
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return sprintf(buf, "%d\n", value);
|
|
+
|
|
+error_unlock:
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static ssize_t minifancurve_store(struct device *dev,
|
|
+ struct device_attribute *devattr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ int value;
|
|
+ int err;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ err = kstrtoint(buf, 0, &value);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Parse for hwmon store is not succesful: error:%d\n",
|
|
+ err);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = write_minifancurve(&priv->ecram, priv->conf, value);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Writing minifancurve not succesful\n");
|
|
+ goto error_unlock;
|
|
+ }
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return count;
|
|
+
|
|
+error_unlock:
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+error:
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static SENSOR_DEVICE_ATTR_RW(minifancurve, minifancurve, 0);
|
|
+
|
|
+static ssize_t pwm1_mode_show(struct device *dev,
|
|
+ struct device_attribute *devattr, char *buf)
|
|
+{
|
|
+ bool value;
|
|
+ int err;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = read_maximumfanspeed(&priv->ecram, priv->conf, &value);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n");
|
|
+ goto error_unlock;
|
|
+ }
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return sprintf(buf, "%d\n", value ? 0 : 2);
|
|
+
|
|
+error_unlock:
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static ssize_t pwm1_mode_store(struct device *dev,
|
|
+ struct device_attribute *devattr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ int value;
|
|
+ int is_maximumfanspeed;
|
|
+ int err;
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ err = kstrtoint(buf, 0, &value);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Parse for hwmon store is not succesful: error:%d\n",
|
|
+ err);
|
|
+ goto error;
|
|
+ }
|
|
+ is_maximumfanspeed = value == 0;
|
|
+
|
|
+ mutex_lock(&priv->fancurve_mutex);
|
|
+ err = write_maximumfanspeed(&priv->ecram, priv->conf,
|
|
+ is_maximumfanspeed);
|
|
+ if (err) {
|
|
+ err = -1;
|
|
+ pr_info("Writing pwm1_mode/maximumfanspeed not succesful\n");
|
|
+ goto error_unlock;
|
|
+ }
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+ return count;
|
|
+
|
|
+error_unlock:
|
|
+ mutex_unlock(&priv->fancurve_mutex);
|
|
+error:
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static SENSOR_DEVICE_ATTR_RW(pwm1_mode, pwm1_mode, 0);
|
|
+
|
|
+static struct attribute *fancurve_hwmon_attributes[] = {
|
|
+ &sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point7_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point8_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point9_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point10_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point1_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point2_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point3_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point4_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point5_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point6_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point7_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point8_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point9_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point10_pwm.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point7_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point8_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point9_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point10_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point1_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point2_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point3_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point4_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point5_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point6_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point7_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point8_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point9_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point10_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point1_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point2_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point3_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point4_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point5_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point6_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point7_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point8_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point9_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point10_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point1_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point2_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point3_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point4_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point5_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point6_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point7_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point8_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point9_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm2_auto_point10_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point1_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point2_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point3_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point4_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point5_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point6_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point7_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point8_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point9_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point10_temp.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point1_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point2_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point3_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point4_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point5_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point6_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point7_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point8_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point9_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm3_auto_point10_temp_hyst.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point1_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point2_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point3_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point4_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point5_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point6_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point7_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point8_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point9_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point10_accel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point1_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point2_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point3_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point4_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point5_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point6_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point7_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point8_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point9_decel.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_auto_point10_decel.dev_attr.attr,
|
|
+ //
|
|
+ &sensor_dev_attr_auto_points_size.dev_attr.attr,
|
|
+ &sensor_dev_attr_minifancurve.dev_attr.attr,
|
|
+ &sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL
|
|
+};
|
|
+
|
|
+static umode_t legion_is_visible(struct kobject *kobj, struct attribute *attr,
|
|
+ int idx)
|
|
+{
|
|
+ bool supported = true;
|
|
+ struct device *dev = kobj_to_dev(kobj);
|
|
+ struct legion_private *priv = dev_get_drvdata(dev);
|
|
+
|
|
+ if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr)
|
|
+ supported = priv->conf->has_minifancurve;
|
|
+
|
|
+ return supported ? attr->mode : 0;
|
|
+}
|
|
+
|
|
+static const struct attribute_group legion_hwmon_sensor_group = {
|
|
+ .attrs = sensor_hwmon_attributes,
|
|
+ .is_visible = NULL
|
|
+};
|
|
+
|
|
+static const struct attribute_group legion_hwmon_fancurve_group = {
|
|
+ .attrs = fancurve_hwmon_attributes,
|
|
+ .is_visible = legion_is_visible,
|
|
+};
|
|
+
|
|
+static const struct attribute_group *legion_hwmon_groups[] = {
|
|
+ &legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL
|
|
+};
|
|
+
|
|
+ssize_t legion_hwmon_init(struct legion_private *priv)
|
|
+{
|
|
+ //TODO: use hwmon_device_register_with_groups or
|
|
+ // hwmon_device_register_with_info (latter means all hwmon functions have to be
|
|
+ // changed)
|
|
+ // some laptop driver do it in one way, some in the other
|
|
+ // TODO: Use devm_hwmon_device_register_with_groups ?
|
|
+ // some laptop drivers use this, some
|
|
+ struct device *hwmon_dev = hwmon_device_register_with_groups(
|
|
+ &priv->platform_device->dev, "legion_hwmon", priv,
|
|
+ legion_hwmon_groups);
|
|
+ if (IS_ERR_OR_NULL(hwmon_dev)) {
|
|
+ pr_err("hwmon_device_register failed!\n");
|
|
+ return PTR_ERR(hwmon_dev);
|
|
+ }
|
|
+ dev_set_drvdata(hwmon_dev, priv);
|
|
+ priv->hwmon_dev = hwmon_dev;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void legion_hwmon_exit(struct legion_private *priv)
|
|
+{
|
|
+ pr_info("Unloading legion hwon\n");
|
|
+ if (priv->hwmon_dev) {
|
|
+ hwmon_device_unregister(priv->hwmon_dev);
|
|
+ priv->hwmon_dev = NULL;
|
|
+ }
|
|
+ pr_info("Unloading legion hwon done\n");
|
|
+}
|
|
+
|
|
+/* ============================= */
|
|
+/* Platform driver */
|
|
+/* ============================ */
|
|
+
|
|
+int legion_add(struct platform_device *pdev)
|
|
+{
|
|
+ struct legion_private *priv;
|
|
+ const struct dmi_system_id *dmi_sys;
|
|
+ int err;
|
|
+ u16 ec_read_id;
|
|
+ bool is_denied = true;
|
|
+ bool is_allowed = false;
|
|
+ bool do_load_by_list = false;
|
|
+ bool do_load = false;
|
|
+ //struct legion_private *priv = dev_get_drvdata(&pdev->dev);
|
|
+ dev_info(&pdev->dev, "legion_laptop platform driver probing\n");
|
|
+
|
|
+ dev_info(&pdev->dev, "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
|
|
+ dmi_get_system_info(DMI_SYS_VENDOR),
|
|
+ dmi_get_system_info(DMI_PRODUCT_NAME),
|
|
+ dmi_get_system_info(DMI_BIOS_VERSION));
|
|
+
|
|
+ // TODO: allocate?
|
|
+ priv = &_priv;
|
|
+ priv->platform_device = pdev;
|
|
+ err = legion_shared_init(priv);
|
|
+ if (err) {
|
|
+ dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
|
|
+ goto err_legion_shared_init;
|
|
+ }
|
|
+ dev_set_drvdata(&pdev->dev, priv);
|
|
+
|
|
+ // TODO: remove
|
|
+ pr_info("Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
|
|
+ dmi_get_system_info(DMI_SYS_VENDOR),
|
|
+ dmi_get_system_info(DMI_PRODUCT_NAME),
|
|
+ dmi_get_system_info(DMI_BIOS_VERSION));
|
|
+
|
|
+ dmi_sys = dmi_first_match(optimistic_allowlist);
|
|
+ is_allowed = dmi_sys != NULL;
|
|
+ is_denied = dmi_check_system(denylist);
|
|
+ do_load_by_list = is_allowed && !is_denied;
|
|
+ do_load = do_load_by_list || force;
|
|
+
|
|
+ dev_info(
|
|
+ &pdev->dev,
|
|
+ "is_denied: %d; is_allowed: %d; do_load_by_list: %d; do_load: %d\n",
|
|
+ is_denied, is_allowed, do_load_by_list, do_load);
|
|
+
|
|
+ if (!(do_load)) {
|
|
+ dev_info(
|
|
+ &pdev->dev,
|
|
+ "Module not useable for this laptop because it is not in allowlist. Notify maintainer if you want to add your device or force load with param force.\n");
|
|
+ err = -ENOMEM;
|
|
+ goto err_model_mismtach;
|
|
+ }
|
|
+
|
|
+ if (force)
|
|
+ dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
|
|
+
|
|
+ if (!do_load_by_list && do_load) {
|
|
+ dev_info(
|
|
+ &pdev->dev,
|
|
+ "legion_laptop is forced to load and would otherwise be not loaded\n");
|
|
+ }
|
|
+
|
|
+ // if forced and no module found, use config for first model
|
|
+ if (dmi_sys == NULL)
|
|
+ dmi_sys = &optimistic_allowlist[0];
|
|
+ dev_info(&pdev->dev, "Using configuration for system: %s\n",
|
|
+ dmi_sys->ident);
|
|
+
|
|
+ priv->conf = dmi_sys->driver_data;
|
|
+
|
|
+ err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start,
|
|
+ priv->conf->memoryio_size);
|
|
+ if (err) {
|
|
+ dev_info(&pdev->dev,
|
|
+ "Could not init access to embedded controller\n");
|
|
+ goto err_ecram_init;
|
|
+ }
|
|
+
|
|
+ ec_read_id = read_ec_id(&priv->ecram, priv->conf);
|
|
+ dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id);
|
|
+ if (priv->conf->check_embedded_controller_id &&
|
|
+ !(ec_read_id == priv->conf->embedded_controller_id)) {
|
|
+ err = -ENOMEM;
|
|
+ dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n",
|
|
+ priv->conf->embedded_controller_id, ec_read_id);
|
|
+ goto err_ecram_id;
|
|
+ }
|
|
+ if (!priv->conf->check_embedded_controller_id) {
|
|
+ dev_info(&pdev->dev,
|
|
+ "Skipped checking embedded controller id\n");
|
|
+ }
|
|
+
|
|
+ dev_info(&pdev->dev, "Creating debugfs inteface\n");
|
|
+ legion_debugfs_init(priv);
|
|
+
|
|
+ pr_info("Creating sysfs inteface\n");
|
|
+ err = legion_sysfs_init(priv);
|
|
+ if (err) {
|
|
+ dev_info(&pdev->dev, "Creating sysfs interface failed\n");
|
|
+ goto err_sysfs_init;
|
|
+ }
|
|
+
|
|
+ pr_info("Creating hwmon interface");
|
|
+ err = legion_hwmon_init(priv);
|
|
+ if (err)
|
|
+ goto err_hwmon_init;
|
|
+
|
|
+ pr_info("Creating platform profile support\n");
|
|
+ err = legion_platform_profile_init(priv);
|
|
+ if (err) {
|
|
+ dev_info(&pdev->dev, "Creating platform profile failed\n");
|
|
+ goto err_platform_profile;
|
|
+ }
|
|
+
|
|
+ pr_info("Init WMI driver support\n");
|
|
+ err = legion_wmi_init();
|
|
+ if (err) {
|
|
+ dev_info(&pdev->dev, "Init WMI driver failed\n");
|
|
+ goto err_wmi;
|
|
+ }
|
|
+
|
|
+ dev_info(&pdev->dev, "legion_laptop loaded for this device\n");
|
|
+ return 0;
|
|
+
|
|
+ // TODO: remove eventually
|
|
+ legion_wmi_exit();
|
|
+err_wmi:
|
|
+ legion_platform_profile_exit(priv);
|
|
+err_platform_profile:
|
|
+ legion_hwmon_exit(priv);
|
|
+err_hwmon_init:
|
|
+ legion_sysfs_exit(priv);
|
|
+err_sysfs_init:
|
|
+ legion_debugfs_exit(priv);
|
|
+err_ecram_id:
|
|
+ ecram_exit(&priv->ecram);
|
|
+err_ecram_init:
|
|
+ legion_shared_exit(priv);
|
|
+err_legion_shared_init:
|
|
+err_model_mismtach:
|
|
+ dev_info(&pdev->dev, "legion_laptop not loaded for this device\n");
|
|
+ return err;
|
|
+}
|
|
+
|
|
+int legion_remove(struct platform_device *pdev)
|
|
+{
|
|
+ struct legion_private *priv = dev_get_drvdata(&pdev->dev);
|
|
+
|
|
+ mutex_lock(&legion_shared_mutex);
|
|
+ priv->loaded = false;
|
|
+ mutex_unlock(&legion_shared_mutex);
|
|
+
|
|
+ // first unregister wmi, so toggling powermode does not
|
|
+ // generate events anymore that even might be delayed
|
|
+ legion_wmi_exit();
|
|
+ legion_platform_profile_exit(priv);
|
|
+
|
|
+ // toggle power mode to load default setting from embedded controller
|
|
+ // again
|
|
+ toggle_powermode(&priv->ecram, priv->conf);
|
|
+
|
|
+ legion_hwmon_exit(priv);
|
|
+ legion_sysfs_exit(priv);
|
|
+ legion_debugfs_exit(priv);
|
|
+ ecram_exit(&priv->ecram);
|
|
+ legion_shared_exit(priv);
|
|
+
|
|
+ pr_info("Legion platform unloaded\n");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int legion_resume(struct platform_device *pdev)
|
|
+{
|
|
+ //struct legion_private *priv = dev_get_drvdata(&pdev->dev);
|
|
+ dev_info(&pdev->dev, "Resumed in legion-laptop\n");
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PM_SLEEP
|
|
+static int legion_pm_resume(struct device *dev)
|
|
+{
|
|
+ //struct legion_private *priv = dev_get_drvdata(dev);
|
|
+ dev_info(dev, "Resumed PM in legion-laptop\n");
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+static SIMPLE_DEV_PM_OPS(legion_pm, NULL, legion_pm_resume);
|
|
+
|
|
+// same as ideapad
|
|
+static const struct acpi_device_id legion_device_ids[] = {
|
|
+ { "PNP0C09", 0 }, // todo: change to "VPC2004"
|
|
+ { "", 0 },
|
|
+};
|
|
+MODULE_DEVICE_TABLE(acpi, legion_device_ids);
|
|
+
|
|
+static struct platform_driver legion_driver = {
|
|
+ .probe = legion_add,
|
|
+ .remove = legion_remove,
|
|
+ .resume = legion_resume,
|
|
+ .driver = {
|
|
+ .name = "legion",
|
|
+ .pm = &legion_pm,
|
|
+ .acpi_match_table = ACPI_PTR(legion_device_ids),
|
|
+ },
|
|
+};
|
|
+
|
|
+int __init legion_init(void)
|
|
+{
|
|
+ int err;
|
|
+
|
|
+ pr_info("legion_laptop starts loading\n");
|
|
+ err = platform_driver_register(&legion_driver);
|
|
+ if (err) {
|
|
+ pr_info("legion_laptop: platform_driver_register failed\n");
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+module_init(legion_init);
|
|
+
|
|
+void __exit legion_exit(void)
|
|
+{
|
|
+ platform_driver_unregister(&legion_driver);
|
|
+ pr_info("legion_laptop exit\n");
|
|
+}
|
|
+
|
|
+module_exit(legion_exit);
|
|
diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c
|
|
new file mode 100644
|
|
index 000000000000..77a6677ec19e
|
|
--- /dev/null
|
|
+++ b/drivers/platform/x86/steamdeck.c
|
|
@@ -0,0 +1,523 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+
|
|
+
|
|
+/*
|
|
+ * Steam Deck ACPI platform driver
|
|
+ *
|
|
+ * Copyright (C) 2021-2022 Valve Corporation
|
|
+ *
|
|
+ */
|
|
+#include <linux/acpi.h>
|
|
+#include <linux/hwmon.h>
|
|
+#include <linux/platform_device.h>
|
|
+#include <linux/regmap.h>
|
|
+#include <linux/extcon-provider.h>
|
|
+
|
|
+#define ACPI_STEAMDECK_NOTIFY_STATUS 0x80
|
|
+
|
|
+/* 0 - port connected, 1 -port disconnected */
|
|
+#define ACPI_STEAMDECK_PORT_CONNECT BIT(0)
|
|
+/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */
|
|
+#define ACPI_STEAMDECK_CUR_DATA_ROLE BIT(3)
|
|
+/*
|
|
+ * Debouncing delay to allow negotiation process to settle. 2s value
|
|
+ * was arrived at via trial and error.
|
|
+ */
|
|
+#define STEAMDECK_ROLE_SWITCH_DELAY (msecs_to_jiffies(2000))
|
|
+
|
|
+struct steamdeck {
|
|
+ struct acpi_device *adev;
|
|
+ struct device *hwmon;
|
|
+ void *regmap;
|
|
+ long fan_target;
|
|
+ struct delayed_work role_work;
|
|
+ struct extcon_dev *edev;
|
|
+ struct device *dev;
|
|
+};
|
|
+
|
|
+static ssize_t
|
|
+steamdeck_simple_store(struct device *dev, const char *buf, size_t count,
|
|
+ const char *method,
|
|
+ unsigned long upper_limit)
|
|
+{
|
|
+ struct steamdeck *fan = dev_get_drvdata(dev);
|
|
+ unsigned long value;
|
|
+
|
|
+ if (kstrtoul(buf, 10, &value) || value >= upper_limit)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (ACPI_FAILURE(acpi_execute_simple_method(fan->adev->handle,
|
|
+ (char *)method, value)))
|
|
+ return -EIO;
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+#define STEAMDECK_ATTR_WO(_name, _method, _upper_limit) \
|
|
+ static ssize_t _name##_store(struct device *dev, \
|
|
+ struct device_attribute *attr, \
|
|
+ const char *buf, size_t count) \
|
|
+ { \
|
|
+ return steamdeck_simple_store(dev, buf, count, \
|
|
+ _method, \
|
|
+ _upper_limit); \
|
|
+ } \
|
|
+ static DEVICE_ATTR_WO(_name)
|
|
+
|
|
+STEAMDECK_ATTR_WO(target_cpu_temp, "STCT", U8_MAX / 2);
|
|
+STEAMDECK_ATTR_WO(gain, "SGAN", U16_MAX);
|
|
+STEAMDECK_ATTR_WO(ramp_rate, "SFRR", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(hysteresis, "SHTS", U16_MAX);
|
|
+STEAMDECK_ATTR_WO(maximum_battery_charge_rate, "CHGR", U16_MAX);
|
|
+STEAMDECK_ATTR_WO(recalculate, "SCHG", U16_MAX);
|
|
+
|
|
+STEAMDECK_ATTR_WO(led_brightness, "CHBV", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(content_adaptive_brightness, "CABC", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(gamma_set, "GAMA", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(display_brightness, "WDBV", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(ctrl_display, "WCDV", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(cabc_minimum_brightness, "WCMB", U8_MAX);
|
|
+STEAMDECK_ATTR_WO(memory_data_access_control, "MDAC", U8_MAX);
|
|
+
|
|
+#define STEAMDECK_ATTR_WO_NOARG(_name, _method) \
|
|
+ static ssize_t _name##_store(struct device *dev, \
|
|
+ struct device_attribute *attr, \
|
|
+ const char *buf, size_t count) \
|
|
+ { \
|
|
+ struct steamdeck *fan = dev_get_drvdata(dev); \
|
|
+ \
|
|
+ if (ACPI_FAILURE(acpi_evaluate_object(fan->adev->handle, \
|
|
+ _method, NULL, NULL))) \
|
|
+ return -EIO; \
|
|
+ \
|
|
+ return count; \
|
|
+ } \
|
|
+ static DEVICE_ATTR_WO(_name)
|
|
+
|
|
+STEAMDECK_ATTR_WO_NOARG(power_cycle_display, "DPCY");
|
|
+STEAMDECK_ATTR_WO_NOARG(display_normal_mode_on, "NORO");
|
|
+STEAMDECK_ATTR_WO_NOARG(display_inversion_off, "INOF");
|
|
+STEAMDECK_ATTR_WO_NOARG(display_inversion_on, "INON");
|
|
+STEAMDECK_ATTR_WO_NOARG(idle_mode_on, "WRNE");
|
|
+
|
|
+#define STEAMDECK_ATTR_RO(_name, _method) \
|
|
+ static ssize_t _name##_show(struct device *dev, \
|
|
+ struct device_attribute *attr, \
|
|
+ char *buf) \
|
|
+ { \
|
|
+ struct steamdeck *jup = dev_get_drvdata(dev); \
|
|
+ unsigned long long val; \
|
|
+ \
|
|
+ if (ACPI_FAILURE(acpi_evaluate_integer( \
|
|
+ jup->adev->handle, \
|
|
+ _method, NULL, &val))) \
|
|
+ return -EIO; \
|
|
+ \
|
|
+ return sprintf(buf, "%llu\n", val); \
|
|
+ } \
|
|
+ static DEVICE_ATTR_RO(_name)
|
|
+
|
|
+STEAMDECK_ATTR_RO(firmware_version, "PDFW");
|
|
+STEAMDECK_ATTR_RO(board_id, "BOID");
|
|
+STEAMDECK_ATTR_RO(pdcs, "PDCS");
|
|
+
|
|
+static umode_t
|
|
+steamdeck_is_visible(struct kobject *kobj, struct attribute *attr, int index)
|
|
+{
|
|
+ return attr->mode;
|
|
+}
|
|
+
|
|
+static struct attribute *steamdeck_attributes[] = {
|
|
+ &dev_attr_target_cpu_temp.attr,
|
|
+ &dev_attr_gain.attr,
|
|
+ &dev_attr_ramp_rate.attr,
|
|
+ &dev_attr_hysteresis.attr,
|
|
+ &dev_attr_maximum_battery_charge_rate.attr,
|
|
+ &dev_attr_recalculate.attr,
|
|
+ &dev_attr_power_cycle_display.attr,
|
|
+
|
|
+ &dev_attr_led_brightness.attr,
|
|
+ &dev_attr_content_adaptive_brightness.attr,
|
|
+ &dev_attr_gamma_set.attr,
|
|
+ &dev_attr_display_brightness.attr,
|
|
+ &dev_attr_ctrl_display.attr,
|
|
+ &dev_attr_cabc_minimum_brightness.attr,
|
|
+ &dev_attr_memory_data_access_control.attr,
|
|
+
|
|
+ &dev_attr_display_normal_mode_on.attr,
|
|
+ &dev_attr_display_inversion_off.attr,
|
|
+ &dev_attr_display_inversion_on.attr,
|
|
+ &dev_attr_idle_mode_on.attr,
|
|
+
|
|
+ &dev_attr_firmware_version.attr,
|
|
+ &dev_attr_board_id.attr,
|
|
+ &dev_attr_pdcs.attr,
|
|
+
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const struct attribute_group steamdeck_group = {
|
|
+ .attrs = steamdeck_attributes,
|
|
+ .is_visible = steamdeck_is_visible,
|
|
+};
|
|
+
|
|
+static const struct attribute_group *steamdeck_groups[] = {
|
|
+ &steamdeck_group,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static int steamdeck_read_fan_speed(struct steamdeck *jup, long *speed)
|
|
+{
|
|
+ unsigned long long val;
|
|
+
|
|
+ if (ACPI_FAILURE(acpi_evaluate_integer(jup->adev->handle,
|
|
+ "FANR", NULL, &val)))
|
|
+ return -EIO;
|
|
+
|
|
+ *speed = val;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
|
|
+ u32 attr, int channel, long *out)
|
|
+{
|
|
+ struct steamdeck *sd = dev_get_drvdata(dev);
|
|
+ unsigned long long val;
|
|
+
|
|
+ switch (type) {
|
|
+ case hwmon_temp:
|
|
+ if (attr != hwmon_temp_input)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
|
|
+ "BATT", NULL, &val)))
|
|
+ return -EIO;
|
|
+ /*
|
|
+ * Assuming BATT returns deg C we need to mutiply it
|
|
+ * by 1000 to convert to mC
|
|
+ */
|
|
+ *out = val * 1000;
|
|
+ break;
|
|
+ case hwmon_fan:
|
|
+ switch (attr) {
|
|
+ case hwmon_fan_input:
|
|
+ return steamdeck_read_fan_speed(sd, out);
|
|
+ case hwmon_fan_target:
|
|
+ *out = sd->fan_target;
|
|
+ break;
|
|
+ case hwmon_fan_fault:
|
|
+ if (ACPI_FAILURE(acpi_evaluate_integer(
|
|
+ sd->adev->handle,
|
|
+ "FANC", NULL, &val)))
|
|
+ return -EIO;
|
|
+ /*
|
|
+ * FANC (Fan check):
|
|
+ * 0: Abnormal
|
|
+ * 1: Normal
|
|
+ */
|
|
+ *out = !val;
|
|
+ break;
|
|
+ default:
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type,
|
|
+ u32 attr, int channel, const char **str)
|
|
+{
|
|
+ switch (type) {
|
|
+ case hwmon_temp:
|
|
+ *str = "Battery Temp";
|
|
+ break;
|
|
+ case hwmon_fan:
|
|
+ *str = "System Fan";
|
|
+ break;
|
|
+ default:
|
|
+ return -EOPNOTSUPP;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
|
|
+ u32 attr, int channel, long val)
|
|
+{
|
|
+ struct steamdeck *sd = dev_get_drvdata(dev);
|
|
+
|
|
+ if (type != hwmon_fan ||
|
|
+ attr != hwmon_fan_target)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ if (val > U16_MAX)
|
|
+ return -EINVAL;
|
|
+
|
|
+ sd->fan_target = val;
|
|
+
|
|
+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle,
|
|
+ "FANS", val)))
|
|
+ return -EIO;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static umode_t
|
|
+steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type,
|
|
+ u32 attr, int channel)
|
|
+{
|
|
+ if (type == hwmon_fan &&
|
|
+ attr == hwmon_fan_target)
|
|
+ return 0644;
|
|
+
|
|
+ return 0444;
|
|
+}
|
|
+
|
|
+static const struct hwmon_channel_info *steamdeck_info[] = {
|
|
+ HWMON_CHANNEL_INFO(temp,
|
|
+ HWMON_T_INPUT | HWMON_T_LABEL),
|
|
+ HWMON_CHANNEL_INFO(fan,
|
|
+ HWMON_F_INPUT | HWMON_F_LABEL |
|
|
+ HWMON_F_TARGET | HWMON_F_FAULT),
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const struct hwmon_ops steamdeck_hwmon_ops = {
|
|
+ .is_visible = steamdeck_hwmon_is_visible,
|
|
+ .read = steamdeck_hwmon_read,
|
|
+ .read_string = steamdeck_hwmon_read_string,
|
|
+ .write = steamdeck_hwmon_write,
|
|
+};
|
|
+
|
|
+static const struct hwmon_chip_info steamdeck_chip_info = {
|
|
+ .ops = &steamdeck_hwmon_ops,
|
|
+ .info = steamdeck_info,
|
|
+};
|
|
+
|
|
+#define STEAMDECK_STA_OK \
|
|
+ (ACPI_STA_DEVICE_ENABLED | \
|
|
+ ACPI_STA_DEVICE_PRESENT | \
|
|
+ ACPI_STA_DEVICE_FUNCTIONING)
|
|
+
|
|
+static int
|
|
+steamdeck_ddic_reg_read(void *context, unsigned int reg, unsigned int *val)
|
|
+{
|
|
+ union acpi_object obj = { .type = ACPI_TYPE_INTEGER };
|
|
+ struct acpi_object_list arg_list = { .count = 1, .pointer = &obj, };
|
|
+ struct steamdeck *sd = context;
|
|
+ unsigned long long _val;
|
|
+
|
|
+ obj.integer.value = reg;
|
|
+
|
|
+ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle,
|
|
+ "RDDI", &arg_list, &_val)))
|
|
+ return -EIO;
|
|
+
|
|
+ *val = _val;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int steamdeck_read_pdcs(struct steamdeck *sd, unsigned long long *pdcs)
|
|
+{
|
|
+ acpi_status status;
|
|
+
|
|
+ status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs);
|
|
+ if (ACPI_FAILURE(status)) {
|
|
+ dev_err(sd->dev, "PDCS evaluation failed: %s\n",
|
|
+ acpi_format_exception(status));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void steamdeck_usb_role_work(struct work_struct *work)
|
|
+{
|
|
+ struct steamdeck *sd =
|
|
+ container_of(work, struct steamdeck, role_work.work);
|
|
+ unsigned long long pdcs;
|
|
+ bool usb_host;
|
|
+
|
|
+ if (steamdeck_read_pdcs(sd, &pdcs))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * We only care about these two
|
|
+ */
|
|
+ pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE;
|
|
+
|
|
+ /*
|
|
+ * For "connect" events our role is determined by a bit in
|
|
+ * PDCS, for "disconnect" we switch to being a gadget
|
|
+ * unconditionally. The thinking for the latter is we don't
|
|
+ * want to start acting as a USB host until we get
|
|
+ * confirmation from the firmware that we are a USB host
|
|
+ */
|
|
+ usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
|
|
+ pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false;
|
|
+
|
|
+ WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST,
|
|
+ usb_host));
|
|
+ dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device");
|
|
+}
|
|
+
|
|
+static void steamdeck_notify(acpi_handle handle, u32 event, void *context)
|
|
+{
|
|
+ struct device *dev = context;
|
|
+ struct steamdeck *sd = dev_get_drvdata(dev);
|
|
+ unsigned long long pdcs;
|
|
+ unsigned long delay;
|
|
+
|
|
+ switch (event) {
|
|
+ case ACPI_STEAMDECK_NOTIFY_STATUS:
|
|
+ if (steamdeck_read_pdcs(sd, &pdcs))
|
|
+ return;
|
|
+ /*
|
|
+ * We process "disconnect" events immediately and
|
|
+ * "connect" events with a delay to give the HW time
|
|
+ * to settle. For example attaching USB hub (at least
|
|
+ * for HW used for testing) will generate intermediary
|
|
+ * event with "host" bit not set, followed by the one
|
|
+ * that does have it set.
|
|
+ */
|
|
+ delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ?
|
|
+ STEAMDECK_ROLE_SWITCH_DELAY : 0;
|
|
+
|
|
+ queue_delayed_work(system_long_wq, &sd->role_work, delay);
|
|
+ break;
|
|
+ default:
|
|
+ dev_err(dev, "Unsupported event [0x%x]\n", event);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void steamdeck_remove_notify_handler(void *data)
|
|
+{
|
|
+ struct steamdeck *sd = data;
|
|
+
|
|
+ acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY,
|
|
+ steamdeck_notify);
|
|
+ cancel_delayed_work_sync(&sd->role_work);
|
|
+}
|
|
+
|
|
+static const unsigned int steamdeck_extcon_cable[] = {
|
|
+ EXTCON_USB,
|
|
+ EXTCON_USB_HOST,
|
|
+ EXTCON_CHG_USB_SDP,
|
|
+ EXTCON_CHG_USB_CDP,
|
|
+ EXTCON_CHG_USB_DCP,
|
|
+ EXTCON_CHG_USB_ACA,
|
|
+ EXTCON_NONE,
|
|
+};
|
|
+
|
|
+static int steamdeck_probe(struct platform_device *pdev)
|
|
+{
|
|
+ struct device *dev = &pdev->dev;
|
|
+ struct steamdeck *sd;
|
|
+ acpi_status status;
|
|
+ unsigned long long sta;
|
|
+ int ret;
|
|
+
|
|
+ static const struct regmap_config regmap_config = {
|
|
+ .reg_bits = 8,
|
|
+ .val_bits = 8,
|
|
+ .max_register = 255,
|
|
+ .cache_type = REGCACHE_NONE,
|
|
+ .reg_read = steamdeck_ddic_reg_read,
|
|
+ };
|
|
+
|
|
+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL);
|
|
+ if (!sd)
|
|
+ return -ENOMEM;
|
|
+ sd->adev = ACPI_COMPANION(&pdev->dev);
|
|
+ sd->dev = dev;
|
|
+ platform_set_drvdata(pdev, sd);
|
|
+ INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work);
|
|
+
|
|
+ status = acpi_evaluate_integer(sd->adev->handle, "_STA",
|
|
+ NULL, &sta);
|
|
+ if (ACPI_FAILURE(status)) {
|
|
+ dev_err(dev, "Status check failed (0x%x)\n", status);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) {
|
|
+ dev_err(dev, "Device is not ready\n");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Our ACPI interface doesn't expose a method to read current
|
|
+ * fan target, so we use current fan speed as an
|
|
+ * approximation.
|
|
+ */
|
|
+ if (steamdeck_read_fan_speed(sd, &sd->fan_target))
|
|
+ dev_warn(dev, "Failed to read fan speed");
|
|
+
|
|
+ sd->hwmon = devm_hwmon_device_register_with_info(dev,
|
|
+ "steamdeck",
|
|
+ sd,
|
|
+ &steamdeck_chip_info,
|
|
+ steamdeck_groups);
|
|
+ if (IS_ERR(sd->hwmon)) {
|
|
+ dev_err(dev, "Failed to register HWMON device");
|
|
+ return PTR_ERR(sd->hwmon);
|
|
+ }
|
|
+
|
|
+ sd->regmap = devm_regmap_init(dev, NULL, sd, ®map_config);
|
|
+ if (IS_ERR(sd->regmap))
|
|
+ dev_err(dev, "Failed to register REGMAP");
|
|
+
|
|
+ sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable);
|
|
+ if (IS_ERR(sd->edev))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = devm_extcon_dev_register(dev, sd->edev);
|
|
+ if (ret < 0) {
|
|
+ dev_err(dev, "Failed to register extcon device: %d\n", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Set initial role value
|
|
+ */
|
|
+ queue_delayed_work(system_long_wq, &sd->role_work, 0);
|
|
+ flush_delayed_work(&sd->role_work);
|
|
+
|
|
+ status = acpi_install_notify_handler(sd->adev->handle,
|
|
+ ACPI_DEVICE_NOTIFY,
|
|
+ steamdeck_notify,
|
|
+ dev);
|
|
+ if (ACPI_FAILURE(status)) {
|
|
+ dev_err(dev, "Error installing ACPI notify handler\n");
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler,
|
|
+ sd);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static const struct acpi_device_id steamdeck_device_ids[] = {
|
|
+ { "VLV0100", 0 },
|
|
+ { "", 0 },
|
|
+};
|
|
+MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids);
|
|
+
|
|
+static struct platform_driver steamdeck_driver = {
|
|
+ .probe = steamdeck_probe,
|
|
+ .driver = {
|
|
+ .name = "steamdeck",
|
|
+ .acpi_match_table = steamdeck_device_ids,
|
|
+ },
|
|
+};
|
|
+module_platform_driver(steamdeck_driver);
|
|
+
|
|
+MODULE_AUTHOR("Andrey Smirnov <andrew.smirnov@gmail.com>");
|
|
+MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
|
|
+MODULE_LICENSE("GPL");
|
|
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
|
|
index 0acb8e1fb7af..b0b49c8653b0 100644
|
|
--- a/include/linux/pagemap.h
|
|
+++ b/include/linux/pagemap.h
|
|
@@ -1182,7 +1182,7 @@ struct readahead_control {
|
|
._index = i, \
|
|
}
|
|
|
|
-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
|
|
+#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE)
|
|
|
|
void page_cache_ra_unbounded(struct readahead_control *,
|
|
unsigned long nr_to_read, unsigned long lookahead_count);
|
|
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
|
|
index 45f09bec02c4..87b20e2ee274 100644
|
|
--- a/include/linux/user_namespace.h
|
|
+++ b/include/linux/user_namespace.h
|
|
@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
|
|
|
|
#ifdef CONFIG_USER_NS
|
|
|
|
+extern int unprivileged_userns_clone;
|
|
+
|
|
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
|
|
{
|
|
if (ns)
|
|
@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
|
|
struct ns_common *ns_get_owner(struct ns_common *ns);
|
|
#else
|
|
|
|
+#define unprivileged_userns_clone 0
|
|
+
|
|
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
|
|
{
|
|
return &init_user_ns;
|
|
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
|
|
index db762e35aca9..0336791656eb 100644
|
|
--- a/include/net/netns/ipv4.h
|
|
+++ b/include/net/netns/ipv4.h
|
|
@@ -194,6 +194,7 @@ struct netns_ipv4 {
|
|
int sysctl_udp_rmem_min;
|
|
|
|
u8 sysctl_fib_notify_on_flag_change;
|
|
+ unsigned int sysctl_tcp_collapse_max_bytes;
|
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
u8 sysctl_udp_l3mdev_accept;
|
|
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
|
|
index 901b440238d5..7026df84a0f6 100644
|
|
--- a/include/trace/events/tcp.h
|
|
+++ b/include/trace/events/tcp.h
|
|
@@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
|
|
TP_ARGS(sk)
|
|
);
|
|
|
|
+DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded,
|
|
+
|
|
+ TP_PROTO(struct sock *sk),
|
|
+
|
|
+ TP_ARGS(sk)
|
|
+);
|
|
+
|
|
TRACE_EVENT(tcp_retransmit_synack,
|
|
|
|
TP_PROTO(const struct sock *sk, const struct request_sock *req),
|
|
diff --git a/init/Kconfig b/init/Kconfig
|
|
index c88bb30a8b0b..908d045dbe10 100644
|
|
--- a/init/Kconfig
|
|
+++ b/init/Kconfig
|
|
@@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
|
|
|
|
menu "General setup"
|
|
|
|
+config CACHY
|
|
+ bool "Some kernel tweaks by CachyOS"
|
|
+ default y
|
|
+
|
|
config BROKEN
|
|
bool
|
|
|
|
@@ -348,6 +352,19 @@ config KERNEL_UNCOMPRESSED
|
|
|
|
endchoice
|
|
|
|
+menu "ZSTD compression options"
|
|
+ depends on KERNEL_ZSTD
|
|
+
|
|
+config ZSTD_COMPRESSION_LEVEL
|
|
+ int "Compression level (1-22)"
|
|
+ range 1 22
|
|
+ default "22"
|
|
+ help
|
|
+ Choose a compression level for zstd kernel compression.
|
|
+ Default is 22, which is the maximum.
|
|
+
|
|
+endmenu
|
|
+
|
|
config DEFAULT_INIT
|
|
string "Default init path"
|
|
default ""
|
|
@@ -1249,6 +1266,22 @@ config USER_NS
|
|
|
|
If unsure, say N.
|
|
|
|
+config USER_NS_UNPRIVILEGED
|
|
+ bool "Allow unprivileged users to create namespaces"
|
|
+ default y
|
|
+ depends on USER_NS
|
|
+ help
|
|
+ When disabled, unprivileged users will not be able to create
|
|
+ new namespaces. Allowing users to create their own namespaces
|
|
+ has been part of several recent local privilege escalation
|
|
+ exploits, so if you need user namespaces but are
|
|
+ paranoid^Wsecurity-conscious you want to disable this.
|
|
+
|
|
+ This setting can be overridden at runtime via the
|
|
+ kernel.unprivileged_userns_clone sysctl.
|
|
+
|
|
+ If unsure, say Y.
|
|
+
|
|
config PID_NS
|
|
bool "PID Namespaces"
|
|
default y
|
|
@@ -1429,6 +1462,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
|
|
with the "-O2" compiler flag for best performance and most
|
|
helpful compile-time warnings.
|
|
|
|
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
|
|
+ bool "Optimize more for performance (-O3)"
|
|
+ help
|
|
+ Choosing this option will pass "-O3" to your compiler to optimize
|
|
+ the kernel yet more for performance.
|
|
+
|
|
config CC_OPTIMIZE_FOR_SIZE
|
|
bool "Optimize for size (-Os)"
|
|
help
|
|
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
|
|
index 38ef6d06888e..0f78364efd4f 100644
|
|
--- a/kernel/Kconfig.hz
|
|
+++ b/kernel/Kconfig.hz
|
|
@@ -40,6 +40,27 @@ choice
|
|
on SMP and NUMA systems and exactly dividing by both PAL and
|
|
NTSC frame rates for video and multimedia work.
|
|
|
|
+ config HZ_500
|
|
+ bool "500 HZ"
|
|
+ help
|
|
+ 500 Hz is a balanced timer frequency. Provides fast interactivity
|
|
+ on desktops with good smoothness without increasing CPU power
|
|
+ consumption and sacrificing the battery life on laptops.
|
|
+
|
|
+ config HZ_600
|
|
+ bool "600 HZ"
|
|
+ help
|
|
+ 600 Hz is a balanced timer frequency. Provides fast interactivity
|
|
+ on desktops with good smoothness without increasing CPU power
|
|
+ consumption and sacrificing the battery life on laptops.
|
|
+
|
|
+ config HZ_750
|
|
+ bool "750 HZ"
|
|
+ help
|
|
+ 750 Hz is a balanced timer frequency. Provides fast interactivity
|
|
+ on desktops with good smoothness without increasing CPU power
|
|
+ consumption and sacrificing the battery life on laptops.
|
|
+
|
|
config HZ_1000
|
|
bool "1000 HZ"
|
|
help
|
|
@@ -53,6 +74,9 @@ config HZ
|
|
default 100 if HZ_100
|
|
default 250 if HZ_250
|
|
default 300 if HZ_300
|
|
+ default 500 if HZ_500
|
|
+ default 600 if HZ_600
|
|
+ default 750 if HZ_750
|
|
default 1000 if HZ_1000
|
|
|
|
config SCHED_HRTICK
|
|
diff --git a/kernel/fork.c b/kernel/fork.c
|
|
index ea332319dffe..349945168239 100644
|
|
--- a/kernel/fork.c
|
|
+++ b/kernel/fork.c
|
|
@@ -98,6 +98,10 @@
|
|
#include <linux/bpf.h>
|
|
#include <linux/stackprotector.h>
|
|
|
|
+#ifdef CONFIG_USER_NS
|
|
+#include <linux/user_namespace.h>
|
|
+#endif
|
|
+
|
|
#include <asm/pgalloc.h>
|
|
#include <linux/uaccess.h>
|
|
#include <asm/mmu_context.h>
|
|
@@ -2032,6 +2036,10 @@ static __latent_entropy struct task_struct *copy_process(
|
|
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
|
|
+ if (!capable(CAP_SYS_ADMIN))
|
|
+ return ERR_PTR(-EPERM);
|
|
+
|
|
/*
|
|
* Thread groups must share signals as well, and detached threads
|
|
* can only be started up within the thread group.
|
|
@@ -3182,6 +3190,12 @@ int ksys_unshare(unsigned long unshare_flags)
|
|
if (unshare_flags & CLONE_NEWNS)
|
|
unshare_flags |= CLONE_FS;
|
|
|
|
+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
|
|
+ err = -EPERM;
|
|
+ if (!capable(CAP_SYS_ADMIN))
|
|
+ goto bad_unshare_out;
|
|
+ }
|
|
+
|
|
err = check_unshare_flags(unshare_flags);
|
|
if (err)
|
|
goto bad_unshare_out;
|
|
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
|
|
index 424b3bc58f3f..ecf2798c5ccf 100644
|
|
--- a/kernel/module/Kconfig
|
|
+++ b/kernel/module/Kconfig
|
|
@@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD
|
|
|
|
endchoice
|
|
|
|
+menu "ZSTD module compression options"
|
|
+ depends on MODULE_COMPRESS_ZSTD
|
|
+
|
|
+config MODULE_COMPRESS_ZSTD_LEVEL
|
|
+ int "Compression level (1-19)"
|
|
+ range 1 19
|
|
+ default 9
|
|
+ help
|
|
+ Compression level used by zstd for compressing modules.
|
|
+
|
|
+config MODULE_COMPRESS_ZSTD_ULTRA
|
|
+ bool "Enable ZSTD ultra compression"
|
|
+ help
|
|
+ Compress modules with ZSTD using the highest possible compression.
|
|
+
|
|
+config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA
|
|
+ int "Compression level (20-22)"
|
|
+ depends on MODULE_COMPRESS_ZSTD_ULTRA
|
|
+ range 20 22
|
|
+ default 20
|
|
+ help
|
|
+ Ultra compression level used by zstd for compressing modules.
|
|
+
|
|
+endmenu
|
|
+
|
|
config MODULE_DECOMPRESS
|
|
bool "Support in-kernel module decompression"
|
|
depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 5f6587d94c1d..96c66b50ee48 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -69,9 +69,13 @@
|
|
*
|
|
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+unsigned int sysctl_sched_latency = 3000000ULL;
|
|
+static unsigned int normalized_sysctl_sched_latency = 3000000ULL;
|
|
+#else
|
|
unsigned int sysctl_sched_latency = 6000000ULL;
|
|
static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
-
|
|
+#endif
|
|
/*
|
|
* The initial- and re-scaling of tunables is configurable
|
|
*
|
|
@@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
*
|
|
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+unsigned int sysctl_sched_min_granularity = 400000ULL;
|
|
+static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL;
|
|
+#else
|
|
unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
+#endif
|
|
|
|
/*
|
|
* Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
|
|
@@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
|
|
*
|
|
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+unsigned int sysctl_sched_wakeup_granularity = 500000UL;
|
|
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
|
|
+#else
|
|
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
|
|
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
|
|
+#endif
|
|
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
@@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu)
|
|
*
|
|
* (default: 5 msec, units: microseconds)
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
|
|
+#else
|
|
static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
|
#endif
|
|
+#endif
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
|
|
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
|
|
index 1c240d2c99bc..98e1a7472fd2 100644
|
|
--- a/kernel/sysctl.c
|
|
+++ b/kernel/sysctl.c
|
|
@@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
static const int six_hundred_forty_kb = 640 * 1024;
|
|
#endif
|
|
+#ifdef CONFIG_USER_NS
|
|
+#include <linux/user_namespace.h>
|
|
+#endif
|
|
|
|
|
|
static const int ngroups_max = NGROUPS_MAX;
|
|
@@ -1645,6 +1648,15 @@ static struct ctl_table kern_table[] = {
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
+#ifdef CONFIG_USER_NS
|
|
+ {
|
|
+ .procname = "unprivileged_userns_clone",
|
|
+ .data = &unprivileged_userns_clone,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = proc_dointvec,
|
|
+ },
|
|
+#endif
|
|
#ifdef CONFIG_PROC_SYSCTL
|
|
{
|
|
.procname = "tainted",
|
|
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
|
|
index 1d8e47bed3f1..fec01d016a35 100644
|
|
--- a/kernel/user_namespace.c
|
|
+++ b/kernel/user_namespace.c
|
|
@@ -22,6 +22,13 @@
|
|
#include <linux/bsearch.h>
|
|
#include <linux/sort.h>
|
|
|
|
+/* sysctl */
|
|
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
|
|
+int unprivileged_userns_clone = 1;
|
|
+#else
|
|
+int unprivileged_userns_clone;
|
|
+#endif
|
|
+
|
|
static struct kmem_cache *user_ns_cachep __read_mostly;
|
|
static DEFINE_MUTEX(userns_state_mutex);
|
|
|
|
diff --git a/mm/Kconfig b/mm/Kconfig
|
|
index 4751031f3f05..cf2e47030fe8 100644
|
|
--- a/mm/Kconfig
|
|
+++ b/mm/Kconfig
|
|
@@ -621,7 +621,7 @@ config COMPACTION
|
|
config COMPACT_UNEVICTABLE_DEFAULT
|
|
int
|
|
depends on COMPACTION
|
|
- default 0 if PREEMPT_RT
|
|
+ default 0 if PREEMPT_RT || CACHY
|
|
default 1
|
|
|
|
#
|
|
diff --git a/mm/compaction.c b/mm/compaction.c
|
|
index 5a9501e0ae01..4d8c63b9cdca 100644
|
|
--- a/mm/compaction.c
|
|
+++ b/mm/compaction.c
|
|
@@ -2735,7 +2735,11 @@ static void compact_nodes(void)
|
|
* aggressively the kernel should compact memory in the
|
|
* background. It takes values in the range [0, 100].
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+unsigned int __read_mostly sysctl_compaction_proactiveness;
|
|
+#else
|
|
unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
|
|
+#endif
|
|
|
|
int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
|
|
void *buffer, size_t *length, loff_t *ppos)
|
|
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
|
|
index 516b1aa247e8..78fb31d27ed7 100644
|
|
--- a/mm/page-writeback.c
|
|
+++ b/mm/page-writeback.c
|
|
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
|
|
/*
|
|
* Start background writeback (via writeback threads) at this percentage
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+static int dirty_background_ratio = 5;
|
|
+#else
|
|
static int dirty_background_ratio = 10;
|
|
+#endif
|
|
|
|
/*
|
|
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
|
|
@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
|
|
/*
|
|
* The interval between `kupdate'-style writebacks
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
|
|
+#else
|
|
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
|
|
+#endif
|
|
|
|
EXPORT_SYMBOL_GPL(dirty_writeback_interval);
|
|
|
|
diff --git a/mm/swap.c b/mm/swap.c
|
|
index 423199ee8478..adef27bd3f8b 100644
|
|
--- a/mm/swap.c
|
|
+++ b/mm/swap.c
|
|
@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
|
|
*/
|
|
void __init swap_setup(void)
|
|
{
|
|
+#ifdef CONFIG_CACHY
|
|
+ /* Only swap-in pages requested, avoid readahead */
|
|
+ page_cluster = 0;
|
|
+#else
|
|
unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
|
|
|
|
/* Use a smaller cluster for small-memory machines */
|
|
@@ -1101,4 +1105,5 @@ void __init swap_setup(void)
|
|
* Right now other parts of the system means that we
|
|
* _really_ don't want to cluster much more
|
|
*/
|
|
+#endif
|
|
}
|
|
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
|
|
index b52644771cc4..11a4b0e3b583 100644
|
|
--- a/mm/vmpressure.c
|
|
+++ b/mm/vmpressure.c
|
|
@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
|
|
* essence, they are percents: the higher the value, the more number
|
|
* unsuccessful reclaims there were.
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+static const unsigned int vmpressure_level_med = 65;
|
|
+#else
|
|
static const unsigned int vmpressure_level_med = 60;
|
|
+#endif
|
|
static const unsigned int vmpressure_level_critical = 95;
|
|
|
|
/*
|
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
|
index 9c1c5e8b24b8..71a7f4517e5a 100644
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -186,7 +186,11 @@ struct scan_control {
|
|
/*
|
|
* From 0 .. 200. Higher means more swappy.
|
|
*/
|
|
+#ifdef CONFIG_CACHY
|
|
+int vm_swappiness = 20;
|
|
+#else
|
|
int vm_swappiness = 60;
|
|
+#endif
|
|
|
|
static void set_task_reclaim_state(struct task_struct *task,
|
|
struct reclaim_state *rs)
|
|
@@ -4536,7 +4540,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
|
|
}
|
|
|
|
/* to protect the working set of the last N jiffies */
|
|
+#ifdef CONFIG_CACHY
|
|
+static unsigned long lru_gen_min_ttl __read_mostly = HZ;
|
|
+#else
|
|
static unsigned long lru_gen_min_ttl __read_mostly;
|
|
+#endif
|
|
|
|
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
|
{
|
|
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
|
|
index 40fe70fc2015..3028e27897d9 100644
|
|
--- a/net/ipv4/sysctl_net_ipv4.c
|
|
+++ b/net/ipv4/sysctl_net_ipv4.c
|
|
@@ -1470,6 +1470,13 @@ static struct ctl_table ipv4_net_table[] = {
|
|
.extra1 = SYSCTL_ZERO,
|
|
.extra2 = &tcp_plb_max_cong_thresh,
|
|
},
|
|
+ {
|
|
+ .procname = "tcp_collapse_max_bytes",
|
|
+ .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes,
|
|
+ .maxlen = sizeof(unsigned int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = proc_douintvec_minmax,
|
|
+ },
|
|
{ }
|
|
};
|
|
|
|
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
|
index 754e0212c951..b6d7faeb737a 100644
|
|
--- a/net/ipv4/tcp_input.c
|
|
+++ b/net/ipv4/tcp_input.c
|
|
@@ -5414,6 +5414,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
|
|
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct net *net = sock_net(sk);
|
|
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
|
|
|
|
@@ -5425,6 +5426,39 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
|
|
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
|
|
return 0;
|
|
|
|
+ /* For context and additional information about this patch, see the
|
|
+ * blog post at
|
|
+ *
|
|
+ * sysctl: net.ipv4.tcp_collapse_max_bytes
|
|
+ *
|
|
+ * If tcp_collapse_max_bytes is non-zero, attempt to collapse the
|
|
+ * queue to free up memory if the current amount of memory allocated
|
|
+ * is less than tcp_collapse_max_bytes. Otherwise, the packet is
|
|
+ * dropped without attempting to collapse the queue.
|
|
+ *
|
|
+ * If tcp_collapse_max_bytes is zero, this feature is disabled
|
|
+ * and the default Linux behavior is used. The default Linux
|
|
+ * behavior is to always perform the attempt to collapse the
|
|
+ * queue to free up memory.
|
|
+ *
|
|
+ * When the receive queue is small, we want to collapse the
|
|
+ * queue. There are two reasons for this: (a) the latency of
|
|
+ * performing the collapse will be small on a small queue, and
|
|
+ * (b) we want to avoid sending a congestion signal (via a
|
|
+ * packet drop) to the sender when the receive queue is small.
|
|
+ *
|
|
+ * The result is that we avoid latency spikes caused by the
|
|
+ * time it takes to perform the collapse logic when the receive
|
|
+ * queue is large and full, while preserving existing behavior
|
|
+ * and performance for all other cases.
|
|
+ */
|
|
+ if (net->ipv4.sysctl_tcp_collapse_max_bytes &&
|
|
+ (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) {
|
|
+ /* We are dropping the packet */
|
|
+ trace_tcp_collapse_max_bytes_exceeded(sk);
|
|
+ goto do_not_collapse;
|
|
+ }
|
|
+
|
|
tcp_collapse_ofo_queue(sk);
|
|
if (!skb_queue_empty(&sk->sk_receive_queue))
|
|
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
|
|
@@ -5443,6 +5477,8 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
|
|
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
|
|
return 0;
|
|
|
|
+do_not_collapse:
|
|
+
|
|
/* If we are really being abused, tell the caller to silently
|
|
* drop receive data on the floor. It will get retransmitted
|
|
* and hopefully then we'll have sufficient space.
|
|
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
|
|
index b9d55277cb85..5e577877158b 100644
|
|
--- a/net/ipv4/tcp_ipv4.c
|
|
+++ b/net/ipv4/tcp_ipv4.c
|
|
@@ -3275,6 +3275,8 @@ static int __net_init tcp_sk_init(struct net *net)
|
|
else
|
|
net->ipv4.tcp_congestion_control = &tcp_reno;
|
|
|
|
+ net->ipv4.sysctl_tcp_collapse_max_bytes = 0;
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
|
|
index 100a386fcd71..a3ec7265fb57 100644
|
|
--- a/scripts/Makefile.lib
|
|
+++ b/scripts/Makefile.lib
|
|
@@ -542,14 +542,21 @@ quiet_cmd_xzmisc = XZMISC $@
|
|
# decompression is used, like initramfs decompression, zstd22 should likely not
|
|
# be used because it would require zstd to allocate a 128 MB buffer.
|
|
|
|
+ifdef CONFIG_ZSTD_COMPRESSION_LEVEL
|
|
+zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL)
|
|
+ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0)
|
|
+zstd_comp_val += --ultra
|
|
+endif
|
|
+endif
|
|
+
|
|
quiet_cmd_zstd = ZSTD $@
|
|
- cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@
|
|
+ cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@
|
|
|
|
quiet_cmd_zstd22 = ZSTD22 $@
|
|
- cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@
|
|
+ cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@
|
|
|
|
quiet_cmd_zstd22_with_size = ZSTD22 $@
|
|
- cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@
|
|
+ cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@
|
|
|
|
# ASM offsets
|
|
# ---------------------------------------------------------------------------
|
|
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
|
|
index ab0c5bd1a60f..f4989f706d7f 100644
|
|
--- a/scripts/Makefile.modinst
|
|
+++ b/scripts/Makefile.modinst
|
|
@@ -100,8 +100,13 @@ quiet_cmd_gzip = GZIP $@
|
|
cmd_gzip = $(KGZIP) -n -f $<
|
|
quiet_cmd_xz = XZ $@
|
|
cmd_xz = $(XZ) --lzma2=dict=2MiB -f $<
|
|
+ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA
|
|
quiet_cmd_zstd = ZSTD $@
|
|
- cmd_zstd = $(ZSTD) -T0 --rm -f -q $<
|
|
+ cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $<
|
|
+else
|
|
+quiet_cmd_zstd = ZSTD $@
|
|
+ cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $<
|
|
+endif
|
|
|
|
$(dst)/%.ko.gz: $(dst)/%.ko FORCE
|
|
$(call cmd,gzip)
|
|
--
|
|
2.40.1
|
|
|
|
From 9e165ac849652399c952c5e1764ca9a7630a28c7 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Tue, 25 Apr 2023 17:17:39 +0200
|
|
Subject: [PATCH 04/10] fixes
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
Documentation/ABI/stable/sysfs-block | 10 +
|
|
.../testing/sysfs-class-led-trigger-blkdev | 78 ++
|
|
Documentation/leds/index.rst | 1 +
|
|
Documentation/leds/ledtrig-blkdev.rst | 158 +++
|
|
arch/x86/kernel/acpi/boot.c | 11 +-
|
|
arch/x86/net/bpf_jit_comp.c | 5 +-
|
|
drivers/bluetooth/btusb.c | 2 +-
|
|
drivers/leds/trigger/Kconfig | 9 +
|
|
drivers/leds/trigger/Makefile | 1 +
|
|
drivers/leds/trigger/ledtrig-blkdev.c | 1221 +++++++++++++++++
|
|
fs/eventpoll.c | 188 ++-
|
|
include/linux/pageblock-flags.h | 2 +-
|
|
kernel/kheaders.c | 10 +-
|
|
kernel/padata.c | 4 +-
|
|
mm/page_alloc.c | 22 +-
|
|
scripts/Makefile.vmlinux_o | 2 +-
|
|
sound/pci/hda/cs35l41_hda.c | 2 +-
|
|
17 files changed, 1636 insertions(+), 90 deletions(-)
|
|
create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
|
|
create mode 100644 Documentation/leds/ledtrig-blkdev.rst
|
|
create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
|
|
|
|
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
|
|
index 282de3680367..ac1dd2fbd855 100644
|
|
--- a/Documentation/ABI/stable/sysfs-block
|
|
+++ b/Documentation/ABI/stable/sysfs-block
|
|
@@ -101,6 +101,16 @@ Description:
|
|
devices that support receiving integrity metadata.
|
|
|
|
|
|
+What: /sys/block/<disk>/linked_leds
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Directory that contains symbolic links to all LEDs that
|
|
+ are associated with (linked to) this block device by the
|
|
+ blkdev LED trigger. Only present when at least one LED
|
|
+ is linked. (See Documentation/leds/ledtrig-blkdev.rst.)
|
|
+
|
|
+
|
|
What: /sys/block/<disk>/<partition>/alignment_offset
|
|
Date: April 2009
|
|
Contact: Martin K. Petersen <martin.petersen@oracle.com>
|
|
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
|
|
new file mode 100644
|
|
index 000000000000..28ce8c814fb7
|
|
--- /dev/null
|
|
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
|
|
@@ -0,0 +1,78 @@
|
|
+What: /sys/class/leds/<led>/blink_time
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Time (in milliseconds) that the LED will be on during a single
|
|
+ "blink".
|
|
+
|
|
+What: /sys/class/leds/<led>/check_interval
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Interval (in milliseconds) between checks of the block devices
|
|
+ linked to this LED. The LED will be blinked if the correct type
|
|
+ of activity (see blink_on_{read,write,discard,flush} attributes)
|
|
+ has occurred on any of the linked devices since the previous
|
|
+ check.
|
|
+
|
|
+What: /sys/class/leds/<led>/blink_on_read
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Boolean that determines whether the LED will blink in response
|
|
+ to read activity on any of its linked block devices.
|
|
+
|
|
+What: /sys/class/leds/<led>/blink_on_write
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Boolean that determines whether the LED will blink in response
|
|
+ to write activity on any of its linked block devices.
|
|
+
|
|
+What: /sys/class/leds/<led>/blink_on_discard
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Boolean that determines whether the LED will blink in response
|
|
+ to discard activity on any of its linked block devices.
|
|
+
|
|
+What: /sys/class/leds/<led>/blink_on_flush
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gamil.com>
|
|
+Description:
|
|
+ Boolean that determines whether the LED will blink in response
|
|
+ to cache flush activity on any of its linked block devices.
|
|
+
|
|
+What: /sys/class/leds/<led>/link_dev_by_path
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Associate a block device with this LED by writing the path to
|
|
+ the device special file (e.g. /dev/sda) to this attribute.
|
|
+ Symbolic links are followed.
|
|
+
|
|
+What: /sys/class/leds/<led>/unlink_dev_by_path
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Remove the association between this LED and a block device by
|
|
+ writing the path to the device special file (e.g. /dev/sda) to
|
|
+ this attribute. Symbolic links are followed.
|
|
+
|
|
+What: /sys/class/leds/<led>/unlink_dev_by_name
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Remove the association between this LED and a block device by
|
|
+ writing the kernel name of the device (e.g. sda) to this
|
|
+ attribute.
|
|
+
|
|
+What: /sys/class/leds/<led>/linked_devices
|
|
+Date: January 2023
|
|
+Contact: Ian Pilcher <arequipeno@gmail.com>
|
|
+Description:
|
|
+ Directory containing links to all block devices that are
|
|
+ associated with this LED. (Note that the names of the
|
|
+ symbolic links in this directory are *kernel* names, which
|
|
+ may not match the device special file paths written to
|
|
+ link_device and unlink_device.)
|
|
diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
|
|
index b9ca081fac71..5e37d8e7bd28 100644
|
|
--- a/Documentation/leds/index.rst
|
|
+++ b/Documentation/leds/index.rst
|
|
@@ -10,6 +10,7 @@ LEDs
|
|
leds-class
|
|
leds-class-flash
|
|
leds-class-multicolor
|
|
+ ledtrig-blkdev
|
|
ledtrig-oneshot
|
|
ledtrig-transient
|
|
ledtrig-usbport
|
|
diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst
|
|
new file mode 100644
|
|
index 000000000000..9ff5b99de451
|
|
--- /dev/null
|
|
+++ b/Documentation/leds/ledtrig-blkdev.rst
|
|
@@ -0,0 +1,158 @@
|
|
+.. SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+=================================
|
|
+Block Device (blkdev) LED Trigger
|
|
+=================================
|
|
+
|
|
+Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or
|
|
+``CONFIG_LEDS_TRIGGER_BLKDEV=m``.
|
|
+
|
|
+See also:
|
|
+
|
|
+* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev``
|
|
+* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block/<disk>/linked_leds``)
|
|
+
|
|
+Overview
|
|
+========
|
|
+
|
|
+.. note::
|
|
+ The examples below use ``<LED>`` to refer to the name of a
|
|
+ system-specific LED. If no suitable LED is available on a test
|
|
+ system (in a virtual machine, for example), it is possible to
|
|
+ use a userspace LED. (See ``Documentation/leds/uleds.rst``.)
|
|
+
|
|
+Verify that the ``blkdev`` LED trigger is available::
|
|
+
|
|
+ # grep blkdev /sys/class/leds/<LED>/trigger
|
|
+ ... rfkill-none blkdev
|
|
+
|
|
+(If the previous command produces no output, you may need to load the trigger
|
|
+module - ``modprobe ledtrig_blkdev``. If the module is not available, check
|
|
+the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.)
|
|
+
|
|
+Associate the LED with the ``blkdev`` LED trigger::
|
|
+
|
|
+ # echo blkdev > /sys/class/leds/<LED>/trigger
|
|
+
|
|
+ # cat /sys/class/leds/<LED>/trigger
|
|
+ ... rfkill-none [blkdev]
|
|
+
|
|
+Note that several new device attributes are available in the
|
|
+``/sys/class/leds/<LED>`` directory.
|
|
+
|
|
+* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are
|
|
+ used to manage the set of block devices associated with this LED. The LED
|
|
+ will blink when activity occurs on any of its linked devices.
|
|
+
|
|
+* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and
|
|
+ ``blink_on_flush`` are boolean values that determine whether the LED will
|
|
+ blink when a particular type of activity is detected on one of its linked
|
|
+ block devices.
|
|
+
|
|
+* ``blink_time`` is the duration (in milliseconds) of each blink of this LED.
|
|
+ (The minimum value is 10 milliseconds.)
|
|
+
|
|
+* ``check_interval`` is the frequency (in milliseconds) with which block devices
|
|
+ linked to this LED will be checked for activity and the LED blinked (if the
|
|
+ correct type of activity has occurred).
|
|
+
|
|
+* The ``linked_devices`` directory will contain a symbolic link to every device
|
|
+ that is associated with this LED.
|
|
+
|
|
+Link a block device to the LED::
|
|
+
|
|
+ # echo /dev/sda > /sys/class/leds/<LED>/link_dev_by_path
|
|
+
|
|
+ # ls /sys/class/leds/<LED>/linked_devices
|
|
+ sda
|
|
+
|
|
+(The value written to ``link_dev_by_path`` must be the path of the device
|
|
+special file, such as ``/dev/sda``, that represents the block device - or the
|
|
+path of a symbolic link to such a device special file.)
|
|
+
|
|
+Activity on the device will now cause the LED to blink. The duration of each
|
|
+blink (in milliseconds) can be adjusted by setting
|
|
+``/sys/class/leds/<LED>/blink_time``. (But see **check_interval and
|
|
+blink_time** below.)
|
|
+
|
|
+Associate a second device with the LED::
|
|
+
|
|
+ # echo /dev/sdb > /sys/class/leds/<LED>/link_dev_by_path
|
|
+
|
|
+ # ls /sys/class/leds/<LED>/linked_devices
|
|
+ sda sdb
|
|
+
|
|
+When a block device is linked to one or more LEDs, the LEDs are linked from
|
|
+the device's ``linked_leds`` directory::
|
|
+
|
|
+ # ls /sys/class/block/sd{a,b}/linked_leds
|
|
+ /sys/class/block/sda/linked_leds:
|
|
+ <LED>
|
|
+
|
|
+ /sys/class/block/sdb/linked_leds:
|
|
+ <LED>
|
|
+
|
|
+(The ``linked_leds`` directory only exists when the block device is linked to
|
|
+at least one LED.)
|
|
+
|
|
+``check_interval`` and ``blink_time``
|
|
+=====================================
|
|
+
|
|
+* By default, linked block devices are checked for activity every 100
|
|
+ milliseconds. This frequency can be changed for an LED via the
|
|
+ ``/sys/class/leds/<led>/check_interval`` attribute. (The minimum value is 25
|
|
+ milliseconds.)
|
|
+
|
|
+* All block devices associated with an LED are checked for activity every
|
|
+ ``check_interval`` milliseconds, and a blink is triggered if the correct type
|
|
+ of activity (as determined by the LED's ``blink_on_*`` attributes) is
|
|
+ detected. The duration of an LED's blink is determined by its ``blink_time``
|
|
+ attribute. Thus (when the correct type of activity is detected), the LED will
|
|
+ be on for ``blink_time`` milliseconds and off for
|
|
+ ``check_interval - blink_time`` milliseconds.
|
|
+
|
|
+* The LED subsystem ignores new blink requests for an LED that is already in
|
|
+ in the process of blinking, so setting a ``blink_time`` greater than or equal
|
|
+ to ``check_interval`` will cause some blinks to be missed.
|
|
+
|
|
+* Because of processing times, scheduling latencies, etc., avoiding missed
|
|
+ blinks actually requires a difference of at least a few milliseconds between
|
|
+ the ``blink_time`` and ``check_interval``. The required difference is likely
|
|
+ to vary from system to system. As a reference, a Thecus N5550 NAS requires a
|
|
+ difference of 7 milliseconds (e.g. ``check_interval == 100``,
|
|
+ ``blink_time == 93``).
|
|
+
|
|
+* The default values (``check_interval == 100``, ``blink_time == 75``) cause the
|
|
+ LED associated with a continuously active device to blink rapidly. For a more
|
|
+ "always on" effect, increase the ``blink_time`` (but not too much; see the
|
|
+ previous bullet).
|
|
+
|
|
+Other Notes
|
|
+===========
|
|
+
|
|
+* Many (possibly all) types of block devices work with this trigger, including:
|
|
+
|
|
+ * SCSI (including SATA and USB) hard disk drives and SSDs
|
|
+ * SCSI (including SATA and USB) optical drives
|
|
+ * NVMe SSDs
|
|
+ * SD cards
|
|
+ * loopback block devices (``/dev/loop*``)
|
|
+ * device mapper devices, such as LVM logical volumes
|
|
+ * MD RAID devices
|
|
+ * zRAM compressed RAM-disks
|
|
+ * partitions on block devices that support them
|
|
+
|
|
+* The names of the symbolic links in ``/sys/class/leds/<LED>/linked_devices``
|
|
+ are **kernel** names, which may not match the paths used for
|
|
+ ``link_dev_by_path`` and ``unlink_dev_by_path``. This is most likely when a
|
|
+ symbolic link is used to refer to the device (as is common with logical
|
|
+ volumes), but it can be true for any device, because nothing prevents the
|
|
+ creation of device special files with arbitrary names (e.g.
|
|
+ ``sudo mknod /foo b 8 0``).
|
|
+
|
|
+ Kernel names can be used to unlink block devices from LEDs by writing them to
|
|
+ the LED's ``unlink_dev_by_name`` attribute.
|
|
+
|
|
+* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
|
|
+ A device can be associated with multiple LEDs, and an LED can be associated
|
|
+ with multiple devices.
|
|
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
|
|
index 0dac4ab5b55b..21b542a6866c 100644
|
|
--- a/arch/x86/kernel/acpi/boot.c
|
|
+++ b/arch/x86/kernel/acpi/boot.c
|
|
@@ -1858,13 +1858,18 @@ early_param("acpi_sci", setup_acpi_sci);
|
|
|
|
int __acpi_acquire_global_lock(unsigned int *lock)
|
|
{
|
|
- unsigned int old, new;
|
|
+ unsigned int old, new, val;
|
|
|
|
old = READ_ONCE(*lock);
|
|
do {
|
|
- new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
|
|
+ val = (old >> 1) & 0x1;
|
|
+ new = (old & ~0x3) + 2 + val;
|
|
} while (!try_cmpxchg(lock, &old, new));
|
|
- return ((new & 0x3) < 3) ? -1 : 0;
|
|
+
|
|
+ if (val)
|
|
+ return 0;
|
|
+
|
|
+ return -1;
|
|
}
|
|
|
|
int __acpi_release_global_lock(unsigned int *lock)
|
|
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
|
|
index 1056bbf55b17..212bfd1517ec 100644
|
|
--- a/arch/x86/net/bpf_jit_comp.c
|
|
+++ b/arch/x86/net/bpf_jit_comp.c
|
|
@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip)
|
|
|
|
static int emit_rsb_call(u8 **pprog, void *func, void *ip)
|
|
{
|
|
+ void *adjusted_ip;
|
|
OPTIMIZER_HIDE_VAR(func);
|
|
- x86_call_depth_emit_accounting(pprog, func);
|
|
- return emit_patch(pprog, func, ip, 0xE8);
|
|
+ adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func);
|
|
+ return emit_patch(pprog, func, adjusted_ip, 0xE8);
|
|
}
|
|
|
|
static int emit_jump(u8 **pprog, void *func, void *ip)
|
|
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
|
|
index 5c536151ef83..5a80379253a7 100644
|
|
--- a/drivers/bluetooth/btusb.c
|
|
+++ b/drivers/bluetooth/btusb.c
|
|
@@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
|
|
}
|
|
|
|
gpiod_set_value_cansleep(reset_gpio, 0);
|
|
- msleep(200);
|
|
+ usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC);
|
|
gpiod_set_value_cansleep(reset_gpio, 1);
|
|
|
|
return;
|
|
diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
|
|
index dc6816d36d06..bda249068182 100644
|
|
--- a/drivers/leds/trigger/Kconfig
|
|
+++ b/drivers/leds/trigger/Kconfig
|
|
@@ -154,4 +154,13 @@ config LEDS_TRIGGER_TTY
|
|
|
|
When build as a module this driver will be called ledtrig-tty.
|
|
|
|
+config LEDS_TRIGGER_BLKDEV
|
|
+ tristate "LED Trigger for block devices"
|
|
+ depends on BLOCK
|
|
+ help
|
|
+ The blkdev LED trigger allows LEDs to be controlled by block device
|
|
+ activity (reads and writes).
|
|
+
|
|
+ See Documentation/leds/ledtrig-blkdev.rst.
|
|
+
|
|
endif # LEDS_TRIGGERS
|
|
diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
|
|
index 25c4db97cdd4..d53bab5d93f1 100644
|
|
--- a/drivers/leds/trigger/Makefile
|
|
+++ b/drivers/leds/trigger/Makefile
|
|
@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o
|
|
obj-$(CONFIG_LEDS_TRIGGER_PATTERN) += ledtrig-pattern.o
|
|
obj-$(CONFIG_LEDS_TRIGGER_AUDIO) += ledtrig-audio.o
|
|
obj-$(CONFIG_LEDS_TRIGGER_TTY) += ledtrig-tty.o
|
|
+obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o
|
|
diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
|
|
new file mode 100644
|
|
index 000000000000..067eedb003b5
|
|
--- /dev/null
|
|
+++ b/drivers/leds/trigger/ledtrig-blkdev.c
|
|
@@ -0,0 +1,1221 @@
|
|
+// SPDX-License-Identifier: GPL-2.0-only
|
|
+
|
|
+/*
|
|
+ * Block device LED trigger
|
|
+ *
|
|
+ * Copyright 2021-2022 Ian Pilcher <arequipeno@gmail.com>
|
|
+ */
|
|
+
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/leds.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/part_stat.h>
|
|
+#include <linux/xarray.h>
|
|
+
|
|
+/**
|
|
+ * DOC: Overview
|
|
+ *
|
|
+ * The ``blkdev`` LED trigger works by periodically checking the activity
|
|
+ * counters of block devices that have been linked to one or more LEDs and
|
|
+ * blinking those LED(s) if the correct type of activity has occurred. The
|
|
+ * periodic check is scheduled with the Linux kernel's deferred work facility.
|
|
+ *
|
|
+ * Trigger-specific data about block devices and LEDs is stored in two data
|
|
+ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led
|
|
+ * (a "BTL"). Each structure contains a &struct xarray that holds links to any
|
|
+ * linked devices of the other type. I.e. &blkdev_trig_bdev.linked_btls
|
|
+ * contains links to all BTLs whose LEDs have been linked to the BTB's block
|
|
+ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose
|
|
+ * block devices have been linked to the BTL's LED. Thus, a block device can
|
|
+ * be linked to more than one LED, and an LED can be linked to more than one
|
|
+ * block device.
|
|
+ */
|
|
+
|
|
+/* Default, minimum & maximum blink duration (milliseconds) */
|
|
+#define BLKDEV_TRIG_BLINK_DEF 75
|
|
+#define BLKDEV_TRIG_BLINK_MIN 10
|
|
+#define BLKDEV_TRIG_BLINK_MAX 86400000 /* 24 hours */
|
|
+
|
|
+/* Default, minimum & maximum activity check interval (milliseconds) */
|
|
+#define BLKDEV_TRIG_CHECK_DEF 100
|
|
+#define BLKDEV_TRIG_CHECK_MIN 25
|
|
+#define BLKDEV_TRIG_CHECK_MAX 86400000 /* 24 hours */
|
|
+
|
|
+/*
|
|
+ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying
|
|
+ * again (milliseconds)
|
|
+ */
|
|
+#define BLKDEV_TRIG_CHECK_RETRY 5
|
|
+
|
|
+/* Mode argument for calls to blkdev_get_by_path() and blkdev_put() */
|
|
+#define BLKDEV_TRIG_FMODE 0
|
|
+
|
|
+/**
|
|
+ * struct blkdev_trig_bdev - Trigger-specific data about a block device.
|
|
+ * @last_checked: Time (in jiffies) at which the trigger last checked this
|
|
+ * block device for activity.
|
|
+ * @last_activity: Time (in jiffies) at which the trigger last detected
|
|
+ * activity of each type.
|
|
+ * @ios: Activity counter values for each type, corresponding to
|
|
+ * the timestamps in &last_activity.
|
|
+ * @index: &xarray index, so the BTB can be included in one or more
|
|
+ * &blkdev_trig_led.linked_btbs.
|
|
+ * @bdev: The block device.
|
|
+ * @linked_btls: The BTLs that represent the LEDs linked to the BTB's
|
|
+ * block device.
|
|
+ *
|
|
+ * Every block device linked to at least one LED gets a "BTB." A BTB is created
|
|
+ * when a block device that is not currently linked to any LEDs is linked to an
|
|
+ * LED.
|
|
+ *
|
|
+ * A BTB is freed when one of the following occurs:
|
|
+ *
|
|
+ * * The number of LEDs linked to the block device becomes zero, because it has
|
|
+ * been unlinked from its last LED using the trigger's &sysfs interface.
|
|
+ *
|
|
+ * * The number of LEDs linked to the block device becomes zero, because the
|
|
+ * last LED to which it was linked has been disassociated from the trigger
|
|
+ * (which happens automatically if the LED device is removed from the system).
|
|
+ *
|
|
+ * * The BTB's block device is removed from the system. To accomodate this
|
|
+ * scenario, BTB's are created as device resources, so that the release
|
|
+ * function will be called by the driver core when the device is removed.
|
|
+ */
|
|
+struct blkdev_trig_bdev {
|
|
+ unsigned long last_checked;
|
|
+ unsigned long last_activity[NR_STAT_GROUPS];
|
|
+ unsigned long ios[NR_STAT_GROUPS];
|
|
+ unsigned long index;
|
|
+ struct block_device *bdev;
|
|
+ struct xarray linked_btls;
|
|
+};
|
|
+
|
|
+/**
|
|
+ * struct blkdev_trig_led - Trigger-specific data about an LED.
|
|
+ * @last_checked: Time (in jiffies) at which the trigger last checked the
|
|
+ * the block devices linked to this LED for activity.
|
|
+ * @index: &xarray index, so the BTL can be included in one or more
|
|
+ * &blkdev_trig_bdev.linked_btls.
|
|
+ * @mode: Bitmask for types of block device activity that will
|
|
+ * cause this LED to blink --- reads, writes, discards,
|
|
+ * etc.
|
|
+ * @led: The LED device.
|
|
+ * @blink_msec: Duration of a blink (milliseconds).
|
|
+ * @check_jiffies: Frequency with which block devices linked to this LED
|
|
+ * should be checked for activity (jiffies).
|
|
+ * @linked_btbs: The BTBs that represent the block devices linked to the
|
|
+ * BTL's LED.
|
|
+ * @all_btls_node: The BTL's node in the module's list of all BTLs.
|
|
+ *
|
|
+ * Every LED associated with the block device trigger gets a "BTL." A BTL is
|
|
+ * created when the trigger is "activated" on an LED (usually by writing
|
|
+ * ``blkdev`` to the LED's &sysfs &trigger attribute). A BTL is freed wnen its
|
|
+ * LED is disassociated from the trigger, either through the trigger's &sysfs
|
|
+ * interface or because the LED device is removed from the system.
|
|
+ */
|
|
+struct blkdev_trig_led {
|
|
+ unsigned long last_checked;
|
|
+ unsigned long index;
|
|
+ unsigned long mode; /* must be ulong for atomic bit ops */
|
|
+ struct led_classdev *led;
|
|
+ unsigned int blink_msec;
|
|
+ unsigned int check_jiffies;
|
|
+ struct xarray linked_btbs;
|
|
+ struct hlist_node all_btls_node;
|
|
+};
|
|
+
|
|
+/* Protects everything except atomic LED attributes */
|
|
+static DEFINE_MUTEX(blkdev_trig_mutex);
|
|
+
|
|
+/* BTB device resource release function */
|
|
+static void blkdev_trig_btb_release(struct device *dev, void *res);
|
|
+
|
|
+/* Index for next BTB or BTL */
|
|
+static unsigned long blkdev_trig_next_index;
|
|
+
|
|
+/* All LEDs associated with the trigger */
|
|
+static HLIST_HEAD(blkdev_trig_all_btls);
|
|
+
|
|
+/* Delayed work to periodically check for activity & blink LEDs */
|
|
+static void blkdev_trig_check(struct work_struct *work);
|
|
+static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check);
|
|
+
|
|
+/* When is the delayed work scheduled to run next (jiffies) */
|
|
+static unsigned long blkdev_trig_next_check;
|
|
+
|
|
+/* Total number of BTB-to-BTL links */
|
|
+static unsigned int blkdev_trig_link_count;
|
|
+
|
|
+/* Empty sysfs attribute list for next 2 declarations */
|
|
+static struct attribute *blkdev_trig_attrs_empty[] = { NULL };
|
|
+
|
|
+/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */
|
|
+static const struct attribute_group blkdev_trig_linked_leds = {
|
|
+ .name = "linked_leds",
|
|
+ .attrs = blkdev_trig_attrs_empty,
|
|
+};
|
|
+
|
|
+/* linked_devices sysfs directory for each LED associated with the trigger */
|
|
+static const struct attribute_group blkdev_trig_linked_devs = {
|
|
+ .name = "linked_devices",
|
|
+ .attrs = blkdev_trig_attrs_empty,
|
|
+};
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * Delayed work to check for activity & blink LEDs
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has
|
|
+ * occurred on the block device.
|
|
+ * @btl: The BTL that represents the LED
|
|
+ * @btb: The BTB that represents the block device
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ * Return: &true if the LED is blinked, &false if not.
|
|
+ */
|
|
+static bool blkdev_trig_blink(const struct blkdev_trig_led *btl,
|
|
+ const struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ unsigned long mode, mask, delay_on, delay_off;
|
|
+ enum stat_group i;
|
|
+
|
|
+ mode = READ_ONCE(btl->mode);
|
|
+
|
|
+ for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) {
|
|
+
|
|
+ if (!(mode & mask))
|
|
+ continue;
|
|
+
|
|
+ if (time_before_eq(btb->last_activity[i], btl->last_checked))
|
|
+ continue;
|
|
+
|
|
+ delay_on = READ_ONCE(btl->blink_msec);
|
|
+ delay_off = 1; /* 0 leaves LED turned on */
|
|
+
|
|
+ led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps.
|
|
+ * @btb: The BTB
|
|
+ * @now: Timestamp (in jiffies)
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb,
|
|
+ unsigned long now)
|
|
+{
|
|
+ unsigned long new_ios;
|
|
+ enum stat_group i;
|
|
+
|
|
+ for (i = STAT_READ; i <= STAT_FLUSH; ++i) {
|
|
+
|
|
+ new_ios = part_stat_read(btb->bdev, ios[i]);
|
|
+
|
|
+ if (new_ios != btb->ios[i]) {
|
|
+ btb->ios[i] = new_ios;
|
|
+ btb->last_activity[i] = now;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ btb->last_checked = now;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_check() - Check linked devices for activity and blink LEDs.
|
|
+ * @work: Delayed work (&blkdev_trig_work)
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_check(struct work_struct *work)
|
|
+{
|
|
+ struct blkdev_trig_led *btl;
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ unsigned long index, delay, now, led_check, led_delay;
|
|
+ bool blinked;
|
|
+
|
|
+ if (!mutex_trylock(&blkdev_trig_mutex)) {
|
|
+ delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY);
|
|
+ goto exit_reschedule;
|
|
+ }
|
|
+
|
|
+ now = jiffies;
|
|
+ delay = ULONG_MAX;
|
|
+
|
|
+ hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) {
|
|
+
|
|
+ led_check = btl->last_checked + btl->check_jiffies;
|
|
+
|
|
+ if (time_before_eq(led_check, now)) {
|
|
+
|
|
+ blinked = false;
|
|
+
|
|
+ xa_for_each (&btl->linked_btbs, index, btb) {
|
|
+
|
|
+ if (btb->last_checked != now)
|
|
+ blkdev_trig_update_btb(btb, now);
|
|
+ if (!blinked)
|
|
+ blinked = blkdev_trig_blink(btl, btb);
|
|
+ }
|
|
+
|
|
+ btl->last_checked = now;
|
|
+ led_delay = btl->check_jiffies;
|
|
+
|
|
+ } else {
|
|
+ led_delay = led_check - now;
|
|
+ }
|
|
+
|
|
+ if (led_delay < delay)
|
|
+ delay = led_delay;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+
|
|
+exit_reschedule:
|
|
+ WARN_ON_ONCE(delay == ULONG_MAX);
|
|
+ WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new
|
|
+ * LED is added to the schedule.
|
|
+ * @btl: The BTL that represents the LED
|
|
+ *
|
|
+ * Called when the number of block devices to which an LED is linked becomes
|
|
+ * non-zero.
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl)
|
|
+{
|
|
+ unsigned long delay = READ_ONCE(btl->check_jiffies);
|
|
+ unsigned long check_by = jiffies + delay;
|
|
+
|
|
+ /*
|
|
+ * If no other LED-to-block device links exist, simply schedule the
|
|
+ * delayed work according to this LED's check_interval attribute
|
|
+ * (check_jiffies).
|
|
+ */
|
|
+ if (blkdev_trig_link_count == 0) {
|
|
+ WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay));
|
|
+ blkdev_trig_next_check = check_by;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the next check is already scheduled to occur soon enough to
|
|
+ * accomodate this LED's check_interval, the schedule doesn't need
|
|
+ * to be changed.
|
|
+ */
|
|
+ if (time_after_eq(check_by, blkdev_trig_next_check))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Modify the schedule, so that the delayed work runs soon enough for
|
|
+ * this LED.
|
|
+ */
|
|
+ WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay));
|
|
+ blkdev_trig_next_check = check_by;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * Linking and unlinking LEDs and block devices
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_link() - Link a block device to an LED.
|
|
+ * @btl: The BTL that represents the LED
|
|
+ * @btb: The BTB that represents the block device
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ * Return: &0 on success, negative &errno on error.
|
|
+ */
|
|
+static int blkdev_trig_link(struct blkdev_trig_led *btl,
|
|
+ struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ bool led_first_link;
|
|
+ int err;
|
|
+
|
|
+ led_first_link = xa_empty(&btl->linked_btbs);
|
|
+
|
|
+ err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL);
|
|
+ if (err)
|
|
+ goto error_erase_btl;
|
|
+
|
|
+ /* Create /sys/class/block/<bdev>/linked_leds/<led> symlink */
|
|
+ err = sysfs_add_link_to_group(bdev_kobj(btb->bdev),
|
|
+ blkdev_trig_linked_leds.name,
|
|
+ &btl->led->dev->kobj, btl->led->name);
|
|
+ if (err)
|
|
+ goto error_erase_btb;
|
|
+
|
|
+ /* Create /sys/class/leds/<led>/linked_devices/<bdev> symlink */
|
|
+ err = sysfs_add_link_to_group(&btl->led->dev->kobj,
|
|
+ blkdev_trig_linked_devs.name,
|
|
+ bdev_kobj(btb->bdev),
|
|
+ dev_name(&btb->bdev->bd_device));
|
|
+ if (err)
|
|
+ goto error_remove_symlink;
|
|
+
|
|
+ /*
|
|
+ * If this is the first block device linked to this LED, the delayed
|
|
+ * work schedule may need to be changed.
|
|
+ */
|
|
+ if (led_first_link)
|
|
+ blkdev_trig_sched_led(btl);
|
|
+
|
|
+ ++blkdev_trig_link_count;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+error_remove_symlink:
|
|
+ sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
|
|
+ blkdev_trig_linked_leds.name,
|
|
+ btl->led->name);
|
|
+error_erase_btb:
|
|
+ xa_erase(&btl->linked_btbs, btb->index);
|
|
+error_erase_btl:
|
|
+ xa_erase(&btb->linked_btls, btl->index);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed.
|
|
+ * @btb: The BTB
|
|
+ *
|
|
+ * Does nothing if the BTB (block device) is still linked to at least one LED.
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ struct block_device *bdev = btb->bdev;
|
|
+ int err;
|
|
+
|
|
+ if (xa_empty(&btb->linked_btls)) {
|
|
+
|
|
+ sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
|
|
+ err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release,
|
|
+ NULL, NULL);
|
|
+ WARN_ON(err);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of
|
|
+ * unlinking a block device from an LED.
|
|
+ * @btl: The BTL that represents the LED
|
|
+ * @btb: The BTB that represents the block device
|
|
+ *
|
|
+ * When a block device is unlinked from an LED, certain steps must be performed
|
|
+ * only if the block device is **not** being released. This function performs
|
|
+ * those steps that are **always** required, whether or not the block device is
|
|
+ * being released.
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl,
|
|
+ struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ --blkdev_trig_link_count;
|
|
+
|
|
+ if (blkdev_trig_link_count == 0)
|
|
+ WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work));
|
|
+
|
|
+ xa_erase(&btb->linked_btls, btl->index);
|
|
+ xa_erase(&btl->linked_btbs, btb->index);
|
|
+
|
|
+ /* Remove /sys/class/leds/<led>/linked_devices/<bdev> symlink */
|
|
+ sysfs_remove_link_from_group(&btl->led->dev->kobj,
|
|
+ blkdev_trig_linked_devs.name,
|
|
+ dev_name(&btb->bdev->bd_device));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is
|
|
+ * **not** being released.
|
|
+ * @btl: The BTL that represents the LED.
|
|
+ * @btb: The BTB that represents the block device.
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl,
|
|
+ struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ _blkdev_trig_unlink_always(btl, btb);
|
|
+
|
|
+ /* Remove /sys/class/block/<bdev>/linked_leds/<led> symlink */
|
|
+ sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
|
|
+ blkdev_trig_linked_leds.name,
|
|
+ btl->led->name);
|
|
+
|
|
+ blkdev_trig_put_btb(btb);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is
|
|
+ * being released.
|
|
+ * @btl: The BTL that represents the LED
|
|
+ * @btb: The BTB that represents the block device
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl,
|
|
+ struct blkdev_trig_bdev *btb)
|
|
+{
|
|
+ _blkdev_trig_unlink_always(btl, btb);
|
|
+
|
|
+ /*
|
|
+ * If the BTB is being released, the driver core has already removed the
|
|
+ * device's attribute groups, and the BTB will be freed automatically,
|
|
+ * so there's nothing else to do.
|
|
+ */
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * BTB creation
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_btb_release() - BTB device resource release function.
|
|
+ * @dev: The block device
|
|
+ * @res: The BTB
|
|
+ *
|
|
+ * Called by the driver core when a block device with a BTB is removed.
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_btb_release(struct device *dev, void *res)
|
|
+{
|
|
+ struct blkdev_trig_bdev *btb = res;
|
|
+ struct blkdev_trig_led *btl;
|
|
+ unsigned long index;
|
|
+
|
|
+ mutex_lock(&blkdev_trig_mutex);
|
|
+
|
|
+ xa_for_each (&btb->linked_btls, index, btl)
|
|
+ blkdev_trig_unlink_release(btl, btb);
|
|
+
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_get_bdev() - Get a block device by path.
|
|
+ * @path: The value written to an LED's &link_dev_by_path or
|
|
+ * &unlink_dev_by_path attribute, which should be the path to a
|
|
+ * special file that represents a block device
|
|
+ * @len: The number of characters in &path (not including its
|
|
+ * terminating null)
|
|
+ *
|
|
+ * The caller must call blkdev_put() when finished with the device.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The block device, or an error pointer.
|
|
+ */
|
|
+static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len)
|
|
+{
|
|
+ struct block_device *bdev;
|
|
+ char *buf;
|
|
+
|
|
+ buf = kmemdup(path, len + 1, GFP_KERNEL); /* +1 to include null */
|
|
+ if (buf == NULL)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ bdev = blkdev_get_by_path(strim(buf), BLKDEV_TRIG_FMODE, THIS_MODULE);
|
|
+ kfree(buf);
|
|
+ return bdev;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_get_btb() - Find or create the BTB for a block device.
|
|
+ * @path: The value written to an LED's &link_dev_by_path attribute,
|
|
+ * which should be the path to a special file that represents a
|
|
+ * block device
|
|
+ * @len: The number of characters in &path
|
|
+ *
|
|
+ * If a new BTB is created, because the block device was not previously linked
|
|
+ * to any LEDs, the block device's &linked_leds &sysfs directory is created.
|
|
+ *
|
|
+ * Context: Process context. Caller must hold &blkdev_trig_mutex.
|
|
+ * Return: Pointer to the BTB, error pointer on error.
|
|
+ */
|
|
+static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path,
|
|
+ size_t len)
|
|
+{
|
|
+ struct block_device *bdev;
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ int err;
|
|
+
|
|
+ bdev = blkdev_trig_get_bdev(path, len);
|
|
+ if (IS_ERR(bdev))
|
|
+ return ERR_CAST(bdev);
|
|
+
|
|
+ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
|
|
+ NULL, NULL);
|
|
+ if (btb != NULL) {
|
|
+ err = 0;
|
|
+ goto exit_put_bdev;
|
|
+ }
|
|
+
|
|
+ if (blkdev_trig_next_index == ULONG_MAX) {
|
|
+ err = -EOVERFLOW;
|
|
+ goto exit_put_bdev;
|
|
+ }
|
|
+
|
|
+ btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL);
|
|
+ if (btb == NULL) {
|
|
+ err = -ENOMEM;
|
|
+ goto exit_put_bdev;
|
|
+ }
|
|
+
|
|
+ err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
|
|
+ if (err)
|
|
+ goto exit_free_btb;
|
|
+
|
|
+ btb->index = blkdev_trig_next_index++;
|
|
+ btb->bdev = bdev;
|
|
+ xa_init(&btb->linked_btls);
|
|
+
|
|
+ /* Populate BTB activity counters */
|
|
+ blkdev_trig_update_btb(btb, jiffies);
|
|
+
|
|
+ devres_add(&bdev->bd_device, btb);
|
|
+
|
|
+exit_free_btb:
|
|
+ if (err)
|
|
+ devres_free(btb);
|
|
+exit_put_bdev:
|
|
+ blkdev_put(bdev, BLKDEV_TRIG_FMODE);
|
|
+ return err ? ERR_PTR(err) : btb;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * Activating and deactivating the trigger on an LED
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is
|
|
+ * associated with the trigger.
|
|
+ * @led: The LED
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ * Return: &0 on success, negative &errno on error.
|
|
+ */
|
|
+static int blkdev_trig_activate(struct led_classdev *led)
|
|
+{
|
|
+ struct blkdev_trig_led *btl;
|
|
+ int err;
|
|
+
|
|
+ btl = kzalloc(sizeof(*btl), GFP_KERNEL);
|
|
+ if (btl == NULL)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ err = mutex_lock_interruptible(&blkdev_trig_mutex);
|
|
+ if (err)
|
|
+ goto exit_free;
|
|
+
|
|
+ if (blkdev_trig_next_index == ULONG_MAX) {
|
|
+ err = -EOVERFLOW;
|
|
+ goto exit_unlock;
|
|
+ }
|
|
+
|
|
+ btl->index = blkdev_trig_next_index++;
|
|
+ btl->last_checked = jiffies;
|
|
+ btl->mode = -1; /* set all bits */
|
|
+ btl->led = led;
|
|
+ btl->blink_msec = BLKDEV_TRIG_BLINK_DEF;
|
|
+ btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF);
|
|
+ xa_init(&btl->linked_btbs);
|
|
+
|
|
+ hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls);
|
|
+ led_set_trigger_data(led, btl);
|
|
+
|
|
+exit_unlock:
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+exit_free:
|
|
+ if (err)
|
|
+ kfree(btl);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is
|
|
+ * disassociated from the trigger.
|
|
+ * @led: The LED
|
|
+ *
|
|
+ * The LEDs subsystem also calls this function when an LED associated with the
|
|
+ * trigger is removed or when the trigger is unregistered (if the module is
|
|
+ * unloaded).
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ */
|
|
+static void blkdev_trig_deactivate(struct led_classdev *led)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_get_trigger_data(led);
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ unsigned long index;
|
|
+
|
|
+ mutex_lock(&blkdev_trig_mutex);
|
|
+
|
|
+ xa_for_each (&btl->linked_btbs, index, btb)
|
|
+ blkdev_trig_unlink_norelease(btl, btb);
|
|
+
|
|
+ hlist_del(&btl->all_btls_node);
|
|
+ kfree(btl);
|
|
+
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * Link-related attribute store functions
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * link_dev_by_path_store() - &link_dev_by_path device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &link_dev_by_path attribute (&dev_attr_link_dev_by_path)
|
|
+ * @buf: The value written to the attribute, which should be the path to
|
|
+ * a special file that represents a block device to be linked to
|
|
+ * the LED (e.g. ``/dev/sda``)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t link_dev_by_path_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ int err;
|
|
+
|
|
+ err = mutex_lock_interruptible(&blkdev_trig_mutex);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ btb = blkdev_trig_get_btb(buf, count);
|
|
+ if (IS_ERR(btb)) {
|
|
+ err = PTR_ERR(btb);
|
|
+ goto exit_unlock;
|
|
+ }
|
|
+
|
|
+ if (xa_load(&btb->linked_btls, btl->index) != NULL) {
|
|
+ err = -EEXIST;
|
|
+ goto exit_put_btb;
|
|
+ }
|
|
+
|
|
+ err = blkdev_trig_link(btl, btb);
|
|
+
|
|
+exit_put_btb:
|
|
+ if (err)
|
|
+ blkdev_trig_put_btb(btb);
|
|
+exit_unlock:
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+ return err ? : count;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store
|
|
+ * function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path)
|
|
+ * @buf: The value written to the attribute, which should be the path to
|
|
+ * a special file that represents a block device to be unlinked
|
|
+ * from the LED (e.g. ``/dev/sda``)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t unlink_dev_by_path_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+ struct block_device *bdev;
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ int err;
|
|
+
|
|
+ bdev = blkdev_trig_get_bdev(buf, count);
|
|
+ if (IS_ERR(bdev))
|
|
+ return PTR_ERR(bdev);
|
|
+
|
|
+ err = mutex_lock_interruptible(&blkdev_trig_mutex);
|
|
+ if (err)
|
|
+ goto exit_put_bdev;
|
|
+
|
|
+ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
|
|
+ NULL, NULL);
|
|
+ if (btb == NULL) {
|
|
+ err = -EUNATCH; /* bdev isn't linked to any LED */
|
|
+ goto exit_unlock;
|
|
+ }
|
|
+
|
|
+ if (xa_load(&btb->linked_btls, btl->index) == NULL) {
|
|
+ err = -EUNATCH; /* bdev isn't linked to this LED */
|
|
+ goto exit_unlock;
|
|
+ }
|
|
+
|
|
+ blkdev_trig_unlink_norelease(btl, btb);
|
|
+
|
|
+exit_unlock:
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+exit_put_bdev:
|
|
+ blkdev_put(bdev, BLKDEV_TRIG_FMODE);
|
|
+ return err ? : count;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store
|
|
+ * function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name)
|
|
+ * @buf: The value written to the attribute, which should be the kernel
|
|
+ * name of a block device to be unlinked from the LED (e.g.
|
|
+ * ``sda``)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Context: Process context. Takes and releases &blkdev_trig_mutex.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t unlink_dev_by_name_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+ struct blkdev_trig_bdev *btb;
|
|
+ unsigned long index;
|
|
+ int err;
|
|
+
|
|
+ err = mutex_lock_interruptible(&blkdev_trig_mutex);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ err = -EUNATCH;
|
|
+
|
|
+ xa_for_each (&btl->linked_btbs, index, btb) {
|
|
+
|
|
+ if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) {
|
|
+ blkdev_trig_unlink_norelease(btl, btb);
|
|
+ err = 0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&blkdev_trig_mutex);
|
|
+ return err ? : count;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ *
|
|
+ * Atomic attribute show & store functions
|
|
+ *
|
|
+ */
|
|
+
|
|
+/**
|
|
+ * blink_time_show() - &blink_time device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_time attribute (&dev_attr_blink_time)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes the value of &blkdev_trig_led.blink_msec to &buf.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t blink_time_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+
|
|
+ return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_time_store() - &blink_time device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_time attribute (&dev_attr_blink_time)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets &blkdev_trig_led.blink_msec to the value in &buf.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t blink_time_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+ unsigned int value;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtouint(buf, 0, &value);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX)
|
|
+ return -ERANGE;
|
|
+
|
|
+ WRITE_ONCE(btl->blink_msec, value);
|
|
+ return count;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * check_interval_show() - &check_interval device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &check_interval attribute (&dev_attr_check_interval)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes the value of &blkdev_trig_led.check_jiffies (converted to
|
|
+ * milliseconds) to &buf.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t check_interval_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
|
|
+
|
|
+ return sysfs_emit(buf, "%u\n",
|
|
+ jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * check_interval_store() - &check_interval device attribute store function
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &check_interval attribute (&dev_attr_check_interval)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting
|
|
+ * from milliseconds).
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t check_interval_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ struct blkdev_trig_led *led = led_trigger_get_drvdata(dev);
|
|
+ unsigned int value;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtouint(buf, 0, &value);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX)
|
|
+ return -ERANGE;
|
|
+
|
|
+ WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value));
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_mode_show() - Helper for boolean attribute show functions.
|
|
+ * @led: The LED
|
|
+ * @buf: Output buffer
|
|
+ * @bit: Which bit to show
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
|
|
+ enum stat_group bit)
|
|
+{
|
|
+ return sysfs_emit(buf,
|
|
+ READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_mode_store() - Helper for boolean attribute store functions.
|
|
+ * @led: The LED
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ * @bit: Which bit to set
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static int blkdev_trig_mode_store(struct blkdev_trig_led *led,
|
|
+ const char *buf, size_t count,
|
|
+ enum stat_group bit)
|
|
+{
|
|
+ bool set;
|
|
+ int err;
|
|
+
|
|
+ err = kstrtobool(buf, &set);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (set)
|
|
+ set_bit(bit, &led->mode);
|
|
+ else
|
|
+ clear_bit(bit, &led->mode);
|
|
+
|
|
+ return count;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_read_show() - &blink_on_read device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in
|
|
+ * &blkdev_trig_led.mode is set or cleared.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t blink_on_read_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
|
|
+ buf, STAT_READ);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_read_store() - &blink_on_read device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf
|
|
+ * (interpretted as a boolean).
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t blink_on_read_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
|
|
+ buf, count, STAT_READ);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_write_show() - &blink_on_write device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in
|
|
+ * in &blkdev_trig_led.mode is set or cleared.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t blink_on_write_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
|
|
+ buf, STAT_WRITE);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_write_store() - &blink_on_write device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf
|
|
+ * (interpretted as a boolean).
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t blink_on_write_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
|
|
+ buf, count, STAT_WRITE);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_flush_show() - &blink_on_flush device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in
|
|
+ * &blkdev_trig_led.mode is set or cleared.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t blink_on_flush_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
|
|
+ buf, STAT_FLUSH);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_flush_store() - &blink_on_flush device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf
|
|
+ * (interpretted as a boolean).
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t blink_on_flush_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
|
|
+ buf, count, STAT_FLUSH);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_discard_show() - &blink_on_discard device attribute show function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard)
|
|
+ * @buf: Output buffer
|
|
+ *
|
|
+ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in
|
|
+ * &blkdev_trig_led.mode is set or cleared.
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: The number of characters written to &buf.
|
|
+ */
|
|
+static ssize_t blink_on_discard_show(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
|
|
+ buf, STAT_DISCARD);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * blink_on_discard_store() - &blink_on_discard device attribute store function.
|
|
+ * @dev: The LED device
|
|
+ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard)
|
|
+ * @buf: The new value (as written to the &sysfs attribute)
|
|
+ * @count: The number of characters in &buf
|
|
+ *
|
|
+ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf
|
|
+ * (interpretted as a boolean).
|
|
+ *
|
|
+ * Context: Process context.
|
|
+ * Return: &count on success, negative &errno on error.
|
|
+ */
|
|
+static ssize_t blink_on_discard_store(struct device *dev,
|
|
+ struct device_attribute *attr,
|
|
+ const char *buf, size_t count)
|
|
+{
|
|
+ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
|
|
+ buf, count, STAT_DISCARD);
|
|
+}
|
|
+
|
|
+/* Device attributes */
|
|
+static DEVICE_ATTR_WO(link_dev_by_path);
|
|
+static DEVICE_ATTR_WO(unlink_dev_by_path);
|
|
+static DEVICE_ATTR_WO(unlink_dev_by_name);
|
|
+static DEVICE_ATTR_RW(blink_time);
|
|
+static DEVICE_ATTR_RW(check_interval);
|
|
+static DEVICE_ATTR_RW(blink_on_read);
|
|
+static DEVICE_ATTR_RW(blink_on_write);
|
|
+static DEVICE_ATTR_RW(blink_on_flush);
|
|
+static DEVICE_ATTR_RW(blink_on_discard);
|
|
+
|
|
+/* Device attributes in LED directory (/sys/class/leds/<led>/...) */
|
|
+static struct attribute *blkdev_trig_attrs[] = {
|
|
+ &dev_attr_link_dev_by_path.attr,
|
|
+ &dev_attr_unlink_dev_by_path.attr,
|
|
+ &dev_attr_unlink_dev_by_name.attr,
|
|
+ &dev_attr_blink_time.attr,
|
|
+ &dev_attr_check_interval.attr,
|
|
+ &dev_attr_blink_on_read.attr,
|
|
+ &dev_attr_blink_on_write.attr,
|
|
+ &dev_attr_blink_on_flush.attr,
|
|
+ &dev_attr_blink_on_discard.attr,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+/* Unnamed attribute group == no subdirectory */
|
|
+static const struct attribute_group blkdev_trig_attr_group = {
|
|
+ .attrs = blkdev_trig_attrs,
|
|
+};
|
|
+
|
|
+/* Attribute groups for the trigger */
|
|
+static const struct attribute_group *blkdev_trig_attr_groups[] = {
|
|
+ &blkdev_trig_attr_group, /* /sys/class/leds/<led>/... */
|
|
+ &blkdev_trig_linked_devs, /* /sys/class/leds/<led>/linked_devices/ */
|
|
+ NULL
|
|
+};
|
|
+
|
|
+/* Trigger registration data */
|
|
+static struct led_trigger blkdev_trig_trigger = {
|
|
+ .name = "blkdev",
|
|
+ .activate = blkdev_trig_activate,
|
|
+ .deactivate = blkdev_trig_deactivate,
|
|
+ .groups = blkdev_trig_attr_groups,
|
|
+};
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_init() - Block device LED trigger initialization.
|
|
+ *
|
|
+ * Registers the ``blkdev`` LED trigger.
|
|
+ *
|
|
+ * Return: &0 on success, negative &errno on failure.
|
|
+ */
|
|
+static int __init blkdev_trig_init(void)
|
|
+{
|
|
+ return led_trigger_register(&blkdev_trig_trigger);
|
|
+}
|
|
+module_init(blkdev_trig_init);
|
|
+
|
|
+/**
|
|
+ * blkdev_trig_exit() - Block device LED trigger module exit.
|
|
+ *
|
|
+ * Unregisters the ``blkdev`` LED trigger.
|
|
+ */
|
|
+static void __exit blkdev_trig_exit(void)
|
|
+{
|
|
+ led_trigger_unregister(&blkdev_trig_trigger);
|
|
+}
|
|
+module_exit(blkdev_trig_exit);
|
|
+
|
|
+MODULE_DESCRIPTION("Block device LED trigger");
|
|
+MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
|
|
+MODULE_LICENSE("GPL v2");
|
|
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
|
|
index 64659b110973..4cad490028ab 100644
|
|
--- a/fs/eventpoll.c
|
|
+++ b/fs/eventpoll.c
|
|
@@ -57,13 +57,7 @@
|
|
* we need a lock that will allow us to sleep. This lock is a
|
|
* mutex (ep->mtx). It is acquired during the event transfer loop,
|
|
* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
|
|
- * Then we also need a global mutex to serialize eventpoll_release_file()
|
|
- * and ep_free().
|
|
- * This mutex is acquired by ep_free() during the epoll file
|
|
- * cleanup path and it is also acquired by eventpoll_release_file()
|
|
- * if a file has been pushed inside an epoll set and it is then
|
|
- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
|
|
- * It is also acquired when inserting an epoll fd onto another epoll
|
|
+ * The epmutex is acquired when inserting an epoll fd onto another epoll
|
|
* fd. We do this so that we walk the epoll tree and ensure that this
|
|
* insertion does not create a cycle of epoll file descriptors, which
|
|
* could lead to deadlock. We need a global mutex to prevent two
|
|
@@ -153,6 +147,13 @@ struct epitem {
|
|
/* The file descriptor information this item refers to */
|
|
struct epoll_filefd ffd;
|
|
|
|
+ /*
|
|
+ * Protected by file->f_lock, true for to-be-released epitem already
|
|
+ * removed from the "struct file" items list; together with
|
|
+ * eventpoll->refcount orchestrates "struct eventpoll" disposal
|
|
+ */
|
|
+ bool dying;
|
|
+
|
|
/* List containing poll wait queues */
|
|
struct eppoll_entry *pwqlist;
|
|
|
|
@@ -217,6 +218,12 @@ struct eventpoll {
|
|
u64 gen;
|
|
struct hlist_head refs;
|
|
|
|
+ /*
|
|
+ * usage count, used together with epitem->dying to
|
|
+ * orchestrate the disposal of this struct
|
|
+ */
|
|
+ refcount_t refcount;
|
|
+
|
|
#ifdef CONFIG_NET_RX_BUSY_POLL
|
|
/* used to track busy poll napi_id */
|
|
unsigned int napi_id;
|
|
@@ -240,9 +247,7 @@ struct ep_pqueue {
|
|
/* Maximum number of epoll watched descriptors, per user */
|
|
static long max_user_watches __read_mostly;
|
|
|
|
-/*
|
|
- * This mutex is used to serialize ep_free() and eventpoll_release_file().
|
|
- */
|
|
+/* Used for cycles detection */
|
|
static DEFINE_MUTEX(epmutex);
|
|
|
|
static u64 loop_check_gen = 0;
|
|
@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
|
|
|
|
/*
|
|
* This function unregisters poll callbacks from the associated file
|
|
- * descriptor. Must be called with "mtx" held (or "epmutex" if called from
|
|
- * ep_free).
|
|
+ * descriptor. Must be called with "mtx" held.
|
|
*/
|
|
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
|
|
{
|
|
@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head)
|
|
kmem_cache_free(epi_cache, epi);
|
|
}
|
|
|
|
+static void ep_get(struct eventpoll *ep)
|
|
+{
|
|
+ refcount_inc(&ep->refcount);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns true if the event poll can be disposed
|
|
+ */
|
|
+static bool ep_refcount_dec_and_test(struct eventpoll *ep)
|
|
+{
|
|
+ if (!refcount_dec_and_test(&ep->refcount))
|
|
+ return false;
|
|
+
|
|
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void ep_free(struct eventpoll *ep)
|
|
+{
|
|
+ mutex_destroy(&ep->mtx);
|
|
+ free_uid(ep->user);
|
|
+ wakeup_source_unregister(ep->ws);
|
|
+ kfree(ep);
|
|
+}
|
|
+
|
|
/*
|
|
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
|
|
* all the associated resources. Must be called with "mtx" held.
|
|
+ * If the dying flag is set, do the removal only if force is true.
|
|
+ * This prevents ep_clear_and_put() from dropping all the ep references
|
|
+ * while running concurrently with eventpoll_release_file().
|
|
+ * Returns true if the eventpoll can be disposed.
|
|
*/
|
|
-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|
+static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
|
|
{
|
|
struct file *file = epi->ffd.file;
|
|
struct epitems_head *to_free;
|
|
@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|
|
|
/* Remove the current item from the list of epoll hooks */
|
|
spin_lock(&file->f_lock);
|
|
+ if (epi->dying && !force) {
|
|
+ spin_unlock(&file->f_lock);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
to_free = NULL;
|
|
head = file->f_ep;
|
|
if (head->first == &epi->fllink && !epi->fllink.next) {
|
|
@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|
call_rcu(&epi->rcu, epi_rcu_free);
|
|
|
|
percpu_counter_dec(&ep->user->epoll_watches);
|
|
+ return ep_refcount_dec_and_test(ep);
|
|
+}
|
|
|
|
- return 0;
|
|
+/*
|
|
+ * ep_remove variant for callers owing an additional reference to the ep
|
|
+ */
|
|
+static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
|
|
+{
|
|
+ WARN_ON_ONCE(__ep_remove(ep, epi, false));
|
|
}
|
|
|
|
-static void ep_free(struct eventpoll *ep)
|
|
+static void ep_clear_and_put(struct eventpoll *ep)
|
|
{
|
|
struct rb_node *rbp;
|
|
struct epitem *epi;
|
|
+ bool dispose;
|
|
|
|
/* We need to release all tasks waiting for these file */
|
|
if (waitqueue_active(&ep->poll_wait))
|
|
ep_poll_safewake(ep, NULL, 0);
|
|
|
|
- /*
|
|
- * We need to lock this because we could be hit by
|
|
- * eventpoll_release_file() while we're freeing the "struct eventpoll".
|
|
- * We do not need to hold "ep->mtx" here because the epoll file
|
|
- * is on the way to be removed and no one has references to it
|
|
- * anymore. The only hit might come from eventpoll_release_file() but
|
|
- * holding "epmutex" is sufficient here.
|
|
- */
|
|
- mutex_lock(&epmutex);
|
|
+ mutex_lock(&ep->mtx);
|
|
|
|
/*
|
|
* Walks through the whole tree by unregistering poll callbacks.
|
|
@@ -768,25 +806,21 @@ static void ep_free(struct eventpoll *ep)
|
|
|
|
/*
|
|
* Walks through the whole tree by freeing each "struct epitem". At this
|
|
- * point we are sure no poll callbacks will be lingering around, and also by
|
|
- * holding "epmutex" we can be sure that no file cleanup code will hit
|
|
- * us during this operation. So we can avoid the lock on "ep->lock".
|
|
- * We do not need to lock ep->mtx, either, we only do it to prevent
|
|
- * a lockdep warning.
|
|
+ * point we are sure no poll callbacks will be lingering around.
|
|
+ * Since we still own a reference to the eventpoll struct, the loop can't
|
|
+ * dispose it.
|
|
*/
|
|
- mutex_lock(&ep->mtx);
|
|
while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
|
|
epi = rb_entry(rbp, struct epitem, rbn);
|
|
- ep_remove(ep, epi);
|
|
+ ep_remove_safe(ep, epi);
|
|
cond_resched();
|
|
}
|
|
+
|
|
+ dispose = ep_refcount_dec_and_test(ep);
|
|
mutex_unlock(&ep->mtx);
|
|
|
|
- mutex_unlock(&epmutex);
|
|
- mutex_destroy(&ep->mtx);
|
|
- free_uid(ep->user);
|
|
- wakeup_source_unregister(ep->ws);
|
|
- kfree(ep);
|
|
+ if (dispose)
|
|
+ ep_free(ep);
|
|
}
|
|
|
|
static int ep_eventpoll_release(struct inode *inode, struct file *file)
|
|
@@ -794,7 +828,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
|
|
struct eventpoll *ep = file->private_data;
|
|
|
|
if (ep)
|
|
- ep_free(ep);
|
|
+ ep_clear_and_put(ep);
|
|
|
|
return 0;
|
|
}
|
|
@@ -906,33 +940,34 @@ void eventpoll_release_file(struct file *file)
|
|
{
|
|
struct eventpoll *ep;
|
|
struct epitem *epi;
|
|
- struct hlist_node *next;
|
|
+ bool dispose;
|
|
|
|
/*
|
|
- * We don't want to get "file->f_lock" because it is not
|
|
- * necessary. It is not necessary because we're in the "struct file"
|
|
- * cleanup path, and this means that no one is using this file anymore.
|
|
- * So, for example, epoll_ctl() cannot hit here since if we reach this
|
|
- * point, the file counter already went to zero and fget() would fail.
|
|
- * The only hit might come from ep_free() but by holding the mutex
|
|
- * will correctly serialize the operation. We do need to acquire
|
|
- * "ep->mtx" after "epmutex" because ep_remove() requires it when called
|
|
- * from anywhere but ep_free().
|
|
- *
|
|
- * Besides, ep_remove() acquires the lock, so we can't hold it here.
|
|
+ * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
|
|
+ * touching the epitems list before eventpoll_release_file() can access
|
|
+ * the ep->mtx.
|
|
*/
|
|
- mutex_lock(&epmutex);
|
|
- if (unlikely(!file->f_ep)) {
|
|
- mutex_unlock(&epmutex);
|
|
- return;
|
|
- }
|
|
- hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
|
|
+again:
|
|
+ spin_lock(&file->f_lock);
|
|
+ if (file->f_ep && file->f_ep->first) {
|
|
+ epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
|
|
+ epi->dying = true;
|
|
+ spin_unlock(&file->f_lock);
|
|
+
|
|
+ /*
|
|
+ * ep access is safe as we still own a reference to the ep
|
|
+ * struct
|
|
+ */
|
|
ep = epi->ep;
|
|
- mutex_lock_nested(&ep->mtx, 0);
|
|
- ep_remove(ep, epi);
|
|
+ mutex_lock(&ep->mtx);
|
|
+ dispose = __ep_remove(ep, epi, true);
|
|
mutex_unlock(&ep->mtx);
|
|
+
|
|
+ if (dispose)
|
|
+ ep_free(ep);
|
|
+ goto again;
|
|
}
|
|
- mutex_unlock(&epmutex);
|
|
+ spin_unlock(&file->f_lock);
|
|
}
|
|
|
|
static int ep_alloc(struct eventpoll **pep)
|
|
@@ -955,6 +990,7 @@ static int ep_alloc(struct eventpoll **pep)
|
|
ep->rbr = RB_ROOT_CACHED;
|
|
ep->ovflist = EP_UNACTIVE_PTR;
|
|
ep->user = user;
|
|
+ refcount_set(&ep->refcount, 1);
|
|
|
|
*pep = ep;
|
|
|
|
@@ -1223,10 +1259,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
|
|
*/
|
|
list_del_init(&wait->entry);
|
|
/*
|
|
- * ->whead != NULL protects us from the race with ep_free()
|
|
- * or ep_remove(), ep_remove_wait_queue() takes whead->lock
|
|
- * held by the caller. Once we nullify it, nothing protects
|
|
- * ep/epi or even wait.
|
|
+ * ->whead != NULL protects us from the race with
|
|
+ * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
|
|
+ * takes whead->lock held by the caller. Once we nullify it,
|
|
+ * nothing protects ep/epi or even wait.
|
|
*/
|
|
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
|
|
}
|
|
@@ -1496,16 +1532,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
|
|
if (tep)
|
|
mutex_unlock(&tep->mtx);
|
|
|
|
+ /*
|
|
+ * ep_remove_safe() calls in the later error paths can't lead to
|
|
+ * ep_free() as the ep file itself still holds an ep reference.
|
|
+ */
|
|
+ ep_get(ep);
|
|
+
|
|
/* now check if we've created too many backpaths */
|
|
if (unlikely(full_check && reverse_path_check())) {
|
|
- ep_remove(ep, epi);
|
|
+ ep_remove_safe(ep, epi);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (epi->event.events & EPOLLWAKEUP) {
|
|
error = ep_create_wakeup_source(epi);
|
|
if (error) {
|
|
- ep_remove(ep, epi);
|
|
+ ep_remove_safe(ep, epi);
|
|
return error;
|
|
}
|
|
}
|
|
@@ -1529,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
|
|
* high memory pressure.
|
|
*/
|
|
if (unlikely(!epq.epi)) {
|
|
- ep_remove(ep, epi);
|
|
+ ep_remove_safe(ep, epi);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
@@ -1760,7 +1802,7 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
|
|
{
|
|
int ret = default_wake_function(wq_entry, mode, sync, key);
|
|
|
|
- list_del_init(&wq_entry->entry);
|
|
+ list_del_init_careful(&wq_entry->entry);
|
|
return ret;
|
|
}
|
|
|
|
@@ -2025,7 +2067,7 @@ static int do_epoll_create(int flags)
|
|
out_free_fd:
|
|
put_unused_fd(fd);
|
|
out_free_ep:
|
|
- ep_free(ep);
|
|
+ ep_clear_and_put(ep);
|
|
return error;
|
|
}
|
|
|
|
@@ -2167,10 +2209,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
|
error = -EEXIST;
|
|
break;
|
|
case EPOLL_CTL_DEL:
|
|
- if (epi)
|
|
- error = ep_remove(ep, epi);
|
|
- else
|
|
+ if (epi) {
|
|
+ /*
|
|
+ * The eventpoll itself is still alive: the refcount
|
|
+ * can't go to zero here.
|
|
+ */
|
|
+ ep_remove_safe(ep, epi);
|
|
+ error = 0;
|
|
+ } else {
|
|
error = -ENOENT;
|
|
+ }
|
|
break;
|
|
case EPOLL_CTL_MOD:
|
|
if (epi) {
|
|
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
|
|
index 5f1ae07d724b..97cda629c9e9 100644
|
|
--- a/include/linux/pageblock-flags.h
|
|
+++ b/include/linux/pageblock-flags.h
|
|
@@ -48,7 +48,7 @@ extern unsigned int pageblock_order;
|
|
#else /* CONFIG_HUGETLB_PAGE */
|
|
|
|
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
|
|
-#define pageblock_order (MAX_ORDER-1)
|
|
+#define pageblock_order PAGE_ALLOC_COSTLY_ORDER
|
|
|
|
#endif /* CONFIG_HUGETLB_PAGE */
|
|
|
|
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
|
|
index 8f69772af77b..42163c9e94e5 100644
|
|
--- a/kernel/kheaders.c
|
|
+++ b/kernel/kheaders.c
|
|
@@ -26,15 +26,15 @@ asm (
|
|
" .popsection \n"
|
|
);
|
|
|
|
-extern char kernel_headers_data;
|
|
-extern char kernel_headers_data_end;
|
|
+extern char kernel_headers_data[];
|
|
+extern char kernel_headers_data_end[];
|
|
|
|
static ssize_t
|
|
ikheaders_read(struct file *file, struct kobject *kobj,
|
|
struct bin_attribute *bin_attr,
|
|
char *buf, loff_t off, size_t len)
|
|
{
|
|
- memcpy(buf, &kernel_headers_data + off, len);
|
|
+ memcpy(buf, &kernel_headers_data[off], len);
|
|
return len;
|
|
}
|
|
|
|
@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = {
|
|
|
|
static int __init ikheaders_init(void)
|
|
{
|
|
- kheaders_attr.size = (&kernel_headers_data_end -
|
|
- &kernel_headers_data);
|
|
+ kheaders_attr.size = (kernel_headers_data_end -
|
|
+ kernel_headers_data);
|
|
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
|
|
}
|
|
|
|
diff --git a/kernel/padata.c b/kernel/padata.c
|
|
index e007b8a4b738..7c80301ab084 100644
|
|
--- a/kernel/padata.c
|
|
+++ b/kernel/padata.c
|
|
@@ -45,7 +45,7 @@ struct padata_mt_job_state {
|
|
};
|
|
|
|
static void padata_free_pd(struct parallel_data *pd);
|
|
-static void __init padata_mt_helper(struct work_struct *work);
|
|
+static void padata_mt_helper(struct work_struct *work);
|
|
|
|
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
|
|
{
|
|
@@ -438,7 +438,7 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
|
|
return err;
|
|
}
|
|
|
|
-static void __init padata_mt_helper(struct work_struct *w)
|
|
+static void padata_mt_helper(struct work_struct *w)
|
|
{
|
|
struct padata_work *pw = container_of(w, struct padata_work, pw_work);
|
|
struct padata_mt_job_state *ps = pw->pw_data;
|
|
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
|
|
index 8e39705c7bdc..68d86fd93ef6 100644
|
|
--- a/mm/page_alloc.c
|
|
+++ b/mm/page_alloc.c
|
|
@@ -3138,6 +3138,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
|
{
|
|
unsigned long flags;
|
|
int i, allocated = 0;
|
|
+ struct list_head *prev_tail = list->prev;
|
|
+ struct page *pos, *n;
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
for (i = 0; i < count; ++i) {
|
|
@@ -3146,9 +3148,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
|
if (unlikely(page == NULL))
|
|
break;
|
|
|
|
- if (unlikely(check_pcp_refill(page, order)))
|
|
- continue;
|
|
-
|
|
/*
|
|
* Split buddy pages returned by expand() are received here in
|
|
* physical page order. The page is added to the tail of
|
|
@@ -3160,7 +3159,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
|
* pages are ordered properly.
|
|
*/
|
|
list_add_tail(&page->pcp_list, list);
|
|
- allocated++;
|
|
if (is_migrate_cma(get_pcppage_migratetype(page)))
|
|
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
|
|
-(1 << order));
|
|
@@ -3174,6 +3172,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
|
*/
|
|
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
+
|
|
+ /*
|
|
+ * Pages are appended to the pcp list without checking to reduce the
|
|
+ * time holding the zone lock. Checking the appended pages happens right
|
|
+ * after the critical section while still holding the pcp lock.
|
|
+ */
|
|
+ pos = list_first_entry(prev_tail, struct page, pcp_list);
|
|
+ list_for_each_entry_safe_from(pos, n, list, pcp_list) {
|
|
+ if (unlikely(check_pcp_refill(pos, order))) {
|
|
+ list_del(&pos->pcp_list);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ allocated++;
|
|
+ }
|
|
+
|
|
return allocated;
|
|
}
|
|
|
|
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
|
|
index 0edfdb40364b..ae52d3b3f063 100644
|
|
--- a/scripts/Makefile.vmlinux_o
|
|
+++ b/scripts/Makefile.vmlinux_o
|
|
@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@
|
|
|
|
.tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \
|
|
vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
|
|
- $(call if_changed,gen_initcalls_lds)
|
|
+ +$(call if_changed,gen_initcalls_lds)
|
|
|
|
targets := .tmp_initcalls.lds
|
|
|
|
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
|
|
index 75020edd39e7..e4455220e9fd 100644
|
|
--- a/sound/pci/hda/cs35l41_hda.c
|
|
+++ b/sound/pci/hda/cs35l41_hda.c
|
|
@@ -1239,7 +1239,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
|
|
|
|
if (strncmp(hid, "CLSA0100", 8) == 0) {
|
|
hw_cfg->bst_type = CS35L41_EXT_BOOST_NO_VSPK_SWITCH;
|
|
- } else if (strncmp(hid, "CLSA0101", 8) == 0) {
|
|
+ } else if (strncmp(hid, "CLSA0101", 8) == 0 || strncmp(hid, "CSC3551", 7) == 0) {
|
|
hw_cfg->bst_type = CS35L41_EXT_BOOST;
|
|
hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
|
|
hw_cfg->gpio1.valid = true;
|
|
--
|
|
2.40.1
|
|
|
|
From d3a7d6477e59e6015a1e50ac35a341c4aa4c7324 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Fri, 28 Apr 2023 19:59:05 +0200
|
|
Subject: [PATCH 05/10] fs-patches
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
block/Kconfig | 3 +
|
|
block/blk-cgroup.c | 78 +-
|
|
block/blk-cgroup.h | 15 +-
|
|
block/blk-core.c | 3 -
|
|
fs/btrfs/Kconfig | 1 +
|
|
fs/btrfs/bio.c | 211 +-
|
|
fs/btrfs/bio.h | 22 +-
|
|
fs/btrfs/block-group.c | 40 +-
|
|
fs/btrfs/block-group.h | 13 +-
|
|
fs/btrfs/block-rsv.c | 21 +-
|
|
fs/btrfs/block-rsv.h | 2 +-
|
|
fs/btrfs/btrfs_inode.h | 35 +-
|
|
fs/btrfs/compression.c | 299 +--
|
|
fs/btrfs/compression.h | 20 +-
|
|
fs/btrfs/ctree.c | 91 +-
|
|
fs/btrfs/ctree.h | 17 +-
|
|
fs/btrfs/delalloc-space.c | 2 +-
|
|
fs/btrfs/delayed-ref.c | 49 +-
|
|
fs/btrfs/delayed-ref.h | 22 +-
|
|
fs/btrfs/disk-io.c | 147 +-
|
|
fs/btrfs/extent-tree.c | 37 +-
|
|
fs/btrfs/extent_io.c | 550 ++--
|
|
fs/btrfs/file-item.c | 93 +-
|
|
fs/btrfs/file-item.h | 3 +-
|
|
fs/btrfs/fs.h | 53 +-
|
|
fs/btrfs/inode-item.c | 15 +-
|
|
fs/btrfs/inode.c | 375 ++-
|
|
fs/btrfs/ioctl.c | 5 +
|
|
fs/btrfs/locking.c | 25 +-
|
|
fs/btrfs/locking.h | 5 +-
|
|
fs/btrfs/lru_cache.h | 5 -
|
|
fs/btrfs/lzo.c | 17 +-
|
|
fs/btrfs/messages.c | 2 +-
|
|
fs/btrfs/messages.h | 2 +-
|
|
fs/btrfs/ordered-data.c | 120 +-
|
|
fs/btrfs/ordered-data.h | 10 +-
|
|
fs/btrfs/raid56.c | 162 +-
|
|
fs/btrfs/raid56.h | 12 +-
|
|
fs/btrfs/relocation.c | 6 +-
|
|
fs/btrfs/scrub.c | 4142 +++++++++--------------------
|
|
fs/btrfs/send.c | 2 +-
|
|
fs/btrfs/space-info.c | 32 +-
|
|
fs/btrfs/space-info.h | 1 +
|
|
fs/btrfs/super.c | 3 +-
|
|
fs/btrfs/sysfs.c | 5 +
|
|
fs/btrfs/tests/extent-map-tests.c | 1 -
|
|
fs/btrfs/transaction.c | 28 +-
|
|
fs/btrfs/tree-checker.c | 14 +
|
|
fs/btrfs/tree-log.c | 171 +-
|
|
fs/btrfs/volumes.c | 593 ++---
|
|
fs/btrfs/volumes.h | 85 +-
|
|
fs/btrfs/zlib.c | 2 -
|
|
fs/btrfs/zoned.c | 4 +-
|
|
fs/btrfs/zstd.c | 1 -
|
|
include/linux/bio.h | 5 +
|
|
include/linux/blk_types.h | 18 +-
|
|
include/linux/crc32c.h | 1 -
|
|
include/linux/writeback.h | 5 -
|
|
include/trace/events/btrfs.h | 2 +-
|
|
include/uapi/linux/btrfs.h | 1 +
|
|
lib/libcrc32c.c | 6 -
|
|
tools/objtool/check.c | 1 +
|
|
62 files changed, 2867 insertions(+), 4844 deletions(-)
|
|
|
|
diff --git a/block/Kconfig b/block/Kconfig
|
|
index 941b2dca70db..69ccf7457ae1 100644
|
|
--- a/block/Kconfig
|
|
+++ b/block/Kconfig
|
|
@@ -41,6 +41,9 @@ config BLK_RQ_ALLOC_TIME
|
|
config BLK_CGROUP_RWSTAT
|
|
bool
|
|
|
|
+config BLK_CGROUP_PUNT_BIO
|
|
+ bool
|
|
+
|
|
config BLK_DEV_BSG_COMMON
|
|
tristate
|
|
|
|
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
|
|
index bd50b55bdb61..18c922579719 100644
|
|
--- a/block/blk-cgroup.c
|
|
+++ b/block/blk-cgroup.c
|
|
@@ -56,7 +56,6 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
|
|
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
|
|
|
|
bool blkcg_debug_stats = false;
|
|
-static struct workqueue_struct *blkcg_punt_bio_wq;
|
|
|
|
#define BLKG_DESTROY_BATCH_SIZE 64
|
|
|
|
@@ -166,7 +165,9 @@ static void __blkg_release(struct rcu_head *rcu)
|
|
{
|
|
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
|
|
|
|
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
|
|
WARN_ON(!bio_list_empty(&blkg->async_bios));
|
|
+#endif
|
|
|
|
/* release the blkcg and parent blkg refs this blkg has been holding */
|
|
css_put(&blkg->blkcg->css);
|
|
@@ -188,6 +189,9 @@ static void blkg_release(struct percpu_ref *ref)
|
|
call_rcu(&blkg->rcu_head, __blkg_release);
|
|
}
|
|
|
|
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
|
|
+static struct workqueue_struct *blkcg_punt_bio_wq;
|
|
+
|
|
static void blkg_async_bio_workfn(struct work_struct *work)
|
|
{
|
|
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
|
|
@@ -198,10 +202,10 @@ static void blkg_async_bio_workfn(struct work_struct *work)
|
|
bool need_plug = false;
|
|
|
|
/* as long as there are pending bios, @blkg can't go away */
|
|
- spin_lock_bh(&blkg->async_bio_lock);
|
|
+ spin_lock(&blkg->async_bio_lock);
|
|
bio_list_merge(&bios, &blkg->async_bios);
|
|
bio_list_init(&blkg->async_bios);
|
|
- spin_unlock_bh(&blkg->async_bio_lock);
|
|
+ spin_unlock(&blkg->async_bio_lock);
|
|
|
|
/* start plug only when bio_list contains at least 2 bios */
|
|
if (bios.head && bios.head->bi_next) {
|
|
@@ -214,6 +218,40 @@ static void blkg_async_bio_workfn(struct work_struct *work)
|
|
blk_finish_plug(&plug);
|
|
}
|
|
|
|
+/*
|
|
+ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
|
|
+ * lead to priority inversions as the kthread can be trapped waiting for that
|
|
+ * cgroup. Use this helper instead of submit_bio to punt the actual issuing to
|
|
+ * a dedicated per-blkcg work item to avoid such priority inversions.
|
|
+ */
|
|
+void blkcg_punt_bio_submit(struct bio *bio)
|
|
+{
|
|
+ struct blkcg_gq *blkg = bio->bi_blkg;
|
|
+
|
|
+ if (blkg->parent) {
|
|
+ spin_lock(&blkg->async_bio_lock);
|
|
+ bio_list_add(&blkg->async_bios, bio);
|
|
+ spin_unlock(&blkg->async_bio_lock);
|
|
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
|
|
+ } else {
|
|
+ /* never bounce for the root cgroup */
|
|
+ submit_bio(bio);
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
|
|
+
|
|
+static int __init blkcg_punt_bio_init(void)
|
|
+{
|
|
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
|
|
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
|
|
+ WQ_UNBOUND | WQ_SYSFS, 0);
|
|
+ if (!blkcg_punt_bio_wq)
|
|
+ return -ENOMEM;
|
|
+ return 0;
|
|
+}
|
|
+subsys_initcall(blkcg_punt_bio_init);
|
|
+#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */
|
|
+
|
|
/**
|
|
* bio_blkcg_css - return the blkcg CSS associated with a bio
|
|
* @bio: target bio
|
|
@@ -269,10 +307,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
|
|
|
|
blkg->q = disk->queue;
|
|
INIT_LIST_HEAD(&blkg->q_node);
|
|
+ blkg->blkcg = blkcg;
|
|
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
|
|
spin_lock_init(&blkg->async_bio_lock);
|
|
bio_list_init(&blkg->async_bios);
|
|
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
|
|
- blkg->blkcg = blkcg;
|
|
+#endif
|
|
|
|
u64_stats_init(&blkg->iostat.sync);
|
|
for_each_possible_cpu(cpu) {
|
|
@@ -1688,25 +1728,6 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
|
|
}
|
|
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
|
|
|
|
-bool __blkcg_punt_bio_submit(struct bio *bio)
|
|
-{
|
|
- struct blkcg_gq *blkg = bio->bi_blkg;
|
|
-
|
|
- /* consume the flag first */
|
|
- bio->bi_opf &= ~REQ_CGROUP_PUNT;
|
|
-
|
|
- /* never bounce for the root cgroup */
|
|
- if (!blkg->parent)
|
|
- return false;
|
|
-
|
|
- spin_lock_bh(&blkg->async_bio_lock);
|
|
- bio_list_add(&blkg->async_bios, bio);
|
|
- spin_unlock_bh(&blkg->async_bio_lock);
|
|
-
|
|
- queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
|
|
- return true;
|
|
-}
|
|
-
|
|
/*
|
|
* Scale the accumulated delay based on how long it has been since we updated
|
|
* the delay. We only call this when we are adding delay, in case it's been a
|
|
@@ -2085,16 +2106,5 @@ bool blk_cgroup_congested(void)
|
|
return ret;
|
|
}
|
|
|
|
-static int __init blkcg_init(void)
|
|
-{
|
|
- blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
|
|
- WQ_MEM_RECLAIM | WQ_FREEZABLE |
|
|
- WQ_UNBOUND | WQ_SYSFS, 0);
|
|
- if (!blkcg_punt_bio_wq)
|
|
- return -ENOMEM;
|
|
- return 0;
|
|
-}
|
|
-subsys_initcall(blkcg_init);
|
|
-
|
|
module_param(blkcg_debug_stats, bool, 0644);
|
|
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
|
|
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
|
|
index 9c5078755e5e..e98d2c1be354 100644
|
|
--- a/block/blk-cgroup.h
|
|
+++ b/block/blk-cgroup.h
|
|
@@ -72,9 +72,10 @@ struct blkcg_gq {
|
|
struct blkg_iostat_set iostat;
|
|
|
|
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
|
|
-
|
|
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
|
|
spinlock_t async_bio_lock;
|
|
struct bio_list async_bios;
|
|
+#endif
|
|
union {
|
|
struct work_struct async_bio_work;
|
|
struct work_struct free_work;
|
|
@@ -375,16 +376,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
|
|
if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
|
|
(p_blkg)->q)))
|
|
|
|
-bool __blkcg_punt_bio_submit(struct bio *bio);
|
|
-
|
|
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
|
|
-{
|
|
- if (bio->bi_opf & REQ_CGROUP_PUNT)
|
|
- return __blkcg_punt_bio_submit(bio);
|
|
- else
|
|
- return false;
|
|
-}
|
|
-
|
|
static inline void blkcg_bio_issue_init(struct bio *bio)
|
|
{
|
|
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
|
|
@@ -506,8 +497,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return
|
|
static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
|
|
static inline void blkg_get(struct blkcg_gq *blkg) { }
|
|
static inline void blkg_put(struct blkcg_gq *blkg) { }
|
|
-
|
|
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
|
|
static inline void blkcg_bio_issue_init(struct bio *bio) { }
|
|
static inline void blk_cgroup_bio_start(struct bio *bio) { }
|
|
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
|
|
diff --git a/block/blk-core.c b/block/blk-core.c
|
|
index 42926e6cb83c..478978dcb2bd 100644
|
|
--- a/block/blk-core.c
|
|
+++ b/block/blk-core.c
|
|
@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct);
|
|
*/
|
|
void submit_bio(struct bio *bio)
|
|
{
|
|
- if (blkcg_punt_bio_submit(bio))
|
|
- return;
|
|
-
|
|
if (bio_op(bio) == REQ_OP_READ) {
|
|
task_io_account_read(bio->bi_iter.bi_size);
|
|
count_vm_events(PGPGIN, bio_sectors(bio));
|
|
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
|
|
index 37b6bab90c83..66fa9ab2c046 100644
|
|
--- a/fs/btrfs/Kconfig
|
|
+++ b/fs/btrfs/Kconfig
|
|
@@ -2,6 +2,7 @@
|
|
|
|
config BTRFS_FS
|
|
tristate "Btrfs filesystem support"
|
|
+ select BLK_CGROUP_PUNT_BIO
|
|
select CRYPTO
|
|
select CRYPTO_CRC32C
|
|
select LIBCRC32C
|
|
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
|
|
index 726592868e9c..5379c4714905 100644
|
|
--- a/fs/btrfs/bio.c
|
|
+++ b/fs/btrfs/bio.c
|
|
@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
|
|
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
|
|
* is already initialized by the block layer.
|
|
*/
|
|
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
|
|
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
|
|
btrfs_bio_end_io_t end_io, void *private)
|
|
{
|
|
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
|
|
- bbio->inode = inode;
|
|
+ bbio->fs_info = fs_info;
|
|
bbio->end_io = end_io;
|
|
bbio->private = private;
|
|
atomic_set(&bbio->pending_ios, 1);
|
|
@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
|
|
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
|
|
* a mempool.
|
|
*/
|
|
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
|
|
- struct btrfs_inode *inode,
|
|
- btrfs_bio_end_io_t end_io, void *private)
|
|
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
|
|
+ struct btrfs_fs_info *fs_info,
|
|
+ btrfs_bio_end_io_t end_io, void *private)
|
|
{
|
|
+ struct btrfs_bio *bbio;
|
|
struct bio *bio;
|
|
|
|
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
|
|
- btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
|
|
- return bio;
|
|
+ bbio = btrfs_bio(bio);
|
|
+ btrfs_bio_init(bbio, fs_info, end_io, private);
|
|
+ return bbio;
|
|
}
|
|
|
|
-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
|
|
- struct bio *orig, u64 map_length,
|
|
- bool use_append)
|
|
+static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
|
|
{
|
|
- struct btrfs_bio *orig_bbio = btrfs_bio(orig);
|
|
+ struct btrfs_ordered_extent *ordered;
|
|
+ int ret;
|
|
+
|
|
+ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
|
|
+ if (WARN_ON_ONCE(!ordered))
|
|
+ return BLK_STS_IOERR;
|
|
+ ret = btrfs_extract_ordered_extent(bbio, ordered);
|
|
+ btrfs_put_ordered_extent(ordered);
|
|
+
|
|
+ return errno_to_blk_status(ret);
|
|
+}
|
|
+
|
|
+static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_bio *orig_bbio,
|
|
+ u64 map_length, bool use_append)
|
|
+{
|
|
+ struct btrfs_bio *bbio;
|
|
struct bio *bio;
|
|
|
|
if (use_append) {
|
|
unsigned int nr_segs;
|
|
|
|
- bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
|
|
+ bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
|
|
&btrfs_clone_bioset, map_length);
|
|
} else {
|
|
- bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
|
|
- &btrfs_clone_bioset);
|
|
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
|
|
+ GFP_NOFS, &btrfs_clone_bioset);
|
|
}
|
|
- btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
|
|
-
|
|
- btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
|
|
- if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
|
|
+ bbio = btrfs_bio(bio);
|
|
+ btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
|
|
+ bbio->inode = orig_bbio->inode;
|
|
+ bbio->file_offset = orig_bbio->file_offset;
|
|
+ if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
|
|
orig_bbio->file_offset += map_length;
|
|
|
|
atomic_inc(&orig_bbio->pending_ios);
|
|
- return bio;
|
|
+ return bbio;
|
|
}
|
|
|
|
static void btrfs_orig_write_end_io(struct bio *bio);
|
|
@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
|
|
goto done;
|
|
}
|
|
|
|
- btrfs_submit_bio(&repair_bbio->bio, mirror);
|
|
+ btrfs_submit_bio(repair_bbio, mirror);
|
|
return;
|
|
}
|
|
|
|
@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
|
|
repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
|
|
&btrfs_repair_bioset);
|
|
repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
|
|
- bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
|
|
+ __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
|
|
|
|
repair_bbio = btrfs_bio(repair_bio);
|
|
- btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
|
|
+ btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
|
|
+ repair_bbio->inode = failed_bbio->inode;
|
|
repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
|
|
|
|
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
|
|
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
|
|
- btrfs_submit_bio(repair_bio, mirror);
|
|
+ btrfs_submit_bio(repair_bbio, mirror);
|
|
return fbio;
|
|
}
|
|
|
|
@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
|
|
struct btrfs_failed_bio *fbio = NULL;
|
|
u32 offset = 0;
|
|
|
|
+ /* Read-repair requires the inode field to be set by the submitter. */
|
|
+ ASSERT(inode);
|
|
+
|
|
/*
|
|
* Hand off repair bios to the repair code as there is no upper level
|
|
* submitter for them.
|
|
@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
|
|
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
|
|
|
|
/* Metadata reads are checked and repaired by the submitter. */
|
|
- if (bbio->bio.bi_opf & REQ_META)
|
|
- bbio->end_io(bbio);
|
|
- else
|
|
+ if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
|
|
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
|
|
+ else
|
|
+ bbio->end_io(bbio);
|
|
}
|
|
|
|
static void btrfs_simple_end_io(struct bio *bio)
|
|
{
|
|
struct btrfs_bio *bbio = btrfs_bio(bio);
|
|
struct btrfs_device *dev = bio->bi_private;
|
|
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
|
|
btrfs_bio_counter_dec(fs_info);
|
|
|
|
@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
|
|
|
|
btrfs_bio_counter_dec(bioc->fs_info);
|
|
bbio->mirror_num = bioc->mirror_num;
|
|
- if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
|
|
+ if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
|
|
+ !(bbio->bio.bi_opf & REQ_META))
|
|
btrfs_check_read_bio(bbio, NULL);
|
|
else
|
|
btrfs_orig_bbio_end_io(bbio);
|
|
@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
|
|
dev->devid, bio->bi_iter.bi_size);
|
|
|
|
btrfsic_check_bio(bio);
|
|
- submit_bio(bio);
|
|
+
|
|
+ if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
|
|
+ blkcg_punt_bio_submit(bio);
|
|
+ else
|
|
+ submit_bio(bio);
|
|
}
|
|
|
|
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
|
|
@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work)
|
|
|
|
/*
|
|
* All of the bios that pass through here are from async helpers.
|
|
- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
|
|
- * This changes nothing when cgroups aren't in use.
|
|
+ * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
|
|
+ * context. This changes nothing when cgroups aren't in use.
|
|
*/
|
|
- bio->bi_opf |= REQ_CGROUP_PUNT;
|
|
+ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
|
|
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
|
|
}
|
|
|
|
@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
|
|
* in order.
|
|
*/
|
|
if (bbio->bio.bi_opf & REQ_META) {
|
|
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
|
|
if (btrfs_is_zoned(fs_info))
|
|
return false;
|
|
@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
|
|
struct btrfs_io_context *bioc,
|
|
struct btrfs_io_stripe *smap, int mirror_num)
|
|
{
|
|
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
struct async_submit_bio *async;
|
|
|
|
async = kmalloc(sizeof(*async), GFP_NOFS);
|
|
@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
|
|
return true;
|
|
}
|
|
|
|
-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
|
|
+static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
|
|
{
|
|
- struct btrfs_bio *bbio = btrfs_bio(bio);
|
|
struct btrfs_inode *inode = bbio->inode;
|
|
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
struct btrfs_bio *orig_bbio = bbio;
|
|
+ struct bio *bio = &bbio->bio;
|
|
u64 logical = bio->bi_iter.bi_sector << 9;
|
|
u64 length = bio->bi_iter.bi_size;
|
|
u64 map_length = length;
|
|
@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
|
|
map_length = min(map_length, fs_info->max_zone_append_size);
|
|
|
|
if (map_length < length) {
|
|
- bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
|
|
- bbio = btrfs_bio(bio);
|
|
+ bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
|
|
+ bio = &bbio->bio;
|
|
}
|
|
|
|
/*
|
|
* Save the iter for the end_io handler and preload the checksums for
|
|
* data reads.
|
|
*/
|
|
- if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
|
|
+ if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
|
|
bbio->saved_iter = bio->bi_iter;
|
|
ret = btrfs_lookup_bio_sums(bbio);
|
|
if (ret)
|
|
@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
|
|
if (use_append) {
|
|
bio->bi_opf &= ~REQ_OP_WRITE;
|
|
bio->bi_opf |= REQ_OP_ZONE_APPEND;
|
|
- ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
|
|
+ ret = btrfs_bio_extract_ordered_extent(bbio);
|
|
if (ret)
|
|
goto fail_put_bio;
|
|
}
|
|
@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
|
|
* Csum items for reloc roots have already been cloned at this
|
|
* point, so they are handled as part of the no-checksum case.
|
|
*/
|
|
- if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
|
|
+ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
|
|
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
|
|
!btrfs_is_data_reloc_root(inode->root)) {
|
|
if (should_async_write(bbio) &&
|
|
@@ -686,9 +712,12 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
|
|
return true;
|
|
}
|
|
|
|
-void btrfs_submit_bio(struct bio *bio, int mirror_num)
|
|
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
|
|
{
|
|
- while (!btrfs_submit_chunk(bio, mirror_num))
|
|
+ /* If bbio->inode is not populated, its file_offset must be 0. */
|
|
+ ASSERT(bbio->inode || bbio->file_offset == 0);
|
|
+
|
|
+ while (!btrfs_submit_chunk(bbio, mirror_num))
|
|
;
|
|
}
|
|
|
|
@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
|
u64 length, u64 logical, struct page *page,
|
|
unsigned int pg_offset, int mirror_num)
|
|
{
|
|
- struct btrfs_device *dev;
|
|
+ struct btrfs_io_stripe smap = { 0 };
|
|
struct bio_vec bvec;
|
|
struct bio bio;
|
|
- u64 map_length = 0;
|
|
- u64 sector;
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
int ret = 0;
|
|
|
|
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
|
|
@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
|
if (btrfs_repair_one_zone(fs_info, logical))
|
|
return 0;
|
|
|
|
- map_length = length;
|
|
-
|
|
/*
|
|
* Avoid races with device replace and make sure our bioc has devices
|
|
* associated to its stripes that don't go away while we are doing the
|
|
* read repair operation.
|
|
*/
|
|
btrfs_bio_counter_inc_blocked(fs_info);
|
|
- if (btrfs_is_parity_mirror(fs_info, logical, length)) {
|
|
- /*
|
|
- * Note that we don't use BTRFS_MAP_WRITE because it's supposed
|
|
- * to update all raid stripes, but here we just want to correct
|
|
- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
|
|
- * stripe's dev and sector.
|
|
- */
|
|
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
|
|
- &map_length, &bioc, 0);
|
|
- if (ret)
|
|
- goto out_counter_dec;
|
|
- ASSERT(bioc->mirror_num == 1);
|
|
- } else {
|
|
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
|
|
- &map_length, &bioc, mirror_num);
|
|
- if (ret)
|
|
- goto out_counter_dec;
|
|
- /*
|
|
- * This happens when dev-replace is also running, and the
|
|
- * mirror_num indicates the dev-replace target.
|
|
- *
|
|
- * In this case, we don't need to do anything, as the read
|
|
- * error just means the replace progress hasn't reached our
|
|
- * read range, and later replace routine would handle it well.
|
|
- */
|
|
- if (mirror_num != bioc->mirror_num)
|
|
- goto out_counter_dec;
|
|
- }
|
|
-
|
|
- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
|
|
- dev = bioc->stripes[bioc->mirror_num - 1].dev;
|
|
- btrfs_put_bioc(bioc);
|
|
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
|
|
+ if (ret < 0)
|
|
+ goto out_counter_dec;
|
|
|
|
- if (!dev || !dev->bdev ||
|
|
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
|
|
+ if (!smap.dev->bdev ||
|
|
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
|
|
ret = -EIO;
|
|
goto out_counter_dec;
|
|
}
|
|
|
|
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
|
|
- bio.bi_iter.bi_sector = sector;
|
|
+ bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
|
|
+ bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
|
|
__bio_add_page(&bio, page, length, pg_offset);
|
|
|
|
btrfsic_check_bio(&bio);
|
|
ret = submit_bio_wait(&bio);
|
|
if (ret) {
|
|
/* try to remap that extent elsewhere? */
|
|
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
|
|
+ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
|
|
goto out_bio_uninit;
|
|
}
|
|
|
|
btrfs_info_rl_in_rcu(fs_info,
|
|
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
|
|
- ino, start, btrfs_dev_name(dev), sector);
|
|
+ ino, start, btrfs_dev_name(smap.dev),
|
|
+ smap.physical >> SECTOR_SHIFT);
|
|
ret = 0;
|
|
|
|
out_bio_uninit:
|
|
@@ -791,6 +787,45 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
|
return ret;
|
|
}
|
|
|
|
+/*
|
|
+ * Submit a btrfs_bio based repair write.
|
|
+ *
|
|
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
|
|
+ */
|
|
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
+ u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
|
|
+ u64 length = bbio->bio.bi_iter.bi_size;
|
|
+ struct btrfs_io_stripe smap = { 0 };
|
|
+ int ret;
|
|
+
|
|
+ ASSERT(fs_info);
|
|
+ ASSERT(mirror_num > 0);
|
|
+ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
|
|
+ ASSERT(!bbio->inode);
|
|
+
|
|
+ btrfs_bio_counter_inc_blocked(fs_info);
|
|
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
|
|
+ if (ret < 0)
|
|
+ goto fail;
|
|
+
|
|
+ if (dev_replace) {
|
|
+ if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
|
|
+ bbio->bio.bi_opf &= ~REQ_OP_WRITE;
|
|
+ bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
|
|
+ }
|
|
+ ASSERT(smap.dev == fs_info->dev_replace.srcdev);
|
|
+ smap.dev = fs_info->dev_replace.tgtdev;
|
|
+ }
|
|
+ __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
|
|
+ return;
|
|
+
|
|
+fail:
|
|
+ btrfs_bio_counter_dec(fs_info);
|
|
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
|
|
+}
|
|
+
|
|
int __init btrfs_bioset_init(void)
|
|
{
|
|
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
|
|
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
|
|
index 873ff85817f0..a8eca3a65673 100644
|
|
--- a/fs/btrfs/bio.h
|
|
+++ b/fs/btrfs/bio.h
|
|
@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
|
|
* passed to btrfs_submit_bio for mapping to the physical devices.
|
|
*/
|
|
struct btrfs_bio {
|
|
- /* Inode and offset into it that this I/O operates on. */
|
|
+ /*
|
|
+ * Inode and offset into it that this I/O operates on.
|
|
+ * Only set for data I/O.
|
|
+ */
|
|
struct btrfs_inode *inode;
|
|
u64 file_offset;
|
|
|
|
@@ -58,6 +61,9 @@ struct btrfs_bio {
|
|
atomic_t pending_ios;
|
|
struct work_struct end_io_work;
|
|
|
|
+ /* File system that this I/O operates on. */
|
|
+ struct btrfs_fs_info *fs_info;
|
|
+
|
|
/*
|
|
* This member must come last, bio_alloc_bioset will allocate enough
|
|
* bytes for entire btrfs_bio but relies on bio being last.
|
|
@@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
|
|
int __init btrfs_bioset_init(void);
|
|
void __cold btrfs_bioset_exit(void);
|
|
|
|
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
|
|
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
|
|
btrfs_bio_end_io_t end_io, void *private);
|
|
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
|
|
- struct btrfs_inode *inode,
|
|
- btrfs_bio_end_io_t end_io, void *private);
|
|
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
|
|
+ struct btrfs_fs_info *fs_info,
|
|
+ btrfs_bio_end_io_t end_io, void *private);
|
|
|
|
static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
|
|
{
|
|
@@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
|
|
/* Bio only refers to one ordered extent. */
|
|
#define REQ_BTRFS_ONE_ORDERED REQ_DRV
|
|
|
|
-void btrfs_submit_bio(struct bio *bio, int mirror_num);
|
|
+/* Submit using blkcg_punt_bio_submit. */
|
|
+#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
|
|
+
|
|
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
|
|
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
|
|
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
|
u64 length, u64 logical, struct page *page,
|
|
unsigned int pg_offset, int mirror_num);
|
|
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
|
|
index 5fc670c27f86..957ad1c31c4f 100644
|
|
--- a/fs/btrfs/block-group.c
|
|
+++ b/fs/btrfs/block-group.c
|
|
@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
|
|
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
|
|
cache);
|
|
|
|
- /*
|
|
- * If not empty, someone is still holding mutex of
|
|
- * full_stripe_lock, which can only be released by caller.
|
|
- * And it will definitely cause use-after-free when caller
|
|
- * tries to release full stripe lock.
|
|
- *
|
|
- * No better way to resolve, but only to warn.
|
|
- */
|
|
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
|
|
kfree(cache->free_space_ctl);
|
|
kfree(cache->physical_map);
|
|
kfree(cache);
|
|
@@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
|
|
|
|
map = em->map_lookup;
|
|
data_stripe_length = em->orig_block_len;
|
|
- io_stripe_size = map->stripe_len;
|
|
+ io_stripe_size = BTRFS_STRIPE_LEN;
|
|
chunk_start = em->start;
|
|
|
|
/* For RAID5/6 adjust to a full IO stripe length */
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
|
- io_stripe_size = map->stripe_len * nr_data_stripes(map);
|
|
+ io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
|
|
if (!buf) {
|
|
@@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
|
|
|
|
for (i = 0; i < map->num_stripes; i++) {
|
|
bool already_inserted = false;
|
|
- u64 stripe_nr;
|
|
- u64 offset;
|
|
+ u32 stripe_nr;
|
|
+ u32 offset;
|
|
int j;
|
|
|
|
if (!in_range(physical, map->stripes[i].physical,
|
|
data_stripe_length))
|
|
continue;
|
|
|
|
- stripe_nr = physical - map->stripes[i].physical;
|
|
- stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
|
|
+ stripe_nr = (physical - map->stripes[i].physical) >>
|
|
+ BTRFS_STRIPE_LEN_SHIFT;
|
|
+ offset = (physical - map->stripes[i].physical) &
|
|
+ BTRFS_STRIPE_LEN_MASK;
|
|
|
|
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
|
|
- BTRFS_BLOCK_GROUP_RAID10)) {
|
|
- stripe_nr = stripe_nr * map->num_stripes + i;
|
|
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
|
|
- }
|
|
+ BTRFS_BLOCK_GROUP_RAID10))
|
|
+ stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
|
|
+ map->sub_stripes);
|
|
/*
|
|
* The remaining case would be for RAID56, multiply by
|
|
* nr_data_stripes(). Alternatively, just use rmap_len below
|
|
* instead of map->stripe_len
|
|
*/
|
|
-
|
|
bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
|
|
|
|
/* Ensure we don't add duplicate addresses */
|
|
@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
|
|
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
|
|
atomic_set(&cache->frozen, 0);
|
|
mutex_init(&cache->free_space_lock);
|
|
- cache->full_stripe_locks_root.root = RB_ROOT;
|
|
- mutex_init(&cache->full_stripe_locks_root.lock);
|
|
|
|
return cache;
|
|
}
|
|
@@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
|
|
}
|
|
|
|
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
|
|
- u64 bytes_used, u64 type,
|
|
+ u64 type,
|
|
u64 chunk_offset, u64 size)
|
|
{
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
@@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
|
|
|
|
cache->length = size;
|
|
set_free_space_tree_thresholds(cache);
|
|
- cache->used = bytes_used;
|
|
cache->flags = type;
|
|
cache->cached = BTRFS_CACHE_FINISHED;
|
|
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
|
|
@@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
if (btrfs_should_fragment_free_space(cache)) {
|
|
- u64 new_bytes_used = size - bytes_used;
|
|
-
|
|
- cache->space_info->bytes_used += new_bytes_used >> 1;
|
|
+ cache->space_info->bytes_used += size >> 1;
|
|
fragment_free_space(cache);
|
|
}
|
|
#endif
|
|
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
|
|
index 6e4a0b429ac3..cc0e4b37db2d 100644
|
|
--- a/fs/btrfs/block-group.h
|
|
+++ b/fs/btrfs/block-group.h
|
|
@@ -91,14 +91,6 @@ struct btrfs_caching_control {
|
|
/* Once caching_thread() finds this much free space, it will wake up waiters. */
|
|
#define CACHING_CTL_WAKE_UP SZ_2M
|
|
|
|
-/*
|
|
- * Tree to record all locked full stripes of a RAID5/6 block group
|
|
- */
|
|
-struct btrfs_full_stripe_locks_tree {
|
|
- struct rb_root root;
|
|
- struct mutex lock;
|
|
-};
|
|
-
|
|
struct btrfs_block_group {
|
|
struct btrfs_fs_info *fs_info;
|
|
struct inode *inode;
|
|
@@ -229,9 +221,6 @@ struct btrfs_block_group {
|
|
*/
|
|
int swap_extents;
|
|
|
|
- /* Record locked full stripes for RAID5/6 block group */
|
|
- struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
|
|
-
|
|
/*
|
|
* Allocation offset for the block group to implement sequential
|
|
* allocation. This is used only on a zoned filesystem.
|
|
@@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
|
|
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
|
|
int btrfs_read_block_groups(struct btrfs_fs_info *info);
|
|
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
|
|
- u64 bytes_used, u64 type,
|
|
+ u64 type,
|
|
u64 chunk_offset, u64 size);
|
|
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
|
|
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
|
|
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
|
|
index 5367a14d44d2..3ab707e26fa2 100644
|
|
--- a/fs/btrfs/block-rsv.c
|
|
+++ b/fs/btrfs/block-rsv.c
|
|
@@ -232,9 +232,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
|
|
u64 num_bytes = 0;
|
|
int ret = -ENOSPC;
|
|
|
|
- if (!block_rsv)
|
|
- return 0;
|
|
-
|
|
spin_lock(&block_rsv->lock);
|
|
num_bytes = mult_perc(block_rsv->size, min_percent);
|
|
if (block_rsv->reserved >= num_bytes)
|
|
@@ -245,17 +242,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
|
|
}
|
|
|
|
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
|
|
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
|
|
enum btrfs_reserve_flush_enum flush)
|
|
{
|
|
- u64 num_bytes = 0;
|
|
int ret = -ENOSPC;
|
|
|
|
if (!block_rsv)
|
|
return 0;
|
|
|
|
spin_lock(&block_rsv->lock);
|
|
- num_bytes = min_reserved;
|
|
if (block_rsv->reserved >= num_bytes)
|
|
ret = 0;
|
|
else
|
|
@@ -355,17 +350,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
|
|
|
|
/*
|
|
* But we also want to reserve enough space so we can do the fallback
|
|
- * global reserve for an unlink, which is an additional 5 items (see the
|
|
- * comment in __unlink_start_trans for what we're modifying.)
|
|
+ * global reserve for an unlink, which is an additional
|
|
+ * BTRFS_UNLINK_METADATA_UNITS items.
|
|
*
|
|
* But we also need space for the delayed ref updates from the unlink,
|
|
- * so its 10, 5 for the actual operation, and 5 for the delayed ref
|
|
- * updates.
|
|
+ * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
|
|
+ * each unlink metadata item.
|
|
*/
|
|
- min_items += 10;
|
|
+ min_items += BTRFS_UNLINK_METADATA_UNITS;
|
|
|
|
num_bytes = max_t(u64, num_bytes,
|
|
- btrfs_calc_insert_metadata_size(fs_info, min_items));
|
|
+ btrfs_calc_insert_metadata_size(fs_info, min_items) +
|
|
+ btrfs_calc_delayed_ref_bytes(fs_info,
|
|
+ BTRFS_UNLINK_METADATA_UNITS));
|
|
|
|
spin_lock(&sinfo->lock);
|
|
spin_lock(&block_rsv->lock);
|
|
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
|
|
index 4cc41c9aaa82..6dc781709aca 100644
|
|
--- a/fs/btrfs/block-rsv.h
|
|
+++ b/fs/btrfs/block-rsv.h
|
|
@@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
|
|
enum btrfs_reserve_flush_enum flush);
|
|
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
|
|
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
|
|
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
|
|
enum btrfs_reserve_flush_enum flush);
|
|
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
|
|
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
|
|
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
|
|
index 9dc21622806e..ec2ae4406c16 100644
|
|
--- a/fs/btrfs/btrfs_inode.h
|
|
+++ b/fs/btrfs/btrfs_inode.h
|
|
@@ -142,11 +142,22 @@ struct btrfs_inode {
|
|
/* a local copy of root's last_log_commit */
|
|
int last_log_commit;
|
|
|
|
- /*
|
|
- * Total number of bytes pending delalloc, used by stat to calculate the
|
|
- * real block usage of the file. This is used only for files.
|
|
- */
|
|
- u64 delalloc_bytes;
|
|
+ union {
|
|
+ /*
|
|
+ * Total number of bytes pending delalloc, used by stat to
|
|
+ * calculate the real block usage of the file. This is used
|
|
+ * only for files.
|
|
+ */
|
|
+ u64 delalloc_bytes;
|
|
+ /*
|
|
+ * The lowest possible index of the next dir index key which
|
|
+ * points to an inode that needs to be logged.
|
|
+ * This is used only for directories.
|
|
+ * Use the helpers btrfs_get_first_dir_index_to_log() and
|
|
+ * btrfs_set_first_dir_index_to_log() to access this field.
|
|
+ */
|
|
+ u64 first_dir_index_to_log;
|
|
+ };
|
|
|
|
union {
|
|
/*
|
|
@@ -247,6 +258,17 @@ struct btrfs_inode {
|
|
struct inode vfs_inode;
|
|
};
|
|
|
|
+static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
|
|
+{
|
|
+ return READ_ONCE(inode->first_dir_index_to_log);
|
|
+}
|
|
+
|
|
+static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
|
|
+ u64 index)
|
|
+{
|
|
+ WRITE_ONCE(inode->first_dir_index_to_log, index);
|
|
+}
|
|
+
|
|
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
|
|
{
|
|
return container_of(inode, struct btrfs_inode, vfs_inode);
|
|
@@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
|
|
|
|
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
|
|
u32 pgoff, u8 *csum, const u8 * const csum_expected);
|
|
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
|
|
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
|
|
+ struct btrfs_ordered_extent *ordered);
|
|
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
|
|
u32 bio_offset, struct bio_vec *bv);
|
|
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
|
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
|
|
index f42f31f22d13..2d0493f0a184 100644
|
|
--- a/fs/btrfs/compression.c
|
|
+++ b/fs/btrfs/compression.c
|
|
@@ -37,6 +37,8 @@
|
|
#include "file-item.h"
|
|
#include "super.h"
|
|
|
|
+struct bio_set btrfs_compressed_bioset;
|
|
+
|
|
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
|
|
|
|
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
|
|
@@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
|
|
return NULL;
|
|
}
|
|
|
|
+static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
|
|
+{
|
|
+ return container_of(bbio, struct compressed_bio, bbio);
|
|
+}
|
|
+
|
|
+static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
|
|
+ u64 start, blk_opf_t op,
|
|
+ btrfs_bio_end_io_t end_io)
|
|
+{
|
|
+ struct btrfs_bio *bbio;
|
|
+
|
|
+ bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
|
|
+ GFP_NOFS, &btrfs_compressed_bioset));
|
|
+ btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
|
|
+ bbio->inode = inode;
|
|
+ bbio->file_offset = start;
|
|
+ return to_compressed_bio(bbio);
|
|
+}
|
|
+
|
|
bool btrfs_compress_is_valid_type(const char *str, size_t len)
|
|
{
|
|
int i;
|
|
@@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws,
|
|
}
|
|
}
|
|
|
|
+static void btrfs_free_compressed_pages(struct compressed_bio *cb)
|
|
+{
|
|
+ for (unsigned int i = 0; i < cb->nr_pages; i++)
|
|
+ put_page(cb->compressed_pages[i]);
|
|
+ kfree(cb->compressed_pages);
|
|
+}
|
|
+
|
|
static int btrfs_decompress_bio(struct compressed_bio *cb);
|
|
|
|
static void end_compressed_bio_read(struct btrfs_bio *bbio)
|
|
{
|
|
- struct compressed_bio *cb = bbio->private;
|
|
- unsigned int index;
|
|
- struct page *page;
|
|
+ struct compressed_bio *cb = to_compressed_bio(bbio);
|
|
+ blk_status_t status = bbio->bio.bi_status;
|
|
|
|
- if (bbio->bio.bi_status)
|
|
- cb->status = bbio->bio.bi_status;
|
|
- else
|
|
- cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
|
|
-
|
|
- /* Release the compressed pages */
|
|
- for (index = 0; index < cb->nr_pages; index++) {
|
|
- page = cb->compressed_pages[index];
|
|
- page->mapping = NULL;
|
|
- put_page(page);
|
|
- }
|
|
-
|
|
- /* Do io completion on the original bio */
|
|
- btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
|
|
+ if (!status)
|
|
+ status = errno_to_blk_status(btrfs_decompress_bio(cb));
|
|
|
|
- /* Finally free the cb struct */
|
|
- kfree(cb->compressed_pages);
|
|
- kfree(cb);
|
|
+ btrfs_free_compressed_pages(cb);
|
|
+ btrfs_bio_end_io(cb->orig_bbio, status);
|
|
bio_put(&bbio->bio);
|
|
}
|
|
|
|
@@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
|
|
* Clear the writeback bits on all of the file
|
|
* pages for a compressed write
|
|
*/
|
|
-static noinline void end_compressed_writeback(struct inode *inode,
|
|
- const struct compressed_bio *cb)
|
|
+static noinline void end_compressed_writeback(const struct compressed_bio *cb)
|
|
{
|
|
+ struct inode *inode = &cb->bbio.inode->vfs_inode;
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
unsigned long index = cb->start >> PAGE_SHIFT;
|
|
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
|
|
struct folio_batch fbatch;
|
|
- const int errno = blk_status_to_errno(cb->status);
|
|
+ const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
|
|
int i;
|
|
int ret;
|
|
|
|
@@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode,
|
|
/* the inode may be gone now */
|
|
}
|
|
|
|
-static void finish_compressed_bio_write(struct compressed_bio *cb)
|
|
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
|
|
{
|
|
- struct inode *inode = cb->inode;
|
|
- unsigned int index;
|
|
+ struct compressed_bio *cb =
|
|
+ container_of(work, struct compressed_bio, write_end_work);
|
|
|
|
/*
|
|
* Ok, we're the last bio for this extent, step one is to call back
|
|
* into the FS and do all the end_io operations.
|
|
*/
|
|
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
|
|
+ btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL,
|
|
cb->start, cb->start + cb->len - 1,
|
|
- cb->status == BLK_STS_OK);
|
|
+ cb->bbio.bio.bi_status == BLK_STS_OK);
|
|
|
|
if (cb->writeback)
|
|
- end_compressed_writeback(inode, cb);
|
|
+ end_compressed_writeback(cb);
|
|
/* Note, our inode could be gone now */
|
|
|
|
- /*
|
|
- * Release the compressed pages, these came from alloc_page and
|
|
- * are not attached to the inode at all
|
|
- */
|
|
- for (index = 0; index < cb->nr_pages; index++) {
|
|
- struct page *page = cb->compressed_pages[index];
|
|
-
|
|
- page->mapping = NULL;
|
|
- put_page(page);
|
|
- }
|
|
-
|
|
- /* Finally free the cb struct */
|
|
- kfree(cb->compressed_pages);
|
|
- kfree(cb);
|
|
-}
|
|
-
|
|
-static void btrfs_finish_compressed_write_work(struct work_struct *work)
|
|
-{
|
|
- struct compressed_bio *cb =
|
|
- container_of(work, struct compressed_bio, write_end_work);
|
|
-
|
|
- finish_compressed_bio_write(cb);
|
|
+ btrfs_free_compressed_pages(cb);
|
|
+ bio_put(&cb->bbio.bio);
|
|
}
|
|
|
|
/*
|
|
@@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
|
|
*/
|
|
static void end_compressed_bio_write(struct btrfs_bio *bbio)
|
|
{
|
|
- struct compressed_bio *cb = bbio->private;
|
|
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
|
|
+ struct compressed_bio *cb = to_compressed_bio(bbio);
|
|
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
|
|
|
|
- cb->status = bbio->bio.bi_status;
|
|
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
|
|
+}
|
|
|
|
- bio_put(&bbio->bio);
|
|
+static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
|
|
+{
|
|
+ struct bio *bio = &cb->bbio.bio;
|
|
+ u32 offset = 0;
|
|
+
|
|
+ while (offset < cb->compressed_len) {
|
|
+ u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
|
|
+
|
|
+ /* Maximum compressed extent is smaller than bio size limit. */
|
|
+ __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
|
|
+ len, 0);
|
|
+ offset += len;
|
|
+ }
|
|
}
|
|
|
|
/*
|
|
@@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
|
|
* This also checksums the file bytes and gets things ready for
|
|
* the end io hooks.
|
|
*/
|
|
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
|
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
|
unsigned int len, u64 disk_start,
|
|
unsigned int compressed_len,
|
|
struct page **compressed_pages,
|
|
unsigned int nr_pages,
|
|
blk_opf_t write_flags,
|
|
- struct cgroup_subsys_state *blkcg_css,
|
|
bool writeback)
|
|
{
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
- struct bio *bio = NULL;
|
|
struct compressed_bio *cb;
|
|
- u64 cur_disk_bytenr = disk_start;
|
|
- blk_status_t ret = BLK_STS_OK;
|
|
|
|
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
|
|
IS_ALIGNED(len, fs_info->sectorsize));
|
|
- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
|
|
- if (!cb)
|
|
- return BLK_STS_RESOURCE;
|
|
- cb->status = BLK_STS_OK;
|
|
- cb->inode = &inode->vfs_inode;
|
|
+
|
|
+ write_flags |= REQ_BTRFS_ONE_ORDERED;
|
|
+
|
|
+ cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
|
|
+ end_compressed_bio_write);
|
|
cb->start = start;
|
|
cb->len = len;
|
|
cb->compressed_pages = compressed_pages;
|
|
@@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
|
cb->writeback = writeback;
|
|
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
|
|
cb->nr_pages = nr_pages;
|
|
+ cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
|
|
+ btrfs_add_compressed_bio_pages(cb);
|
|
|
|
- if (blkcg_css) {
|
|
- kthread_associate_blkcg(blkcg_css);
|
|
- write_flags |= REQ_CGROUP_PUNT;
|
|
- }
|
|
-
|
|
- write_flags |= REQ_BTRFS_ONE_ORDERED;
|
|
- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
|
|
- BTRFS_I(cb->inode), end_compressed_bio_write, cb);
|
|
- bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
|
|
- btrfs_bio(bio)->file_offset = start;
|
|
-
|
|
- while (cur_disk_bytenr < disk_start + compressed_len) {
|
|
- u64 offset = cur_disk_bytenr - disk_start;
|
|
- unsigned int index = offset >> PAGE_SHIFT;
|
|
- unsigned int real_size;
|
|
- unsigned int added;
|
|
- struct page *page = compressed_pages[index];
|
|
-
|
|
- /*
|
|
- * We have various limits on the real read size:
|
|
- * - page boundary
|
|
- * - compressed length boundary
|
|
- */
|
|
- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
|
|
- real_size = min_t(u64, real_size, compressed_len - offset);
|
|
- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
|
|
-
|
|
- added = bio_add_page(bio, page, real_size, offset_in_page(offset));
|
|
- /*
|
|
- * Maximum compressed extent is smaller than bio size limit,
|
|
- * thus bio_add_page() should always success.
|
|
- */
|
|
- ASSERT(added == real_size);
|
|
- cur_disk_bytenr += added;
|
|
- }
|
|
-
|
|
- /* Finished the range. */
|
|
- ASSERT(bio->bi_iter.bi_size);
|
|
- btrfs_submit_bio(bio, 0);
|
|
- if (blkcg_css)
|
|
- kthread_associate_blkcg(NULL);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static u64 bio_end_offset(struct bio *bio)
|
|
-{
|
|
- struct bio_vec *last = bio_last_bvec_all(bio);
|
|
-
|
|
- return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
|
|
+ btrfs_submit_bio(&cb->bbio, 0);
|
|
}
|
|
|
|
/*
|
|
@@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
|
{
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
unsigned long end_index;
|
|
- u64 cur = bio_end_offset(cb->orig_bio);
|
|
+ struct bio *orig_bio = &cb->orig_bbio->bio;
|
|
+ u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
|
|
u64 isize = i_size_read(inode);
|
|
int ret;
|
|
struct page *page;
|
|
@@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
|
*/
|
|
if (!em || cur < em->start ||
|
|
(cur + fs_info->sectorsize > extent_map_end(em)) ||
|
|
- (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
|
|
+ (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) {
|
|
free_extent_map(em);
|
|
unlock_extent(tree, cur, page_end, NULL);
|
|
unlock_page(page);
|
|
@@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
|
}
|
|
|
|
add_size = min(em->start + em->len, page_end + 1) - cur;
|
|
- ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
|
|
+ ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
|
|
if (ret != add_size) {
|
|
unlock_extent(tree, cur, page_end, NULL);
|
|
unlock_page(page);
|
|
@@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
|
* After the compressed pages are read, we copy the bytes into the
|
|
* bio we were passed and then call the bio end_io calls
|
|
*/
|
|
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
|
- int mirror_num)
|
|
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
|
|
{
|
|
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
- struct extent_map_tree *em_tree;
|
|
+ struct btrfs_inode *inode = bbio->inode;
|
|
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
+ struct extent_map_tree *em_tree = &inode->extent_tree;
|
|
struct compressed_bio *cb;
|
|
unsigned int compressed_len;
|
|
- struct bio *comp_bio;
|
|
- const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
|
- u64 cur_disk_byte = disk_bytenr;
|
|
- u64 file_offset;
|
|
+ u64 file_offset = bbio->file_offset;
|
|
u64 em_len;
|
|
u64 em_start;
|
|
struct extent_map *em;
|
|
@@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
|
int memstall = 0;
|
|
blk_status_t ret;
|
|
int ret2;
|
|
- int i;
|
|
-
|
|
- em_tree = &BTRFS_I(inode)->extent_tree;
|
|
-
|
|
- file_offset = bio_first_bvec_all(bio)->bv_offset +
|
|
- page_offset(bio_first_page_all(bio));
|
|
|
|
/* we need the actual starting offset of this extent in the file */
|
|
read_lock(&em_tree->lock);
|
|
@@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
|
|
|
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
|
|
compressed_len = em->block_len;
|
|
- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
|
|
- if (!cb) {
|
|
- ret = BLK_STS_RESOURCE;
|
|
- goto out;
|
|
- }
|
|
|
|
- cb->status = BLK_STS_OK;
|
|
- cb->inode = inode;
|
|
+ cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
|
|
+ end_compressed_bio_read);
|
|
|
|
cb->start = em->orig_start;
|
|
em_len = em->len;
|
|
em_start = em->start;
|
|
|
|
- cb->len = bio->bi_iter.bi_size;
|
|
+ cb->len = bbio->bio.bi_iter.bi_size;
|
|
cb->compressed_len = compressed_len;
|
|
cb->compress_type = em->compress_type;
|
|
- cb->orig_bio = bio;
|
|
+ cb->orig_bbio = bbio;
|
|
|
|
free_extent_map(em);
|
|
- em = NULL;
|
|
|
|
cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
|
|
cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
|
|
if (!cb->compressed_pages) {
|
|
ret = BLK_STS_RESOURCE;
|
|
- goto fail;
|
|
+ goto out_free_bio;
|
|
}
|
|
|
|
ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
|
|
if (ret2) {
|
|
ret = BLK_STS_RESOURCE;
|
|
- goto fail;
|
|
+ goto out_free_compressed_pages;
|
|
}
|
|
|
|
- add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
|
|
+ add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
|
|
+ &pflags);
|
|
|
|
/* include any pages we added in add_ra-bio_pages */
|
|
- cb->len = bio->bi_iter.bi_size;
|
|
-
|
|
- comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
|
|
- end_compressed_bio_read, cb);
|
|
- comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
|
|
-
|
|
- while (cur_disk_byte < disk_bytenr + compressed_len) {
|
|
- u64 offset = cur_disk_byte - disk_bytenr;
|
|
- unsigned int index = offset >> PAGE_SHIFT;
|
|
- unsigned int real_size;
|
|
- unsigned int added;
|
|
- struct page *page = cb->compressed_pages[index];
|
|
-
|
|
- /*
|
|
- * We have various limit on the real read size:
|
|
- * - page boundary
|
|
- * - compressed length boundary
|
|
- */
|
|
- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
|
|
- real_size = min_t(u64, real_size, compressed_len - offset);
|
|
- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
|
|
-
|
|
- added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
|
|
- /*
|
|
- * Maximum compressed extent is smaller than bio size limit,
|
|
- * thus bio_add_page() should always success.
|
|
- */
|
|
- ASSERT(added == real_size);
|
|
- cur_disk_byte += added;
|
|
- }
|
|
+ cb->len = bbio->bio.bi_iter.bi_size;
|
|
+ cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
|
|
+ btrfs_add_compressed_bio_pages(cb);
|
|
|
|
if (memstall)
|
|
psi_memstall_leave(&pflags);
|
|
|
|
- /*
|
|
- * Stash the initial offset of this chunk, as there is no direct
|
|
- * correlation between compressed pages and the original file offset.
|
|
- * The field is only used for printing error messages anyway.
|
|
- */
|
|
- btrfs_bio(comp_bio)->file_offset = file_offset;
|
|
-
|
|
- ASSERT(comp_bio->bi_iter.bi_size);
|
|
- btrfs_submit_bio(comp_bio, mirror_num);
|
|
+ btrfs_submit_bio(&cb->bbio, mirror_num);
|
|
return;
|
|
|
|
-fail:
|
|
- if (cb->compressed_pages) {
|
|
- for (i = 0; i < cb->nr_pages; i++) {
|
|
- if (cb->compressed_pages[i])
|
|
- __free_page(cb->compressed_pages[i]);
|
|
- }
|
|
- }
|
|
-
|
|
+out_free_compressed_pages:
|
|
kfree(cb->compressed_pages);
|
|
- kfree(cb);
|
|
+out_free_bio:
|
|
+ bio_put(&cb->bbio.bio);
|
|
out:
|
|
- free_extent_map(em);
|
|
- btrfs_bio_end_io(btrfs_bio(bio), ret);
|
|
- return;
|
|
+ btrfs_bio_end_io(bbio, ret);
|
|
}
|
|
|
|
/*
|
|
@@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
|
|
ret = compression_decompress_bio(workspace, cb);
|
|
put_workspace(type, workspace);
|
|
|
|
+ if (!ret)
|
|
+ zero_fill_bio(&cb->orig_bbio->bio);
|
|
return ret;
|
|
}
|
|
|
|
@@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
|
|
|
|
int __init btrfs_init_compress(void)
|
|
{
|
|
+ if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
|
|
+ offsetof(struct compressed_bio, bbio.bio),
|
|
+ BIOSET_NEED_BVECS))
|
|
+ return -ENOMEM;
|
|
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
|
|
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
|
|
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
|
|
@@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void)
|
|
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
|
|
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
|
|
zstd_cleanup_workspace_manager();
|
|
+ bioset_exit(&btrfs_compressed_bioset);
|
|
}
|
|
|
|
/*
|
|
@@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void)
|
|
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
|
|
struct compressed_bio *cb, u32 decompressed)
|
|
{
|
|
- struct bio *orig_bio = cb->orig_bio;
|
|
+ struct bio *orig_bio = &cb->orig_bbio->bio;
|
|
/* Offset inside the full decompressed extent */
|
|
u32 cur_offset;
|
|
|
|
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
|
|
index a5e3377db9ad..19ab2abeddc0 100644
|
|
--- a/fs/btrfs/compression.h
|
|
+++ b/fs/btrfs/compression.h
|
|
@@ -6,8 +6,8 @@
|
|
#ifndef BTRFS_COMPRESSION_H
|
|
#define BTRFS_COMPRESSION_H
|
|
|
|
-#include <linux/blk_types.h>
|
|
#include <linux/sizes.h>
|
|
+#include "bio.h"
|
|
|
|
struct btrfs_inode;
|
|
|
|
@@ -23,6 +23,7 @@ struct btrfs_inode;
|
|
|
|
/* Maximum length of compressed data stored on disk */
|
|
#define BTRFS_MAX_COMPRESSED (SZ_128K)
|
|
+#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE)
|
|
static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
|
|
|
|
/* Maximum size of data before compression */
|
|
@@ -37,9 +38,6 @@ struct compressed_bio {
|
|
/* the pages with the compressed data on them */
|
|
struct page **compressed_pages;
|
|
|
|
- /* inode that owns this data */
|
|
- struct inode *inode;
|
|
-
|
|
/* starting offset in the inode for our pages */
|
|
u64 start;
|
|
|
|
@@ -55,14 +53,14 @@ struct compressed_bio {
|
|
/* Whether this is a write for writeback. */
|
|
bool writeback;
|
|
|
|
- /* IO errors */
|
|
- blk_status_t status;
|
|
-
|
|
union {
|
|
/* For reads, this is the bio we are copying the data into */
|
|
- struct bio *orig_bio;
|
|
+ struct btrfs_bio *orig_bbio;
|
|
struct work_struct write_end_work;
|
|
};
|
|
+
|
|
+ /* Must be last. */
|
|
+ struct btrfs_bio bbio;
|
|
};
|
|
|
|
static inline unsigned int btrfs_compress_type(unsigned int type_level)
|
|
@@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
|
|
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
|
|
struct compressed_bio *cb, u32 decompressed);
|
|
|
|
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
|
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
|
unsigned int len, u64 disk_start,
|
|
unsigned int compressed_len,
|
|
struct page **compressed_pages,
|
|
unsigned int nr_pages,
|
|
blk_opf_t write_flags,
|
|
- struct cgroup_subsys_state *blkcg_css,
|
|
bool writeback);
|
|
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
|
- int mirror_num);
|
|
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num);
|
|
|
|
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
|
|
|
|
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
|
|
index a5b6bb54545f..3c983c70028a 100644
|
|
--- a/fs/btrfs/ctree.c
|
|
+++ b/fs/btrfs/ctree.c
|
|
@@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
|
|
* Search for a key in the given extent_buffer.
|
|
*
|
|
* The lower boundary for the search is specified by the slot number @first_slot.
|
|
- * Use a value of 0 to search over the whole extent buffer.
|
|
+ * Use a value of 0 to search over the whole extent buffer. Works for both
|
|
+ * leaves and nodes.
|
|
*
|
|
* The slot in the extent buffer is returned via @slot. If the key exists in the
|
|
* extent buffer, then @slot will point to the slot where the key is, otherwise
|
|
@@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
|
|
* Slot may point to the total number of items (i.e. one position beyond the last
|
|
* key) if the key is bigger than the last key in the extent buffer.
|
|
*/
|
|
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
|
|
- const struct btrfs_key *key, int *slot)
|
|
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
|
|
+ const struct btrfs_key *key, int *slot)
|
|
{
|
|
unsigned long p;
|
|
int item_size;
|
|
@@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
|
|
if (slot < 0 || slot >= btrfs_header_nritems(parent))
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
- BUG_ON(level == 0);
|
|
+ ASSERT(level);
|
|
|
|
check.level = level - 1;
|
|
check.transid = btrfs_node_ptr_generation(parent, slot);
|
|
@@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
|
|
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
|
|
return 0;
|
|
|
|
- left = btrfs_read_node_slot(parent, pslot - 1);
|
|
- if (IS_ERR(left))
|
|
- left = NULL;
|
|
+ if (pslot) {
|
|
+ left = btrfs_read_node_slot(parent, pslot - 1);
|
|
+ if (IS_ERR(left)) {
|
|
+ ret = PTR_ERR(left);
|
|
+ left = NULL;
|
|
+ goto enospc;
|
|
+ }
|
|
|
|
- if (left) {
|
|
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
|
|
wret = btrfs_cow_block(trans, root, left,
|
|
parent, pslot - 1, &left,
|
|
@@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
|
|
}
|
|
}
|
|
|
|
- right = btrfs_read_node_slot(parent, pslot + 1);
|
|
- if (IS_ERR(right))
|
|
- right = NULL;
|
|
+ if (pslot + 1 < btrfs_header_nritems(parent)) {
|
|
+ right = btrfs_read_node_slot(parent, pslot + 1);
|
|
+ if (IS_ERR(right)) {
|
|
+ ret = PTR_ERR(right);
|
|
+ right = NULL;
|
|
+ goto enospc;
|
|
+ }
|
|
|
|
- if (right) {
|
|
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
|
|
wret = btrfs_cow_block(trans, root, right,
|
|
parent, pslot + 1, &right,
|
|
@@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
|
|
if (!parent)
|
|
return 1;
|
|
|
|
- left = btrfs_read_node_slot(parent, pslot - 1);
|
|
- if (IS_ERR(left))
|
|
- left = NULL;
|
|
-
|
|
/* first, try to make some room in the middle buffer */
|
|
- if (left) {
|
|
+ if (pslot) {
|
|
u32 left_nr;
|
|
|
|
+ left = btrfs_read_node_slot(parent, pslot - 1);
|
|
+ if (IS_ERR(left))
|
|
+ return PTR_ERR(left);
|
|
+
|
|
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
|
|
|
|
left_nr = btrfs_header_nritems(left);
|
|
@@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
|
|
btrfs_tree_unlock(left);
|
|
free_extent_buffer(left);
|
|
}
|
|
- right = btrfs_read_node_slot(parent, pslot + 1);
|
|
- if (IS_ERR(right))
|
|
- right = NULL;
|
|
|
|
/*
|
|
* then try to empty the right most buffer into the middle
|
|
*/
|
|
- if (right) {
|
|
+ if (pslot + 1 < btrfs_header_nritems(parent)) {
|
|
u32 right_nr;
|
|
|
|
+ right = btrfs_read_node_slot(parent, pslot + 1);
|
|
+ if (IS_ERR(right))
|
|
+ return PTR_ERR(right);
|
|
+
|
|
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
|
|
|
|
right_nr = btrfs_header_nritems(right);
|
|
@@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
|
|
return 0;
|
|
}
|
|
|
|
- return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
|
|
+ return btrfs_bin_search(eb, search_low_slot, key, slot);
|
|
}
|
|
|
|
static int search_leaf(struct btrfs_trans_handle *trans,
|
|
@@ -2321,7 +2329,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
|
|
*/
|
|
btrfs_unlock_up_safe(p, level + 1);
|
|
|
|
- ret = btrfs_bin_search(b, key, &slot);
|
|
+ ret = btrfs_bin_search(b, 0, key, &slot);
|
|
if (ret < 0)
|
|
goto done;
|
|
|
|
@@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
|
|
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
|
|
struct btrfs_path *path)
|
|
{
|
|
- while (1) {
|
|
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
|
int ret;
|
|
- const int slot = path->slots[0];
|
|
- const struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
- /* This is where we start walking the path. */
|
|
- if (slot >= btrfs_header_nritems(leaf)) {
|
|
- /*
|
|
- * If we've reached the last slot in this leaf we need
|
|
- * to go to the next leaf and reset the path.
|
|
- */
|
|
- ret = btrfs_next_leaf(root, path);
|
|
- if (ret)
|
|
- return ret;
|
|
- continue;
|
|
- }
|
|
- /* Store the found, valid item in @key. */
|
|
- btrfs_item_key_to_cpu(leaf, key, slot);
|
|
- break;
|
|
+ ret = btrfs_next_leaf(root, path);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
+
|
|
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
|
|
return 0;
|
|
}
|
|
|
|
@@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
|
|
btrfs_assert_tree_write_locked(path->nodes[1]);
|
|
|
|
right = btrfs_read_node_slot(upper, slot + 1);
|
|
- /*
|
|
- * slot + 1 is not valid or we fail to read the right node,
|
|
- * no big deal, just return.
|
|
- */
|
|
if (IS_ERR(right))
|
|
- return 1;
|
|
+ return PTR_ERR(right);
|
|
|
|
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
|
|
|
|
@@ -3417,12 +3410,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
|
|
btrfs_assert_tree_write_locked(path->nodes[1]);
|
|
|
|
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
|
|
- /*
|
|
- * slot - 1 is not valid or we fail to read the left node,
|
|
- * no big deal, just return.
|
|
- */
|
|
if (IS_ERR(left))
|
|
- return 1;
|
|
+ return PTR_ERR(left);
|
|
|
|
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
|
|
|
|
@@ -4576,7 +4565,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
|
|
while (1) {
|
|
nritems = btrfs_header_nritems(cur);
|
|
level = btrfs_header_level(cur);
|
|
- sret = btrfs_bin_search(cur, min_key, &slot);
|
|
+ sret = btrfs_bin_search(cur, 0, min_key, &slot);
|
|
if (sret < 0) {
|
|
ret = sret;
|
|
goto out;
|
|
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
|
|
index 97897107fab5..4c1986cd5bed 100644
|
|
--- a/fs/btrfs/ctree.h
|
|
+++ b/fs/btrfs/ctree.h
|
|
@@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
|
|
int __init btrfs_ctree_init(void);
|
|
void __cold btrfs_ctree_exit(void);
|
|
|
|
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
|
|
- const struct btrfs_key *key, int *slot);
|
|
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
|
|
+ const struct btrfs_key *key, int *slot);
|
|
|
|
-/*
|
|
- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
|
|
- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
|
|
- */
|
|
-static inline int btrfs_bin_search(struct extent_buffer *eb,
|
|
- const struct btrfs_key *key,
|
|
- int *slot)
|
|
-{
|
|
- return btrfs_generic_bin_search(eb, 0, key, slot);
|
|
-}
|
|
-
|
|
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
|
|
- int *slot);
|
|
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
|
|
int btrfs_previous_item(struct btrfs_root *root,
|
|
struct btrfs_path *path, u64 min_objectid,
|
|
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
|
|
index 7ddb1d104e8e..427abaf608b8 100644
|
|
--- a/fs/btrfs/delalloc-space.c
|
|
+++ b/fs/btrfs/delalloc-space.c
|
|
@@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
|
* racing with an ordered completion or some such that would think it
|
|
* needs to free the reservation we just made.
|
|
*/
|
|
- spin_lock(&inode->lock);
|
|
nr_extents = count_max_extents(fs_info, num_bytes);
|
|
+ spin_lock(&inode->lock);
|
|
btrfs_mod_outstanding_extents(inode, nr_extents);
|
|
inode->csum_bytes += disk_num_bytes;
|
|
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
|
|
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
|
|
index 886ffb232eac..0b32432d7d56 100644
|
|
--- a/fs/btrfs/delayed-ref.c
|
|
+++ b/fs/btrfs/delayed-ref.c
|
|
@@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
|
|
return ret;
|
|
}
|
|
|
|
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
|
|
-{
|
|
- u64 num_entries =
|
|
- atomic_read(&trans->transaction->delayed_refs.num_entries);
|
|
- u64 avg_runtime;
|
|
- u64 val;
|
|
-
|
|
- smp_mb();
|
|
- avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
|
|
- val = num_entries * avg_runtime;
|
|
- if (val >= NSEC_PER_SEC)
|
|
- return 1;
|
|
- if (val >= NSEC_PER_SEC / 2)
|
|
- return 2;
|
|
-
|
|
- return btrfs_check_space_for_delayed_refs(trans->fs_info);
|
|
-}
|
|
-
|
|
/*
|
|
* Release a ref head's reservation.
|
|
*
|
|
@@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
|
|
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
|
|
{
|
|
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
|
|
- u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
|
|
+ const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
|
|
u64 released = 0;
|
|
|
|
- /*
|
|
- * We have to check the mount option here because we could be enabling
|
|
- * the free space tree for the first time and don't have the compat_ro
|
|
- * option set yet.
|
|
- *
|
|
- * We need extra reservations if we have the free space tree because
|
|
- * we'll have to modify that tree as well.
|
|
- */
|
|
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
|
|
- num_bytes *= 2;
|
|
-
|
|
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
|
|
if (released)
|
|
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
|
|
@@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
|
|
if (!trans->delayed_ref_updates)
|
|
return;
|
|
|
|
- num_bytes = btrfs_calc_insert_metadata_size(fs_info,
|
|
- trans->delayed_ref_updates);
|
|
- /*
|
|
- * We have to check the mount option here because we could be enabling
|
|
- * the free space tree for the first time and don't have the compat_ro
|
|
- * option set yet.
|
|
- *
|
|
- * We need extra reservations if we have the free space tree because
|
|
- * we'll have to modify that tree as well.
|
|
- */
|
|
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
|
|
- num_bytes *= 2;
|
|
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
|
|
+ trans->delayed_ref_updates);
|
|
|
|
spin_lock(&delayed_rsv->lock);
|
|
delayed_rsv->size += num_bytes;
|
|
@@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
enum btrfs_reserve_flush_enum flush)
|
|
{
|
|
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
|
|
- u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
|
|
+ u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
|
|
u64 num_bytes = 0;
|
|
int ret = -ENOSPC;
|
|
|
|
@@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
|
|
if (ret)
|
|
return ret;
|
|
- btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
|
|
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
|
|
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
|
|
0, num_bytes, 1);
|
|
return 0;
|
|
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
|
|
index 2eb34abf700f..b54261fe509b 100644
|
|
--- a/fs/btrfs/delayed-ref.h
|
|
+++ b/fs/btrfs/delayed-ref.h
|
|
@@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
|
|
int __init btrfs_delayed_ref_init(void);
|
|
void __cold btrfs_delayed_ref_exit(void);
|
|
|
|
+static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
|
|
+ int num_delayed_refs)
|
|
+{
|
|
+ u64 num_bytes;
|
|
+
|
|
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);
|
|
+
|
|
+ /*
|
|
+ * We have to check the mount option here because we could be enabling
|
|
+ * the free space tree for the first time and don't have the compat_ro
|
|
+ * option set yet.
|
|
+ *
|
|
+ * We need extra reservations if we have the free space tree because
|
|
+ * we'll have to modify that tree as well.
|
|
+ */
|
|
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
|
|
+ num_bytes *= 2;
|
|
+
|
|
+ return num_bytes;
|
|
+}
|
|
+
|
|
static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
|
|
int action, u64 bytenr, u64 len, u64 parent)
|
|
{
|
|
@@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_block_rsv *src,
|
|
u64 num_bytes);
|
|
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
|
|
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
|
|
|
|
/*
|
|
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
|
|
index 9e1596bb208d..59ea049fe7ee 100644
|
|
--- a/fs/btrfs/disk-io.c
|
|
+++ b/fs/btrfs/disk-io.c
|
|
@@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
|
|
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
|
|
{
|
|
int ret;
|
|
- unsigned int nofs_flag;
|
|
|
|
- /*
|
|
- * We might be called under a transaction (e.g. indirect backref
|
|
- * resolution) which could deadlock if it triggers memory reclaim
|
|
- */
|
|
- nofs_flag = memalloc_nofs_save();
|
|
- ret = btrfs_drew_lock_init(&root->snapshot_lock);
|
|
- memalloc_nofs_restore(nofs_flag);
|
|
- if (ret)
|
|
- goto fail;
|
|
+ btrfs_drew_lock_init(&root->snapshot_lock);
|
|
|
|
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
|
|
!btrfs_is_data_reloc_root(root)) {
|
|
@@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root)
|
|
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
|
|
if (root->anon_dev)
|
|
free_anon_bdev(root->anon_dev);
|
|
- btrfs_drew_lock_destroy(&root->snapshot_lock);
|
|
free_root_extent_buffers(root);
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
spin_lock(&root->fs_info->fs_roots_radix_lock);
|
|
@@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
|
|
atomic_set(&fs_info->reloc_cancel_req, 0);
|
|
}
|
|
|
|
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
|
|
+static int btrfs_init_btree_inode(struct super_block *sb)
|
|
{
|
|
- struct inode *inode = fs_info->btree_inode;
|
|
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
|
|
unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
|
|
fs_info->tree_root);
|
|
+ struct inode *inode;
|
|
+
|
|
+ inode = new_inode(sb);
|
|
+ if (!inode)
|
|
+ return -ENOMEM;
|
|
|
|
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
|
|
set_nlink(inode, 1);
|
|
@@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
|
|
*/
|
|
inode->i_size = OFFSET_MAX;
|
|
inode->i_mapping->a_ops = &btree_aops;
|
|
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
|
|
|
|
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
|
|
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
|
|
@@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
|
|
BTRFS_I(inode)->location.offset = 0;
|
|
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
|
|
__insert_inode_hash(inode, hash);
|
|
+ fs_info->btree_inode = inode;
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
|
|
@@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
|
|
atomic64_set(&fs_info->free_chunk_space, 0);
|
|
fs_info->tree_mod_log = RB_ROOT;
|
|
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
|
|
- fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
|
|
btrfs_init_ref_verify(fs_info);
|
|
|
|
fs_info->thread_pool_size = min_t(unsigned long,
|
|
@@ -3344,14 +3342,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
struct btrfs_root *tree_root;
|
|
struct btrfs_root *chunk_root;
|
|
int ret;
|
|
- int err = -EINVAL;
|
|
int level;
|
|
|
|
ret = init_mount_fs_info(fs_info, sb);
|
|
- if (ret) {
|
|
- err = ret;
|
|
+ if (ret)
|
|
goto fail;
|
|
- }
|
|
|
|
/* These need to be init'ed before we start creating inodes and such. */
|
|
tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
|
|
@@ -3361,17 +3356,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
GFP_KERNEL);
|
|
fs_info->chunk_root = chunk_root;
|
|
if (!tree_root || !chunk_root) {
|
|
- err = -ENOMEM;
|
|
+ ret = -ENOMEM;
|
|
goto fail;
|
|
}
|
|
|
|
- fs_info->btree_inode = new_inode(sb);
|
|
- if (!fs_info->btree_inode) {
|
|
- err = -ENOMEM;
|
|
+ ret = btrfs_init_btree_inode(sb);
|
|
+ if (ret)
|
|
goto fail;
|
|
- }
|
|
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
|
|
- btrfs_init_btree_inode(fs_info);
|
|
|
|
invalidate_bdev(fs_devices->latest_dev->bdev);
|
|
|
|
@@ -3380,7 +3371,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
*/
|
|
disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
|
|
if (IS_ERR(disk_super)) {
|
|
- err = PTR_ERR(disk_super);
|
|
+ ret = PTR_ERR(disk_super);
|
|
goto fail_alloc;
|
|
}
|
|
|
|
@@ -3392,7 +3383,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
if (!btrfs_supported_super_csum(csum_type)) {
|
|
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
|
|
csum_type);
|
|
- err = -EINVAL;
|
|
+ ret = -EINVAL;
|
|
btrfs_release_disk_super(disk_super);
|
|
goto fail_alloc;
|
|
}
|
|
@@ -3401,7 +3392,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
|
|
ret = btrfs_init_csum_hash(fs_info, csum_type);
|
|
if (ret) {
|
|
- err = ret;
|
|
btrfs_release_disk_super(disk_super);
|
|
goto fail_alloc;
|
|
}
|
|
@@ -3412,7 +3402,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
*/
|
|
if (btrfs_check_super_csum(fs_info, disk_super)) {
|
|
btrfs_err(fs_info, "superblock checksum mismatch");
|
|
- err = -EINVAL;
|
|
+ ret = -EINVAL;
|
|
btrfs_release_disk_super(disk_super);
|
|
goto fail_alloc;
|
|
}
|
|
@@ -3442,12 +3432,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
ret = btrfs_validate_mount_super(fs_info);
|
|
if (ret) {
|
|
btrfs_err(fs_info, "superblock contains fatal errors");
|
|
- err = -EINVAL;
|
|
+ ret = -EINVAL;
|
|
goto fail_alloc;
|
|
}
|
|
|
|
- if (!btrfs_super_root(disk_super))
|
|
+ if (!btrfs_super_root(disk_super)) {
|
|
+ btrfs_err(fs_info, "invalid superblock tree root bytenr");
|
|
+ ret = -EINVAL;
|
|
goto fail_alloc;
|
|
+ }
|
|
|
|
/* check FS state, whether FS is broken. */
|
|
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
|
|
@@ -3474,16 +3467,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
fs_info->stripesize = stripesize;
|
|
|
|
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
|
|
- if (ret) {
|
|
- err = ret;
|
|
+ if (ret)
|
|
goto fail_alloc;
|
|
- }
|
|
|
|
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
|
|
- if (ret < 0) {
|
|
- err = ret;
|
|
+ if (ret < 0)
|
|
goto fail_alloc;
|
|
- }
|
|
|
|
if (sectorsize < PAGE_SIZE) {
|
|
struct btrfs_subpage_info *subpage_info;
|
|
@@ -3503,17 +3492,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
"read-write for sector size %u with page size %lu is experimental",
|
|
sectorsize, PAGE_SIZE);
|
|
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
|
|
- if (!subpage_info)
|
|
+ if (!subpage_info) {
|
|
+ ret = -ENOMEM;
|
|
goto fail_alloc;
|
|
+ }
|
|
btrfs_init_subpage_info(subpage_info, sectorsize);
|
|
fs_info->subpage_info = subpage_info;
|
|
}
|
|
|
|
ret = btrfs_init_workqueues(fs_info);
|
|
- if (ret) {
|
|
- err = ret;
|
|
+ if (ret)
|
|
goto fail_sb_buffer;
|
|
- }
|
|
|
|
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
|
|
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
|
|
@@ -3559,6 +3548,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
btrfs_free_extra_devids(fs_devices);
|
|
if (!fs_devices->latest_dev->bdev) {
|
|
btrfs_err(fs_info, "failed to read devices");
|
|
+ ret = -EIO;
|
|
goto fail_tree_roots;
|
|
}
|
|
|
|
@@ -3574,8 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
ret = btrfs_get_dev_zone_info_all_devices(fs_info);
|
|
if (ret) {
|
|
btrfs_err(fs_info,
|
|
- "zoned: failed to read device zone info: %d",
|
|
- ret);
|
|
+ "zoned: failed to read device zone info: %d", ret);
|
|
goto fail_block_groups;
|
|
}
|
|
|
|
@@ -3654,19 +3643,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
!btrfs_check_rw_degradable(fs_info, NULL)) {
|
|
btrfs_warn(fs_info,
|
|
"writable mount is not allowed due to too many missing devices");
|
|
+ ret = -EINVAL;
|
|
goto fail_sysfs;
|
|
}
|
|
|
|
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
|
|
"btrfs-cleaner");
|
|
- if (IS_ERR(fs_info->cleaner_kthread))
|
|
+ if (IS_ERR(fs_info->cleaner_kthread)) {
|
|
+ ret = PTR_ERR(fs_info->cleaner_kthread);
|
|
goto fail_sysfs;
|
|
+ }
|
|
|
|
fs_info->transaction_kthread = kthread_run(transaction_kthread,
|
|
tree_root,
|
|
"btrfs-transaction");
|
|
- if (IS_ERR(fs_info->transaction_kthread))
|
|
+ if (IS_ERR(fs_info->transaction_kthread)) {
|
|
+ ret = PTR_ERR(fs_info->transaction_kthread);
|
|
goto fail_cleaner;
|
|
+ }
|
|
|
|
if (!btrfs_test_opt(fs_info, NOSSD) &&
|
|
!fs_info->fs_devices->rotating) {
|
|
@@ -3684,7 +3678,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
fs_info->fs_devices->discardable) {
|
|
btrfs_set_and_info(fs_info, DISCARD_ASYNC,
|
|
"auto enabling async discard");
|
|
- btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
|
|
}
|
|
|
|
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
|
|
@@ -3711,16 +3704,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
|
|
btrfs_info(fs_info, "start tree-log replay");
|
|
ret = btrfs_replay_log(fs_info, fs_devices);
|
|
- if (ret) {
|
|
- err = ret;
|
|
+ if (ret)
|
|
goto fail_qgroup;
|
|
- }
|
|
}
|
|
|
|
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
|
|
if (IS_ERR(fs_info->fs_root)) {
|
|
- err = PTR_ERR(fs_info->fs_root);
|
|
- btrfs_warn(fs_info, "failed to read fs tree: %d", err);
|
|
+ ret = PTR_ERR(fs_info->fs_root);
|
|
+ btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
|
|
fs_info->fs_root = NULL;
|
|
goto fail_qgroup;
|
|
}
|
|
@@ -3797,7 +3788,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
|
iput(fs_info->btree_inode);
|
|
fail:
|
|
btrfs_close_devices(fs_info->fs_devices);
|
|
- return err;
|
|
+ ASSERT(ret < 0);
|
|
+ return ret;
|
|
}
|
|
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
|
|
|
|
@@ -4094,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device)
|
|
{
|
|
struct bio *bio = &device->flush_bio;
|
|
|
|
+ device->last_flush_error = BLK_STS_OK;
|
|
+
|
|
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
|
|
/*
|
|
* When a disk has write caching disabled, we skip submission of a bio
|
|
@@ -4122,25 +4116,24 @@ static void write_dev_flush(struct btrfs_device *device)
|
|
|
|
/*
|
|
* If the flush bio has been submitted by write_dev_flush, wait for it.
|
|
+ * Return true for any error, and false otherwise.
|
|
*/
|
|
-static blk_status_t wait_dev_flush(struct btrfs_device *device)
|
|
+static bool wait_dev_flush(struct btrfs_device *device)
|
|
{
|
|
struct bio *bio = &device->flush_bio;
|
|
|
|
- if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
|
|
- return BLK_STS_OK;
|
|
+ if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
|
|
+ return false;
|
|
|
|
- clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
|
|
wait_for_completion_io(&device->flush_wait);
|
|
|
|
- return bio->bi_status;
|
|
-}
|
|
+ if (bio->bi_status) {
|
|
+ device->last_flush_error = bio->bi_status;
|
|
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
|
|
+ return true;
|
|
+ }
|
|
|
|
-static int check_barrier_error(struct btrfs_fs_info *fs_info)
|
|
-{
|
|
- if (!btrfs_check_rw_degradable(fs_info, NULL))
|
|
- return -EIO;
|
|
- return 0;
|
|
+ return false;
|
|
}
|
|
|
|
/*
|
|
@@ -4152,7 +4145,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
|
|
struct list_head *head;
|
|
struct btrfs_device *dev;
|
|
int errors_wait = 0;
|
|
- blk_status_t ret;
|
|
|
|
lockdep_assert_held(&info->fs_devices->device_list_mutex);
|
|
/* send down all the barriers */
|
|
@@ -4167,7 +4159,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
|
|
continue;
|
|
|
|
write_dev_flush(dev);
|
|
- dev->last_flush_error = BLK_STS_OK;
|
|
}
|
|
|
|
/* wait for all the barriers */
|
|
@@ -4182,23 +4173,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
|
|
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
|
|
continue;
|
|
|
|
- ret = wait_dev_flush(dev);
|
|
- if (ret) {
|
|
- dev->last_flush_error = ret;
|
|
- btrfs_dev_stat_inc_and_print(dev,
|
|
- BTRFS_DEV_STAT_FLUSH_ERRS);
|
|
+ if (wait_dev_flush(dev))
|
|
errors_wait++;
|
|
- }
|
|
}
|
|
|
|
- if (errors_wait) {
|
|
- /*
|
|
- * At some point we need the status of all disks
|
|
- * to arrive at the volume status. So error checking
|
|
- * is being pushed to a separate loop.
|
|
- */
|
|
- return check_barrier_error(info);
|
|
- }
|
|
+ /*
|
|
+ * Checks last_flush_error of disks in order to determine the device
|
|
+ * state.
|
|
+ */
|
|
+ if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
|
|
+ return -EIO;
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -4404,12 +4389,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
|
|
root_objectid = gang[i]->root_key.objectid;
|
|
err = btrfs_orphan_cleanup(gang[i]);
|
|
if (err)
|
|
- break;
|
|
+ goto out;
|
|
btrfs_put_root(gang[i]);
|
|
}
|
|
root_objectid++;
|
|
}
|
|
-
|
|
+out:
|
|
/* release the uncleaned roots due to error */
|
|
for (; i < ret; i++) {
|
|
if (gang[i])
|
|
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
|
|
index 824c657f59e8..5cd289de4e92 100644
|
|
--- a/fs/btrfs/extent-tree.c
|
|
+++ b/fs/btrfs/extent-tree.c
|
|
@@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
|
|
}
|
|
|
|
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
|
|
- struct btrfs_delayed_ref_head *locked_ref,
|
|
- unsigned long *run_refs)
|
|
+ struct btrfs_delayed_ref_head *locked_ref)
|
|
{
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
@@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
|
|
return -EAGAIN;
|
|
}
|
|
|
|
- (*run_refs)++;
|
|
ref->in_tree = 0;
|
|
rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
|
|
RB_CLEAR_NODE(&ref->ref_node);
|
|
@@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
struct btrfs_delayed_ref_head *locked_ref = NULL;
|
|
- ktime_t start = ktime_get();
|
|
int ret;
|
|
unsigned long count = 0;
|
|
- unsigned long actual_count = 0;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
do {
|
|
@@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
|
|
spin_lock(&locked_ref->lock);
|
|
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
|
|
|
|
- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
|
|
- &actual_count);
|
|
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
|
|
if (ret < 0 && ret != -EAGAIN) {
|
|
/*
|
|
* Error, btrfs_run_delayed_refs_for_head already
|
|
@@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
|
|
cond_resched();
|
|
} while ((nr != -1 && count < nr) || locked_ref);
|
|
|
|
- /*
|
|
- * We don't want to include ref heads since we can have empty ref heads
|
|
- * and those will drastically skew our runtime down since we just do
|
|
- * accounting, no actual extent tree updates.
|
|
- */
|
|
- if (actual_count > 0) {
|
|
- u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
|
|
- u64 avg;
|
|
-
|
|
- /*
|
|
- * We weigh the current average higher than our current runtime
|
|
- * to avoid large swings in the average.
|
|
- */
|
|
- spin_lock(&delayed_refs->lock);
|
|
- avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
|
|
- fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
|
|
- spin_unlock(&delayed_refs->lock);
|
|
- }
|
|
return 0;
|
|
}
|
|
|
|
@@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
{
|
|
int level = wc->level;
|
|
int lookup_info = 1;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
|
|
while (level >= 0) {
|
|
ret = walk_down_proc(trans, root, path, wc, lookup_info);
|
|
- if (ret > 0)
|
|
+ if (ret)
|
|
break;
|
|
|
|
if (level == 0)
|
|
@@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
path->slots[level]++;
|
|
continue;
|
|
} else if (ret < 0)
|
|
- return ret;
|
|
+ break;
|
|
level = wc->level;
|
|
}
|
|
- return 0;
|
|
+ return (ret == 1) ? 0 : ret;
|
|
}
|
|
|
|
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
|
|
@@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
|
|
|
|
ret = walk_down_tree(trans, root, path, wc);
|
|
if (ret < 0) {
|
|
+ btrfs_abort_transaction(trans, ret);
|
|
err = ret;
|
|
break;
|
|
}
|
|
|
|
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
|
|
if (ret < 0) {
|
|
+ btrfs_abort_transaction(trans, ret);
|
|
err = ret;
|
|
break;
|
|
}
|
|
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
|
|
index 40300e8e5f99..a1adadd5d25d 100644
|
|
--- a/fs/btrfs/extent_io.c
|
|
+++ b/fs/btrfs/extent_io.c
|
|
@@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
|
|
* how many bytes are there before stripe/ordered extent boundary.
|
|
*/
|
|
struct btrfs_bio_ctrl {
|
|
- struct bio *bio;
|
|
+ struct btrfs_bio *bbio;
|
|
int mirror_num;
|
|
enum btrfs_compression_type compress_type;
|
|
u32 len_to_oe_boundary;
|
|
+ blk_opf_t opf;
|
|
btrfs_bio_end_io_t end_io_func;
|
|
+ struct writeback_control *wbc;
|
|
|
|
/*
|
|
* This is for metadata read, to provide the extra needed verification
|
|
@@ -117,51 +119,41 @@ struct btrfs_bio_ctrl {
|
|
* does the unlocking.
|
|
*/
|
|
bool extent_locked;
|
|
-
|
|
- /* Tell the submit_bio code to use REQ_SYNC */
|
|
- bool sync_io;
|
|
};
|
|
|
|
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
- struct bio *bio;
|
|
- struct bio_vec *bv;
|
|
- struct inode *inode;
|
|
- int mirror_num;
|
|
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
|
|
+ int mirror_num = bio_ctrl->mirror_num;
|
|
|
|
- if (!bio_ctrl->bio)
|
|
+ if (!bbio)
|
|
return;
|
|
|
|
- bio = bio_ctrl->bio;
|
|
- bv = bio_first_bvec_all(bio);
|
|
- inode = bv->bv_page->mapping->host;
|
|
- mirror_num = bio_ctrl->mirror_num;
|
|
-
|
|
/* Caller should ensure the bio has at least some range added */
|
|
- ASSERT(bio->bi_iter.bi_size);
|
|
+ ASSERT(bbio->bio.bi_iter.bi_size);
|
|
|
|
- if (!is_data_inode(inode)) {
|
|
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
|
|
+ if (!is_data_inode(&bbio->inode->vfs_inode)) {
|
|
+ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
|
|
/*
|
|
* For metadata read, we should have the parent_check,
|
|
* and copy it to bbio for metadata verification.
|
|
*/
|
|
ASSERT(bio_ctrl->parent_check);
|
|
- memcpy(&btrfs_bio(bio)->parent_check,
|
|
+ memcpy(&bbio->parent_check,
|
|
bio_ctrl->parent_check,
|
|
sizeof(struct btrfs_tree_parent_check));
|
|
}
|
|
- bio->bi_opf |= REQ_META;
|
|
+ bbio->bio.bi_opf |= REQ_META;
|
|
}
|
|
|
|
- if (btrfs_op(bio) == BTRFS_MAP_READ &&
|
|
+ if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
|
|
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
|
|
- btrfs_submit_compressed_read(inode, bio, mirror_num);
|
|
+ btrfs_submit_compressed_read(bbio, mirror_num);
|
|
else
|
|
- btrfs_submit_bio(bio, mirror_num);
|
|
+ btrfs_submit_bio(bbio, mirror_num);
|
|
|
|
- /* The bio is owned by the end_io handler now */
|
|
- bio_ctrl->bio = NULL;
|
|
+ /* The bbio is owned by the end_io handler now */
|
|
+ bio_ctrl->bbio = NULL;
|
|
}
|
|
|
|
/*
|
|
@@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
|
|
*/
|
|
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
|
|
{
|
|
- struct bio *bio = bio_ctrl->bio;
|
|
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
|
|
|
|
- if (!bio)
|
|
+ if (!bbio)
|
|
return;
|
|
|
|
if (ret) {
|
|
ASSERT(ret < 0);
|
|
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
|
|
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
|
|
/* The bio is owned by the end_io handler now */
|
|
- bio_ctrl->bio = NULL;
|
|
+ bio_ctrl->bbio = NULL;
|
|
} else {
|
|
submit_one_bio(bio_ctrl);
|
|
}
|
|
@@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
|
|
return 0;
|
|
}
|
|
|
|
-/*
|
|
- * Attempt to add a page to bio.
|
|
- *
|
|
- * @bio_ctrl: record both the bio, and its bio_flags
|
|
- * @page: page to add to the bio
|
|
- * @disk_bytenr: offset of the new bio or to check whether we are adding
|
|
- * a contiguous page to the previous one
|
|
- * @size: portion of page that we want to write
|
|
- * @pg_offset: starting offset in the page
|
|
- * @compress_type: compression type of the current bio to see if we can merge them
|
|
- *
|
|
- * Attempt to add a page to bio considering stripe alignment etc.
|
|
- *
|
|
- * Return >= 0 for the number of bytes added to the bio.
|
|
- * Can return 0 if the current bio is already at stripe/zone boundary.
|
|
- * Return <0 for error.
|
|
- */
|
|
-static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
|
|
- struct page *page,
|
|
- u64 disk_bytenr, unsigned int size,
|
|
- unsigned int pg_offset,
|
|
- enum btrfs_compression_type compress_type)
|
|
+static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
|
|
+ struct page *page, u64 disk_bytenr,
|
|
+ unsigned int pg_offset)
|
|
{
|
|
- struct bio *bio = bio_ctrl->bio;
|
|
- u32 bio_size = bio->bi_iter.bi_size;
|
|
- u32 real_size;
|
|
+ struct bio *bio = &bio_ctrl->bbio->bio;
|
|
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
|
|
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
|
|
- bool contig = false;
|
|
|
|
- ASSERT(bio);
|
|
- /* The limit should be calculated when bio_ctrl->bio is allocated */
|
|
- ASSERT(bio_ctrl->len_to_oe_boundary);
|
|
- if (bio_ctrl->compress_type != compress_type)
|
|
- return 0;
|
|
-
|
|
-
|
|
- if (bio->bi_iter.bi_size == 0) {
|
|
- /* We can always add a page into an empty bio. */
|
|
- contig = true;
|
|
- } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
|
|
- struct bio_vec *bvec = bio_last_bvec_all(bio);
|
|
-
|
|
- /*
|
|
- * The contig check requires the following conditions to be met:
|
|
- * 1) The pages are belonging to the same inode
|
|
- * This is implied by the call chain.
|
|
- *
|
|
- * 2) The range has adjacent logical bytenr
|
|
- *
|
|
- * 3) The range has adjacent file offset
|
|
- * This is required for the usage of btrfs_bio->file_offset.
|
|
- */
|
|
- if (bio_end_sector(bio) == sector &&
|
|
- page_offset(bvec->bv_page) + bvec->bv_offset +
|
|
- bvec->bv_len == page_offset(page) + pg_offset)
|
|
- contig = true;
|
|
- } else {
|
|
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
|
|
/*
|
|
- * For compression, all IO should have its logical bytenr
|
|
- * set to the starting bytenr of the compressed extent.
|
|
+ * For compression, all IO should have its logical bytenr set
|
|
+ * to the starting bytenr of the compressed extent.
|
|
*/
|
|
- contig = bio->bi_iter.bi_sector == sector;
|
|
+ return bio->bi_iter.bi_sector == sector;
|
|
}
|
|
|
|
- if (!contig)
|
|
- return 0;
|
|
-
|
|
- real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
|
|
-
|
|
/*
|
|
- * If real_size is 0, never call bio_add_*_page(), as even size is 0,
|
|
- * bio will still execute its endio function on the page!
|
|
+ * The contig check requires the following conditions to be met:
|
|
+ *
|
|
+ * 1) The pages are belonging to the same inode
|
|
+ * This is implied by the call chain.
|
|
+ *
|
|
+ * 2) The range has adjacent logical bytenr
|
|
+ *
|
|
+ * 3) The range has adjacent file offset
|
|
+ * This is required for the usage of btrfs_bio->file_offset.
|
|
*/
|
|
- if (real_size == 0)
|
|
- return 0;
|
|
-
|
|
- return bio_add_page(bio, page, real_size, pg_offset);
|
|
+ return bio_end_sector(bio) == sector &&
|
|
+ page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
|
|
+ page_offset(page) + pg_offset;
|
|
}
|
|
|
|
-static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
|
|
- struct btrfs_inode *inode, u64 file_offset)
|
|
+static void alloc_new_bio(struct btrfs_inode *inode,
|
|
+ struct btrfs_bio_ctrl *bio_ctrl,
|
|
+ u64 disk_bytenr, u64 file_offset)
|
|
{
|
|
- struct btrfs_ordered_extent *ordered;
|
|
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
+ struct btrfs_bio *bbio;
|
|
+
|
|
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
|
|
+ bio_ctrl->end_io_func, NULL);
|
|
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
|
|
+ bbio->inode = inode;
|
|
+ bbio->file_offset = file_offset;
|
|
+ bio_ctrl->bbio = bbio;
|
|
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
|
|
|
|
/*
|
|
* Limit the extent to the ordered boundary for Zone Append.
|
|
@@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
|
|
* them.
|
|
*/
|
|
if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
|
|
- btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
|
|
+ btrfs_use_zone_append(bbio)) {
|
|
+ struct btrfs_ordered_extent *ordered;
|
|
+
|
|
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
|
|
if (ordered) {
|
|
bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
|
|
ordered->file_offset +
|
|
ordered->disk_num_bytes - file_offset);
|
|
btrfs_put_ordered_extent(ordered);
|
|
- return;
|
|
}
|
|
}
|
|
|
|
- bio_ctrl->len_to_oe_boundary = U32_MAX;
|
|
-}
|
|
-
|
|
-static void alloc_new_bio(struct btrfs_inode *inode,
|
|
- struct btrfs_bio_ctrl *bio_ctrl,
|
|
- struct writeback_control *wbc, blk_opf_t opf,
|
|
- u64 disk_bytenr, u32 offset, u64 file_offset,
|
|
- enum btrfs_compression_type compress_type)
|
|
-{
|
|
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
- struct bio *bio;
|
|
-
|
|
- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
|
|
- NULL);
|
|
- /*
|
|
- * For compressed page range, its disk_bytenr is always @disk_bytenr
|
|
- * passed in, no matter if we have added any range into previous bio.
|
|
- */
|
|
- if (compress_type != BTRFS_COMPRESS_NONE)
|
|
- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
|
|
- else
|
|
- bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
|
|
- btrfs_bio(bio)->file_offset = file_offset;
|
|
- bio_ctrl->bio = bio;
|
|
- bio_ctrl->compress_type = compress_type;
|
|
- calc_bio_boundaries(bio_ctrl, inode, file_offset);
|
|
-
|
|
- if (wbc) {
|
|
+ if (bio_ctrl->wbc) {
|
|
/*
|
|
* Pick the last added device to support cgroup writeback. For
|
|
* multi-device file systems this means blk-cgroup policies have
|
|
* to always be set on the last added/replaced device.
|
|
* This is a bit odd but has been like that for a long time.
|
|
*/
|
|
- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
|
|
- wbc_init_bio(wbc, bio);
|
|
+ bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
|
|
+ wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
|
|
}
|
|
}
|
|
|
|
/*
|
|
- * @opf: bio REQ_OP_* and REQ_* flags as one value
|
|
- * @wbc: optional writeback control for io accounting
|
|
* @disk_bytenr: logical bytenr where the write will be
|
|
* @page: page to add to the bio
|
|
* @size: portion of page that we want to write to
|
|
* @pg_offset: offset of the new bio or to check whether we are adding
|
|
* a contiguous page to the previous one
|
|
- * @compress_type: compress type for current bio
|
|
*
|
|
- * The will either add the page into the existing @bio_ctrl->bio, or allocate a
|
|
- * new one in @bio_ctrl->bio.
|
|
+ * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
|
|
+ * new one in @bio_ctrl->bbio.
|
|
* The mirror number for this IO should already be initizlied in
|
|
* @bio_ctrl->mirror_num.
|
|
*/
|
|
-static int submit_extent_page(blk_opf_t opf,
|
|
- struct writeback_control *wbc,
|
|
- struct btrfs_bio_ctrl *bio_ctrl,
|
|
- u64 disk_bytenr, struct page *page,
|
|
- size_t size, unsigned long pg_offset,
|
|
- enum btrfs_compression_type compress_type,
|
|
- bool force_bio_submit)
|
|
+static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
|
|
+ u64 disk_bytenr, struct page *page,
|
|
+ size_t size, unsigned long pg_offset)
|
|
{
|
|
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
|
|
- unsigned int cur = pg_offset;
|
|
-
|
|
- ASSERT(bio_ctrl);
|
|
-
|
|
- ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
|
|
- pg_offset + size <= PAGE_SIZE);
|
|
|
|
+ ASSERT(pg_offset + size <= PAGE_SIZE);
|
|
ASSERT(bio_ctrl->end_io_func);
|
|
|
|
- if (force_bio_submit)
|
|
+ if (bio_ctrl->bbio &&
|
|
+ !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
|
|
submit_one_bio(bio_ctrl);
|
|
|
|
- while (cur < pg_offset + size) {
|
|
- u32 offset = cur - pg_offset;
|
|
- int added;
|
|
+ do {
|
|
+ u32 len = size;
|
|
|
|
/* Allocate new bio if needed */
|
|
- if (!bio_ctrl->bio) {
|
|
- alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
|
|
- offset, page_offset(page) + cur,
|
|
- compress_type);
|
|
+ if (!bio_ctrl->bbio) {
|
|
+ alloc_new_bio(inode, bio_ctrl, disk_bytenr,
|
|
+ page_offset(page) + pg_offset);
|
|
}
|
|
- /*
|
|
- * We must go through btrfs_bio_add_page() to ensure each
|
|
- * page range won't cross various boundaries.
|
|
- */
|
|
- if (compress_type != BTRFS_COMPRESS_NONE)
|
|
- added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
|
|
- size - offset, pg_offset + offset,
|
|
- compress_type);
|
|
- else
|
|
- added = btrfs_bio_add_page(bio_ctrl, page,
|
|
- disk_bytenr + offset, size - offset,
|
|
- pg_offset + offset, compress_type);
|
|
-
|
|
- /* Metadata page range should never be split */
|
|
- if (!is_data_inode(&inode->vfs_inode))
|
|
- ASSERT(added == 0 || added == size - offset);
|
|
-
|
|
- /* At least we added some page, update the account */
|
|
- if (wbc && added)
|
|
- wbc_account_cgroup_owner(wbc, page, added);
|
|
-
|
|
- /* We have reached boundary, submit right now */
|
|
- if (added < size - offset) {
|
|
- /* The bio should contain some page(s) */
|
|
- ASSERT(bio_ctrl->bio->bi_iter.bi_size);
|
|
+
|
|
+ /* Cap to the current ordered extent boundary if there is one. */
|
|
+ if (len > bio_ctrl->len_to_oe_boundary) {
|
|
+ ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
|
|
+ ASSERT(is_data_inode(&inode->vfs_inode));
|
|
+ len = bio_ctrl->len_to_oe_boundary;
|
|
+ }
|
|
+
|
|
+ if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
|
|
+ /* bio full: move on to a new one */
|
|
submit_one_bio(bio_ctrl);
|
|
+ continue;
|
|
}
|
|
- cur += added;
|
|
- }
|
|
- return 0;
|
|
+
|
|
+ if (bio_ctrl->wbc)
|
|
+ wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
|
|
+
|
|
+ size -= len;
|
|
+ pg_offset += len;
|
|
+ disk_bytenr += len;
|
|
+ bio_ctrl->len_to_oe_boundary -= len;
|
|
+
|
|
+ /* Ordered extent boundary: move on to a new bio. */
|
|
+ if (bio_ctrl->len_to_oe_boundary == 0)
|
|
+ submit_one_bio(bio_ctrl);
|
|
+ } while (size);
|
|
}
|
|
|
|
static int attach_extent_buffer_page(struct extent_buffer *eb,
|
|
@@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
|
|
* return 0 on success, otherwise return error
|
|
*/
|
|
static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|
- struct btrfs_bio_ctrl *bio_ctrl,
|
|
- blk_opf_t read_flags, u64 *prev_em_start)
|
|
+ struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
|
|
{
|
|
struct inode *inode = page->mapping->host;
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
@@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|
unlock_extent(tree, start, end, NULL);
|
|
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
|
|
unlock_page(page);
|
|
- goto out;
|
|
+ return ret;
|
|
}
|
|
|
|
if (page->index == last_byte >> PAGE_SHIFT) {
|
|
@@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|
bio_ctrl->end_io_func = end_bio_extent_readpage;
|
|
begin_page_read(fs_info, page);
|
|
while (cur <= end) {
|
|
- unsigned long this_bio_flag = 0;
|
|
+ enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
|
|
bool force_bio_submit = false;
|
|
u64 disk_bytenr;
|
|
|
|
@@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|
if (IS_ERR(em)) {
|
|
unlock_extent(tree, cur, end, NULL);
|
|
end_page_read(page, false, cur, end + 1 - cur);
|
|
- ret = PTR_ERR(em);
|
|
- break;
|
|
+ return PTR_ERR(em);
|
|
}
|
|
extent_offset = cur - em->start;
|
|
BUG_ON(extent_map_end(em) <= cur);
|
|
BUG_ON(end < cur);
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
|
|
- this_bio_flag = em->compress_type;
|
|
+ compress_type = em->compress_type;
|
|
|
|
iosize = min(extent_map_end(em) - cur, end - cur + 1);
|
|
iosize = ALIGN(iosize, blocksize);
|
|
- if (this_bio_flag != BTRFS_COMPRESS_NONE)
|
|
+ if (compress_type != BTRFS_COMPRESS_NONE)
|
|
disk_bytenr = em->block_start;
|
|
else
|
|
disk_bytenr = em->block_start + extent_offset;
|
|
@@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
|
continue;
|
|
}
|
|
|
|
- ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
|
|
- bio_ctrl, disk_bytenr, page, iosize,
|
|
- pg_offset, this_bio_flag,
|
|
- force_bio_submit);
|
|
- if (ret) {
|
|
- /*
|
|
- * We have to unlock the remaining range, or the page
|
|
- * will never be unlocked.
|
|
- */
|
|
- unlock_extent(tree, cur, end, NULL);
|
|
- end_page_read(page, false, cur, end + 1 - cur);
|
|
- goto out;
|
|
+ if (bio_ctrl->compress_type != compress_type) {
|
|
+ submit_one_bio(bio_ctrl);
|
|
+ bio_ctrl->compress_type = compress_type;
|
|
}
|
|
+
|
|
+ if (force_bio_submit)
|
|
+ submit_one_bio(bio_ctrl);
|
|
+ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
|
|
+ pg_offset);
|
|
cur = cur + iosize;
|
|
pg_offset += iosize;
|
|
}
|
|
-out:
|
|
- return ret;
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
int btrfs_read_folio(struct file *file, struct folio *folio)
|
|
@@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
|
|
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
|
|
u64 start = page_offset(page);
|
|
u64 end = start + PAGE_SIZE - 1;
|
|
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
|
|
+ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
|
|
int ret;
|
|
|
|
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
|
|
|
|
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
|
|
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
|
|
/*
|
|
* If btrfs_do_readpage() failed we will want to submit the assembled
|
|
* bio to do the cleanup.
|
|
@@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
|
|
|
|
for (index = 0; index < nr_pages; index++) {
|
|
btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
|
|
- REQ_RAHEAD, prev_em_start);
|
|
+ prev_em_start);
|
|
put_page(pages[index]);
|
|
}
|
|
}
|
|
@@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
|
|
*/
|
|
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
struct page *page,
|
|
- struct writeback_control *wbc,
|
|
struct btrfs_bio_ctrl *bio_ctrl,
|
|
loff_t i_size,
|
|
int *nr_ret)
|
|
@@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
u64 extent_offset;
|
|
u64 block_start;
|
|
struct extent_map *em;
|
|
- int saved_ret = 0;
|
|
int ret = 0;
|
|
int nr = 0;
|
|
- enum req_op op = REQ_OP_WRITE;
|
|
- const blk_opf_t write_flags = wbc_to_write_flags(wbc);
|
|
- bool has_error = false;
|
|
bool compressed;
|
|
|
|
ret = btrfs_writepage_cow_fixup(page);
|
|
if (ret) {
|
|
/* Fixup worker will requeue */
|
|
- redirty_page_for_writepage(wbc, page);
|
|
+ redirty_page_for_writepage(bio_ctrl->wbc, page);
|
|
unlock_page(page);
|
|
return 1;
|
|
}
|
|
@@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
* we don't want to touch the inode after unlocking the page,
|
|
* so we update the mapping writeback index now
|
|
*/
|
|
- wbc->nr_to_write--;
|
|
+ bio_ctrl->wbc->nr_to_write--;
|
|
|
|
bio_ctrl->end_io_func = end_bio_extent_writepage;
|
|
while (cur <= end) {
|
|
@@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
if (IS_ERR(em)) {
|
|
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
|
|
ret = PTR_ERR_OR_ZERO(em);
|
|
- has_error = true;
|
|
- if (!saved_ret)
|
|
- saved_ret = ret;
|
|
- break;
|
|
+ goto out_error;
|
|
}
|
|
|
|
extent_offset = cur - em->start;
|
|
@@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
*/
|
|
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
|
|
|
|
- ret = submit_extent_page(op | write_flags, wbc,
|
|
- bio_ctrl, disk_bytenr,
|
|
- page, iosize,
|
|
- cur - page_offset(page),
|
|
- 0, false);
|
|
- if (ret) {
|
|
- has_error = true;
|
|
- if (!saved_ret)
|
|
- saved_ret = ret;
|
|
-
|
|
- btrfs_page_set_error(fs_info, page, cur, iosize);
|
|
- if (PageWriteback(page))
|
|
- btrfs_page_clear_writeback(fs_info, page, cur,
|
|
- iosize);
|
|
- }
|
|
-
|
|
+ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
|
|
+ cur - page_offset(page));
|
|
cur += iosize;
|
|
nr++;
|
|
}
|
|
+
|
|
+ btrfs_page_assert_not_dirty(fs_info, page);
|
|
+ *nr_ret = nr;
|
|
+ return 0;
|
|
+
|
|
+out_error:
|
|
/*
|
|
* If we finish without problem, we should not only clear page dirty,
|
|
* but also empty subpage dirty bits
|
|
*/
|
|
- if (!has_error)
|
|
- btrfs_page_assert_not_dirty(fs_info, page);
|
|
- else
|
|
- ret = saved_ret;
|
|
*nr_ret = nr;
|
|
return ret;
|
|
}
|
|
@@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
|
* Return 0 if everything goes well.
|
|
* Return <0 for error.
|
|
*/
|
|
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
|
- struct btrfs_bio_ctrl *bio_ctrl)
|
|
+static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
struct folio *folio = page_folio(page);
|
|
struct inode *inode = page->mapping->host;
|
|
@@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
|
loff_t i_size = i_size_read(inode);
|
|
unsigned long end_index = i_size >> PAGE_SHIFT;
|
|
|
|
- trace___extent_writepage(page, inode, wbc);
|
|
+ trace___extent_writepage(page, inode, bio_ctrl->wbc);
|
|
|
|
WARN_ON(!PageLocked(page));
|
|
|
|
@@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
|
}
|
|
|
|
if (!bio_ctrl->extent_locked) {
|
|
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
|
|
+ ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
|
|
if (ret == 1)
|
|
return 0;
|
|
if (ret)
|
|
goto done;
|
|
}
|
|
|
|
- ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
|
|
- &nr);
|
|
+ ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
|
|
if (ret == 1)
|
|
return 0;
|
|
|
|
@@ -1773,6 +1657,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
|
if (PageError(page))
|
|
end_extent_writepage(page, ret, page_start, page_end);
|
|
if (bio_ctrl->extent_locked) {
|
|
+ struct writeback_control *wbc = bio_ctrl->wbc;
|
|
+
|
|
/*
|
|
* If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
|
|
* the page can either be locked by lock_page() or
|
|
@@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
|
|
|
|
if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
|
|
btrfs_tree_unlock(eb);
|
|
- if (!bio_ctrl->sync_io)
|
|
+ if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL)
|
|
return 0;
|
|
if (!flush) {
|
|
submit_write_bio(bio_ctrl, 0);
|
|
@@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb)
|
|
* Unlike the work in write_one_eb(), we rely completely on extent locking.
|
|
* Page locking is only utilized at minimum to keep the VMM code happy.
|
|
*/
|
|
-static int write_one_subpage_eb(struct extent_buffer *eb,
|
|
- struct writeback_control *wbc,
|
|
- struct btrfs_bio_ctrl *bio_ctrl)
|
|
+static void write_one_subpage_eb(struct extent_buffer *eb,
|
|
+ struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
struct btrfs_fs_info *fs_info = eb->fs_info;
|
|
struct page *page = eb->pages[0];
|
|
- blk_opf_t write_flags = wbc_to_write_flags(wbc);
|
|
bool no_dirty_ebs = false;
|
|
- int ret;
|
|
|
|
prepare_eb_write(eb);
|
|
|
|
@@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
|
|
|
|
bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
|
|
|
|
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
|
|
- bio_ctrl, eb->start, page, eb->len,
|
|
- eb->start - page_offset(page), 0, false);
|
|
- if (ret) {
|
|
- btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
|
|
- set_btree_ioerr(page, eb);
|
|
- unlock_page(page);
|
|
-
|
|
- if (atomic_dec_and_test(&eb->io_pages))
|
|
- end_extent_buffer_writeback(eb);
|
|
- return -EIO;
|
|
- }
|
|
+ submit_extent_page(bio_ctrl, eb->start, page, eb->len,
|
|
+ eb->start - page_offset(page));
|
|
unlock_page(page);
|
|
/*
|
|
* Submission finished without problem, if no range of the page is
|
|
* dirty anymore, we have submitted a page. Update nr_written in wbc.
|
|
*/
|
|
if (no_dirty_ebs)
|
|
- wbc->nr_to_write--;
|
|
- return ret;
|
|
+ bio_ctrl->wbc->nr_to_write--;
|
|
}
|
|
|
|
-static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
|
- struct writeback_control *wbc,
|
|
+static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
|
|
struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
u64 disk_bytenr = eb->start;
|
|
int i, num_pages;
|
|
- blk_opf_t write_flags = wbc_to_write_flags(wbc);
|
|
- int ret = 0;
|
|
|
|
prepare_eb_write(eb);
|
|
|
|
@@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
|
|
|
clear_page_dirty_for_io(p);
|
|
set_page_writeback(p);
|
|
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
|
|
- bio_ctrl, disk_bytenr, p,
|
|
- PAGE_SIZE, 0, 0, false);
|
|
- if (ret) {
|
|
- set_btree_ioerr(p, eb);
|
|
- if (PageWriteback(p))
|
|
- end_page_writeback(p);
|
|
- if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
|
|
- end_extent_buffer_writeback(eb);
|
|
- ret = -EIO;
|
|
- break;
|
|
- }
|
|
+ submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0);
|
|
disk_bytenr += PAGE_SIZE;
|
|
- wbc->nr_to_write--;
|
|
+ bio_ctrl->wbc->nr_to_write--;
|
|
unlock_page(p);
|
|
}
|
|
-
|
|
- if (unlikely(ret)) {
|
|
- for (; i < num_pages; i++) {
|
|
- struct page *p = eb->pages[i];
|
|
- clear_page_dirty_for_io(p);
|
|
- unlock_page(p);
|
|
- }
|
|
- }
|
|
-
|
|
- return ret;
|
|
}
|
|
|
|
/*
|
|
@@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
|
* Return >=0 for the number of submitted extent buffers.
|
|
* Return <0 for fatal error.
|
|
*/
|
|
-static int submit_eb_subpage(struct page *page,
|
|
- struct writeback_control *wbc,
|
|
- struct btrfs_bio_ctrl *bio_ctrl)
|
|
+static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
|
|
int submitted = 0;
|
|
@@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page,
|
|
free_extent_buffer(eb);
|
|
goto cleanup;
|
|
}
|
|
- ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
|
|
+ write_one_subpage_eb(eb, bio_ctrl);
|
|
free_extent_buffer(eb);
|
|
- if (ret < 0)
|
|
- goto cleanup;
|
|
submitted++;
|
|
}
|
|
return submitted;
|
|
@@ -2318,8 +2162,7 @@ static int submit_eb_subpage(struct page *page,
|
|
* previous call.
|
|
* Return <0 for fatal error.
|
|
*/
|
|
-static int submit_eb_page(struct page *page, struct writeback_control *wbc,
|
|
- struct btrfs_bio_ctrl *bio_ctrl,
|
|
+static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
|
|
struct extent_buffer **eb_context)
|
|
{
|
|
struct address_space *mapping = page->mapping;
|
|
@@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
|
|
return 0;
|
|
|
|
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
|
|
- return submit_eb_subpage(page, wbc, bio_ctrl);
|
|
+ return submit_eb_subpage(page, bio_ctrl);
|
|
|
|
spin_lock(&mapping->private_lock);
|
|
if (!PagePrivate(page)) {
|
|
@@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
|
|
* If for_sync, this hole will be filled with
|
|
* trasnsaction commit.
|
|
*/
|
|
- if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
|
|
+ if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL &&
|
|
+ !bio_ctrl->wbc->for_sync)
|
|
ret = -EAGAIN;
|
|
else
|
|
ret = 0;
|
|
@@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
|
|
btrfs_schedule_zone_finish_bg(cache, eb);
|
|
btrfs_put_block_group(cache);
|
|
}
|
|
- ret = write_one_eb(eb, wbc, bio_ctrl);
|
|
+ write_one_eb(eb, bio_ctrl);
|
|
free_extent_buffer(eb);
|
|
- if (ret < 0)
|
|
- return ret;
|
|
return 1;
|
|
}
|
|
|
|
@@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping,
|
|
{
|
|
struct extent_buffer *eb_context = NULL;
|
|
struct btrfs_bio_ctrl bio_ctrl = {
|
|
+ .wbc = wbc,
|
|
+ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
|
|
.extent_locked = 0,
|
|
- .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
|
|
};
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
|
|
int ret = 0;
|
|
@@ -2445,8 +2288,7 @@ int btree_write_cache_pages(struct address_space *mapping,
|
|
for (i = 0; i < nr_folios; i++) {
|
|
struct folio *folio = fbatch.folios[i];
|
|
|
|
- ret = submit_eb_page(&folio->page, wbc, &bio_ctrl,
|
|
- &eb_context);
|
|
+ ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context);
|
|
if (ret == 0)
|
|
continue;
|
|
if (ret < 0) {
|
|
@@ -2529,9 +2371,9 @@ int btree_write_cache_pages(struct address_space *mapping,
|
|
* existing IO to complete.
|
|
*/
|
|
static int extent_write_cache_pages(struct address_space *mapping,
|
|
- struct writeback_control *wbc,
|
|
struct btrfs_bio_ctrl *bio_ctrl)
|
|
{
|
|
+ struct writeback_control *wbc = bio_ctrl->wbc;
|
|
struct inode *inode = mapping->host;
|
|
int ret = 0;
|
|
int done = 0;
|
|
@@ -2632,7 +2474,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
|
|
continue;
|
|
}
|
|
|
|
- ret = __extent_writepage(&folio->page, wbc, bio_ctrl);
|
|
+ ret = __extent_writepage(&folio->page, bio_ctrl);
|
|
if (ret < 0) {
|
|
done = 1;
|
|
break;
|
|
@@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
|
|
u64 cur = start;
|
|
unsigned long nr_pages;
|
|
const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
|
|
- struct btrfs_bio_ctrl bio_ctrl = {
|
|
- .extent_locked = 1,
|
|
- .sync_io = 1,
|
|
- };
|
|
struct writeback_control wbc_writepages = {
|
|
.sync_mode = WB_SYNC_ALL,
|
|
.range_start = start,
|
|
.range_end = end + 1,
|
|
- /* We're called from an async helper function */
|
|
- .punt_to_cgroup = 1,
|
|
.no_cgroup_owner = 1,
|
|
};
|
|
+ struct btrfs_bio_ctrl bio_ctrl = {
|
|
+ .wbc = &wbc_writepages,
|
|
+ /* We're called from an async helper function */
|
|
+ .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
|
|
+ wbc_to_write_flags(&wbc_writepages),
|
|
+ .extent_locked = 1,
|
|
+ };
|
|
|
|
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
|
|
nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
|
|
@@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
|
|
ASSERT(PageLocked(page));
|
|
ASSERT(PageDirty(page));
|
|
clear_page_dirty_for_io(page);
|
|
- ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
|
|
+ ret = __extent_writepage(page, &bio_ctrl);
|
|
ASSERT(ret <= 0);
|
|
if (ret < 0) {
|
|
found_error = true;
|
|
@@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping,
|
|
struct inode *inode = mapping->host;
|
|
int ret = 0;
|
|
struct btrfs_bio_ctrl bio_ctrl = {
|
|
+ .wbc = wbc,
|
|
+ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
|
|
.extent_locked = 0,
|
|
- .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
|
|
};
|
|
|
|
/*
|
|
@@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping,
|
|
* protect the write pointer updates.
|
|
*/
|
|
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
|
|
- ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
|
|
+ ret = extent_write_cache_pages(mapping, &bio_ctrl);
|
|
submit_write_bio(&bio_ctrl, ret);
|
|
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
|
|
return ret;
|
|
@@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping,
|
|
|
|
void extent_readahead(struct readahead_control *rac)
|
|
{
|
|
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
|
|
+ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
|
|
struct page *pagepool[16];
|
|
struct extent_map *em_cached = NULL;
|
|
u64 prev_em_start = (u64)-1;
|
|
@@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
|
|
struct page *page = eb->pages[0];
|
|
struct extent_state *cached_state = NULL;
|
|
struct btrfs_bio_ctrl bio_ctrl = {
|
|
+ .opf = REQ_OP_READ,
|
|
.mirror_num = mirror_num,
|
|
.parent_check = check,
|
|
};
|
|
- int ret = 0;
|
|
+ int ret;
|
|
|
|
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
|
|
ASSERT(PagePrivate(page));
|
|
@@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
|
|
return ret;
|
|
}
|
|
|
|
- ret = 0;
|
|
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
|
|
PageUptodate(page) ||
|
|
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
|
|
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
|
|
unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
|
|
&cached_state);
|
|
- return ret;
|
|
+ return 0;
|
|
}
|
|
|
|
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
|
|
@@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
|
|
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
|
|
|
|
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
|
|
- ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
|
|
- eb->start, page, eb->len,
|
|
- eb->start - page_offset(page), 0, true);
|
|
- if (ret) {
|
|
- /*
|
|
- * In the endio function, if we hit something wrong we will
|
|
- * increase the io_pages, so here we need to decrease it for
|
|
- * error path.
|
|
- */
|
|
- atomic_dec(&eb->io_pages);
|
|
- }
|
|
+ submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
|
|
+ eb->start - page_offset(page));
|
|
submit_one_bio(&bio_ctrl);
|
|
- if (ret || wait != WAIT_COMPLETE) {
|
|
+ if (wait != WAIT_COMPLETE) {
|
|
free_extent_state(cached_state);
|
|
- return ret;
|
|
+ return 0;
|
|
}
|
|
|
|
wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
|
|
EXTENT_LOCKED, &cached_state);
|
|
if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
|
|
- ret = -EIO;
|
|
- return ret;
|
|
+ return -EIO;
|
|
+ return 0;
|
|
}
|
|
|
|
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
|
|
@@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
|
|
{
|
|
int i;
|
|
struct page *page;
|
|
- int err;
|
|
- int ret = 0;
|
|
int locked_pages = 0;
|
|
int all_uptodate = 1;
|
|
int num_pages;
|
|
unsigned long num_reads = 0;
|
|
struct btrfs_bio_ctrl bio_ctrl = {
|
|
+ .opf = REQ_OP_READ,
|
|
.mirror_num = mirror_num,
|
|
.parent_check = check,
|
|
};
|
|
@@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
|
|
page = eb->pages[i];
|
|
|
|
if (!PageUptodate(page)) {
|
|
- if (ret) {
|
|
- atomic_dec(&eb->io_pages);
|
|
- unlock_page(page);
|
|
- continue;
|
|
- }
|
|
-
|
|
ClearPageError(page);
|
|
- err = submit_extent_page(REQ_OP_READ, NULL,
|
|
- &bio_ctrl, page_offset(page), page,
|
|
- PAGE_SIZE, 0, 0, false);
|
|
- if (err) {
|
|
- /*
|
|
- * We failed to submit the bio so it's the
|
|
- * caller's responsibility to perform cleanup
|
|
- * i.e unlock page/set error bit.
|
|
- */
|
|
- ret = err;
|
|
- SetPageError(page);
|
|
- unlock_page(page);
|
|
- atomic_dec(&eb->io_pages);
|
|
- }
|
|
+ submit_extent_page(&bio_ctrl, page_offset(page), page,
|
|
+ PAGE_SIZE, 0);
|
|
} else {
|
|
unlock_page(page);
|
|
}
|
|
@@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
|
|
|
|
submit_one_bio(&bio_ctrl);
|
|
|
|
- if (ret || wait != WAIT_COMPLETE)
|
|
- return ret;
|
|
+ if (wait != WAIT_COMPLETE)
|
|
+ return 0;
|
|
|
|
for (i = 0; i < num_pages; i++) {
|
|
page = eb->pages[i];
|
|
wait_on_page_locked(page);
|
|
if (!PageUptodate(page))
|
|
- ret = -EIO;
|
|
+ return -EIO;
|
|
}
|
|
|
|
- return ret;
|
|
+ return 0;
|
|
|
|
unlock_exit:
|
|
while (locked_pages > 0) {
|
|
@@ -4596,7 +4412,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
|
|
page = eb->pages[locked_pages];
|
|
unlock_page(page);
|
|
}
|
|
- return ret;
|
|
+ return 0;
|
|
}
|
|
|
|
static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
|
|
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
|
|
index 41c77a100853..018c711a0bc8 100644
|
|
--- a/fs/btrfs/file-item.c
|
|
+++ b/fs/btrfs/file-item.c
|
|
@@ -335,48 +335,6 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
|
|
return ret;
|
|
}
|
|
|
|
-/*
|
|
- * Locate the file_offset of @cur_disk_bytenr of a @bio.
|
|
- *
|
|
- * Bio of btrfs represents read range of
|
|
- * [bi_sector << 9, bi_sector << 9 + bi_size).
|
|
- * Knowing this, we can iterate through each bvec to locate the page belong to
|
|
- * @cur_disk_bytenr and get the file offset.
|
|
- *
|
|
- * @inode is used to determine if the bvec page really belongs to @inode.
|
|
- *
|
|
- * Return 0 if we can't find the file offset
|
|
- * Return >0 if we find the file offset and restore it to @file_offset_ret
|
|
- */
|
|
-static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
|
|
- u64 disk_bytenr, u64 *file_offset_ret)
|
|
-{
|
|
- struct bvec_iter iter;
|
|
- struct bio_vec bvec;
|
|
- u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
|
- int ret = 0;
|
|
-
|
|
- bio_for_each_segment(bvec, bio, iter) {
|
|
- struct page *page = bvec.bv_page;
|
|
-
|
|
- if (cur > disk_bytenr)
|
|
- break;
|
|
- if (cur + bvec.bv_len <= disk_bytenr) {
|
|
- cur += bvec.bv_len;
|
|
- continue;
|
|
- }
|
|
- ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
|
|
- if (page->mapping && page->mapping->host &&
|
|
- page->mapping->host == inode) {
|
|
- ret = 1;
|
|
- *file_offset_ret = page_offset(page) + bvec.bv_offset +
|
|
- disk_bytenr - cur;
|
|
- break;
|
|
- }
|
|
- }
|
|
- return ret;
|
|
-}
|
|
-
|
|
/*
|
|
* Lookup the checksum for the read bio in csum tree.
|
|
*
|
|
@@ -386,17 +344,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
|
|
{
|
|
struct btrfs_inode *inode = bbio->inode;
|
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
- struct extent_io_tree *io_tree = &inode->io_tree;
|
|
struct bio *bio = &bbio->bio;
|
|
struct btrfs_path *path;
|
|
const u32 sectorsize = fs_info->sectorsize;
|
|
const u32 csum_size = fs_info->csum_size;
|
|
u32 orig_len = bio->bi_iter.bi_size;
|
|
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
|
- u64 cur_disk_bytenr;
|
|
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
|
|
- int count = 0;
|
|
blk_status_t ret = BLK_STS_OK;
|
|
+ u32 bio_offset = 0;
|
|
|
|
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
|
|
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
|
|
@@ -447,28 +403,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
|
|
path->skip_locking = 1;
|
|
}
|
|
|
|
- for (cur_disk_bytenr = orig_disk_bytenr;
|
|
- cur_disk_bytenr < orig_disk_bytenr + orig_len;
|
|
- cur_disk_bytenr += (count * sectorsize)) {
|
|
- u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
|
|
- unsigned int sector_offset;
|
|
- u8 *csum_dst;
|
|
-
|
|
- /*
|
|
- * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
|
|
- * we're calculating the offset to the bio start.
|
|
- *
|
|
- * Bio size is limited to UINT_MAX, thus unsigned int is large
|
|
- * enough to contain the raw result, not to mention the right
|
|
- * shifted result.
|
|
- */
|
|
- ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
|
|
- sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
|
|
- fs_info->sectorsize_bits;
|
|
- csum_dst = bbio->csum + sector_offset * csum_size;
|
|
+ while (bio_offset < orig_len) {
|
|
+ int count;
|
|
+ u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
|
|
+ u8 *csum_dst = bbio->csum +
|
|
+ (bio_offset >> fs_info->sectorsize_bits) * csum_size;
|
|
|
|
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
|
|
- search_len, csum_dst);
|
|
+ orig_len - bio_offset, csum_dst);
|
|
if (count < 0) {
|
|
ret = errno_to_blk_status(count);
|
|
if (bbio->csum != bbio->csum_inline)
|
|
@@ -493,14 +435,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
|
|
|
|
if (inode->root->root_key.objectid ==
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
- u64 file_offset;
|
|
- int ret;
|
|
-
|
|
- ret = search_file_offset_in_bio(bio,
|
|
- &inode->vfs_inode,
|
|
- cur_disk_bytenr, &file_offset);
|
|
- if (ret)
|
|
- set_extent_bits(io_tree, file_offset,
|
|
+ u64 file_offset = bbio->file_offset + bio_offset;
|
|
+
|
|
+ set_extent_bits(&inode->io_tree, file_offset,
|
|
file_offset + sectorsize - 1,
|
|
EXTENT_NODATASUM);
|
|
} else {
|
|
@@ -509,6 +446,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
|
|
cur_disk_bytenr, cur_disk_bytenr + sectorsize);
|
|
}
|
|
}
|
|
+ bio_offset += count * sectorsize;
|
|
}
|
|
|
|
btrfs_free_path(path);
|
|
@@ -659,7 +597,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
|
|
* in is large enough to contain all csums.
|
|
*/
|
|
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
|
|
- u8 *csum_buf, unsigned long *csum_bitmap)
|
|
+ u8 *csum_buf, unsigned long *csum_bitmap,
|
|
+ bool search_commit)
|
|
{
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
struct btrfs_key key;
|
|
@@ -676,6 +615,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
+ if (search_commit) {
|
|
+ path->skip_locking = 1;
|
|
+ path->reada = READA_FORWARD;
|
|
+ path->search_commit_root = 1;
|
|
+ }
|
|
+
|
|
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
|
|
key.type = BTRFS_EXTENT_CSUM_KEY;
|
|
key.offset = start;
|
|
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
|
|
index cd7f2ae515c0..6be8725cd574 100644
|
|
--- a/fs/btrfs/file-item.h
|
|
+++ b/fs/btrfs/file-item.h
|
|
@@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
|
|
struct list_head *list, int search_commit,
|
|
bool nowait);
|
|
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
|
|
- u8 *csum_buf, unsigned long *csum_bitmap);
|
|
+ u8 *csum_buf, unsigned long *csum_bitmap,
|
|
+ bool search_commit);
|
|
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
|
|
const struct btrfs_path *path,
|
|
struct btrfs_file_extent_item *fi,
|
|
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
|
|
index 24cd49229408..0d98fc5f6f44 100644
|
|
--- a/fs/btrfs/fs.h
|
|
+++ b/fs/btrfs/fs.h
|
|
@@ -24,6 +24,18 @@
|
|
#define BTRFS_SUPER_INFO_SIZE 4096
|
|
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
|
|
|
|
+/*
|
|
+ * Number of metadata items necessary for an unlink operation:
|
|
+ *
|
|
+ * 1 for the possible orphan item
|
|
+ * 1 for the dir item
|
|
+ * 1 for the dir index
|
|
+ * 1 for the inode ref
|
|
+ * 1 for the inode
|
|
+ * 1 for the parent inode
|
|
+ */
|
|
+#define BTRFS_UNLINK_METADATA_UNITS 6
|
|
+
|
|
/*
|
|
* The reserved space at the beginning of each device. It covers the primary
|
|
* super block and leaves space for potential use by other tools like
|
|
@@ -193,11 +205,7 @@ enum {
|
|
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
|
|
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
|
|
|
|
-#ifdef CONFIG_BTRFS_DEBUG
|
|
-/*
|
|
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
|
|
- */
|
|
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
|
|
+#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \
|
|
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
|
|
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
|
|
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
|
|
@@ -210,23 +218,22 @@ enum {
|
|
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
|
|
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
|
|
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
|
|
- BTRFS_FEATURE_INCOMPAT_ZONED | \
|
|
+ BTRFS_FEATURE_INCOMPAT_ZONED)
|
|
+
|
|
+#ifdef CONFIG_BTRFS_DEBUG
|
|
+ /*
|
|
+ * Features under developmen like Extent tree v2 support is enabled
|
|
+ * only under CONFIG_BTRFS_DEBUG.
|
|
+ */
|
|
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
|
|
+ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
|
|
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
|
|
+
|
|
#else
|
|
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
|
|
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
|
|
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
|
|
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
|
|
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
|
|
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
|
|
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
|
|
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
|
|
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
|
|
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
|
|
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
|
|
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
|
|
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
|
|
- BTRFS_FEATURE_INCOMPAT_ZONED)
|
|
+
|
|
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
|
|
+ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE)
|
|
+
|
|
#endif
|
|
|
|
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
|
|
@@ -412,7 +419,6 @@ struct btrfs_fs_info {
|
|
* Must be written and read while holding btrfs_fs_info::commit_root_sem.
|
|
*/
|
|
u64 last_reloc_trans;
|
|
- u64 avg_delayed_ref_runtime;
|
|
|
|
/*
|
|
* This is updated to the current trans every time a full commit is
|
|
@@ -638,7 +644,6 @@ struct btrfs_fs_info {
|
|
refcount_t scrub_workers_refcnt;
|
|
struct workqueue_struct *scrub_workers;
|
|
struct workqueue_struct *scrub_wr_completion_workers;
|
|
- struct workqueue_struct *scrub_parity_workers;
|
|
struct btrfs_subpage_info *subpage_info;
|
|
|
|
struct btrfs_discard_ctl discard_ctl;
|
|
@@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves(
|
|
* Use this if we would be adding new items, as we could split nodes as we cow
|
|
* down the tree.
|
|
*/
|
|
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
|
|
+static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info,
|
|
unsigned num_items)
|
|
{
|
|
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
|
|
@@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
|
|
* Doing a truncate or a modification won't result in new nodes or leaves, just
|
|
* what we need for COW.
|
|
*/
|
|
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
|
|
+static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
|
|
unsigned num_items)
|
|
{
|
|
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
|
|
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
|
|
index b65c45b5d681..4c322b720a80 100644
|
|
--- a/fs/btrfs/inode-item.c
|
|
+++ b/fs/btrfs/inode-item.c
|
|
@@ -527,7 +527,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
while (1) {
|
|
u64 clear_start = 0, clear_len = 0, extent_start = 0;
|
|
- bool should_throttle = false;
|
|
+ bool refill_delayed_refs_rsv = false;
|
|
|
|
fi = NULL;
|
|
leaf = path->nodes[0];
|
|
@@ -660,8 +660,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
/* No pending yet, add ourselves */
|
|
pending_del_slot = path->slots[0];
|
|
pending_del_nr = 1;
|
|
- } else if (pending_del_nr &&
|
|
- path->slots[0] + 1 == pending_del_slot) {
|
|
+ } else if (path->slots[0] + 1 == pending_del_slot) {
|
|
/* Hop on the pending chunk */
|
|
pending_del_nr++;
|
|
pending_del_slot = path->slots[0];
|
|
@@ -686,10 +685,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
btrfs_abort_transaction(trans, ret);
|
|
break;
|
|
}
|
|
- if (be_nice) {
|
|
- if (btrfs_should_throttle_delayed_refs(trans))
|
|
- should_throttle = true;
|
|
- }
|
|
+ if (be_nice && btrfs_check_space_for_delayed_refs(fs_info))
|
|
+ refill_delayed_refs_rsv = true;
|
|
}
|
|
|
|
if (found_type == BTRFS_INODE_ITEM_KEY)
|
|
@@ -697,7 +694,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
if (path->slots[0] == 0 ||
|
|
path->slots[0] != pending_del_slot ||
|
|
- should_throttle) {
|
|
+ refill_delayed_refs_rsv) {
|
|
if (pending_del_nr) {
|
|
ret = btrfs_del_items(trans, root, path,
|
|
pending_del_slot,
|
|
@@ -720,7 +717,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
* actually allocate, so just bail if we're short and
|
|
* let the normal reservation dance happen higher up.
|
|
*/
|
|
- if (should_throttle) {
|
|
+ if (refill_delayed_refs_rsv) {
|
|
ret = btrfs_delayed_refs_rsv_refill(fs_info,
|
|
BTRFS_RESERVE_NO_FLUSH);
|
|
if (ret) {
|
|
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
|
|
index 957e4d76a7b6..57d070025c7a 100644
|
|
--- a/fs/btrfs/inode.c
|
|
+++ b/fs/btrfs/inode.c
|
|
@@ -79,6 +79,7 @@ struct btrfs_iget_args {
|
|
struct btrfs_dio_data {
|
|
ssize_t submitted;
|
|
struct extent_changeset *data_reserved;
|
|
+ struct btrfs_ordered_extent *ordered;
|
|
bool data_space_reserved;
|
|
bool nocow_done;
|
|
};
|
|
@@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
|
|
again:
|
|
will_compress = 0;
|
|
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
|
|
- nr_pages = min_t(unsigned long, nr_pages,
|
|
- BTRFS_MAX_COMPRESSED / PAGE_SIZE);
|
|
+ nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
|
|
|
|
/*
|
|
* we don't want to send crud past the end of i_size through
|
|
@@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
|
|
ret = cow_file_range(inode, locked_page, start, end, &page_started,
|
|
&nr_written, 0, NULL);
|
|
/* Inline extent inserted, page gets unlocked and everything is done */
|
|
- if (page_started) {
|
|
- ret = 0;
|
|
- goto out;
|
|
- }
|
|
+ if (page_started)
|
|
+ return 0;
|
|
+
|
|
if (ret < 0) {
|
|
btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
|
|
if (locked_page) {
|
|
@@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
|
|
end_extent_writepage(locked_page, ret, page_start, page_end);
|
|
unlock_page(locked_page);
|
|
}
|
|
- goto out;
|
|
+ return ret;
|
|
}
|
|
|
|
- ret = extent_write_locked_range(&inode->vfs_inode, start, end);
|
|
/* All pages will be unlocked, including @locked_page */
|
|
-out:
|
|
- kfree(async_extent);
|
|
- return ret;
|
|
+ return extent_write_locked_range(&inode->vfs_inode, start, end);
|
|
}
|
|
|
|
static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
@@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
u64 start = async_extent->start;
|
|
u64 end = async_extent->start + async_extent->ram_size - 1;
|
|
|
|
+ if (async_chunk->blkcg_css)
|
|
+ kthread_associate_blkcg(async_chunk->blkcg_css);
|
|
+
|
|
/*
|
|
* If async_chunk->locked_page is in the async_extent range, we need to
|
|
* handle it.
|
|
@@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
lock_extent(io_tree, start, end, NULL);
|
|
|
|
/* We have fall back to uncompressed write */
|
|
- if (!async_extent->pages)
|
|
- return submit_uncompressed_range(inode, async_extent, locked_page);
|
|
+ if (!async_extent->pages) {
|
|
+ ret = submit_uncompressed_range(inode, async_extent, locked_page);
|
|
+ goto done;
|
|
+ }
|
|
|
|
ret = btrfs_reserve_extent(root, async_extent->ram_size,
|
|
async_extent->compressed_size,
|
|
@@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
extent_clear_unlock_delalloc(inode, start, end,
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
|
|
PAGE_UNLOCK | PAGE_START_WRITEBACK);
|
|
- if (btrfs_submit_compressed_write(inode, start, /* file_offset */
|
|
+
|
|
+ btrfs_submit_compressed_write(inode, start, /* file_offset */
|
|
async_extent->ram_size, /* num_bytes */
|
|
ins.objectid, /* disk_bytenr */
|
|
ins.offset, /* compressed_len */
|
|
async_extent->pages, /* compressed_pages */
|
|
async_extent->nr_pages,
|
|
- async_chunk->write_flags,
|
|
- async_chunk->blkcg_css, true)) {
|
|
- const u64 start = async_extent->start;
|
|
- const u64 end = start + async_extent->ram_size - 1;
|
|
-
|
|
- btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
|
|
-
|
|
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
|
|
- PAGE_END_WRITEBACK | PAGE_SET_ERROR);
|
|
- free_async_extent_pages(async_extent);
|
|
- }
|
|
+ async_chunk->write_flags, true);
|
|
*alloc_hint = ins.objectid + ins.offset;
|
|
+done:
|
|
+ if (async_chunk->blkcg_css)
|
|
+ kthread_associate_blkcg(NULL);
|
|
kfree(async_extent);
|
|
return ret;
|
|
|
|
@@ -1086,8 +1081,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
|
|
PAGE_UNLOCK | PAGE_START_WRITEBACK |
|
|
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
|
|
free_async_extent_pages(async_extent);
|
|
- kfree(async_extent);
|
|
- return ret;
|
|
+ goto done;
|
|
}
|
|
|
|
/*
|
|
@@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
|
|
if (blkcg_css != blkcg_root_css) {
|
|
css_get(blkcg_css);
|
|
async_chunk[i].blkcg_css = blkcg_css;
|
|
+ async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
|
|
} else {
|
|
async_chunk[i].blkcg_css = NULL;
|
|
}
|
|
@@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
|
|
}
|
|
|
|
/*
|
|
- * Split an extent_map at [start, start + len]
|
|
+ * Split off the first pre bytes from the extent_map at [start, start + len]
|
|
*
|
|
* This function is intended to be used only for extract_ordered_extent().
|
|
*/
|
|
-static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
|
- u64 pre, u64 post)
|
|
+static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre)
|
|
{
|
|
struct extent_map_tree *em_tree = &inode->extent_tree;
|
|
struct extent_map *em;
|
|
struct extent_map *split_pre = NULL;
|
|
struct extent_map *split_mid = NULL;
|
|
- struct extent_map *split_post = NULL;
|
|
int ret = 0;
|
|
unsigned long flags;
|
|
|
|
- /* Sanity check */
|
|
- if (pre == 0 && post == 0)
|
|
- return 0;
|
|
+ ASSERT(pre != 0);
|
|
+ ASSERT(pre < len);
|
|
|
|
split_pre = alloc_extent_map();
|
|
- if (pre)
|
|
- split_mid = alloc_extent_map();
|
|
- if (post)
|
|
- split_post = alloc_extent_map();
|
|
- if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
|
|
+ if (!split_pre)
|
|
+ return -ENOMEM;
|
|
+ split_mid = alloc_extent_map();
|
|
+ if (!split_mid) {
|
|
ret = -ENOMEM;
|
|
- goto out;
|
|
+ goto out_free_pre;
|
|
}
|
|
|
|
- ASSERT(pre + post < len);
|
|
-
|
|
lock_extent(&inode->io_tree, start, start + len - 1, NULL);
|
|
write_lock(&em_tree->lock);
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
|
@@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
|
|
|
/* First, replace the em with a new extent_map starting from * em->start */
|
|
split_pre->start = em->start;
|
|
- split_pre->len = (pre ? pre : em->len - post);
|
|
+ split_pre->len = pre;
|
|
split_pre->orig_start = split_pre->start;
|
|
split_pre->block_start = em->block_start;
|
|
split_pre->block_len = split_pre->len;
|
|
@@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
|
|
|
/*
|
|
* Now we only have an extent_map at:
|
|
- * [em->start, em->start + pre] if pre != 0
|
|
- * [em->start, em->start + em->len - post] if pre == 0
|
|
- */
|
|
-
|
|
- if (pre) {
|
|
- /* Insert the middle extent_map */
|
|
- split_mid->start = em->start + pre;
|
|
- split_mid->len = em->len - pre - post;
|
|
- split_mid->orig_start = split_mid->start;
|
|
- split_mid->block_start = em->block_start + pre;
|
|
- split_mid->block_len = split_mid->len;
|
|
- split_mid->orig_block_len = split_mid->block_len;
|
|
- split_mid->ram_bytes = split_mid->len;
|
|
- split_mid->flags = flags;
|
|
- split_mid->compress_type = em->compress_type;
|
|
- split_mid->generation = em->generation;
|
|
- add_extent_mapping(em_tree, split_mid, 1);
|
|
- }
|
|
-
|
|
- if (post) {
|
|
- split_post->start = em->start + em->len - post;
|
|
- split_post->len = post;
|
|
- split_post->orig_start = split_post->start;
|
|
- split_post->block_start = em->block_start + em->len - post;
|
|
- split_post->block_len = split_post->len;
|
|
- split_post->orig_block_len = split_post->block_len;
|
|
- split_post->ram_bytes = split_post->len;
|
|
- split_post->flags = flags;
|
|
- split_post->compress_type = em->compress_type;
|
|
- split_post->generation = em->generation;
|
|
- add_extent_mapping(em_tree, split_post, 1);
|
|
- }
|
|
+ * [em->start, em->start + pre]
|
|
+ */
|
|
+
|
|
+ /* Insert the middle extent_map. */
|
|
+ split_mid->start = em->start + pre;
|
|
+ split_mid->len = em->len - pre;
|
|
+ split_mid->orig_start = split_mid->start;
|
|
+ split_mid->block_start = em->block_start + pre;
|
|
+ split_mid->block_len = split_mid->len;
|
|
+ split_mid->orig_block_len = split_mid->block_len;
|
|
+ split_mid->ram_bytes = split_mid->len;
|
|
+ split_mid->flags = flags;
|
|
+ split_mid->compress_type = em->compress_type;
|
|
+ split_mid->generation = em->generation;
|
|
+ add_extent_mapping(em_tree, split_mid, 1);
|
|
|
|
/* Once for us */
|
|
free_extent_map(em);
|
|
@@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
|
out_unlock:
|
|
write_unlock(&em_tree->lock);
|
|
unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
|
|
-out:
|
|
- free_extent_map(split_pre);
|
|
free_extent_map(split_mid);
|
|
- free_extent_map(split_post);
|
|
-
|
|
+out_free_pre:
|
|
+ free_extent_map(split_pre);
|
|
return ret;
|
|
}
|
|
|
|
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
|
|
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
|
|
+ struct btrfs_ordered_extent *ordered)
|
|
{
|
|
u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
|
|
u64 len = bbio->bio.bi_iter.bi_size;
|
|
struct btrfs_inode *inode = bbio->inode;
|
|
- struct btrfs_ordered_extent *ordered;
|
|
- u64 file_len;
|
|
- u64 end = start + len;
|
|
- u64 ordered_end;
|
|
- u64 pre, post;
|
|
+ u64 ordered_len = ordered->num_bytes;
|
|
int ret = 0;
|
|
|
|
- ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
|
|
- if (WARN_ON_ONCE(!ordered))
|
|
- return BLK_STS_IOERR;
|
|
+ /* Must always be called for the beginning of an ordered extent. */
|
|
+ if (WARN_ON_ONCE(start != ordered->disk_bytenr))
|
|
+ return -EINVAL;
|
|
|
|
- /* No need to split */
|
|
+ /* No need to split if the ordered extent covers the entire bio. */
|
|
if (ordered->disk_num_bytes == len)
|
|
- goto out;
|
|
-
|
|
- /* We cannot split once end_bio'd ordered extent */
|
|
- if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
|
|
- ret = -EINVAL;
|
|
- goto out;
|
|
- }
|
|
-
|
|
- /* We cannot split a compressed ordered extent */
|
|
- if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
|
|
- ret = -EINVAL;
|
|
- goto out;
|
|
- }
|
|
-
|
|
- ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
|
|
- /* bio must be in one ordered extent */
|
|
- if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
|
|
- ret = -EINVAL;
|
|
- goto out;
|
|
- }
|
|
-
|
|
- /* Checksum list should be empty */
|
|
- if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
|
|
- ret = -EINVAL;
|
|
- goto out;
|
|
- }
|
|
-
|
|
- file_len = ordered->num_bytes;
|
|
- pre = start - ordered->disk_bytenr;
|
|
- post = ordered_end - end;
|
|
+ return 0;
|
|
|
|
- ret = btrfs_split_ordered_extent(ordered, pre, post);
|
|
+ ret = btrfs_split_ordered_extent(ordered, len);
|
|
if (ret)
|
|
- goto out;
|
|
- ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
|
|
+ return ret;
|
|
|
|
-out:
|
|
- btrfs_put_ordered_extent(ordered);
|
|
+ /*
|
|
+ * Don't split the extent_map for NOCOW extents, as we're writing into
|
|
+ * a pre-existing one.
|
|
+ */
|
|
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
|
|
+ return 0;
|
|
|
|
- return errno_to_blk_status(ret);
|
|
+ return split_extent_map(inode, bbio->file_offset, ordered_len, len);
|
|
}
|
|
|
|
/*
|
|
@@ -3367,13 +3308,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
|
|
return 0;
|
|
}
|
|
|
|
-static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
|
|
-{
|
|
- u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
|
|
-
|
|
- return csums + offset_in_sectors * fs_info->csum_size;
|
|
-}
|
|
-
|
|
/*
|
|
* Verify the checksum of a single data sector.
|
|
*
|
|
@@ -3411,7 +3345,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
|
|
return true;
|
|
}
|
|
|
|
- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
|
|
+ csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
|
|
+ fs_info->csum_size;
|
|
if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
|
|
csum_expected))
|
|
goto zeroit;
|
|
@@ -3691,6 +3626,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
|
|
trans = btrfs_start_transaction(root, 1);
|
|
if (IS_ERR(trans)) {
|
|
ret = PTR_ERR(trans);
|
|
+ iput(inode);
|
|
goto out;
|
|
}
|
|
btrfs_debug(fs_info, "auto deleting %Lu",
|
|
@@ -3698,8 +3634,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
|
|
ret = btrfs_del_orphan_item(trans, root,
|
|
found_key.objectid);
|
|
btrfs_end_transaction(trans);
|
|
- if (ret)
|
|
+ if (ret) {
|
|
+ iput(inode);
|
|
goto out;
|
|
+ }
|
|
continue;
|
|
}
|
|
|
|
@@ -4261,15 +4199,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
|
|
{
|
|
struct btrfs_root *root = dir->root;
|
|
|
|
- /*
|
|
- * 1 for the possible orphan item
|
|
- * 1 for the dir item
|
|
- * 1 for the dir index
|
|
- * 1 for the inode ref
|
|
- * 1 for the inode
|
|
- * 1 for the parent inode
|
|
- */
|
|
- return btrfs_start_transaction_fallback_global_rsv(root, 6);
|
|
+ return btrfs_start_transaction_fallback_global_rsv(root,
|
|
+ BTRFS_UNLINK_METADATA_UNITS);
|
|
}
|
|
|
|
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
|
|
@@ -5243,7 +5174,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
|
|
{
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
struct btrfs_trans_handle *trans;
|
|
- u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
|
|
+ u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
|
|
int ret;
|
|
|
|
/*
|
|
@@ -5281,7 +5212,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
trans->bytes_reserved = delayed_refs_extra;
|
|
btrfs_block_rsv_migrate(rsv, trans->block_rsv,
|
|
- delayed_refs_extra, 1);
|
|
+ delayed_refs_extra, true);
|
|
}
|
|
return trans;
|
|
}
|
|
@@ -5291,7 +5222,7 @@ void btrfs_evict_inode(struct inode *inode)
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
struct btrfs_trans_handle *trans;
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
- struct btrfs_block_rsv *rsv;
|
|
+ struct btrfs_block_rsv *rsv = NULL;
|
|
int ret;
|
|
|
|
trace_btrfs_inode_evict(inode);
|
|
@@ -5308,18 +5239,18 @@ void btrfs_evict_inode(struct inode *inode)
|
|
((btrfs_root_refs(&root->root_item) != 0 &&
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
|
|
btrfs_is_free_space_inode(BTRFS_I(inode))))
|
|
- goto no_delete;
|
|
+ goto out;
|
|
|
|
if (is_bad_inode(inode))
|
|
- goto no_delete;
|
|
+ goto out;
|
|
|
|
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
|
|
- goto no_delete;
|
|
+ goto out;
|
|
|
|
if (inode->i_nlink > 0) {
|
|
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
|
|
- goto no_delete;
|
|
+ goto out;
|
|
}
|
|
|
|
/*
|
|
@@ -5328,7 +5259,7 @@ void btrfs_evict_inode(struct inode *inode)
|
|
*/
|
|
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
|
|
if (ret)
|
|
- goto no_delete;
|
|
+ goto out;
|
|
|
|
/*
|
|
* This drops any pending insert or delete operations we have for this
|
|
@@ -5340,7 +5271,7 @@ void btrfs_evict_inode(struct inode *inode)
|
|
|
|
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
|
|
if (!rsv)
|
|
- goto no_delete;
|
|
+ goto out;
|
|
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
|
|
rsv->failfast = true;
|
|
|
|
@@ -5356,16 +5287,21 @@ void btrfs_evict_inode(struct inode *inode)
|
|
|
|
trans = evict_refill_and_join(root, rsv);
|
|
if (IS_ERR(trans))
|
|
- goto free_rsv;
|
|
+ goto out;
|
|
|
|
trans->block_rsv = rsv;
|
|
|
|
ret = btrfs_truncate_inode_items(trans, root, &control);
|
|
trans->block_rsv = &fs_info->trans_block_rsv;
|
|
btrfs_end_transaction(trans);
|
|
- btrfs_btree_balance_dirty(fs_info);
|
|
+ /*
|
|
+ * We have not added new delayed items for our inode after we
|
|
+ * have flushed its delayed items, so no need to throttle on
|
|
+ * delayed items. However we have modified extent buffers.
|
|
+ */
|
|
+ btrfs_btree_balance_dirty_nodelay(fs_info);
|
|
if (ret && ret != -ENOSPC && ret != -EAGAIN)
|
|
- goto free_rsv;
|
|
+ goto out;
|
|
else if (!ret)
|
|
break;
|
|
}
|
|
@@ -5387,9 +5323,8 @@ void btrfs_evict_inode(struct inode *inode)
|
|
btrfs_end_transaction(trans);
|
|
}
|
|
|
|
-free_rsv:
|
|
+out:
|
|
btrfs_free_block_rsv(fs_info, rsv);
|
|
-no_delete:
|
|
/*
|
|
* If we didn't successfully delete, the orphan item will still be in
|
|
* the tree and we'll retry on the next mount. Again, we might also want
|
|
@@ -6981,6 +6916,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
|
|
}
|
|
|
|
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
|
|
+ struct btrfs_dio_data *dio_data,
|
|
const u64 start,
|
|
const u64 len,
|
|
const u64 orig_start,
|
|
@@ -6991,7 +6927,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
|
|
const int type)
|
|
{
|
|
struct extent_map *em = NULL;
|
|
- int ret;
|
|
+ struct btrfs_ordered_extent *ordered;
|
|
|
|
if (type != BTRFS_ORDERED_NOCOW) {
|
|
em = create_io_em(inode, start, len, orig_start, block_start,
|
|
@@ -7001,18 +6937,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
|
|
if (IS_ERR(em))
|
|
goto out;
|
|
}
|
|
- ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
|
|
- block_len, 0,
|
|
- (1 << type) |
|
|
- (1 << BTRFS_ORDERED_DIRECT),
|
|
- BTRFS_COMPRESS_NONE);
|
|
- if (ret) {
|
|
+ ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
|
|
+ block_start, block_len, 0,
|
|
+ (1 << type) |
|
|
+ (1 << BTRFS_ORDERED_DIRECT),
|
|
+ BTRFS_COMPRESS_NONE);
|
|
+ if (IS_ERR(ordered)) {
|
|
if (em) {
|
|
free_extent_map(em);
|
|
btrfs_drop_extent_map_range(inode, start,
|
|
start + len - 1, false);
|
|
}
|
|
- em = ERR_PTR(ret);
|
|
+ em = ERR_CAST(ordered);
|
|
+ } else {
|
|
+ ASSERT(!dio_data->ordered);
|
|
+ dio_data->ordered = ordered;
|
|
}
|
|
out:
|
|
|
|
@@ -7020,6 +6959,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
|
|
}
|
|
|
|
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
|
|
+ struct btrfs_dio_data *dio_data,
|
|
u64 start, u64 len)
|
|
{
|
|
struct btrfs_root *root = inode->root;
|
|
@@ -7035,7 +6975,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
- em = btrfs_create_dio_extent(inode, start, ins.offset, start,
|
|
+ em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
|
|
ins.objectid, ins.offset, ins.offset,
|
|
ins.offset, BTRFS_ORDERED_REGULAR);
|
|
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
|
@@ -7380,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
|
}
|
|
space_reserved = true;
|
|
|
|
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
|
|
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
|
|
orig_start, block_start,
|
|
len, orig_block_len,
|
|
ram_bytes, type);
|
|
@@ -7422,7 +7362,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
|
goto out;
|
|
space_reserved = true;
|
|
|
|
- em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
|
|
+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
|
|
if (IS_ERR(em)) {
|
|
ret = PTR_ERR(em);
|
|
goto out;
|
|
@@ -7728,6 +7668,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
|
|
pos + length - 1, NULL);
|
|
ret = -ENOTBLK;
|
|
}
|
|
+ if (write) {
|
|
+ btrfs_put_ordered_extent(dio_data->ordered);
|
|
+ dio_data->ordered = NULL;
|
|
+ }
|
|
|
|
if (write)
|
|
extent_changeset_free(dio_data->data_reserved);
|
|
@@ -7767,14 +7711,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
|
|
container_of(bbio, struct btrfs_dio_private, bbio);
|
|
struct btrfs_dio_data *dio_data = iter->private;
|
|
|
|
- btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
|
|
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
|
|
+ btrfs_dio_end_io, bio->bi_private);
|
|
+ bbio->inode = BTRFS_I(iter->inode);
|
|
bbio->file_offset = file_offset;
|
|
|
|
dip->file_offset = file_offset;
|
|
dip->bytes = bio->bi_iter.bi_size;
|
|
|
|
dio_data->submitted += bio->bi_iter.bi_size;
|
|
- btrfs_submit_bio(bio, 0);
|
|
+
|
|
+ /*
|
|
+ * Check if we are doing a partial write. If we are, we need to split
|
|
+ * the ordered extent to match the submitted bio. Hang on to the
|
|
+ * remaining unfinishable ordered_extent in dio_data so that it can be
|
|
+ * cancelled in iomap_end to avoid a deadlock wherein faulting the
|
|
+ * remaining pages is blocked on the outstanding ordered extent.
|
|
+ */
|
|
+ if (iter->flags & IOMAP_WRITE) {
|
|
+ int ret;
|
|
+
|
|
+ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
|
|
+ if (ret) {
|
|
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ btrfs_submit_bio(bbio, 0);
|
|
}
|
|
|
|
static const struct iomap_ops btrfs_dio_iomap_ops = {
|
|
@@ -7789,7 +7753,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
|
|
|
|
ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
|
|
{
|
|
- struct btrfs_dio_data data;
|
|
+ struct btrfs_dio_data data = { 0 };
|
|
|
|
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
|
IOMAP_DIO_PARTIAL, &data, done_before);
|
|
@@ -7798,7 +7762,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be
|
|
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
|
|
size_t done_before)
|
|
{
|
|
- struct btrfs_dio_data data;
|
|
+ struct btrfs_dio_data data = { 0 };
|
|
|
|
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
|
IOMAP_DIO_PARTIAL, &data, done_before);
|
|
@@ -9908,8 +9872,6 @@ static ssize_t btrfs_encoded_read_inline(
|
|
}
|
|
|
|
struct btrfs_encoded_read_private {
|
|
- struct btrfs_inode *inode;
|
|
- u64 file_offset;
|
|
wait_queue_head_t wait;
|
|
atomic_t pending;
|
|
blk_status_t status;
|
|
@@ -9939,45 +9901,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
|
u64 file_offset, u64 disk_bytenr,
|
|
u64 disk_io_size, struct page **pages)
|
|
{
|
|
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
struct btrfs_encoded_read_private priv = {
|
|
- .inode = inode,
|
|
- .file_offset = file_offset,
|
|
.pending = ATOMIC_INIT(1),
|
|
};
|
|
unsigned long i = 0;
|
|
- u64 cur = 0;
|
|
+ struct btrfs_bio *bbio;
|
|
|
|
init_waitqueue_head(&priv.wait);
|
|
- /* Submit bios for the extent, splitting due to bio limits as necessary. */
|
|
- while (cur < disk_io_size) {
|
|
- struct bio *bio = NULL;
|
|
- u64 remaining = disk_io_size - cur;
|
|
-
|
|
- while (bio || remaining) {
|
|
- size_t bytes = min_t(u64, remaining, PAGE_SIZE);
|
|
-
|
|
- if (!bio) {
|
|
- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
|
|
- inode,
|
|
- btrfs_encoded_read_endio,
|
|
- &priv);
|
|
- bio->bi_iter.bi_sector =
|
|
- (disk_bytenr + cur) >> SECTOR_SHIFT;
|
|
- }
|
|
|
|
- if (!bytes ||
|
|
- bio_add_page(bio, pages[i], bytes, 0) < bytes) {
|
|
- atomic_inc(&priv.pending);
|
|
- btrfs_submit_bio(bio, 0);
|
|
- bio = NULL;
|
|
- continue;
|
|
- }
|
|
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
|
|
+ btrfs_encoded_read_endio, &priv);
|
|
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
|
|
+ bbio->inode = inode;
|
|
|
|
- i++;
|
|
- cur += bytes;
|
|
- remaining -= bytes;
|
|
+ do {
|
|
+ size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
|
|
+
|
|
+ if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
|
|
+ atomic_inc(&priv.pending);
|
|
+ btrfs_submit_bio(bbio, 0);
|
|
+
|
|
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
|
|
+ btrfs_encoded_read_endio, &priv);
|
|
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
|
|
+ bbio->inode = inode;
|
|
+ continue;
|
|
}
|
|
- }
|
|
+
|
|
+ i++;
|
|
+ disk_bytenr += bytes;
|
|
+ disk_io_size -= bytes;
|
|
+ } while (disk_io_size);
|
|
+
|
|
+ atomic_inc(&priv.pending);
|
|
+ btrfs_submit_bio(bbio, 0);
|
|
|
|
if (atomic_dec_return(&priv.pending))
|
|
io_wait_event(priv.wait, !atomic_read(&priv.pending));
|
|
@@ -10398,13 +10356,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
|
|
|
|
btrfs_delalloc_release_extents(inode, num_bytes);
|
|
|
|
- if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
|
|
- ins.offset, pages, nr_pages, 0, NULL,
|
|
- false)) {
|
|
- btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
|
|
- ret = -EIO;
|
|
- goto out_pages;
|
|
- }
|
|
+ btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
|
|
+ ins.offset, pages, nr_pages, 0, false);
|
|
ret = orig_count;
|
|
goto out;
|
|
|
|
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
|
|
index ba769a1eb87a..25833b4eeaf5 100644
|
|
--- a/fs/btrfs/ioctl.c
|
|
+++ b/fs/btrfs/ioctl.c
|
|
@@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
|
|
if (IS_ERR(sa))
|
|
return PTR_ERR(sa);
|
|
|
|
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
|
|
+ ret = -EOPNOTSUPP;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
|
|
ret = mnt_want_write_file(file);
|
|
if (ret)
|
|
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
|
|
index 870528d87526..3a496b0d3d2b 100644
|
|
--- a/fs/btrfs/locking.c
|
|
+++ b/fs/btrfs/locking.c
|
|
@@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
|
|
* acquire the lock.
|
|
*/
|
|
|
|
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
|
|
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
|
|
{
|
|
- int ret;
|
|
-
|
|
- ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
|
|
- if (ret)
|
|
- return ret;
|
|
-
|
|
atomic_set(&lock->readers, 0);
|
|
+ atomic_set(&lock->writers, 0);
|
|
init_waitqueue_head(&lock->pending_readers);
|
|
init_waitqueue_head(&lock->pending_writers);
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
|
|
-{
|
|
- percpu_counter_destroy(&lock->writers);
|
|
}
|
|
|
|
/* Return true if acquisition is successful, false otherwise */
|
|
@@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
|
|
if (atomic_read(&lock->readers))
|
|
return false;
|
|
|
|
- percpu_counter_inc(&lock->writers);
|
|
+ atomic_inc(&lock->writers);
|
|
|
|
/* Ensure writers count is updated before we check for pending readers */
|
|
- smp_mb();
|
|
+ smp_mb__after_atomic();
|
|
if (atomic_read(&lock->readers)) {
|
|
btrfs_drew_write_unlock(lock);
|
|
return false;
|
|
@@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
|
|
|
|
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
|
|
{
|
|
- percpu_counter_dec(&lock->writers);
|
|
+ atomic_dec(&lock->writers);
|
|
cond_wake_up(&lock->pending_readers);
|
|
}
|
|
|
|
@@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
|
|
*/
|
|
smp_mb__after_atomic();
|
|
|
|
- wait_event(lock->pending_readers,
|
|
- percpu_counter_sum(&lock->writers) == 0);
|
|
+ wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
|
|
}
|
|
|
|
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
|
|
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
|
|
index 11c2269b4b6f..edb9b4a0dba1 100644
|
|
--- a/fs/btrfs/locking.h
|
|
+++ b/fs/btrfs/locking.h
|
|
@@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
|
|
|
|
struct btrfs_drew_lock {
|
|
atomic_t readers;
|
|
- struct percpu_counter writers;
|
|
+ atomic_t writers;
|
|
wait_queue_head_t pending_writers;
|
|
wait_queue_head_t pending_readers;
|
|
};
|
|
|
|
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
|
|
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
|
|
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
|
|
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
|
|
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
|
|
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
|
|
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
|
|
index de3e18bce24a..00328c856be6 100644
|
|
--- a/fs/btrfs/lru_cache.h
|
|
+++ b/fs/btrfs/lru_cache.h
|
|
@@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca
|
|
return cache->size;
|
|
}
|
|
|
|
-static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
|
|
-{
|
|
- return cache->size >= cache->max_size;
|
|
-}
|
|
-
|
|
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
|
|
struct btrfs_lru_cache *cache)
|
|
{
|
|
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
|
|
index 71f6d8302d50..3a095b9c6373 100644
|
|
--- a/fs/btrfs/lzo.c
|
|
+++ b/fs/btrfs/lzo.c
|
|
@@ -17,6 +17,7 @@
|
|
#include "compression.h"
|
|
#include "ctree.h"
|
|
#include "super.h"
|
|
+#include "btrfs_inode.h"
|
|
|
|
#define LZO_LEN 4
|
|
|
|
@@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
|
|
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
{
|
|
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
|
- const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
|
|
+ const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
|
|
const u32 sectorsize = fs_info->sectorsize;
|
|
char *kaddr;
|
|
int ret;
|
|
@@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
*/
|
|
btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
|
|
seg_len);
|
|
- ret = -EIO;
|
|
- goto out;
|
|
+ return -EIO;
|
|
}
|
|
|
|
/* Copy the compressed segment payload into workspace */
|
|
@@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
workspace->buf, &out_len);
|
|
if (ret != LZO_E_OK) {
|
|
btrfs_err(fs_info, "failed to decompress");
|
|
- ret = -EIO;
|
|
- goto out;
|
|
+ return -EIO;
|
|
}
|
|
|
|
/* Copy the data into inode pages */
|
|
@@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
|
|
/* All data read, exit */
|
|
if (ret == 0)
|
|
- goto out;
|
|
+ return 0;
|
|
ret = 0;
|
|
|
|
/* Check if the sector has enough space for a segment header */
|
|
@@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
/* Skip the padding zeros */
|
|
cur_in += sector_bytes_left;
|
|
}
|
|
-out:
|
|
- if (!ret)
|
|
- zero_fill_bio(cb->orig_bio);
|
|
- return ret;
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
int lzo_decompress(struct list_head *ws, const u8 *data_in,
|
|
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
|
|
index fde5aaa6e7c9..310a05cf95ef 100644
|
|
--- a/fs/btrfs/messages.c
|
|
+++ b/fs/btrfs/messages.c
|
|
@@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
|
|
#endif
|
|
|
|
#ifdef CONFIG_BTRFS_ASSERT
|
|
-void __cold btrfs_assertfail(const char *expr, const char *file, int line)
|
|
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line)
|
|
{
|
|
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
|
|
BUG();
|
|
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
|
|
index 8c516ee58ff9..ac2d1982ba3d 100644
|
|
--- a/fs/btrfs/messages.h
|
|
+++ b/fs/btrfs/messages.h
|
|
@@ -160,7 +160,7 @@ do { \
|
|
} while (0)
|
|
|
|
#ifdef CONFIG_BTRFS_ASSERT
|
|
-void __cold btrfs_assertfail(const char *expr, const char *file, int line);
|
|
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line);
|
|
|
|
#define ASSERT(expr) \
|
|
(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
|
|
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
|
|
index 6c24b69e2d0a..a9778a91511e 100644
|
|
--- a/fs/btrfs/ordered-data.c
|
|
+++ b/fs/btrfs/ordered-data.c
|
|
@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
|
|
* @compress_type: Compression algorithm used for data.
|
|
*
|
|
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
|
|
- * tree is given a single reference on the ordered extent that was inserted.
|
|
+ * tree is given a single reference on the ordered extent that was inserted, and
|
|
+ * the returned pointer is given a second reference.
|
|
*
|
|
- * Return: 0 or -ENOMEM.
|
|
+ * Return: the new ordered extent or error pointer.
|
|
*/
|
|
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
|
|
- u64 disk_num_bytes, u64 offset, unsigned flags,
|
|
- int compress_type)
|
|
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
|
|
+ struct btrfs_inode *inode, u64 file_offset,
|
|
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
|
|
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
|
|
+ int compress_type)
|
|
{
|
|
struct btrfs_root *root = inode->root;
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
/* For nocow write, we can release the qgroup rsv right now */
|
|
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
|
|
if (ret < 0)
|
|
- return ret;
|
|
+ return ERR_PTR(ret);
|
|
ret = 0;
|
|
} else {
|
|
/*
|
|
@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
*/
|
|
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
|
|
if (ret < 0)
|
|
- return ret;
|
|
+ return ERR_PTR(ret);
|
|
}
|
|
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
|
|
if (!entry)
|
|
- return -ENOMEM;
|
|
+ return ERR_PTR(-ENOMEM);
|
|
|
|
entry->file_offset = file_offset;
|
|
entry->num_bytes = num_bytes;
|
|
@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
btrfs_mod_outstanding_extents(inode, 1);
|
|
spin_unlock(&inode->lock);
|
|
|
|
+ /* One ref for the returned entry to match semantics of lookup. */
|
|
+ refcount_inc(&entry->refs);
|
|
+
|
|
+ return entry;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
|
|
+ * of returning it to the caller.
|
|
+ */
|
|
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
|
|
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
|
|
+ int compress_type)
|
|
+{
|
|
+ struct btrfs_ordered_extent *ordered;
|
|
+
|
|
+ ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
|
|
+ ram_bytes, disk_bytenr,
|
|
+ disk_num_bytes, offset, flags,
|
|
+ compress_type);
|
|
+
|
|
+ if (IS_ERR(ordered))
|
|
+ return PTR_ERR(ordered);
|
|
+ btrfs_put_ordered_extent(ordered);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
|
|
return false;
|
|
}
|
|
|
|
-
|
|
-static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
|
|
- u64 len)
|
|
-{
|
|
- struct inode *inode = ordered->inode;
|
|
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
|
- u64 file_offset = ordered->file_offset + pos;
|
|
- u64 disk_bytenr = ordered->disk_bytenr + pos;
|
|
- unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
|
|
-
|
|
- /*
|
|
- * The splitting extent is already counted and will be added again in
|
|
- * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
|
|
- */
|
|
- percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
|
|
- fs_info->delalloc_batch);
|
|
- WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
|
|
- return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
|
|
- disk_bytenr, len, 0, flags,
|
|
- ordered->compress_type);
|
|
-}
|
|
-
|
|
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
|
|
- u64 post)
|
|
+/* Split out a new ordered extent for this first @len bytes of @ordered. */
|
|
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
|
|
{
|
|
struct inode *inode = ordered->inode;
|
|
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
|
|
- struct rb_node *node;
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
- int ret = 0;
|
|
+ u64 file_offset = ordered->file_offset;
|
|
+ u64 disk_bytenr = ordered->disk_bytenr;
|
|
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
|
|
+ struct rb_node *node;
|
|
|
|
trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
|
|
|
|
+ ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
|
|
+
|
|
+ /*
|
|
+ * The entire bio must be covered by the ordered extent, but we can't
|
|
+ * reduce the original extent to a zero length either.
|
|
+ */
|
|
+ if (WARN_ON_ONCE(len >= ordered->num_bytes))
|
|
+ return -EINVAL;
|
|
+ /* We cannot split once ordered extent is past end_bio. */
|
|
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
|
|
+ return -EINVAL;
|
|
+ /* We cannot split a compressed ordered extent. */
|
|
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
|
|
+ return -EINVAL;
|
|
+ /* Checksum list should be empty. */
|
|
+ if (WARN_ON_ONCE(!list_empty(&ordered->list)))
|
|
+ return -EINVAL;
|
|
+
|
|
spin_lock_irq(&tree->lock);
|
|
/* Remove from tree once */
|
|
node = &ordered->rb_node;
|
|
@@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
|
|
if (tree->last == node)
|
|
tree->last = NULL;
|
|
|
|
- ordered->file_offset += pre;
|
|
- ordered->disk_bytenr += pre;
|
|
- ordered->num_bytes -= (pre + post);
|
|
- ordered->disk_num_bytes -= (pre + post);
|
|
- ordered->bytes_left -= (pre + post);
|
|
+ ordered->file_offset += len;
|
|
+ ordered->disk_bytenr += len;
|
|
+ ordered->num_bytes -= len;
|
|
+ ordered->disk_num_bytes -= len;
|
|
+ ordered->bytes_left -= len;
|
|
|
|
/* Re-insert the node */
|
|
node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
|
|
@@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
|
|
|
|
spin_unlock_irq(&tree->lock);
|
|
|
|
- if (pre)
|
|
- ret = clone_ordered_extent(ordered, 0, pre);
|
|
- if (ret == 0 && post)
|
|
- ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
|
|
- post);
|
|
+ /*
|
|
+ * The splitting extent is already counted and will be added again in
|
|
+ * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
|
|
+ */
|
|
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
|
|
|
|
- return ret;
|
|
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
|
|
+ disk_bytenr, len, 0, flags,
|
|
+ ordered->compress_type);
|
|
}
|
|
|
|
int __init ordered_data_init(void)
|
|
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
|
|
index eb40cb39f842..f0f1138d23c3 100644
|
|
--- a/fs/btrfs/ordered-data.h
|
|
+++ b/fs/btrfs/ordered-data.h
|
|
@@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
|
|
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
|
|
struct btrfs_ordered_extent **cached,
|
|
u64 file_offset, u64 io_size);
|
|
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
|
|
+ struct btrfs_inode *inode, u64 file_offset,
|
|
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
|
|
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
|
|
+ int compress_type);
|
|
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
|
u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
|
|
- u64 disk_num_bytes, u64 offset, unsigned flags,
|
|
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
|
|
int compress_type);
|
|
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
|
|
struct btrfs_ordered_sum *sum);
|
|
@@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
|
|
struct extent_state **cached_state);
|
|
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
|
|
struct extent_state **cached_state);
|
|
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
|
|
- u64 post);
|
|
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len);
|
|
int __init ordered_data_init(void);
|
|
void __cold ordered_data_exit(void);
|
|
|
|
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
|
|
index 642828c1b299..2fab37f062de 100644
|
|
--- a/fs/btrfs/raid56.c
|
|
+++ b/fs/btrfs/raid56.c
|
|
@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
|
|
*/
|
|
static int rbio_bucket(struct btrfs_raid_bio *rbio)
|
|
{
|
|
- u64 num = rbio->bioc->raid_map[0];
|
|
+ u64 num = rbio->bioc->full_stripe_logical;
|
|
|
|
/*
|
|
* we shift down quite a bit. We're using byte
|
|
@@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
|
|
static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
|
|
{
|
|
struct btrfs_stripe_hash_table *table;
|
|
- unsigned long flags;
|
|
|
|
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
|
|
return;
|
|
|
|
table = rbio->bioc->fs_info->stripe_hash_table;
|
|
|
|
- spin_lock_irqsave(&table->cache_lock, flags);
|
|
+ spin_lock(&table->cache_lock);
|
|
__remove_rbio_from_cache(rbio);
|
|
- spin_unlock_irqrestore(&table->cache_lock, flags);
|
|
+ spin_unlock(&table->cache_lock);
|
|
}
|
|
|
|
/*
|
|
@@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
|
|
static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
|
|
{
|
|
struct btrfs_stripe_hash_table *table;
|
|
- unsigned long flags;
|
|
struct btrfs_raid_bio *rbio;
|
|
|
|
table = info->stripe_hash_table;
|
|
|
|
- spin_lock_irqsave(&table->cache_lock, flags);
|
|
+ spin_lock(&table->cache_lock);
|
|
while (!list_empty(&table->stripe_cache)) {
|
|
rbio = list_entry(table->stripe_cache.next,
|
|
struct btrfs_raid_bio,
|
|
stripe_cache);
|
|
__remove_rbio_from_cache(rbio);
|
|
}
|
|
- spin_unlock_irqrestore(&table->cache_lock, flags);
|
|
+ spin_unlock(&table->cache_lock);
|
|
}
|
|
|
|
/*
|
|
@@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
|
|
static void cache_rbio(struct btrfs_raid_bio *rbio)
|
|
{
|
|
struct btrfs_stripe_hash_table *table;
|
|
- unsigned long flags;
|
|
|
|
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
|
|
return;
|
|
|
|
table = rbio->bioc->fs_info->stripe_hash_table;
|
|
|
|
- spin_lock_irqsave(&table->cache_lock, flags);
|
|
+ spin_lock(&table->cache_lock);
|
|
spin_lock(&rbio->bio_list_lock);
|
|
|
|
/* bump our ref if we were not in the list before */
|
|
@@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
|
|
__remove_rbio_from_cache(found);
|
|
}
|
|
|
|
- spin_unlock_irqrestore(&table->cache_lock, flags);
|
|
+ spin_unlock(&table->cache_lock);
|
|
}
|
|
|
|
/*
|
|
@@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len)
|
|
*/
|
|
static int rbio_is_full(struct btrfs_raid_bio *rbio)
|
|
{
|
|
- unsigned long flags;
|
|
unsigned long size = rbio->bio_list_bytes;
|
|
int ret = 1;
|
|
|
|
- spin_lock_irqsave(&rbio->bio_list_lock, flags);
|
|
+ spin_lock(&rbio->bio_list_lock);
|
|
if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
|
|
ret = 0;
|
|
BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
|
|
- spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
|
|
return ret;
|
|
}
|
|
@@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
|
|
test_bit(RBIO_CACHE_BIT, &cur->flags))
|
|
return 0;
|
|
|
|
- if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
|
|
+ if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
|
|
return 0;
|
|
|
|
/* we can't merge with different operations */
|
|
@@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
|
|
struct btrfs_stripe_hash *h;
|
|
struct btrfs_raid_bio *cur;
|
|
struct btrfs_raid_bio *pending;
|
|
- unsigned long flags;
|
|
struct btrfs_raid_bio *freeit = NULL;
|
|
struct btrfs_raid_bio *cache_drop = NULL;
|
|
int ret = 0;
|
|
|
|
h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
|
|
|
|
- spin_lock_irqsave(&h->lock, flags);
|
|
+ spin_lock(&h->lock);
|
|
list_for_each_entry(cur, &h->hash_list, hash_list) {
|
|
- if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
|
|
+ if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
|
|
continue;
|
|
|
|
spin_lock(&cur->bio_list_lock);
|
|
@@ -724,7 +719,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
|
|
refcount_inc(&rbio->refs);
|
|
list_add(&rbio->hash_list, &h->hash_list);
|
|
out:
|
|
- spin_unlock_irqrestore(&h->lock, flags);
|
|
+ spin_unlock(&h->lock);
|
|
if (cache_drop)
|
|
remove_rbio_from_cache(cache_drop);
|
|
if (freeit)
|
|
@@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
|
|
{
|
|
int bucket;
|
|
struct btrfs_stripe_hash *h;
|
|
- unsigned long flags;
|
|
int keep_cache = 0;
|
|
|
|
bucket = rbio_bucket(rbio);
|
|
@@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
|
|
if (list_empty(&rbio->plug_list))
|
|
cache_rbio(rbio);
|
|
|
|
- spin_lock_irqsave(&h->lock, flags);
|
|
+ spin_lock(&h->lock);
|
|
spin_lock(&rbio->bio_list_lock);
|
|
|
|
if (!list_empty(&rbio->hash_list)) {
|
|
@@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
|
|
list_add(&next->hash_list, &h->hash_list);
|
|
refcount_inc(&next->refs);
|
|
spin_unlock(&rbio->bio_list_lock);
|
|
- spin_unlock_irqrestore(&h->lock, flags);
|
|
+ spin_unlock(&h->lock);
|
|
|
|
if (next->operation == BTRFS_RBIO_READ_REBUILD)
|
|
start_async_work(next, recover_rbio_work_locked);
|
|
@@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
|
|
}
|
|
done:
|
|
spin_unlock(&rbio->bio_list_lock);
|
|
- spin_unlock_irqrestore(&h->lock, flags);
|
|
+ spin_unlock(&h->lock);
|
|
|
|
done_nolock:
|
|
if (!keep_cache)
|
|
@@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
|
|
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
|
|
ASSERT(index >= 0 && index < rbio->nr_sectors);
|
|
|
|
- spin_lock_irq(&rbio->bio_list_lock);
|
|
+ spin_lock(&rbio->bio_list_lock);
|
|
sector = &rbio->bio_sectors[index];
|
|
if (sector->page || bio_list_only) {
|
|
/* Don't return sector without a valid page pointer */
|
|
if (!sector->page)
|
|
sector = NULL;
|
|
- spin_unlock_irq(&rbio->bio_list_lock);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
return sector;
|
|
}
|
|
- spin_unlock_irq(&rbio->bio_list_lock);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
|
|
return &rbio->stripe_sectors[index];
|
|
}
|
|
@@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
|
|
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_io_context *bioc)
|
|
{
|
|
- const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
|
|
+ const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
|
|
const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
|
|
const unsigned int num_pages = stripe_npages * real_stripes;
|
|
const unsigned int stripe_nsectors =
|
|
@@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
|
|
bio->bi_iter.bi_sector = disk_start >> 9;
|
|
bio->bi_private = rbio;
|
|
|
|
- bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
|
|
+ __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
|
|
bio_list_add(bio_list, bio);
|
|
return 0;
|
|
}
|
|
@@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
|
|
struct bio_vec bvec;
|
|
struct bvec_iter iter;
|
|
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
|
|
- rbio->bioc->raid_map[0];
|
|
+ rbio->bioc->full_stripe_logical;
|
|
|
|
bio_for_each_segment(bvec, bio, iter) {
|
|
u32 bvec_offset;
|
|
@@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
|
|
{
|
|
struct bio *bio;
|
|
|
|
- spin_lock_irq(&rbio->bio_list_lock);
|
|
+ spin_lock(&rbio->bio_list_lock);
|
|
bio_list_for_each(bio, &rbio->bio_list)
|
|
index_one_bio(rbio, bio);
|
|
|
|
- spin_unlock_irq(&rbio->bio_list_lock);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
}
|
|
|
|
static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
|
|
@@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
|
|
goto error;
|
|
}
|
|
|
|
- if (likely(!rbio->bioc->num_tgtdevs))
|
|
+ if (likely(!rbio->bioc->replace_nr_stripes))
|
|
return 0;
|
|
|
|
- /* Make a copy for the replace target device. */
|
|
+ /*
|
|
+ * Make a copy for the replace target device.
|
|
+ *
|
|
+ * Thus the source stripe number (in replace_stripe_src) should be valid.
|
|
+ */
|
|
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
|
|
+
|
|
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
|
|
total_sector_nr++) {
|
|
struct sector_ptr *sector;
|
|
@@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
|
|
stripe = total_sector_nr / rbio->stripe_nsectors;
|
|
sectornr = total_sector_nr % rbio->stripe_nsectors;
|
|
|
|
- if (!rbio->bioc->tgtdev_map[stripe]) {
|
|
+ /*
|
|
+ * For RAID56, there is only one device that can be replaced,
|
|
+ * and replace_stripe_src[0] indicates the stripe number we
|
|
+ * need to copy from.
|
|
+ */
|
|
+ if (stripe != rbio->bioc->replace_stripe_src) {
|
|
/*
|
|
* We can skip the whole stripe completely, note
|
|
* total_sector_nr will be increased by one anyway.
|
|
@@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
|
|
}
|
|
|
|
ret = rbio_add_io_sector(rbio, bio_list, sector,
|
|
- rbio->bioc->tgtdev_map[stripe],
|
|
+ rbio->real_stripes,
|
|
sectornr, REQ_OP_WRITE);
|
|
if (ret)
|
|
goto error;
|
|
@@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
|
|
{
|
|
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
|
|
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
|
|
- rbio->bioc->raid_map[0];
|
|
+ rbio->bioc->full_stripe_logical;
|
|
int total_nr_sector = offset >> fs_info->sectorsize_bits;
|
|
|
|
ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
|
|
@@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
|
|
{
|
|
const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
|
|
const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
|
- const u64 full_stripe_start = rbio->bioc->raid_map[0];
|
|
+ const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
|
|
const u32 orig_len = orig_bio->bi_iter.bi_size;
|
|
const u32 sectorsize = fs_info->sectorsize;
|
|
u64 cur_logical;
|
|
@@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
|
|
* here due to a crc mismatch and we can't give them the
|
|
* data they want.
|
|
*/
|
|
- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
|
|
- if (rbio->bioc->raid_map[faila] ==
|
|
- RAID5_P_STRIPE)
|
|
+ if (failb == rbio->real_stripes - 1) {
|
|
+ if (faila == rbio->real_stripes - 2)
|
|
/*
|
|
* Only P and Q are corrupted.
|
|
* We only care about data stripes recovery,
|
|
@@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
|
|
goto pstripe;
|
|
}
|
|
|
|
- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
|
|
+ if (failb == rbio->real_stripes - 2) {
|
|
raid6_datap_recov(rbio->real_stripes, sectorsize,
|
|
faila, pointers);
|
|
} else {
|
|
@@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
|
|
|
|
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
|
|
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
|
|
- spin_lock_irq(&rbio->bio_list_lock);
|
|
+ spin_lock(&rbio->bio_list_lock);
|
|
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
|
|
- spin_unlock_irq(&rbio->bio_list_lock);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
}
|
|
|
|
index_rbio_pages(rbio);
|
|
@@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
|
|
{
|
|
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
|
|
struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
|
|
- rbio->bioc->raid_map[0]);
|
|
- const u64 start = rbio->bioc->raid_map[0];
|
|
+ rbio->bioc->full_stripe_logical);
|
|
+ const u64 start = rbio->bioc->full_stripe_logical;
|
|
const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
|
|
fs_info->sectorsize_bits;
|
|
int ret;
|
|
@@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
|
|
}
|
|
|
|
ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
|
|
- rbio->csum_buf, rbio->csum_bitmap);
|
|
+ rbio->csum_buf, rbio->csum_bitmap, false);
|
|
if (ret < 0)
|
|
goto error;
|
|
if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
|
|
@@ -2124,7 +2128,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
|
|
*/
|
|
btrfs_warn_rl(fs_info,
|
|
"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
|
|
- rbio->bioc->raid_map[0], ret);
|
|
+ rbio->bioc->full_stripe_logical, ret);
|
|
no_csum:
|
|
kfree(rbio->csum_buf);
|
|
bitmap_free(rbio->csum_bitmap);
|
|
@@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
|
|
* bio list any more, anyone else that wants to change this stripe
|
|
* needs to do their own rmw.
|
|
*/
|
|
- spin_lock_irq(&rbio->bio_list_lock);
|
|
+ spin_lock(&rbio->bio_list_lock);
|
|
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
|
|
- spin_unlock_irq(&rbio->bio_list_lock);
|
|
+ spin_unlock(&rbio->bio_list_lock);
|
|
|
|
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
|
|
|
|
@@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
|
|
return rbio;
|
|
}
|
|
|
|
-/* Used for both parity scrub and missing. */
|
|
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
|
|
- unsigned int pgoff, u64 logical)
|
|
-{
|
|
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
|
|
- int stripe_offset;
|
|
- int index;
|
|
-
|
|
- ASSERT(logical >= rbio->bioc->raid_map[0]);
|
|
- ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
|
|
- BTRFS_STRIPE_LEN * rbio->nr_data);
|
|
- stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
|
|
- index = stripe_offset / sectorsize;
|
|
- rbio->bio_sectors[index].page = page;
|
|
- rbio->bio_sectors[index].pgoff = pgoff;
|
|
-}
|
|
-
|
|
/*
|
|
* We just scrub the parity that we have correct data on the same horizontal,
|
|
* so we needn't allocate all pages for all the stripes.
|
|
@@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
|
|
else
|
|
BUG();
|
|
|
|
- if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
|
|
+ /*
|
|
+ * Replace is running and our P/Q stripe is being replaced, then we
|
|
+ * need to duplicate the final write to replace target.
|
|
+ */
|
|
+ if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
|
|
is_replace = 1;
|
|
bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
|
|
}
|
|
@@ -2544,13 +2535,18 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
|
|
if (!is_replace)
|
|
goto submit_write;
|
|
|
|
+ /*
|
|
+ * Replace is running and our parity stripe needs to be duplicated to
|
|
+ * the target device. Check we have a valid source stripe number.
|
|
+ */
|
|
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
|
|
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
|
|
struct sector_ptr *sector;
|
|
|
|
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
|
|
ret = rbio_add_io_sector(rbio, &bio_list, sector,
|
|
- bioc->tgtdev_map[rbio->scrubp],
|
|
- sectornr, REQ_OP_WRITE);
|
|
+ rbio->real_stripes,
|
|
+ sectornr, REQ_OP_WRITE);
|
|
if (ret)
|
|
goto cleanup;
|
|
}
|
|
@@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
|
|
if (!lock_stripe_add(rbio))
|
|
start_async_work(rbio, scrub_rbio_work_locked);
|
|
}
|
|
-
|
|
-/* The following code is used for dev replace of a missing RAID 5/6 device. */
|
|
-
|
|
-struct btrfs_raid_bio *
|
|
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
|
|
-{
|
|
- struct btrfs_fs_info *fs_info = bioc->fs_info;
|
|
- struct btrfs_raid_bio *rbio;
|
|
-
|
|
- rbio = alloc_rbio(fs_info, bioc);
|
|
- if (IS_ERR(rbio))
|
|
- return NULL;
|
|
-
|
|
- rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
|
|
- bio_list_add(&rbio->bio_list, bio);
|
|
- /*
|
|
- * This is a special bio which is used to hold the completion handler
|
|
- * and make the scrub rbio is similar to the other types
|
|
- */
|
|
- ASSERT(!bio->bi_iter.bi_size);
|
|
-
|
|
- set_rbio_range_error(rbio, bio);
|
|
-
|
|
- return rbio;
|
|
-}
|
|
-
|
|
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
|
|
-{
|
|
- start_async_work(rbio, recover_rbio_work);
|
|
-}
|
|
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
|
|
index df0e0abdeb1f..0f7f31c8cb98 100644
|
|
--- a/fs/btrfs/raid56.h
|
|
+++ b/fs/btrfs/raid56.h
|
|
@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
|
|
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
|
|
}
|
|
|
|
+static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
|
|
+{
|
|
+ return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
|
|
+}
|
|
+
|
|
#define RAID5_P_STRIPE ((u64)-2)
|
|
#define RAID6_Q_STRIPE ((u64)-1)
|
|
|
|
@@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
|
|
int mirror_num);
|
|
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
|
|
|
|
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
|
|
- unsigned int pgoff, u64 logical);
|
|
-
|
|
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
|
|
struct btrfs_io_context *bioc,
|
|
struct btrfs_device *scrub_dev,
|
|
unsigned long *dbitmap, int stripe_nsectors);
|
|
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
|
|
|
|
-struct btrfs_raid_bio *
|
|
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
|
|
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
|
|
-
|
|
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
|
|
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
|
|
|
|
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
|
|
index ef13a9d4e370..09b1988d1791 100644
|
|
--- a/fs/btrfs/relocation.c
|
|
+++ b/fs/btrfs/relocation.c
|
|
@@ -1266,7 +1266,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
|
|
level = btrfs_header_level(parent);
|
|
ASSERT(level >= lowest_level);
|
|
|
|
- ret = btrfs_bin_search(parent, &key, &slot);
|
|
+ ret = btrfs_bin_search(parent, 0, &key, &slot);
|
|
if (ret < 0)
|
|
break;
|
|
if (ret && slot > 0)
|
|
@@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
|
|
|
if (upper->eb && !upper->locked) {
|
|
if (!lowest) {
|
|
- ret = btrfs_bin_search(upper->eb, key, &slot);
|
|
+ ret = btrfs_bin_search(upper->eb, 0, key, &slot);
|
|
if (ret < 0)
|
|
goto next;
|
|
BUG_ON(ret);
|
|
@@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
|
slot = path->slots[upper->level];
|
|
btrfs_release_path(path);
|
|
} else {
|
|
- ret = btrfs_bin_search(upper->eb, key, &slot);
|
|
+ ret = btrfs_bin_search(upper->eb, 0, key, &slot);
|
|
if (ret < 0)
|
|
goto next;
|
|
BUG_ON(ret);
|
|
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
|
|
index 69c93ae333f6..836725a19661 100644
|
|
--- a/fs/btrfs/scrub.c
|
|
+++ b/fs/btrfs/scrub.c
|
|
@@ -38,18 +38,14 @@
|
|
* - add a mode to also read unallocated space
|
|
*/
|
|
|
|
-struct scrub_block;
|
|
struct scrub_ctx;
|
|
|
|
/*
|
|
- * The following three values only influence the performance.
|
|
+ * The following value only influences the performance.
|
|
*
|
|
- * The last one configures the number of parallel and outstanding I/O
|
|
- * operations. The first one configures an upper limit for the number
|
|
- * of (dynamically allocated) pages that are added to a bio.
|
|
+ * This determines the batch size for stripe submitted in one go.
|
|
*/
|
|
-#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
|
|
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
|
|
+#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
|
|
|
|
/*
|
|
* The following value times PAGE_SIZE needs to be large enough to match the
|
|
@@ -57,128 +53,124 @@ struct scrub_ctx;
|
|
*/
|
|
#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
|
|
|
|
-#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
|
|
+/* Represent one sector and its needed info to verify the content. */
|
|
+struct scrub_sector_verification {
|
|
+ bool is_metadata;
|
|
|
|
-/*
|
|
- * Maximum number of mirrors that can be available for all profiles counting
|
|
- * the target device of dev-replace as one. During an active device replace
|
|
- * procedure, the target device of the copy operation is a mirror for the
|
|
- * filesystem data as well that can be used to read data in order to repair
|
|
- * read errors on other disks.
|
|
- *
|
|
- * Current value is derived from RAID1C4 with 4 copies.
|
|
- */
|
|
-#define BTRFS_MAX_MIRRORS (4 + 1)
|
|
+ union {
|
|
+ /*
|
|
+ * Csum pointer for data csum verification. Should point to a
|
|
+ * sector csum inside scrub_stripe::csums.
|
|
+ *
|
|
+ * NULL if this data sector has no csum.
|
|
+ */
|
|
+ u8 *csum;
|
|
|
|
-struct scrub_recover {
|
|
- refcount_t refs;
|
|
- struct btrfs_io_context *bioc;
|
|
- u64 map_length;
|
|
+ /*
|
|
+ * Extra info for metadata verification. All sectors inside a
|
|
+ * tree block share the same generation.
|
|
+ */
|
|
+ u64 generation;
|
|
+ };
|
|
};
|
|
|
|
-struct scrub_sector {
|
|
- struct scrub_block *sblock;
|
|
- struct list_head list;
|
|
- u64 flags; /* extent flags */
|
|
- u64 generation;
|
|
- /* Offset in bytes to @sblock. */
|
|
- u32 offset;
|
|
- atomic_t refs;
|
|
- unsigned int have_csum:1;
|
|
- unsigned int io_error:1;
|
|
- u8 csum[BTRFS_CSUM_SIZE];
|
|
-
|
|
- struct scrub_recover *recover;
|
|
-};
|
|
+enum scrub_stripe_flags {
|
|
+ /* Set when @mirror_num, @dev, @physical and @logical are set. */
|
|
+ SCRUB_STRIPE_FLAG_INITIALIZED,
|
|
|
|
-struct scrub_bio {
|
|
- int index;
|
|
- struct scrub_ctx *sctx;
|
|
- struct btrfs_device *dev;
|
|
- struct bio *bio;
|
|
- blk_status_t status;
|
|
- u64 logical;
|
|
- u64 physical;
|
|
- struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
|
|
- int sector_count;
|
|
- int next_free;
|
|
- struct work_struct work;
|
|
-};
|
|
+ /* Set when the read-repair is finished. */
|
|
+ SCRUB_STRIPE_FLAG_REPAIR_DONE,
|
|
|
|
-struct scrub_block {
|
|
/*
|
|
- * Each page will have its page::private used to record the logical
|
|
- * bytenr.
|
|
+ * Set for data stripes if it's triggered from P/Q stripe.
|
|
+ * During such scrub, we should not report errors in data stripes, nor
|
|
+ * update the accounting.
|
|
*/
|
|
- struct page *pages[SCRUB_MAX_PAGES];
|
|
- struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
|
|
- struct btrfs_device *dev;
|
|
- /* Logical bytenr of the sblock */
|
|
- u64 logical;
|
|
- u64 physical;
|
|
- u64 physical_for_dev_replace;
|
|
- /* Length of sblock in bytes */
|
|
- u32 len;
|
|
- int sector_count;
|
|
- int mirror_num;
|
|
-
|
|
- atomic_t outstanding_sectors;
|
|
- refcount_t refs; /* free mem on transition to zero */
|
|
- struct scrub_ctx *sctx;
|
|
- struct scrub_parity *sparity;
|
|
- struct {
|
|
- unsigned int header_error:1;
|
|
- unsigned int checksum_error:1;
|
|
- unsigned int no_io_error_seen:1;
|
|
- unsigned int generation_error:1; /* also sets header_error */
|
|
-
|
|
- /* The following is for the data used to check parity */
|
|
- /* It is for the data with checksum */
|
|
- unsigned int data_corrected:1;
|
|
- };
|
|
- struct work_struct work;
|
|
+ SCRUB_STRIPE_FLAG_NO_REPORT,
|
|
};
|
|
|
|
-/* Used for the chunks with parity stripe such RAID5/6 */
|
|
-struct scrub_parity {
|
|
- struct scrub_ctx *sctx;
|
|
+#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
|
|
+
|
|
+/*
|
|
+ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
|
|
+ */
|
|
+struct scrub_stripe {
|
|
+ struct scrub_ctx *sctx;
|
|
+ struct btrfs_block_group *bg;
|
|
+
|
|
+ struct page *pages[SCRUB_STRIPE_PAGES];
|
|
+ struct scrub_sector_verification *sectors;
|
|
+
|
|
+ struct btrfs_device *dev;
|
|
+ u64 logical;
|
|
+ u64 physical;
|
|
|
|
- struct btrfs_device *scrub_dev;
|
|
+ u16 mirror_num;
|
|
|
|
- u64 logic_start;
|
|
+ /* Should be BTRFS_STRIPE_LEN / sectorsize. */
|
|
+ u16 nr_sectors;
|
|
+
|
|
+ /*
|
|
+ * How many data/meta extents are in this stripe. Only for scrub status
|
|
+ * reporting purposes.
|
|
+ */
|
|
+ u16 nr_data_extents;
|
|
+ u16 nr_meta_extents;
|
|
|
|
- u64 logic_end;
|
|
+ atomic_t pending_io;
|
|
+ wait_queue_head_t io_wait;
|
|
+ wait_queue_head_t repair_wait;
|
|
|
|
- int nsectors;
|
|
+ /*
|
|
+ * Indicate the states of the stripe. Bits are defined in
|
|
+ * scrub_stripe_flags enum.
|
|
+ */
|
|
+ unsigned long state;
|
|
|
|
- u32 stripe_len;
|
|
+ /* Indicate which sectors are covered by extent items. */
|
|
+ unsigned long extent_sector_bitmap;
|
|
|
|
- refcount_t refs;
|
|
+ /*
|
|
+ * The errors hit during the initial read of the stripe.
|
|
+ *
|
|
+ * Would be utilized for error reporting and repair.
|
|
+ */
|
|
+ unsigned long init_error_bitmap;
|
|
|
|
- struct list_head sectors_list;
|
|
+ /*
|
|
+ * The following error bitmaps are all for the current status.
|
|
+ * Every time we submit a new read, these bitmaps may be updated.
|
|
+ *
|
|
+ * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
|
|
+ *
|
|
+ * IO and csum errors can happen for both metadata and data.
|
|
+ */
|
|
+ unsigned long error_bitmap;
|
|
+ unsigned long io_error_bitmap;
|
|
+ unsigned long csum_error_bitmap;
|
|
+ unsigned long meta_error_bitmap;
|
|
|
|
- /* Work of parity check and repair */
|
|
- struct work_struct work;
|
|
+ /* For writeback (repair or replace) error reporting. */
|
|
+ unsigned long write_error_bitmap;
|
|
|
|
- /* Mark the parity blocks which have data */
|
|
- unsigned long dbitmap;
|
|
+ /* Writeback can be concurrent, thus we need to protect the bitmap. */
|
|
+ spinlock_t write_error_lock;
|
|
|
|
/*
|
|
- * Mark the parity blocks which have data, but errors happen when
|
|
- * read data or check data
|
|
+ * Checksum for the whole stripe if this stripe is inside a data block
|
|
+ * group.
|
|
*/
|
|
- unsigned long ebitmap;
|
|
+ u8 *csums;
|
|
+
|
|
+ struct work_struct work;
|
|
};
|
|
|
|
struct scrub_ctx {
|
|
- struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
|
|
+ struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
|
|
+ struct scrub_stripe *raid56_data_stripes;
|
|
struct btrfs_fs_info *fs_info;
|
|
int first_free;
|
|
- int curr;
|
|
- atomic_t bios_in_flight;
|
|
- atomic_t workers_pending;
|
|
- spinlock_t list_lock;
|
|
- wait_queue_head_t list_wait;
|
|
+ int cur_stripe;
|
|
struct list_head csum_list;
|
|
atomic_t cancel_req;
|
|
int readonly;
|
|
@@ -191,10 +183,8 @@ struct scrub_ctx {
|
|
int is_dev_replace;
|
|
u64 write_pointer;
|
|
|
|
- struct scrub_bio *wr_curr_bio;
|
|
struct mutex wr_lock;
|
|
struct btrfs_device *wr_tgtdev;
|
|
- bool flush_all_writes;
|
|
|
|
/*
|
|
* statistics
|
|
@@ -221,239 +211,66 @@ struct scrub_warning {
|
|
struct btrfs_device *dev;
|
|
};
|
|
|
|
-struct full_stripe_lock {
|
|
- struct rb_node node;
|
|
- u64 logical;
|
|
- u64 refs;
|
|
- struct mutex mutex;
|
|
-};
|
|
-
|
|
-#ifndef CONFIG_64BIT
|
|
-/* This structure is for architectures whose (void *) is smaller than u64 */
|
|
-struct scrub_page_private {
|
|
- u64 logical;
|
|
-};
|
|
-#endif
|
|
-
|
|
-static int attach_scrub_page_private(struct page *page, u64 logical)
|
|
-{
|
|
-#ifdef CONFIG_64BIT
|
|
- attach_page_private(page, (void *)logical);
|
|
- return 0;
|
|
-#else
|
|
- struct scrub_page_private *spp;
|
|
-
|
|
- spp = kmalloc(sizeof(*spp), GFP_KERNEL);
|
|
- if (!spp)
|
|
- return -ENOMEM;
|
|
- spp->logical = logical;
|
|
- attach_page_private(page, (void *)spp);
|
|
- return 0;
|
|
-#endif
|
|
-}
|
|
-
|
|
-static void detach_scrub_page_private(struct page *page)
|
|
-{
|
|
-#ifdef CONFIG_64BIT
|
|
- detach_page_private(page);
|
|
- return;
|
|
-#else
|
|
- struct scrub_page_private *spp;
|
|
-
|
|
- spp = detach_page_private(page);
|
|
- kfree(spp);
|
|
- return;
|
|
-#endif
|
|
-}
|
|
-
|
|
-static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
|
|
- struct btrfs_device *dev,
|
|
- u64 logical, u64 physical,
|
|
- u64 physical_for_dev_replace,
|
|
- int mirror_num)
|
|
-{
|
|
- struct scrub_block *sblock;
|
|
-
|
|
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
|
|
- if (!sblock)
|
|
- return NULL;
|
|
- refcount_set(&sblock->refs, 1);
|
|
- sblock->sctx = sctx;
|
|
- sblock->logical = logical;
|
|
- sblock->physical = physical;
|
|
- sblock->physical_for_dev_replace = physical_for_dev_replace;
|
|
- sblock->dev = dev;
|
|
- sblock->mirror_num = mirror_num;
|
|
- sblock->no_io_error_seen = 1;
|
|
- /*
|
|
- * Scrub_block::pages will be allocated at alloc_scrub_sector() when
|
|
- * the corresponding page is not allocated.
|
|
- */
|
|
- return sblock;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Allocate a new scrub sector and attach it to @sblock.
|
|
- *
|
|
- * Will also allocate new pages for @sblock if needed.
|
|
- */
|
|
-static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
|
|
- u64 logical)
|
|
+static void release_scrub_stripe(struct scrub_stripe *stripe)
|
|
{
|
|
- const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
|
|
- struct scrub_sector *ssector;
|
|
-
|
|
- /* We must never have scrub_block exceed U32_MAX in size. */
|
|
- ASSERT(logical - sblock->logical < U32_MAX);
|
|
-
|
|
- ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
|
|
- if (!ssector)
|
|
- return NULL;
|
|
-
|
|
- /* Allocate a new page if the slot is not allocated */
|
|
- if (!sblock->pages[page_index]) {
|
|
- int ret;
|
|
+ if (!stripe)
|
|
+ return;
|
|
|
|
- sblock->pages[page_index] = alloc_page(GFP_KERNEL);
|
|
- if (!sblock->pages[page_index]) {
|
|
- kfree(ssector);
|
|
- return NULL;
|
|
- }
|
|
- ret = attach_scrub_page_private(sblock->pages[page_index],
|
|
- sblock->logical + (page_index << PAGE_SHIFT));
|
|
- if (ret < 0) {
|
|
- kfree(ssector);
|
|
- __free_page(sblock->pages[page_index]);
|
|
- sblock->pages[page_index] = NULL;
|
|
- return NULL;
|
|
- }
|
|
+ for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
|
|
+ if (stripe->pages[i])
|
|
+ __free_page(stripe->pages[i]);
|
|
+ stripe->pages[i] = NULL;
|
|
}
|
|
-
|
|
- atomic_set(&ssector->refs, 1);
|
|
- ssector->sblock = sblock;
|
|
- /* The sector to be added should not be used */
|
|
- ASSERT(sblock->sectors[sblock->sector_count] == NULL);
|
|
- ssector->offset = logical - sblock->logical;
|
|
-
|
|
- /* The sector count must be smaller than the limit */
|
|
- ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
|
|
-
|
|
- sblock->sectors[sblock->sector_count] = ssector;
|
|
- sblock->sector_count++;
|
|
- sblock->len += sblock->sctx->fs_info->sectorsize;
|
|
-
|
|
- return ssector;
|
|
-}
|
|
-
|
|
-static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
|
|
-{
|
|
- struct scrub_block *sblock = ssector->sblock;
|
|
- pgoff_t index;
|
|
- /*
|
|
- * When calling this function, ssector must be alreaday attached to the
|
|
- * parent sblock.
|
|
- */
|
|
- ASSERT(sblock);
|
|
-
|
|
- /* The range should be inside the sblock range */
|
|
- ASSERT(ssector->offset < sblock->len);
|
|
-
|
|
- index = ssector->offset >> PAGE_SHIFT;
|
|
- ASSERT(index < SCRUB_MAX_PAGES);
|
|
- ASSERT(sblock->pages[index]);
|
|
- ASSERT(PagePrivate(sblock->pages[index]));
|
|
- return sblock->pages[index];
|
|
+ kfree(stripe->sectors);
|
|
+ kfree(stripe->csums);
|
|
+ stripe->sectors = NULL;
|
|
+ stripe->csums = NULL;
|
|
+ stripe->sctx = NULL;
|
|
+ stripe->state = 0;
|
|
}
|
|
|
|
-static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
|
|
+static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
|
|
+ struct scrub_stripe *stripe)
|
|
{
|
|
- struct scrub_block *sblock = ssector->sblock;
|
|
+ int ret;
|
|
|
|
- /*
|
|
- * When calling this function, ssector must be already attached to the
|
|
- * parent sblock.
|
|
- */
|
|
- ASSERT(sblock);
|
|
+ memset(stripe, 0, sizeof(*stripe));
|
|
|
|
- /* The range should be inside the sblock range */
|
|
- ASSERT(ssector->offset < sblock->len);
|
|
+ stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
|
|
+ stripe->state = 0;
|
|
|
|
- return offset_in_page(ssector->offset);
|
|
-}
|
|
+ init_waitqueue_head(&stripe->io_wait);
|
|
+ init_waitqueue_head(&stripe->repair_wait);
|
|
+ atomic_set(&stripe->pending_io, 0);
|
|
+ spin_lock_init(&stripe->write_error_lock);
|
|
|
|
-static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
|
|
-{
|
|
- return page_address(scrub_sector_get_page(ssector)) +
|
|
- scrub_sector_get_page_offset(ssector);
|
|
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
|
|
+ if (ret < 0)
|
|
+ goto error;
|
|
+
|
|
+ stripe->sectors = kcalloc(stripe->nr_sectors,
|
|
+ sizeof(struct scrub_sector_verification),
|
|
+ GFP_KERNEL);
|
|
+ if (!stripe->sectors)
|
|
+ goto error;
|
|
+
|
|
+ stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
|
|
+ fs_info->csum_size, GFP_KERNEL);
|
|
+ if (!stripe->csums)
|
|
+ goto error;
|
|
+ return 0;
|
|
+error:
|
|
+ release_scrub_stripe(stripe);
|
|
+ return -ENOMEM;
|
|
}
|
|
|
|
-static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
|
|
- unsigned int len)
|
|
+static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
|
|
{
|
|
- return bio_add_page(bio, scrub_sector_get_page(ssector), len,
|
|
- scrub_sector_get_page_offset(ssector));
|
|
+ wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
|
|
}
|
|
|
|
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
|
|
- struct scrub_block *sblocks_for_recheck[]);
|
|
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
|
|
- struct scrub_block *sblock,
|
|
- int retry_failed_mirror);
|
|
-static void scrub_recheck_block_checksum(struct scrub_block *sblock);
|
|
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
|
|
- struct scrub_block *sblock_good);
|
|
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
|
|
- struct scrub_block *sblock_good,
|
|
- int sector_num, int force_write);
|
|
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
|
|
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
|
|
- int sector_num);
|
|
-static int scrub_checksum_data(struct scrub_block *sblock);
|
|
-static int scrub_checksum_tree_block(struct scrub_block *sblock);
|
|
-static int scrub_checksum_super(struct scrub_block *sblock);
|
|
-static void scrub_block_put(struct scrub_block *sblock);
|
|
-static void scrub_sector_get(struct scrub_sector *sector);
|
|
-static void scrub_sector_put(struct scrub_sector *sector);
|
|
-static void scrub_parity_get(struct scrub_parity *sparity);
|
|
-static void scrub_parity_put(struct scrub_parity *sparity);
|
|
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
|
|
- u64 physical, struct btrfs_device *dev, u64 flags,
|
|
- u64 gen, int mirror_num, u8 *csum,
|
|
- u64 physical_for_dev_replace);
|
|
-static void scrub_bio_end_io(struct bio *bio);
|
|
-static void scrub_bio_end_io_worker(struct work_struct *work);
|
|
-static void scrub_block_complete(struct scrub_block *sblock);
|
|
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
|
|
- u64 extent_logical, u32 extent_len,
|
|
- u64 *extent_physical,
|
|
- struct btrfs_device **extent_dev,
|
|
- int *extent_mirror_num);
|
|
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
|
|
- struct scrub_sector *sector);
|
|
-static void scrub_wr_submit(struct scrub_ctx *sctx);
|
|
-static void scrub_wr_bio_end_io(struct bio *bio);
|
|
-static void scrub_wr_bio_end_io_worker(struct work_struct *work);
|
|
static void scrub_put_ctx(struct scrub_ctx *sctx);
|
|
|
|
-static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
|
|
-{
|
|
- return sector->recover &&
|
|
- (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
|
|
-}
|
|
-
|
|
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
|
|
-{
|
|
- refcount_inc(&sctx->refs);
|
|
- atomic_inc(&sctx->bios_in_flight);
|
|
-}
|
|
-
|
|
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
|
|
-{
|
|
- atomic_dec(&sctx->bios_in_flight);
|
|
- wake_up(&sctx->list_wait);
|
|
- scrub_put_ctx(sctx);
|
|
-}
|
|
-
|
|
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
|
|
{
|
|
while (atomic_read(&fs_info->scrub_pause_req)) {
|
|
@@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
|
|
scrub_pause_off(fs_info);
|
|
}
|
|
|
|
-/*
|
|
- * Insert new full stripe lock into full stripe locks tree
|
|
- *
|
|
- * Return pointer to existing or newly inserted full_stripe_lock structure if
|
|
- * everything works well.
|
|
- * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
|
|
- *
|
|
- * NOTE: caller must hold full_stripe_locks_root->lock before calling this
|
|
- * function
|
|
- */
|
|
-static struct full_stripe_lock *insert_full_stripe_lock(
|
|
- struct btrfs_full_stripe_locks_tree *locks_root,
|
|
- u64 fstripe_logical)
|
|
-{
|
|
- struct rb_node **p;
|
|
- struct rb_node *parent = NULL;
|
|
- struct full_stripe_lock *entry;
|
|
- struct full_stripe_lock *ret;
|
|
-
|
|
- lockdep_assert_held(&locks_root->lock);
|
|
-
|
|
- p = &locks_root->root.rb_node;
|
|
- while (*p) {
|
|
- parent = *p;
|
|
- entry = rb_entry(parent, struct full_stripe_lock, node);
|
|
- if (fstripe_logical < entry->logical) {
|
|
- p = &(*p)->rb_left;
|
|
- } else if (fstripe_logical > entry->logical) {
|
|
- p = &(*p)->rb_right;
|
|
- } else {
|
|
- entry->refs++;
|
|
- return entry;
|
|
- }
|
|
- }
|
|
-
|
|
- /*
|
|
- * Insert new lock.
|
|
- */
|
|
- ret = kmalloc(sizeof(*ret), GFP_KERNEL);
|
|
- if (!ret)
|
|
- return ERR_PTR(-ENOMEM);
|
|
- ret->logical = fstripe_logical;
|
|
- ret->refs = 1;
|
|
- mutex_init(&ret->mutex);
|
|
-
|
|
- rb_link_node(&ret->node, parent, p);
|
|
- rb_insert_color(&ret->node, &locks_root->root);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Search for a full stripe lock of a block group
|
|
- *
|
|
- * Return pointer to existing full stripe lock if found
|
|
- * Return NULL if not found
|
|
- */
|
|
-static struct full_stripe_lock *search_full_stripe_lock(
|
|
- struct btrfs_full_stripe_locks_tree *locks_root,
|
|
- u64 fstripe_logical)
|
|
-{
|
|
- struct rb_node *node;
|
|
- struct full_stripe_lock *entry;
|
|
-
|
|
- lockdep_assert_held(&locks_root->lock);
|
|
-
|
|
- node = locks_root->root.rb_node;
|
|
- while (node) {
|
|
- entry = rb_entry(node, struct full_stripe_lock, node);
|
|
- if (fstripe_logical < entry->logical)
|
|
- node = node->rb_left;
|
|
- else if (fstripe_logical > entry->logical)
|
|
- node = node->rb_right;
|
|
- else
|
|
- return entry;
|
|
- }
|
|
- return NULL;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Helper to get full stripe logical from a normal bytenr.
|
|
- *
|
|
- * Caller must ensure @cache is a RAID56 block group.
|
|
- */
|
|
-static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
|
|
-{
|
|
- u64 ret;
|
|
-
|
|
- /*
|
|
- * Due to chunk item size limit, full stripe length should not be
|
|
- * larger than U32_MAX. Just a sanity check here.
|
|
- */
|
|
- WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
|
|
-
|
|
- /*
|
|
- * round_down() can only handle power of 2, while RAID56 full
|
|
- * stripe length can be 64KiB * n, so we need to manually round down.
|
|
- */
|
|
- ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
|
|
- cache->full_stripe_len + cache->start;
|
|
- return ret;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Lock a full stripe to avoid concurrency of recovery and read
|
|
- *
|
|
- * It's only used for profiles with parities (RAID5/6), for other profiles it
|
|
- * does nothing.
|
|
- *
|
|
- * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
|
|
- * So caller must call unlock_full_stripe() at the same context.
|
|
- *
|
|
- * Return <0 if encounters error.
|
|
- */
|
|
-static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
|
|
- bool *locked_ret)
|
|
-{
|
|
- struct btrfs_block_group *bg_cache;
|
|
- struct btrfs_full_stripe_locks_tree *locks_root;
|
|
- struct full_stripe_lock *existing;
|
|
- u64 fstripe_start;
|
|
- int ret = 0;
|
|
-
|
|
- *locked_ret = false;
|
|
- bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
|
|
- if (!bg_cache) {
|
|
- ASSERT(0);
|
|
- return -ENOENT;
|
|
- }
|
|
-
|
|
- /* Profiles not based on parity don't need full stripe lock */
|
|
- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
|
|
- goto out;
|
|
- locks_root = &bg_cache->full_stripe_locks_root;
|
|
-
|
|
- fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
|
|
-
|
|
- /* Now insert the full stripe lock */
|
|
- mutex_lock(&locks_root->lock);
|
|
- existing = insert_full_stripe_lock(locks_root, fstripe_start);
|
|
- mutex_unlock(&locks_root->lock);
|
|
- if (IS_ERR(existing)) {
|
|
- ret = PTR_ERR(existing);
|
|
- goto out;
|
|
- }
|
|
- mutex_lock(&existing->mutex);
|
|
- *locked_ret = true;
|
|
-out:
|
|
- btrfs_put_block_group(bg_cache);
|
|
- return ret;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Unlock a full stripe.
|
|
- *
|
|
- * NOTE: Caller must ensure it's the same context calling corresponding
|
|
- * lock_full_stripe().
|
|
- *
|
|
- * Return 0 if we unlock full stripe without problem.
|
|
- * Return <0 for error
|
|
- */
|
|
-static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
|
|
- bool locked)
|
|
-{
|
|
- struct btrfs_block_group *bg_cache;
|
|
- struct btrfs_full_stripe_locks_tree *locks_root;
|
|
- struct full_stripe_lock *fstripe_lock;
|
|
- u64 fstripe_start;
|
|
- bool freeit = false;
|
|
- int ret = 0;
|
|
-
|
|
- /* If we didn't acquire full stripe lock, no need to continue */
|
|
- if (!locked)
|
|
- return 0;
|
|
-
|
|
- bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
|
|
- if (!bg_cache) {
|
|
- ASSERT(0);
|
|
- return -ENOENT;
|
|
- }
|
|
- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
|
|
- goto out;
|
|
-
|
|
- locks_root = &bg_cache->full_stripe_locks_root;
|
|
- fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
|
|
-
|
|
- mutex_lock(&locks_root->lock);
|
|
- fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
|
|
- /* Unpaired unlock_full_stripe() detected */
|
|
- if (!fstripe_lock) {
|
|
- WARN_ON(1);
|
|
- ret = -ENOENT;
|
|
- mutex_unlock(&locks_root->lock);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- if (fstripe_lock->refs == 0) {
|
|
- WARN_ON(1);
|
|
- btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
|
|
- fstripe_lock->logical);
|
|
- } else {
|
|
- fstripe_lock->refs--;
|
|
- }
|
|
-
|
|
- if (fstripe_lock->refs == 0) {
|
|
- rb_erase(&fstripe_lock->node, &locks_root->root);
|
|
- freeit = true;
|
|
- }
|
|
- mutex_unlock(&locks_root->lock);
|
|
-
|
|
- mutex_unlock(&fstripe_lock->mutex);
|
|
- if (freeit)
|
|
- kfree(fstripe_lock);
|
|
-out:
|
|
- btrfs_put_block_group(bg_cache);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static void scrub_free_csums(struct scrub_ctx *sctx)
|
|
{
|
|
while (!list_empty(&sctx->csum_list)) {
|
|
@@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
|
|
if (!sctx)
|
|
return;
|
|
|
|
- /* this can happen when scrub is cancelled */
|
|
- if (sctx->curr != -1) {
|
|
- struct scrub_bio *sbio = sctx->bios[sctx->curr];
|
|
-
|
|
- for (i = 0; i < sbio->sector_count; i++)
|
|
- scrub_block_put(sbio->sectors[i]->sblock);
|
|
- bio_put(sbio->bio);
|
|
- }
|
|
-
|
|
- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
|
|
- struct scrub_bio *sbio = sctx->bios[i];
|
|
-
|
|
- if (!sbio)
|
|
- break;
|
|
- kfree(sbio);
|
|
- }
|
|
+ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
|
|
+ release_scrub_stripe(&sctx->stripes[i]);
|
|
|
|
- kfree(sctx->wr_curr_bio);
|
|
scrub_free_csums(sctx);
|
|
kfree(sctx);
|
|
}
|
|
@@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
|
|
goto nomem;
|
|
refcount_set(&sctx->refs, 1);
|
|
sctx->is_dev_replace = is_dev_replace;
|
|
- sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
|
|
- sctx->curr = -1;
|
|
sctx->fs_info = fs_info;
|
|
INIT_LIST_HEAD(&sctx->csum_list);
|
|
- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
|
|
- struct scrub_bio *sbio;
|
|
+ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
|
|
+ int ret;
|
|
|
|
- sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
|
|
- if (!sbio)
|
|
+ ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
|
|
+ if (ret < 0)
|
|
goto nomem;
|
|
- sctx->bios[i] = sbio;
|
|
-
|
|
- sbio->index = i;
|
|
- sbio->sctx = sctx;
|
|
- sbio->sector_count = 0;
|
|
- INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
|
|
-
|
|
- if (i != SCRUB_BIOS_PER_SCTX - 1)
|
|
- sctx->bios[i]->next_free = i + 1;
|
|
- else
|
|
- sctx->bios[i]->next_free = -1;
|
|
+ sctx->stripes[i].sctx = sctx;
|
|
}
|
|
sctx->first_free = 0;
|
|
- atomic_set(&sctx->bios_in_flight, 0);
|
|
- atomic_set(&sctx->workers_pending, 0);
|
|
atomic_set(&sctx->cancel_req, 0);
|
|
|
|
- spin_lock_init(&sctx->list_lock);
|
|
spin_lock_init(&sctx->stat_lock);
|
|
- init_waitqueue_head(&sctx->list_wait);
|
|
sctx->throttle_deadline = 0;
|
|
|
|
- WARN_ON(sctx->wr_curr_bio != NULL);
|
|
mutex_init(&sctx->wr_lock);
|
|
- sctx->wr_curr_bio = NULL;
|
|
if (is_dev_replace) {
|
|
WARN_ON(!fs_info->dev_replace.tgtdev);
|
|
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
|
|
- sctx->flush_all_writes = false;
|
|
}
|
|
|
|
return sctx;
|
|
@@ -898,10 +464,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
|
|
return 0;
|
|
}
|
|
|
|
-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
|
|
+static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
|
|
+ bool is_super, u64 logical, u64 physical)
|
|
{
|
|
- struct btrfs_device *dev;
|
|
- struct btrfs_fs_info *fs_info;
|
|
+ struct btrfs_fs_info *fs_info = dev->fs_info;
|
|
struct btrfs_path *path;
|
|
struct btrfs_key found_key;
|
|
struct extent_buffer *eb;
|
|
@@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
|
|
u8 ref_level = 0;
|
|
int ret;
|
|
|
|
- WARN_ON(sblock->sector_count < 1);
|
|
- dev = sblock->dev;
|
|
- fs_info = sblock->sctx->fs_info;
|
|
-
|
|
/* Super block error, no need to search extent tree. */
|
|
- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
|
|
+ if (is_super) {
|
|
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
|
|
- errstr, btrfs_dev_name(dev), sblock->physical);
|
|
+ errstr, btrfs_dev_name(dev), physical);
|
|
return;
|
|
}
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return;
|
|
|
|
- swarn.physical = sblock->physical;
|
|
- swarn.logical = sblock->logical;
|
|
+ swarn.physical = physical;
|
|
+ swarn.logical = logical;
|
|
swarn.errstr = errstr;
|
|
swarn.dev = NULL;
|
|
|
|
@@ -978,1921 +540,717 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
|
|
btrfs_free_path(path);
|
|
}
|
|
|
|
-static inline void scrub_get_recover(struct scrub_recover *recover)
|
|
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
|
|
{
|
|
- refcount_inc(&recover->refs);
|
|
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
|
|
+ return 2;
|
|
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
|
|
+ return 3;
|
|
+ else
|
|
+ return (int)bioc->num_stripes;
|
|
}
|
|
|
|
-static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
|
|
- struct scrub_recover *recover)
|
|
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
|
|
+ u64 full_stripe_logical,
|
|
+ int nstripes, int mirror,
|
|
+ int *stripe_index,
|
|
+ u64 *stripe_offset)
|
|
{
|
|
- if (refcount_dec_and_test(&recover->refs)) {
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- btrfs_put_bioc(recover->bioc);
|
|
- kfree(recover);
|
|
+ int i;
|
|
+
|
|
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
+ const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
|
|
+ nstripes - 1 : nstripes - 2;
|
|
+
|
|
+ /* RAID5/6 */
|
|
+ for (i = 0; i < nr_data_stripes; i++) {
|
|
+ const u64 data_stripe_start = full_stripe_logical +
|
|
+ (i * BTRFS_STRIPE_LEN);
|
|
+
|
|
+ if (logical >= data_stripe_start &&
|
|
+ logical < data_stripe_start + BTRFS_STRIPE_LEN)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ *stripe_index = i;
|
|
+ *stripe_offset = (logical - full_stripe_logical) &
|
|
+ BTRFS_STRIPE_LEN_MASK;
|
|
+ } else {
|
|
+ /* The other RAID type */
|
|
+ *stripe_index = mirror;
|
|
+ *stripe_offset = 0;
|
|
}
|
|
}
|
|
|
|
-/*
|
|
- * scrub_handle_errored_block gets called when either verification of the
|
|
- * sectors failed or the bio failed to read, e.g. with EIO. In the latter
|
|
- * case, this function handles all sectors in the bio, even though only one
|
|
- * may be bad.
|
|
- * The goal of this function is to repair the errored block by using the
|
|
- * contents of one of the mirrors.
|
|
- */
|
|
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
|
|
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
|
|
{
|
|
- struct scrub_ctx *sctx = sblock_to_check->sctx;
|
|
- struct btrfs_device *dev = sblock_to_check->dev;
|
|
- struct btrfs_fs_info *fs_info;
|
|
- u64 logical;
|
|
- unsigned int failed_mirror_index;
|
|
- unsigned int is_metadata;
|
|
- unsigned int have_csum;
|
|
- /* One scrub_block for each mirror */
|
|
- struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
|
|
- struct scrub_block *sblock_bad;
|
|
- int ret;
|
|
- int mirror_index;
|
|
- int sector_num;
|
|
- int success;
|
|
- bool full_stripe_locked;
|
|
- unsigned int nofs_flag;
|
|
- static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
- DEFAULT_RATELIMIT_BURST);
|
|
+ int ret = 0;
|
|
+ u64 length;
|
|
|
|
- BUG_ON(sblock_to_check->sector_count < 1);
|
|
- fs_info = sctx->fs_info;
|
|
- if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
|
|
- /*
|
|
- * If we find an error in a super block, we just report it.
|
|
- * They will get written with the next transaction commit
|
|
- * anyway
|
|
- */
|
|
- scrub_print_warning("super block error", sblock_to_check);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- ++sctx->stat.super_errors;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
|
|
+ if (!btrfs_is_zoned(sctx->fs_info))
|
|
return 0;
|
|
- }
|
|
- logical = sblock_to_check->logical;
|
|
- ASSERT(sblock_to_check->mirror_num);
|
|
- failed_mirror_index = sblock_to_check->mirror_num - 1;
|
|
- is_metadata = !(sblock_to_check->sectors[0]->flags &
|
|
- BTRFS_EXTENT_FLAG_DATA);
|
|
- have_csum = sblock_to_check->sectors[0]->have_csum;
|
|
-
|
|
- if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
|
|
+
|
|
+ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
|
|
return 0;
|
|
|
|
- /*
|
|
- * We must use GFP_NOFS because the scrub task might be waiting for a
|
|
- * worker task executing this function and in turn a transaction commit
|
|
- * might be waiting the scrub task to pause (which needs to wait for all
|
|
- * the worker tasks to complete before pausing).
|
|
- * We do allocations in the workers through insert_full_stripe_lock()
|
|
- * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
|
|
- * this function.
|
|
- */
|
|
- nofs_flag = memalloc_nofs_save();
|
|
- /*
|
|
- * For RAID5/6, race can happen for a different device scrub thread.
|
|
- * For data corruption, Parity and Data threads will both try
|
|
- * to recovery the data.
|
|
- * Race can lead to doubly added csum error, or even unrecoverable
|
|
- * error.
|
|
- */
|
|
- ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
|
|
- if (ret < 0) {
|
|
- memalloc_nofs_restore(nofs_flag);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- if (ret == -ENOMEM)
|
|
- sctx->stat.malloc_errors++;
|
|
- sctx->stat.read_errors++;
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- return ret;
|
|
+ if (sctx->write_pointer < physical) {
|
|
+ length = physical - sctx->write_pointer;
|
|
+
|
|
+ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
|
|
+ sctx->write_pointer, length);
|
|
+ if (!ret)
|
|
+ sctx->write_pointer = physical;
|
|
}
|
|
+ return ret;
|
|
+}
|
|
|
|
- /*
|
|
- * read all mirrors one after the other. This includes to
|
|
- * re-read the extent or metadata block that failed (that was
|
|
- * the cause that this fixup code is called) another time,
|
|
- * sector by sector this time in order to know which sectors
|
|
- * caused I/O errors and which ones are good (for all mirrors).
|
|
- * It is the goal to handle the situation when more than one
|
|
- * mirror contains I/O errors, but the errors do not
|
|
- * overlap, i.e. the data can be repaired by selecting the
|
|
- * sectors from those mirrors without I/O error on the
|
|
- * particular sectors. One example (with blocks >= 2 * sectorsize)
|
|
- * would be that mirror #1 has an I/O error on the first sector,
|
|
- * the second sector is good, and mirror #2 has an I/O error on
|
|
- * the second sector, but the first sector is good.
|
|
- * Then the first sector of the first mirror can be repaired by
|
|
- * taking the first sector of the second mirror, and the
|
|
- * second sector of the second mirror can be repaired by
|
|
- * copying the contents of the 2nd sector of the 1st mirror.
|
|
- * One more note: if the sectors of one mirror contain I/O
|
|
- * errors, the checksum cannot be verified. In order to get
|
|
- * the best data for repairing, the first attempt is to find
|
|
- * a mirror without I/O errors and with a validated checksum.
|
|
- * Only if this is not possible, the sectors are picked from
|
|
- * mirrors with I/O errors without considering the checksum.
|
|
- * If the latter is the case, at the end, the checksum of the
|
|
- * repaired area is verified in order to correctly maintain
|
|
- * the statistics.
|
|
- */
|
|
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
|
|
- /*
|
|
- * Note: the two members refs and outstanding_sectors are not
|
|
- * used in the blocks that are used for the recheck procedure.
|
|
- *
|
|
- * But alloc_scrub_block() will initialize sblock::ref anyway,
|
|
- * so we can use scrub_block_put() to clean them up.
|
|
- *
|
|
- * And here we don't setup the physical/dev for the sblock yet,
|
|
- * they will be correctly initialized in scrub_setup_recheck_block().
|
|
- */
|
|
- sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
|
|
- logical, 0, 0, mirror_index);
|
|
- if (!sblocks_for_recheck[mirror_index]) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- sctx->stat.read_errors++;
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
|
|
- goto out;
|
|
- }
|
|
- }
|
|
-
|
|
- /* Setup the context, map the logical blocks and alloc the sectors */
|
|
- ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
|
|
- if (ret) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.read_errors++;
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
|
|
- goto out;
|
|
- }
|
|
- BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
|
|
- sblock_bad = sblocks_for_recheck[failed_mirror_index];
|
|
-
|
|
- /* build and submit the bios for the failed mirror, check checksums */
|
|
- scrub_recheck_block(fs_info, sblock_bad, 1);
|
|
-
|
|
- if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
|
|
- sblock_bad->no_io_error_seen) {
|
|
- /*
|
|
- * The error disappeared after reading sector by sector, or
|
|
- * the area was part of a huge bio and other parts of the
|
|
- * bio caused I/O errors, or the block layer merged several
|
|
- * read requests into one and the error is caused by a
|
|
- * different bio (usually one of the two latter cases is
|
|
- * the cause)
|
|
- */
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.unverified_errors++;
|
|
- sblock_to_check->data_corrected = 1;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
-
|
|
- if (sctx->is_dev_replace)
|
|
- scrub_write_block_to_dev_replace(sblock_bad);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- if (!sblock_bad->no_io_error_seen) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.read_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- if (__ratelimit(&rs))
|
|
- scrub_print_warning("i/o error", sblock_to_check);
|
|
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
|
|
- } else if (sblock_bad->checksum_error) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.csum_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- if (__ratelimit(&rs))
|
|
- scrub_print_warning("checksum error", sblock_to_check);
|
|
- btrfs_dev_stat_inc_and_print(dev,
|
|
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
|
|
- } else if (sblock_bad->header_error) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.verify_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- if (__ratelimit(&rs))
|
|
- scrub_print_warning("checksum/header error",
|
|
- sblock_to_check);
|
|
- if (sblock_bad->generation_error)
|
|
- btrfs_dev_stat_inc_and_print(dev,
|
|
- BTRFS_DEV_STAT_GENERATION_ERRS);
|
|
- else
|
|
- btrfs_dev_stat_inc_and_print(dev,
|
|
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
|
|
- }
|
|
-
|
|
- if (sctx->readonly) {
|
|
- ASSERT(!sctx->is_dev_replace);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- /*
|
|
- * now build and submit the bios for the other mirrors, check
|
|
- * checksums.
|
|
- * First try to pick the mirror which is completely without I/O
|
|
- * errors and also does not have a checksum error.
|
|
- * If one is found, and if a checksum is present, the full block
|
|
- * that is known to contain an error is rewritten. Afterwards
|
|
- * the block is known to be corrected.
|
|
- * If a mirror is found which is completely correct, and no
|
|
- * checksum is present, only those sectors are rewritten that had
|
|
- * an I/O error in the block to be repaired, since it cannot be
|
|
- * determined, which copy of the other sectors is better (and it
|
|
- * could happen otherwise that a correct sector would be
|
|
- * overwritten by a bad one).
|
|
- */
|
|
- for (mirror_index = 0; ;mirror_index++) {
|
|
- struct scrub_block *sblock_other;
|
|
-
|
|
- if (mirror_index == failed_mirror_index)
|
|
- continue;
|
|
-
|
|
- /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
|
|
- if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
|
|
- if (mirror_index >= BTRFS_MAX_MIRRORS)
|
|
- break;
|
|
- if (!sblocks_for_recheck[mirror_index]->sector_count)
|
|
- break;
|
|
-
|
|
- sblock_other = sblocks_for_recheck[mirror_index];
|
|
- } else {
|
|
- struct scrub_recover *r = sblock_bad->sectors[0]->recover;
|
|
- int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
|
|
-
|
|
- if (mirror_index >= max_allowed)
|
|
- break;
|
|
- if (!sblocks_for_recheck[1]->sector_count)
|
|
- break;
|
|
-
|
|
- ASSERT(failed_mirror_index == 0);
|
|
- sblock_other = sblocks_for_recheck[1];
|
|
- sblock_other->mirror_num = 1 + mirror_index;
|
|
- }
|
|
-
|
|
- /* build and submit the bios, check checksums */
|
|
- scrub_recheck_block(fs_info, sblock_other, 0);
|
|
-
|
|
- if (!sblock_other->header_error &&
|
|
- !sblock_other->checksum_error &&
|
|
- sblock_other->no_io_error_seen) {
|
|
- if (sctx->is_dev_replace) {
|
|
- scrub_write_block_to_dev_replace(sblock_other);
|
|
- goto corrected_error;
|
|
- } else {
|
|
- ret = scrub_repair_block_from_good_copy(
|
|
- sblock_bad, sblock_other);
|
|
- if (!ret)
|
|
- goto corrected_error;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
|
|
- goto did_not_correct_error;
|
|
-
|
|
- /*
|
|
- * In case of I/O errors in the area that is supposed to be
|
|
- * repaired, continue by picking good copies of those sectors.
|
|
- * Select the good sectors from mirrors to rewrite bad sectors from
|
|
- * the area to fix. Afterwards verify the checksum of the block
|
|
- * that is supposed to be repaired. This verification step is
|
|
- * only done for the purpose of statistic counting and for the
|
|
- * final scrub report, whether errors remain.
|
|
- * A perfect algorithm could make use of the checksum and try
|
|
- * all possible combinations of sectors from the different mirrors
|
|
- * until the checksum verification succeeds. For example, when
|
|
- * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
|
|
- * of mirror #2 is readable but the final checksum test fails,
|
|
- * then the 2nd sector of mirror #3 could be tried, whether now
|
|
- * the final checksum succeeds. But this would be a rare
|
|
- * exception and is therefore not implemented. At least it is
|
|
- * avoided that the good copy is overwritten.
|
|
- * A more useful improvement would be to pick the sectors
|
|
- * without I/O error based on sector sizes (512 bytes on legacy
|
|
- * disks) instead of on sectorsize. Then maybe 512 byte of one
|
|
- * mirror could be repaired by taking 512 byte of a different
|
|
- * mirror, even if other 512 byte sectors in the same sectorsize
|
|
- * area are unreadable.
|
|
- */
|
|
- success = 1;
|
|
- for (sector_num = 0; sector_num < sblock_bad->sector_count;
|
|
- sector_num++) {
|
|
- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
|
|
- struct scrub_block *sblock_other = NULL;
|
|
-
|
|
- /* Skip no-io-error sectors in scrub */
|
|
- if (!sector_bad->io_error && !sctx->is_dev_replace)
|
|
- continue;
|
|
-
|
|
- if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
|
|
- /*
|
|
- * In case of dev replace, if raid56 rebuild process
|
|
- * didn't work out correct data, then copy the content
|
|
- * in sblock_bad to make sure target device is identical
|
|
- * to source device, instead of writing garbage data in
|
|
- * sblock_for_recheck array to target device.
|
|
- */
|
|
- sblock_other = NULL;
|
|
- } else if (sector_bad->io_error) {
|
|
- /* Try to find no-io-error sector in mirrors */
|
|
- for (mirror_index = 0;
|
|
- mirror_index < BTRFS_MAX_MIRRORS &&
|
|
- sblocks_for_recheck[mirror_index]->sector_count > 0;
|
|
- mirror_index++) {
|
|
- if (!sblocks_for_recheck[mirror_index]->
|
|
- sectors[sector_num]->io_error) {
|
|
- sblock_other = sblocks_for_recheck[mirror_index];
|
|
- break;
|
|
- }
|
|
- }
|
|
- if (!sblock_other)
|
|
- success = 0;
|
|
- }
|
|
-
|
|
- if (sctx->is_dev_replace) {
|
|
- /*
|
|
- * Did not find a mirror to fetch the sector from.
|
|
- * scrub_write_sector_to_dev_replace() handles this
|
|
- * case (sector->io_error), by filling the block with
|
|
- * zeros before submitting the write request
|
|
- */
|
|
- if (!sblock_other)
|
|
- sblock_other = sblock_bad;
|
|
-
|
|
- if (scrub_write_sector_to_dev_replace(sblock_other,
|
|
- sector_num) != 0) {
|
|
- atomic64_inc(
|
|
- &fs_info->dev_replace.num_write_errors);
|
|
- success = 0;
|
|
- }
|
|
- } else if (sblock_other) {
|
|
- ret = scrub_repair_sector_from_good_copy(sblock_bad,
|
|
- sblock_other,
|
|
- sector_num, 0);
|
|
- if (0 == ret)
|
|
- sector_bad->io_error = 0;
|
|
- else
|
|
- success = 0;
|
|
- }
|
|
- }
|
|
-
|
|
- if (success && !sctx->is_dev_replace) {
|
|
- if (is_metadata || have_csum) {
|
|
- /*
|
|
- * need to verify the checksum now that all
|
|
- * sectors on disk are repaired (the write
|
|
- * request for data to be repaired is on its way).
|
|
- * Just be lazy and use scrub_recheck_block()
|
|
- * which re-reads the data before the checksum
|
|
- * is verified, but most likely the data comes out
|
|
- * of the page cache.
|
|
- */
|
|
- scrub_recheck_block(fs_info, sblock_bad, 1);
|
|
- if (!sblock_bad->header_error &&
|
|
- !sblock_bad->checksum_error &&
|
|
- sblock_bad->no_io_error_seen)
|
|
- goto corrected_error;
|
|
- else
|
|
- goto did_not_correct_error;
|
|
- } else {
|
|
-corrected_error:
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.corrected_errors++;
|
|
- sblock_to_check->data_corrected = 1;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_err_rl_in_rcu(fs_info,
|
|
- "fixed up error at logical %llu on dev %s",
|
|
- logical, btrfs_dev_name(dev));
|
|
- }
|
|
- } else {
|
|
-did_not_correct_error:
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_err_rl_in_rcu(fs_info,
|
|
- "unable to fixup (regular) error at logical %llu on dev %s",
|
|
- logical, btrfs_dev_name(dev));
|
|
- }
|
|
-
|
|
-out:
|
|
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
|
|
- struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
|
|
- struct scrub_recover *recover;
|
|
- int sector_index;
|
|
-
|
|
- /* Not allocated, continue checking the next mirror */
|
|
- if (!sblock)
|
|
- continue;
|
|
-
|
|
- for (sector_index = 0; sector_index < sblock->sector_count;
|
|
- sector_index++) {
|
|
- /*
|
|
- * Here we just cleanup the recover, each sector will be
|
|
- * properly cleaned up by later scrub_block_put()
|
|
- */
|
|
- recover = sblock->sectors[sector_index]->recover;
|
|
- if (recover) {
|
|
- scrub_put_recover(fs_info, recover);
|
|
- sblock->sectors[sector_index]->recover = NULL;
|
|
- }
|
|
- }
|
|
- scrub_block_put(sblock);
|
|
- }
|
|
-
|
|
- ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
|
|
- memalloc_nofs_restore(nofs_flag);
|
|
- if (ret < 0)
|
|
- return ret;
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
|
|
+static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
|
|
{
|
|
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
|
|
- return 2;
|
|
- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
|
|
- return 3;
|
|
- else
|
|
- return (int)bioc->num_stripes;
|
|
-}
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
|
|
|
|
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
|
|
- u64 *raid_map,
|
|
- int nstripes, int mirror,
|
|
- int *stripe_index,
|
|
- u64 *stripe_offset)
|
|
-{
|
|
- int i;
|
|
-
|
|
- if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
- /* RAID5/6 */
|
|
- for (i = 0; i < nstripes; i++) {
|
|
- if (raid_map[i] == RAID6_Q_STRIPE ||
|
|
- raid_map[i] == RAID5_P_STRIPE)
|
|
- continue;
|
|
-
|
|
- if (logical >= raid_map[i] &&
|
|
- logical < raid_map[i] + BTRFS_STRIPE_LEN)
|
|
- break;
|
|
- }
|
|
-
|
|
- *stripe_index = i;
|
|
- *stripe_offset = logical - raid_map[i];
|
|
- } else {
|
|
- /* The other RAID type */
|
|
- *stripe_index = mirror;
|
|
- *stripe_offset = 0;
|
|
- }
|
|
+ return stripe->pages[page_index];
|
|
}
|
|
|
|
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
|
|
- struct scrub_block *sblocks_for_recheck[])
|
|
+static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
|
|
+ int sector_nr)
|
|
{
|
|
- struct scrub_ctx *sctx = original_sblock->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- u64 logical = original_sblock->logical;
|
|
- u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
|
|
- u64 generation = original_sblock->sectors[0]->generation;
|
|
- u64 flags = original_sblock->sectors[0]->flags;
|
|
- u64 have_csum = original_sblock->sectors[0]->have_csum;
|
|
- struct scrub_recover *recover;
|
|
- struct btrfs_io_context *bioc;
|
|
- u64 sublen;
|
|
- u64 mapped_length;
|
|
- u64 stripe_offset;
|
|
- int stripe_index;
|
|
- int sector_index = 0;
|
|
- int mirror_index;
|
|
- int nmirrors;
|
|
- int ret;
|
|
-
|
|
- while (length > 0) {
|
|
- sublen = min_t(u64, length, fs_info->sectorsize);
|
|
- mapped_length = sublen;
|
|
- bioc = NULL;
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
|
|
- /*
|
|
- * With a length of sectorsize, each returned stripe represents
|
|
- * one mirror
|
|
- */
|
|
- btrfs_bio_counter_inc_blocked(fs_info);
|
|
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
|
|
- logical, &mapped_length, &bioc);
|
|
- if (ret || !bioc || mapped_length < sublen) {
|
|
- btrfs_put_bioc(bioc);
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- return -EIO;
|
|
- }
|
|
-
|
|
- recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
|
|
- if (!recover) {
|
|
- btrfs_put_bioc(bioc);
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- return -ENOMEM;
|
|
- }
|
|
-
|
|
- refcount_set(&recover->refs, 1);
|
|
- recover->bioc = bioc;
|
|
- recover->map_length = mapped_length;
|
|
-
|
|
- ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
|
|
-
|
|
- nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
|
|
-
|
|
- for (mirror_index = 0; mirror_index < nmirrors;
|
|
- mirror_index++) {
|
|
- struct scrub_block *sblock;
|
|
- struct scrub_sector *sector;
|
|
-
|
|
- sblock = sblocks_for_recheck[mirror_index];
|
|
- sblock->sctx = sctx;
|
|
-
|
|
- sector = alloc_scrub_sector(sblock, logical);
|
|
- if (!sector) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- scrub_put_recover(fs_info, recover);
|
|
- return -ENOMEM;
|
|
- }
|
|
- sector->flags = flags;
|
|
- sector->generation = generation;
|
|
- sector->have_csum = have_csum;
|
|
- if (have_csum)
|
|
- memcpy(sector->csum,
|
|
- original_sblock->sectors[0]->csum,
|
|
- sctx->fs_info->csum_size);
|
|
-
|
|
- scrub_stripe_index_and_offset(logical,
|
|
- bioc->map_type,
|
|
- bioc->raid_map,
|
|
- bioc->num_stripes -
|
|
- bioc->num_tgtdevs,
|
|
- mirror_index,
|
|
- &stripe_index,
|
|
- &stripe_offset);
|
|
- /*
|
|
- * We're at the first sector, also populate @sblock
|
|
- * physical and dev.
|
|
- */
|
|
- if (sector_index == 0) {
|
|
- sblock->physical =
|
|
- bioc->stripes[stripe_index].physical +
|
|
- stripe_offset;
|
|
- sblock->dev = bioc->stripes[stripe_index].dev;
|
|
- sblock->physical_for_dev_replace =
|
|
- original_sblock->physical_for_dev_replace;
|
|
- }
|
|
-
|
|
- BUG_ON(sector_index >= original_sblock->sector_count);
|
|
- scrub_get_recover(recover);
|
|
- sector->recover = recover;
|
|
- }
|
|
- scrub_put_recover(fs_info, recover);
|
|
- length -= sublen;
|
|
- logical += sublen;
|
|
- sector_index++;
|
|
- }
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void scrub_bio_wait_endio(struct bio *bio)
|
|
-{
|
|
- complete(bio->bi_private);
|
|
+ return offset_in_page(sector_nr << fs_info->sectorsize_bits);
|
|
}
|
|
|
|
-static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
|
|
- struct bio *bio,
|
|
- struct scrub_sector *sector)
|
|
+static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
|
|
{
|
|
- DECLARE_COMPLETION_ONSTACK(done);
|
|
-
|
|
- bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
|
|
- SECTOR_SHIFT;
|
|
- bio->bi_private = &done;
|
|
- bio->bi_end_io = scrub_bio_wait_endio;
|
|
- raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
|
|
-
|
|
- wait_for_completion_io(&done);
|
|
- return blk_status_to_errno(bio->bi_status);
|
|
-}
|
|
-
|
|
-static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
|
|
- struct scrub_block *sblock)
|
|
-{
|
|
- struct scrub_sector *first_sector = sblock->sectors[0];
|
|
- struct bio *bio;
|
|
- int i;
|
|
-
|
|
- /* All sectors in sblock belong to the same stripe on the same device. */
|
|
- ASSERT(sblock->dev);
|
|
- if (!sblock->dev->bdev)
|
|
- goto out;
|
|
-
|
|
- bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
|
|
-
|
|
- for (i = 0; i < sblock->sector_count; i++) {
|
|
- struct scrub_sector *sector = sblock->sectors[i];
|
|
-
|
|
- bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
|
|
- }
|
|
-
|
|
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
|
|
- bio_put(bio);
|
|
- goto out;
|
|
- }
|
|
-
|
|
- bio_put(bio);
|
|
-
|
|
- scrub_recheck_block_checksum(sblock);
|
|
-
|
|
- return;
|
|
-out:
|
|
- for (i = 0; i < sblock->sector_count; i++)
|
|
- sblock->sectors[i]->io_error = 1;
|
|
-
|
|
- sblock->no_io_error_seen = 0;
|
|
-}
|
|
-
|
|
-/*
|
|
- * This function will check the on disk data for checksum errors, header errors
|
|
- * and read I/O errors. If any I/O errors happen, the exact sectors which are
|
|
- * errored are marked as being bad. The goal is to enable scrub to take those
|
|
- * sectors that are not errored from all the mirrors so that the sectors that
|
|
- * are errored in the just handled mirror can be repaired.
|
|
- */
|
|
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
|
|
- struct scrub_block *sblock,
|
|
- int retry_failed_mirror)
|
|
-{
|
|
- int i;
|
|
-
|
|
- sblock->no_io_error_seen = 1;
|
|
-
|
|
- /* short cut for raid56 */
|
|
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
|
|
- return scrub_recheck_block_on_raid56(fs_info, sblock);
|
|
-
|
|
- for (i = 0; i < sblock->sector_count; i++) {
|
|
- struct scrub_sector *sector = sblock->sectors[i];
|
|
- struct bio bio;
|
|
- struct bio_vec bvec;
|
|
-
|
|
- if (sblock->dev->bdev == NULL) {
|
|
- sector->io_error = 1;
|
|
- sblock->no_io_error_seen = 0;
|
|
- continue;
|
|
- }
|
|
-
|
|
- bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
|
|
- bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
|
|
- bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
|
|
- SECTOR_SHIFT;
|
|
-
|
|
- btrfsic_check_bio(&bio);
|
|
- if (submit_bio_wait(&bio)) {
|
|
- sector->io_error = 1;
|
|
- sblock->no_io_error_seen = 0;
|
|
- }
|
|
-
|
|
- bio_uninit(&bio);
|
|
- }
|
|
-
|
|
- if (sblock->no_io_error_seen)
|
|
- scrub_recheck_block_checksum(sblock);
|
|
-}
|
|
-
|
|
-static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
|
|
-{
|
|
- struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
|
|
- int ret;
|
|
-
|
|
- ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
|
|
- return !ret;
|
|
-}
|
|
-
|
|
-static void scrub_recheck_block_checksum(struct scrub_block *sblock)
|
|
-{
|
|
- sblock->header_error = 0;
|
|
- sblock->checksum_error = 0;
|
|
- sblock->generation_error = 0;
|
|
-
|
|
- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
|
|
- scrub_checksum_data(sblock);
|
|
- else
|
|
- scrub_checksum_tree_block(sblock);
|
|
-}
|
|
-
|
|
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
|
|
- struct scrub_block *sblock_good)
|
|
-{
|
|
- int i;
|
|
- int ret = 0;
|
|
-
|
|
- for (i = 0; i < sblock_bad->sector_count; i++) {
|
|
- int ret_sub;
|
|
-
|
|
- ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
|
|
- sblock_good, i, 1);
|
|
- if (ret_sub)
|
|
- ret = ret_sub;
|
|
- }
|
|
-
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
|
|
- struct scrub_block *sblock_good,
|
|
- int sector_num, int force_write)
|
|
-{
|
|
- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
|
|
- struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
|
|
- struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
|
|
- const u32 sectorsize = fs_info->sectorsize;
|
|
-
|
|
- if (force_write || sblock_bad->header_error ||
|
|
- sblock_bad->checksum_error || sector_bad->io_error) {
|
|
- struct bio bio;
|
|
- struct bio_vec bvec;
|
|
- int ret;
|
|
-
|
|
- if (!sblock_bad->dev->bdev) {
|
|
- btrfs_warn_rl(fs_info,
|
|
- "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
|
|
- return -EIO;
|
|
- }
|
|
-
|
|
- bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
|
|
- bio.bi_iter.bi_sector = (sblock_bad->physical +
|
|
- sector_bad->offset) >> SECTOR_SHIFT;
|
|
- ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
|
|
-
|
|
- btrfsic_check_bio(&bio);
|
|
- ret = submit_bio_wait(&bio);
|
|
- bio_uninit(&bio);
|
|
-
|
|
- if (ret) {
|
|
- btrfs_dev_stat_inc_and_print(sblock_bad->dev,
|
|
- BTRFS_DEV_STAT_WRITE_ERRS);
|
|
- atomic64_inc(&fs_info->dev_replace.num_write_errors);
|
|
- return -EIO;
|
|
- }
|
|
- }
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
|
|
-{
|
|
- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
|
|
- int i;
|
|
-
|
|
- /*
|
|
- * This block is used for the check of the parity on the source device,
|
|
- * so the data needn't be written into the destination device.
|
|
- */
|
|
- if (sblock->sparity)
|
|
- return;
|
|
-
|
|
- for (i = 0; i < sblock->sector_count; i++) {
|
|
- int ret;
|
|
-
|
|
- ret = scrub_write_sector_to_dev_replace(sblock, i);
|
|
- if (ret)
|
|
- atomic64_inc(&fs_info->dev_replace.num_write_errors);
|
|
- }
|
|
-}
|
|
-
|
|
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
|
|
-{
|
|
- const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
|
|
- struct scrub_sector *sector = sblock->sectors[sector_num];
|
|
-
|
|
- if (sector->io_error)
|
|
- memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
|
|
-
|
|
- return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
|
|
-}
|
|
-
|
|
-static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
|
|
-{
|
|
- int ret = 0;
|
|
- u64 length;
|
|
-
|
|
- if (!btrfs_is_zoned(sctx->fs_info))
|
|
- return 0;
|
|
-
|
|
- if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
|
|
- return 0;
|
|
-
|
|
- if (sctx->write_pointer < physical) {
|
|
- length = physical - sctx->write_pointer;
|
|
-
|
|
- ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
|
|
- sctx->write_pointer, length);
|
|
- if (!ret)
|
|
- sctx->write_pointer = physical;
|
|
- }
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static void scrub_block_get(struct scrub_block *sblock)
|
|
-{
|
|
- refcount_inc(&sblock->refs);
|
|
-}
|
|
-
|
|
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
|
|
- struct scrub_sector *sector)
|
|
-{
|
|
- struct scrub_block *sblock = sector->sblock;
|
|
- struct scrub_bio *sbio;
|
|
- int ret;
|
|
- const u32 sectorsize = sctx->fs_info->sectorsize;
|
|
-
|
|
- mutex_lock(&sctx->wr_lock);
|
|
-again:
|
|
- if (!sctx->wr_curr_bio) {
|
|
- sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
|
|
- GFP_KERNEL);
|
|
- if (!sctx->wr_curr_bio) {
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
- return -ENOMEM;
|
|
- }
|
|
- sctx->wr_curr_bio->sctx = sctx;
|
|
- sctx->wr_curr_bio->sector_count = 0;
|
|
- }
|
|
- sbio = sctx->wr_curr_bio;
|
|
- if (sbio->sector_count == 0) {
|
|
- ret = fill_writer_pointer_gap(sctx, sector->offset +
|
|
- sblock->physical_for_dev_replace);
|
|
- if (ret) {
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
- return ret;
|
|
- }
|
|
-
|
|
- sbio->physical = sblock->physical_for_dev_replace + sector->offset;
|
|
- sbio->logical = sblock->logical + sector->offset;
|
|
- sbio->dev = sctx->wr_tgtdev;
|
|
- if (!sbio->bio) {
|
|
- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
|
|
- REQ_OP_WRITE, GFP_NOFS);
|
|
- }
|
|
- sbio->bio->bi_private = sbio;
|
|
- sbio->bio->bi_end_io = scrub_wr_bio_end_io;
|
|
- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
|
|
- sbio->status = 0;
|
|
- } else if (sbio->physical + sbio->sector_count * sectorsize !=
|
|
- sblock->physical_for_dev_replace + sector->offset ||
|
|
- sbio->logical + sbio->sector_count * sectorsize !=
|
|
- sblock->logical + sector->offset) {
|
|
- scrub_wr_submit(sctx);
|
|
- goto again;
|
|
- }
|
|
-
|
|
- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
|
|
- if (ret != sectorsize) {
|
|
- if (sbio->sector_count < 1) {
|
|
- bio_put(sbio->bio);
|
|
- sbio->bio = NULL;
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
- return -EIO;
|
|
- }
|
|
- scrub_wr_submit(sctx);
|
|
- goto again;
|
|
- }
|
|
-
|
|
- sbio->sectors[sbio->sector_count] = sector;
|
|
- scrub_sector_get(sector);
|
|
- /*
|
|
- * Since ssector no longer holds a page, but uses sblock::pages, we
|
|
- * have to ensure the sblock had not been freed before our write bio
|
|
- * finished.
|
|
- */
|
|
- scrub_block_get(sector->sblock);
|
|
-
|
|
- sbio->sector_count++;
|
|
- if (sbio->sector_count == sctx->sectors_per_bio)
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void scrub_wr_submit(struct scrub_ctx *sctx)
|
|
-{
|
|
- struct scrub_bio *sbio;
|
|
-
|
|
- if (!sctx->wr_curr_bio)
|
|
- return;
|
|
-
|
|
- sbio = sctx->wr_curr_bio;
|
|
- sctx->wr_curr_bio = NULL;
|
|
- scrub_pending_bio_inc(sctx);
|
|
- /* process all writes in a single worker thread. Then the block layer
|
|
- * orders the requests before sending them to the driver which
|
|
- * doubled the write performance on spinning disks when measured
|
|
- * with Linux 3.5 */
|
|
- btrfsic_check_bio(sbio->bio);
|
|
- submit_bio(sbio->bio);
|
|
-
|
|
- if (btrfs_is_zoned(sctx->fs_info))
|
|
- sctx->write_pointer = sbio->physical + sbio->sector_count *
|
|
- sctx->fs_info->sectorsize;
|
|
-}
|
|
-
|
|
-static void scrub_wr_bio_end_io(struct bio *bio)
|
|
-{
|
|
- struct scrub_bio *sbio = bio->bi_private;
|
|
- struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
|
|
-
|
|
- sbio->status = bio->bi_status;
|
|
- sbio->bio = bio;
|
|
-
|
|
- INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
|
|
- queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
|
|
-}
|
|
-
|
|
-static void scrub_wr_bio_end_io_worker(struct work_struct *work)
|
|
-{
|
|
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
|
|
- struct scrub_ctx *sctx = sbio->sctx;
|
|
- int i;
|
|
-
|
|
- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
|
|
- if (sbio->status) {
|
|
- struct btrfs_dev_replace *dev_replace =
|
|
- &sbio->sctx->fs_info->dev_replace;
|
|
-
|
|
- for (i = 0; i < sbio->sector_count; i++) {
|
|
- struct scrub_sector *sector = sbio->sectors[i];
|
|
-
|
|
- sector->io_error = 1;
|
|
- atomic64_inc(&dev_replace->num_write_errors);
|
|
- }
|
|
- }
|
|
-
|
|
- /*
|
|
- * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
|
|
- * endio we should put the sblock.
|
|
- */
|
|
- for (i = 0; i < sbio->sector_count; i++) {
|
|
- scrub_block_put(sbio->sectors[i]->sblock);
|
|
- scrub_sector_put(sbio->sectors[i]);
|
|
- }
|
|
-
|
|
- bio_put(sbio->bio);
|
|
- kfree(sbio);
|
|
- scrub_pending_bio_dec(sctx);
|
|
-}
|
|
-
|
|
-static int scrub_checksum(struct scrub_block *sblock)
|
|
-{
|
|
- u64 flags;
|
|
- int ret;
|
|
-
|
|
- /*
|
|
- * No need to initialize these stats currently,
|
|
- * because this function only use return value
|
|
- * instead of these stats value.
|
|
- *
|
|
- * Todo:
|
|
- * always use stats
|
|
- */
|
|
- sblock->header_error = 0;
|
|
- sblock->generation_error = 0;
|
|
- sblock->checksum_error = 0;
|
|
-
|
|
- WARN_ON(sblock->sector_count < 1);
|
|
- flags = sblock->sectors[0]->flags;
|
|
- ret = 0;
|
|
- if (flags & BTRFS_EXTENT_FLAG_DATA)
|
|
- ret = scrub_checksum_data(sblock);
|
|
- else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
|
|
- ret = scrub_checksum_tree_block(sblock);
|
|
- else if (flags & BTRFS_EXTENT_FLAG_SUPER)
|
|
- ret = scrub_checksum_super(sblock);
|
|
- else
|
|
- WARN_ON(1);
|
|
- if (ret)
|
|
- scrub_handle_errored_block(sblock);
|
|
-
|
|
- return ret;
|
|
-}
|
|
-
|
|
-static int scrub_checksum_data(struct scrub_block *sblock)
|
|
-{
|
|
- struct scrub_ctx *sctx = sblock->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
|
|
+ const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
|
|
+ const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
|
|
+ const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
|
|
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
|
|
- u8 csum[BTRFS_CSUM_SIZE];
|
|
- struct scrub_sector *sector;
|
|
- char *kaddr;
|
|
-
|
|
- BUG_ON(sblock->sector_count < 1);
|
|
- sector = sblock->sectors[0];
|
|
- if (!sector->have_csum)
|
|
- return 0;
|
|
-
|
|
- kaddr = scrub_sector_get_kaddr(sector);
|
|
-
|
|
- shash->tfm = fs_info->csum_shash;
|
|
- crypto_shash_init(shash);
|
|
-
|
|
- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
|
|
-
|
|
- if (memcmp(csum, sector->csum, fs_info->csum_size))
|
|
- sblock->checksum_error = 1;
|
|
- return sblock->checksum_error;
|
|
-}
|
|
-
|
|
-static int scrub_checksum_tree_block(struct scrub_block *sblock)
|
|
-{
|
|
- struct scrub_ctx *sctx = sblock->sctx;
|
|
- struct btrfs_header *h;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
|
|
- u8 calculated_csum[BTRFS_CSUM_SIZE];
|
|
u8 on_disk_csum[BTRFS_CSUM_SIZE];
|
|
- /*
|
|
- * This is done in sectorsize steps even for metadata as there's a
|
|
- * constraint for nodesize to be aligned to sectorsize. This will need
|
|
- * to change so we don't misuse data and metadata units like that.
|
|
- */
|
|
- const u32 sectorsize = sctx->fs_info->sectorsize;
|
|
- const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
|
|
- int i;
|
|
- struct scrub_sector *sector;
|
|
- char *kaddr;
|
|
-
|
|
- BUG_ON(sblock->sector_count < 1);
|
|
-
|
|
- /* Each member in sectors is just one sector */
|
|
- ASSERT(sblock->sector_count == num_sectors);
|
|
-
|
|
- sector = sblock->sectors[0];
|
|
- kaddr = scrub_sector_get_kaddr(sector);
|
|
- h = (struct btrfs_header *)kaddr;
|
|
- memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
|
|
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
|
|
+ struct btrfs_header *header;
|
|
|
|
/*
|
|
- * we don't use the getter functions here, as we
|
|
- * a) don't have an extent buffer and
|
|
- * b) the page is already kmapped
|
|
+ * Here we don't have a good way to attach the pages (and subpages)
|
|
+ * to a dummy extent buffer, thus we have to directly grab the members
|
|
+ * from pages.
|
|
*/
|
|
- if (sblock->logical != btrfs_stack_header_bytenr(h)) {
|
|
- sblock->header_error = 1;
|
|
+ header = (struct btrfs_header *)(page_address(first_page) + first_off);
|
|
+ memcpy(on_disk_csum, header->csum, fs_info->csum_size);
|
|
+
|
|
+ if (logical != btrfs_stack_header_bytenr(header)) {
|
|
+ bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
btrfs_warn_rl(fs_info,
|
|
"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
|
|
- sblock->logical, sblock->mirror_num,
|
|
- btrfs_stack_header_bytenr(h),
|
|
- sblock->logical);
|
|
- goto out;
|
|
+ logical, stripe->mirror_num,
|
|
+ btrfs_stack_header_bytenr(header), logical);
|
|
+ return;
|
|
}
|
|
-
|
|
- if (!scrub_check_fsid(h->fsid, sector)) {
|
|
- sblock->header_error = 1;
|
|
+ if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
|
|
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
btrfs_warn_rl(fs_info,
|
|
"tree block %llu mirror %u has bad fsid, has %pU want %pU",
|
|
- sblock->logical, sblock->mirror_num,
|
|
- h->fsid, sblock->dev->fs_devices->fsid);
|
|
- goto out;
|
|
+ logical, stripe->mirror_num,
|
|
+ header->fsid, fs_info->fs_devices->fsid);
|
|
+ return;
|
|
}
|
|
-
|
|
- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
|
|
- sblock->header_error = 1;
|
|
+ if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
|
|
+ BTRFS_UUID_SIZE) != 0) {
|
|
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
btrfs_warn_rl(fs_info,
|
|
"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
|
|
- sblock->logical, sblock->mirror_num,
|
|
- h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
|
|
- goto out;
|
|
+ logical, stripe->mirror_num,
|
|
+ header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
|
|
+ return;
|
|
}
|
|
|
|
+ /* Now check tree block csum. */
|
|
shash->tfm = fs_info->csum_shash;
|
|
crypto_shash_init(shash);
|
|
- crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
|
|
- sectorsize - BTRFS_CSUM_SIZE);
|
|
+ crypto_shash_update(shash, page_address(first_page) + first_off +
|
|
+ BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
|
|
+
|
|
+ for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
|
|
+ struct page *page = scrub_stripe_get_page(stripe, i);
|
|
+ unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
|
|
|
|
- for (i = 1; i < num_sectors; i++) {
|
|
- kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
|
|
- crypto_shash_update(shash, kaddr, sectorsize);
|
|
+ crypto_shash_update(shash, page_address(page) + page_off,
|
|
+ fs_info->sectorsize);
|
|
}
|
|
|
|
crypto_shash_final(shash, calculated_csum);
|
|
- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
|
|
- sblock->checksum_error = 1;
|
|
+ if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
|
|
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
btrfs_warn_rl(fs_info,
|
|
"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
|
|
- sblock->logical, sblock->mirror_num,
|
|
+ logical, stripe->mirror_num,
|
|
CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
|
|
CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
|
|
- goto out;
|
|
+ return;
|
|
}
|
|
-
|
|
- if (sector->generation != btrfs_stack_header_generation(h)) {
|
|
- sblock->header_error = 1;
|
|
- sblock->generation_error = 1;
|
|
+ if (stripe->sectors[sector_nr].generation !=
|
|
+ btrfs_stack_header_generation(header)) {
|
|
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
btrfs_warn_rl(fs_info,
|
|
"tree block %llu mirror %u has bad generation, has %llu want %llu",
|
|
- sblock->logical, sblock->mirror_num,
|
|
- btrfs_stack_header_generation(h),
|
|
- sector->generation);
|
|
- }
|
|
-
|
|
-out:
|
|
- return sblock->header_error || sblock->checksum_error;
|
|
-}
|
|
-
|
|
-static int scrub_checksum_super(struct scrub_block *sblock)
|
|
-{
|
|
- struct btrfs_super_block *s;
|
|
- struct scrub_ctx *sctx = sblock->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
|
|
- u8 calculated_csum[BTRFS_CSUM_SIZE];
|
|
- struct scrub_sector *sector;
|
|
- char *kaddr;
|
|
- int fail_gen = 0;
|
|
- int fail_cor = 0;
|
|
-
|
|
- BUG_ON(sblock->sector_count < 1);
|
|
- sector = sblock->sectors[0];
|
|
- kaddr = scrub_sector_get_kaddr(sector);
|
|
- s = (struct btrfs_super_block *)kaddr;
|
|
-
|
|
- if (sblock->logical != btrfs_super_bytenr(s))
|
|
- ++fail_cor;
|
|
-
|
|
- if (sector->generation != btrfs_super_generation(s))
|
|
- ++fail_gen;
|
|
-
|
|
- if (!scrub_check_fsid(s->fsid, sector))
|
|
- ++fail_cor;
|
|
-
|
|
- shash->tfm = fs_info->csum_shash;
|
|
- crypto_shash_init(shash);
|
|
- crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
|
|
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
|
|
-
|
|
- if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
|
|
- ++fail_cor;
|
|
-
|
|
- return fail_cor + fail_gen;
|
|
-}
|
|
-
|
|
-static void scrub_block_put(struct scrub_block *sblock)
|
|
-{
|
|
- if (refcount_dec_and_test(&sblock->refs)) {
|
|
- int i;
|
|
-
|
|
- if (sblock->sparity)
|
|
- scrub_parity_put(sblock->sparity);
|
|
-
|
|
- for (i = 0; i < sblock->sector_count; i++)
|
|
- scrub_sector_put(sblock->sectors[i]);
|
|
- for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
|
|
- if (sblock->pages[i]) {
|
|
- detach_scrub_page_private(sblock->pages[i]);
|
|
- __free_page(sblock->pages[i]);
|
|
- }
|
|
- }
|
|
- kfree(sblock);
|
|
+ logical, stripe->mirror_num,
|
|
+ btrfs_stack_header_generation(header),
|
|
+ stripe->sectors[sector_nr].generation);
|
|
+ return;
|
|
}
|
|
+ bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
|
|
+ bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
|
|
}
|
|
|
|
-static void scrub_sector_get(struct scrub_sector *sector)
|
|
-{
|
|
- atomic_inc(§or->refs);
|
|
-}
|
|
-
|
|
-static void scrub_sector_put(struct scrub_sector *sector)
|
|
+static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
|
|
{
|
|
- if (atomic_dec_and_test(§or->refs))
|
|
- kfree(sector);
|
|
-}
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
|
|
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
|
|
+ struct page *page = scrub_stripe_get_page(stripe, sector_nr);
|
|
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
|
|
+ u8 csum_buf[BTRFS_CSUM_SIZE];
|
|
+ int ret;
|
|
|
|
-/*
|
|
- * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
|
|
- * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
|
|
- */
|
|
-static void scrub_throttle(struct scrub_ctx *sctx)
|
|
-{
|
|
- const int time_slice = 1000;
|
|
- struct scrub_bio *sbio;
|
|
- struct btrfs_device *device;
|
|
- s64 delta;
|
|
- ktime_t now;
|
|
- u32 div;
|
|
- u64 bwlimit;
|
|
+ ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
|
|
|
|
- sbio = sctx->bios[sctx->curr];
|
|
- device = sbio->dev;
|
|
- bwlimit = READ_ONCE(device->scrub_speed_max);
|
|
- if (bwlimit == 0)
|
|
+ /* Sector not utilized, skip it. */
|
|
+ if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
|
|
return;
|
|
|
|
- /*
|
|
- * Slice is divided into intervals when the IO is submitted, adjust by
|
|
- * bwlimit and maximum of 64 intervals.
|
|
- */
|
|
- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
|
|
- div = min_t(u32, 64, div);
|
|
-
|
|
- /* Start new epoch, set deadline */
|
|
- now = ktime_get();
|
|
- if (sctx->throttle_deadline == 0) {
|
|
- sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
|
|
- sctx->throttle_sent = 0;
|
|
- }
|
|
+ /* IO error, no need to check. */
|
|
+ if (test_bit(sector_nr, &stripe->io_error_bitmap))
|
|
+ return;
|
|
|
|
- /* Still in the time to send? */
|
|
- if (ktime_before(now, sctx->throttle_deadline)) {
|
|
- /* If current bio is within the limit, send it */
|
|
- sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
|
|
- if (sctx->throttle_sent <= div_u64(bwlimit, div))
|
|
+ /* Metadata, verify the full tree block. */
|
|
+ if (sector->is_metadata) {
|
|
+ /*
|
|
+ * Check if the tree block crosses the stripe boudary. If
|
|
+ * crossed the boundary, we cannot verify it but only give a
|
|
+ * warning.
|
|
+ *
|
|
+ * This can only happen on a very old filesystem where chunks
|
|
+ * are not ensured to be stripe aligned.
|
|
+ */
|
|
+ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
|
|
+ btrfs_warn_rl(fs_info,
|
|
+ "tree block at %llu crosses stripe boundary %llu",
|
|
+ stripe->logical +
|
|
+ (sector_nr << fs_info->sectorsize_bits),
|
|
+ stripe->logical);
|
|
return;
|
|
-
|
|
- /* We're over the limit, sleep until the rest of the slice */
|
|
- delta = ktime_ms_delta(sctx->throttle_deadline, now);
|
|
- } else {
|
|
- /* New request after deadline, start new epoch */
|
|
- delta = 0;
|
|
- }
|
|
-
|
|
- if (delta) {
|
|
- long timeout;
|
|
-
|
|
- timeout = div_u64(delta * HZ, 1000);
|
|
- schedule_timeout_interruptible(timeout);
|
|
- }
|
|
-
|
|
- /* Next call will start the deadline period */
|
|
- sctx->throttle_deadline = 0;
|
|
-}
|
|
-
|
|
-static void scrub_submit(struct scrub_ctx *sctx)
|
|
-{
|
|
- struct scrub_bio *sbio;
|
|
-
|
|
- if (sctx->curr == -1)
|
|
+ }
|
|
+ scrub_verify_one_metadata(stripe, sector_nr);
|
|
return;
|
|
+ }
|
|
|
|
- scrub_throttle(sctx);
|
|
-
|
|
- sbio = sctx->bios[sctx->curr];
|
|
- sctx->curr = -1;
|
|
- scrub_pending_bio_inc(sctx);
|
|
- btrfsic_check_bio(sbio->bio);
|
|
- submit_bio(sbio->bio);
|
|
-}
|
|
-
|
|
-static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
|
|
- struct scrub_sector *sector)
|
|
-{
|
|
- struct scrub_block *sblock = sector->sblock;
|
|
- struct scrub_bio *sbio;
|
|
- const u32 sectorsize = sctx->fs_info->sectorsize;
|
|
- int ret;
|
|
-
|
|
-again:
|
|
/*
|
|
- * grab a fresh bio or wait for one to become available
|
|
+ * Data is easier, we just verify the data csum (if we have it). For
|
|
+ * cases without csum, we have no other choice but to trust it.
|
|
*/
|
|
- while (sctx->curr == -1) {
|
|
- spin_lock(&sctx->list_lock);
|
|
- sctx->curr = sctx->first_free;
|
|
- if (sctx->curr != -1) {
|
|
- sctx->first_free = sctx->bios[sctx->curr]->next_free;
|
|
- sctx->bios[sctx->curr]->next_free = -1;
|
|
- sctx->bios[sctx->curr]->sector_count = 0;
|
|
- spin_unlock(&sctx->list_lock);
|
|
- } else {
|
|
- spin_unlock(&sctx->list_lock);
|
|
- wait_event(sctx->list_wait, sctx->first_free != -1);
|
|
- }
|
|
- }
|
|
- sbio = sctx->bios[sctx->curr];
|
|
- if (sbio->sector_count == 0) {
|
|
- sbio->physical = sblock->physical + sector->offset;
|
|
- sbio->logical = sblock->logical + sector->offset;
|
|
- sbio->dev = sblock->dev;
|
|
- if (!sbio->bio) {
|
|
- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
|
|
- REQ_OP_READ, GFP_NOFS);
|
|
- }
|
|
- sbio->bio->bi_private = sbio;
|
|
- sbio->bio->bi_end_io = scrub_bio_end_io;
|
|
- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
|
|
- sbio->status = 0;
|
|
- } else if (sbio->physical + sbio->sector_count * sectorsize !=
|
|
- sblock->physical + sector->offset ||
|
|
- sbio->logical + sbio->sector_count * sectorsize !=
|
|
- sblock->logical + sector->offset ||
|
|
- sbio->dev != sblock->dev) {
|
|
- scrub_submit(sctx);
|
|
- goto again;
|
|
+ if (!sector->csum) {
|
|
+ clear_bit(sector_nr, &stripe->error_bitmap);
|
|
+ return;
|
|
}
|
|
|
|
- sbio->sectors[sbio->sector_count] = sector;
|
|
- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
|
|
- if (ret != sectorsize) {
|
|
- if (sbio->sector_count < 1) {
|
|
- bio_put(sbio->bio);
|
|
- sbio->bio = NULL;
|
|
- return -EIO;
|
|
- }
|
|
- scrub_submit(sctx);
|
|
- goto again;
|
|
+ ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
|
|
+ if (ret < 0) {
|
|
+ set_bit(sector_nr, &stripe->csum_error_bitmap);
|
|
+ set_bit(sector_nr, &stripe->error_bitmap);
|
|
+ } else {
|
|
+ clear_bit(sector_nr, &stripe->csum_error_bitmap);
|
|
+ clear_bit(sector_nr, &stripe->error_bitmap);
|
|
}
|
|
-
|
|
- scrub_block_get(sblock); /* one for the page added to the bio */
|
|
- atomic_inc(&sblock->outstanding_sectors);
|
|
- sbio->sector_count++;
|
|
- if (sbio->sector_count == sctx->sectors_per_bio)
|
|
- scrub_submit(sctx);
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void scrub_missing_raid56_end_io(struct bio *bio)
|
|
-{
|
|
- struct scrub_block *sblock = bio->bi_private;
|
|
- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
|
|
-
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- if (bio->bi_status)
|
|
- sblock->no_io_error_seen = 0;
|
|
-
|
|
- bio_put(bio);
|
|
-
|
|
- queue_work(fs_info->scrub_workers, &sblock->work);
|
|
}
|
|
|
|
-static void scrub_missing_raid56_worker(struct work_struct *work)
|
|
+/* Verify specified sectors of a stripe. */
|
|
+static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
|
|
{
|
|
- struct scrub_block *sblock = container_of(work, struct scrub_block, work);
|
|
- struct scrub_ctx *sctx = sblock->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- u64 logical;
|
|
- struct btrfs_device *dev;
|
|
-
|
|
- logical = sblock->logical;
|
|
- dev = sblock->dev;
|
|
-
|
|
- if (sblock->no_io_error_seen)
|
|
- scrub_recheck_block_checksum(sblock);
|
|
-
|
|
- if (!sblock->no_io_error_seen) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.read_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_err_rl_in_rcu(fs_info,
|
|
- "IO error rebuilding logical %llu for dev %s",
|
|
- logical, btrfs_dev_name(dev));
|
|
- } else if (sblock->header_error || sblock->checksum_error) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_err_rl_in_rcu(fs_info,
|
|
- "failed to rebuild valid logical %llu for dev %s",
|
|
- logical, btrfs_dev_name(dev));
|
|
- } else {
|
|
- scrub_write_block_to_dev_replace(sblock);
|
|
- }
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
|
|
+ int sector_nr;
|
|
|
|
- if (sctx->is_dev_replace && sctx->flush_all_writes) {
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
+ for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
|
|
+ scrub_verify_one_sector(stripe, sector_nr);
|
|
+ if (stripe->sectors[sector_nr].is_metadata)
|
|
+ sector_nr += sectors_per_tree - 1;
|
|
}
|
|
-
|
|
- scrub_block_put(sblock);
|
|
- scrub_pending_bio_dec(sctx);
|
|
}
|
|
|
|
-static void scrub_missing_raid56_pages(struct scrub_block *sblock)
|
|
+static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
|
|
{
|
|
- struct scrub_ctx *sctx = sblock->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- u64 length = sblock->sector_count << fs_info->sectorsize_bits;
|
|
- u64 logical = sblock->logical;
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
- struct bio *bio;
|
|
- struct btrfs_raid_bio *rbio;
|
|
- int ret;
|
|
int i;
|
|
|
|
- btrfs_bio_counter_inc_blocked(fs_info);
|
|
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
|
|
- &length, &bioc);
|
|
- if (ret || !bioc || !bioc->raid_map)
|
|
- goto bioc_out;
|
|
-
|
|
- if (WARN_ON(!sctx->is_dev_replace ||
|
|
- !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
|
|
- /*
|
|
- * We shouldn't be scrubbing a missing device. Even for dev
|
|
- * replace, we should only get here for RAID 5/6. We either
|
|
- * managed to mount something with no mirrors remaining or
|
|
- * there's a bug in scrub_find_good_copy()/btrfs_map_block().
|
|
- */
|
|
- goto bioc_out;
|
|
- }
|
|
-
|
|
- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
|
|
- bio->bi_iter.bi_sector = logical >> 9;
|
|
- bio->bi_private = sblock;
|
|
- bio->bi_end_io = scrub_missing_raid56_end_io;
|
|
-
|
|
- rbio = raid56_alloc_missing_rbio(bio, bioc);
|
|
- if (!rbio)
|
|
- goto rbio_out;
|
|
-
|
|
- for (i = 0; i < sblock->sector_count; i++) {
|
|
- struct scrub_sector *sector = sblock->sectors[i];
|
|
-
|
|
- raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
|
|
- scrub_sector_get_page_offset(sector),
|
|
- sector->offset + sector->sblock->logical);
|
|
+ for (i = 0; i < stripe->nr_sectors; i++) {
|
|
+ if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
|
|
+ scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
|
|
+ break;
|
|
}
|
|
-
|
|
- INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
|
|
- scrub_block_get(sblock);
|
|
- scrub_pending_bio_inc(sctx);
|
|
- raid56_submit_missing_rbio(rbio);
|
|
- btrfs_put_bioc(bioc);
|
|
- return;
|
|
-
|
|
-rbio_out:
|
|
- bio_put(bio);
|
|
-bioc_out:
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- btrfs_put_bioc(bioc);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
+ ASSERT(i < stripe->nr_sectors);
|
|
+ return i;
|
|
}
|
|
|
|
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
|
|
- u64 physical, struct btrfs_device *dev, u64 flags,
|
|
- u64 gen, int mirror_num, u8 *csum,
|
|
- u64 physical_for_dev_replace)
|
|
+/*
|
|
+ * Repair read is different to the regular read:
|
|
+ *
|
|
+ * - Only reads the failed sectors
|
|
+ * - May have extra blocksize limits
|
|
+ */
|
|
+static void scrub_repair_read_endio(struct btrfs_bio *bbio)
|
|
{
|
|
- struct scrub_block *sblock;
|
|
- const u32 sectorsize = sctx->fs_info->sectorsize;
|
|
- int index;
|
|
-
|
|
- sblock = alloc_scrub_block(sctx, dev, logical, physical,
|
|
- physical_for_dev_replace, mirror_num);
|
|
- if (!sblock) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- return -ENOMEM;
|
|
- }
|
|
+ struct scrub_stripe *stripe = bbio->private;
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ struct bio_vec *bvec;
|
|
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
|
|
+ u32 bio_size = 0;
|
|
+ int i;
|
|
|
|
- for (index = 0; len > 0; index++) {
|
|
- struct scrub_sector *sector;
|
|
- /*
|
|
- * Here we will allocate one page for one sector to scrub.
|
|
- * This is fine if PAGE_SIZE == sectorsize, but will cost
|
|
- * more memory for PAGE_SIZE > sectorsize case.
|
|
- */
|
|
- u32 l = min(sectorsize, len);
|
|
+ ASSERT(sector_nr < stripe->nr_sectors);
|
|
|
|
- sector = alloc_scrub_sector(sblock, logical);
|
|
- if (!sector) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- scrub_block_put(sblock);
|
|
- return -ENOMEM;
|
|
- }
|
|
- sector->flags = flags;
|
|
- sector->generation = gen;
|
|
- if (csum) {
|
|
- sector->have_csum = 1;
|
|
- memcpy(sector->csum, csum, sctx->fs_info->csum_size);
|
|
- } else {
|
|
- sector->have_csum = 0;
|
|
- }
|
|
- len -= l;
|
|
- logical += l;
|
|
- physical += l;
|
|
- physical_for_dev_replace += l;
|
|
- }
|
|
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
|
|
+ bio_size += bvec->bv_len;
|
|
|
|
- WARN_ON(sblock->sector_count == 0);
|
|
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
|
|
- /*
|
|
- * This case should only be hit for RAID 5/6 device replace. See
|
|
- * the comment in scrub_missing_raid56_pages() for details.
|
|
- */
|
|
- scrub_missing_raid56_pages(sblock);
|
|
+ if (bbio->bio.bi_status) {
|
|
+ bitmap_set(&stripe->io_error_bitmap, sector_nr,
|
|
+ bio_size >> fs_info->sectorsize_bits);
|
|
+ bitmap_set(&stripe->error_bitmap, sector_nr,
|
|
+ bio_size >> fs_info->sectorsize_bits);
|
|
} else {
|
|
- for (index = 0; index < sblock->sector_count; index++) {
|
|
- struct scrub_sector *sector = sblock->sectors[index];
|
|
- int ret;
|
|
-
|
|
- ret = scrub_add_sector_to_rd_bio(sctx, sector);
|
|
- if (ret) {
|
|
- scrub_block_put(sblock);
|
|
- return ret;
|
|
- }
|
|
- }
|
|
-
|
|
- if (flags & BTRFS_EXTENT_FLAG_SUPER)
|
|
- scrub_submit(sctx);
|
|
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr,
|
|
+ bio_size >> fs_info->sectorsize_bits);
|
|
}
|
|
-
|
|
- /* last one frees, either here or in bio completion for last page */
|
|
- scrub_block_put(sblock);
|
|
- return 0;
|
|
+ bio_put(&bbio->bio);
|
|
+ if (atomic_dec_and_test(&stripe->pending_io))
|
|
+ wake_up(&stripe->io_wait);
|
|
}
|
|
|
|
-static void scrub_bio_end_io(struct bio *bio)
|
|
+static int calc_next_mirror(int mirror, int num_copies)
|
|
{
|
|
- struct scrub_bio *sbio = bio->bi_private;
|
|
- struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
|
|
-
|
|
- sbio->status = bio->bi_status;
|
|
- sbio->bio = bio;
|
|
-
|
|
- queue_work(fs_info->scrub_workers, &sbio->work);
|
|
+ ASSERT(mirror <= num_copies);
|
|
+ return (mirror + 1 > num_copies) ? 1 : mirror + 1;
|
|
}
|
|
|
|
-static void scrub_bio_end_io_worker(struct work_struct *work)
|
|
+static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
|
|
+ int mirror, int blocksize, bool wait)
|
|
{
|
|
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
|
|
- struct scrub_ctx *sctx = sbio->sctx;
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ struct btrfs_bio *bbio = NULL;
|
|
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
|
|
int i;
|
|
|
|
- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
|
|
- if (sbio->status) {
|
|
- for (i = 0; i < sbio->sector_count; i++) {
|
|
- struct scrub_sector *sector = sbio->sectors[i];
|
|
+ ASSERT(stripe->mirror_num >= 1);
|
|
+ ASSERT(atomic_read(&stripe->pending_io) == 0);
|
|
+
|
|
+ for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
|
|
+ struct page *page;
|
|
+ int pgoff;
|
|
+ int ret;
|
|
|
|
- sector->io_error = 1;
|
|
- sector->sblock->no_io_error_seen = 0;
|
|
+ page = scrub_stripe_get_page(stripe, i);
|
|
+ pgoff = scrub_stripe_get_page_offset(stripe, i);
|
|
+
|
|
+ /* The current sector cannot be merged, submit the bio. */
|
|
+ if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
|
|
+ bbio->bio.bi_iter.bi_size >= blocksize)) {
|
|
+ ASSERT(bbio->bio.bi_iter.bi_size);
|
|
+ atomic_inc(&stripe->pending_io);
|
|
+ btrfs_submit_bio(bbio, mirror);
|
|
+ if (wait)
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ bbio = NULL;
|
|
}
|
|
- }
|
|
|
|
- /* Now complete the scrub_block items that have all pages completed */
|
|
- for (i = 0; i < sbio->sector_count; i++) {
|
|
- struct scrub_sector *sector = sbio->sectors[i];
|
|
- struct scrub_block *sblock = sector->sblock;
|
|
+ if (!bbio) {
|
|
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
|
|
+ fs_info, scrub_repair_read_endio, stripe);
|
|
+ bbio->bio.bi_iter.bi_sector = (stripe->logical +
|
|
+ (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
|
|
+ }
|
|
|
|
- if (atomic_dec_and_test(&sblock->outstanding_sectors))
|
|
- scrub_block_complete(sblock);
|
|
- scrub_block_put(sblock);
|
|
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
|
|
+ ASSERT(ret == fs_info->sectorsize);
|
|
}
|
|
-
|
|
- bio_put(sbio->bio);
|
|
- sbio->bio = NULL;
|
|
- spin_lock(&sctx->list_lock);
|
|
- sbio->next_free = sctx->first_free;
|
|
- sctx->first_free = sbio->index;
|
|
- spin_unlock(&sctx->list_lock);
|
|
-
|
|
- if (sctx->is_dev_replace && sctx->flush_all_writes) {
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
+ if (bbio) {
|
|
+ ASSERT(bbio->bio.bi_iter.bi_size);
|
|
+ atomic_inc(&stripe->pending_io);
|
|
+ btrfs_submit_bio(bbio, mirror);
|
|
+ if (wait)
|
|
+ wait_scrub_stripe_io(stripe);
|
|
}
|
|
-
|
|
- scrub_pending_bio_dec(sctx);
|
|
}
|
|
|
|
-static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
|
|
- unsigned long *bitmap,
|
|
- u64 start, u32 len)
|
|
+static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
|
|
+ struct scrub_stripe *stripe)
|
|
{
|
|
- u64 offset;
|
|
- u32 nsectors;
|
|
- u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
|
|
-
|
|
- if (len >= sparity->stripe_len) {
|
|
- bitmap_set(bitmap, 0, sparity->nsectors);
|
|
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
+ DEFAULT_RATELIMIT_BURST);
|
|
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ struct btrfs_device *dev = NULL;
|
|
+ u64 physical = 0;
|
|
+ int nr_data_sectors = 0;
|
|
+ int nr_meta_sectors = 0;
|
|
+ int nr_nodatacsum_sectors = 0;
|
|
+ int nr_repaired_sectors = 0;
|
|
+ int sector_nr;
|
|
+
|
|
+ if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
|
|
return;
|
|
- }
|
|
|
|
- start -= sparity->logic_start;
|
|
- start = div64_u64_rem(start, sparity->stripe_len, &offset);
|
|
- offset = offset >> sectorsize_bits;
|
|
- nsectors = len >> sectorsize_bits;
|
|
+ /*
|
|
+ * Init needed infos for error reporting.
|
|
+ *
|
|
+ * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
|
|
+ * thus no need for dev/physical, error reporting still needs dev and physical.
|
|
+ */
|
|
+ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
|
|
+ u64 mapped_len = fs_info->sectorsize;
|
|
+ struct btrfs_io_context *bioc = NULL;
|
|
+ int stripe_index = stripe->mirror_num - 1;
|
|
+ int ret;
|
|
|
|
- if (offset + nsectors <= sparity->nsectors) {
|
|
- bitmap_set(bitmap, offset, nsectors);
|
|
- return;
|
|
+ /* For scrub, our mirror_num should always start at 1. */
|
|
+ ASSERT(stripe->mirror_num >= 1);
|
|
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
|
|
+ stripe->logical, &mapped_len, &bioc);
|
|
+ /*
|
|
+ * If we failed, dev will be NULL, and later detailed reports
|
|
+ * will just be skipped.
|
|
+ */
|
|
+ if (ret < 0)
|
|
+ goto skip;
|
|
+ physical = bioc->stripes[stripe_index].physical;
|
|
+ dev = bioc->stripes[stripe_index].dev;
|
|
+ btrfs_put_bioc(bioc);
|
|
}
|
|
|
|
- bitmap_set(bitmap, offset, sparity->nsectors - offset);
|
|
- bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
|
|
-}
|
|
+skip:
|
|
+ for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
|
|
+ bool repaired = false;
|
|
|
|
-static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
|
|
- u64 start, u32 len)
|
|
-{
|
|
- __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
|
|
-}
|
|
+ if (stripe->sectors[sector_nr].is_metadata) {
|
|
+ nr_meta_sectors++;
|
|
+ } else {
|
|
+ nr_data_sectors++;
|
|
+ if (!stripe->sectors[sector_nr].csum)
|
|
+ nr_nodatacsum_sectors++;
|
|
+ }
|
|
|
|
-static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
|
|
- u64 start, u32 len)
|
|
-{
|
|
- __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
|
|
-}
|
|
+ if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
|
|
+ !test_bit(sector_nr, &stripe->error_bitmap)) {
|
|
+ nr_repaired_sectors++;
|
|
+ repaired = true;
|
|
+ }
|
|
|
|
-static void scrub_block_complete(struct scrub_block *sblock)
|
|
-{
|
|
- int corrupted = 0;
|
|
+ /* Good sector from the beginning, nothing need to be done. */
|
|
+ if (!test_bit(sector_nr, &stripe->init_error_bitmap))
|
|
+ continue;
|
|
|
|
- if (!sblock->no_io_error_seen) {
|
|
- corrupted = 1;
|
|
- scrub_handle_errored_block(sblock);
|
|
- } else {
|
|
/*
|
|
- * if has checksum error, write via repair mechanism in
|
|
- * dev replace case, otherwise write here in dev replace
|
|
- * case.
|
|
+ * Report error for the corrupted sectors. If repaired, just
|
|
+ * output the message of repaired message.
|
|
*/
|
|
- corrupted = scrub_checksum(sblock);
|
|
- if (!corrupted && sblock->sctx->is_dev_replace)
|
|
- scrub_write_block_to_dev_replace(sblock);
|
|
- }
|
|
+ if (repaired) {
|
|
+ if (dev) {
|
|
+ btrfs_err_rl_in_rcu(fs_info,
|
|
+ "fixed up error at logical %llu on dev %s physical %llu",
|
|
+ stripe->logical, btrfs_dev_name(dev),
|
|
+ physical);
|
|
+ } else {
|
|
+ btrfs_err_rl_in_rcu(fs_info,
|
|
+ "fixed up error at logical %llu on mirror %u",
|
|
+ stripe->logical, stripe->mirror_num);
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
|
|
- if (sblock->sparity && corrupted && !sblock->data_corrected) {
|
|
- u64 start = sblock->logical;
|
|
- u64 end = sblock->logical +
|
|
- sblock->sectors[sblock->sector_count - 1]->offset +
|
|
- sblock->sctx->fs_info->sectorsize;
|
|
+ /* The remaining are all for unrepaired. */
|
|
+ if (dev) {
|
|
+ btrfs_err_rl_in_rcu(fs_info,
|
|
+ "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
|
|
+ stripe->logical, btrfs_dev_name(dev),
|
|
+ physical);
|
|
+ } else {
|
|
+ btrfs_err_rl_in_rcu(fs_info,
|
|
+ "unable to fixup (regular) error at logical %llu on mirror %u",
|
|
+ stripe->logical, stripe->mirror_num);
|
|
+ }
|
|
|
|
- ASSERT(end - start <= U32_MAX);
|
|
- scrub_parity_mark_sectors_error(sblock->sparity,
|
|
- start, end - start);
|
|
+ if (test_bit(sector_nr, &stripe->io_error_bitmap))
|
|
+ if (__ratelimit(&rs) && dev)
|
|
+ scrub_print_common_warning("i/o error", dev, false,
|
|
+ stripe->logical, physical);
|
|
+ if (test_bit(sector_nr, &stripe->csum_error_bitmap))
|
|
+ if (__ratelimit(&rs) && dev)
|
|
+ scrub_print_common_warning("checksum error", dev, false,
|
|
+ stripe->logical, physical);
|
|
+ if (test_bit(sector_nr, &stripe->meta_error_bitmap))
|
|
+ if (__ratelimit(&rs) && dev)
|
|
+ scrub_print_common_warning("header error", dev, false,
|
|
+ stripe->logical, physical);
|
|
}
|
|
-}
|
|
|
|
-static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
|
|
-{
|
|
- sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
|
|
- list_del(&sum->list);
|
|
- kfree(sum);
|
|
+ spin_lock(&sctx->stat_lock);
|
|
+ sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
|
|
+ sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
|
|
+ sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
|
|
+ sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
|
|
+ sctx->stat.no_csum += nr_nodatacsum_sectors;
|
|
+ sctx->stat.read_errors +=
|
|
+ bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
|
|
+ sctx->stat.csum_errors +=
|
|
+ bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
|
|
+ sctx->stat.verify_errors +=
|
|
+ bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
|
|
+ sctx->stat.uncorrectable_errors +=
|
|
+ bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
|
|
+ sctx->stat.corrected_errors += nr_repaired_sectors;
|
|
+ spin_unlock(&sctx->stat_lock);
|
|
}
|
|
|
|
/*
|
|
- * Find the desired csum for range [logical, logical + sectorsize), and store
|
|
- * the csum into @csum.
|
|
+ * The main entrance for all read related scrub work, including:
|
|
*
|
|
- * The search source is sctx->csum_list, which is a pre-populated list
|
|
- * storing bytenr ordered csum ranges. We're responsible to cleanup any range
|
|
- * that is before @logical.
|
|
+ * - Wait for the initial read to finish
|
|
+ * - Verify and locate any bad sectors
|
|
+ * - Go through the remaining mirrors and try to read as large blocksize as
|
|
+ * possible
|
|
+ * - Go through all mirrors (including the failed mirror) sector-by-sector
|
|
*
|
|
- * Return 0 if there is no csum for the range.
|
|
- * Return 1 if there is csum for the range and copied to @csum.
|
|
+ * Writeback does not happen here, it needs extra synchronization.
|
|
*/
|
|
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
|
|
+static void scrub_stripe_read_repair_worker(struct work_struct *work)
|
|
{
|
|
- bool found = false;
|
|
-
|
|
- while (!list_empty(&sctx->csum_list)) {
|
|
- struct btrfs_ordered_sum *sum = NULL;
|
|
- unsigned long index;
|
|
- unsigned long num_sectors;
|
|
-
|
|
- sum = list_first_entry(&sctx->csum_list,
|
|
- struct btrfs_ordered_sum, list);
|
|
- /* The current csum range is beyond our range, no csum found */
|
|
- if (sum->bytenr > logical)
|
|
- break;
|
|
-
|
|
- /*
|
|
- * The current sum is before our bytenr, since scrub is always
|
|
- * done in bytenr order, the csum will never be used anymore,
|
|
- * clean it up so that later calls won't bother with the range,
|
|
- * and continue search the next range.
|
|
- */
|
|
- if (sum->bytenr + sum->len <= logical) {
|
|
- drop_csum_range(sctx, sum);
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* Now the csum range covers our bytenr, copy the csum */
|
|
- found = true;
|
|
- index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
|
|
- num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
|
|
+ struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
|
|
+ stripe->bg->length);
|
|
+ int mirror;
|
|
+ int i;
|
|
|
|
- memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
|
|
- sctx->fs_info->csum_size);
|
|
+ ASSERT(stripe->mirror_num > 0);
|
|
|
|
- /* Cleanup the range if we're at the end of the csum range */
|
|
- if (index == num_sectors - 1)
|
|
- drop_csum_range(sctx, sum);
|
|
- break;
|
|
- }
|
|
- if (!found)
|
|
- return 0;
|
|
- return 1;
|
|
-}
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
|
|
+ /* Save the initial failed bitmap for later repair and report usage. */
|
|
+ stripe->init_error_bitmap = stripe->error_bitmap;
|
|
|
|
-/* scrub extent tries to collect up to 64 kB for each bio */
|
|
-static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
|
|
- u64 logical, u32 len,
|
|
- u64 physical, struct btrfs_device *dev, u64 flags,
|
|
- u64 gen, int mirror_num)
|
|
-{
|
|
- struct btrfs_device *src_dev = dev;
|
|
- u64 src_physical = physical;
|
|
- int src_mirror = mirror_num;
|
|
- int ret;
|
|
- u8 csum[BTRFS_CSUM_SIZE];
|
|
- u32 blocksize;
|
|
+ if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
|
|
+ goto out;
|
|
|
|
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
|
|
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
|
- blocksize = map->stripe_len;
|
|
- else
|
|
- blocksize = sctx->fs_info->sectorsize;
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.data_extents_scrubbed++;
|
|
- sctx->stat.data_bytes_scrubbed += len;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
|
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
|
- blocksize = map->stripe_len;
|
|
- else
|
|
- blocksize = sctx->fs_info->nodesize;
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.tree_extents_scrubbed++;
|
|
- sctx->stat.tree_bytes_scrubbed += len;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- } else {
|
|
- blocksize = sctx->fs_info->sectorsize;
|
|
- WARN_ON(1);
|
|
+ /*
|
|
+ * Try all remaining mirrors.
|
|
+ *
|
|
+ * Here we still try to read as large block as possible, as this is
|
|
+ * faster and we have extra safety nets to rely on.
|
|
+ */
|
|
+ for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
|
|
+ mirror != stripe->mirror_num;
|
|
+ mirror = calc_next_mirror(mirror, num_copies)) {
|
|
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
|
|
+
|
|
+ scrub_stripe_submit_repair_read(stripe, mirror,
|
|
+ BTRFS_STRIPE_LEN, false);
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ scrub_verify_one_stripe(stripe, old_error_bitmap);
|
|
+ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
|
|
+ goto out;
|
|
}
|
|
|
|
/*
|
|
- * For dev-replace case, we can have @dev being a missing device.
|
|
- * Regular scrub will avoid its execution on missing device at all,
|
|
- * as that would trigger tons of read error.
|
|
+ * Last safety net, try re-checking all mirrors, including the failed
|
|
+ * one, sector-by-sector.
|
|
*
|
|
- * Reading from missing device will cause read error counts to
|
|
- * increase unnecessarily.
|
|
- * So here we change the read source to a good mirror.
|
|
+ * As if one sector failed the drive's internal csum, the whole read
|
|
+ * containing the offending sector would be marked as error.
|
|
+ * Thus here we do sector-by-sector read.
|
|
+ *
|
|
+ * This can be slow, thus we only try it as the last resort.
|
|
*/
|
|
- if (sctx->is_dev_replace && !dev->bdev)
|
|
- scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
|
|
- &src_dev, &src_mirror);
|
|
- while (len) {
|
|
- u32 l = min(len, blocksize);
|
|
- int have_csum = 0;
|
|
-
|
|
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
|
|
- /* push csums to sbio */
|
|
- have_csum = scrub_find_csum(sctx, logical, csum);
|
|
- if (have_csum == 0)
|
|
- ++sctx->stat.no_csum;
|
|
- }
|
|
- ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
|
|
- flags, gen, src_mirror,
|
|
- have_csum ? csum : NULL, physical);
|
|
- if (ret)
|
|
- return ret;
|
|
- len -= l;
|
|
- logical += l;
|
|
- physical += l;
|
|
- src_physical += l;
|
|
+
|
|
+ for (i = 0, mirror = stripe->mirror_num;
|
|
+ i < num_copies;
|
|
+ i++, mirror = calc_next_mirror(mirror, num_copies)) {
|
|
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
|
|
+
|
|
+ scrub_stripe_submit_repair_read(stripe, mirror,
|
|
+ fs_info->sectorsize, true);
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ scrub_verify_one_stripe(stripe, old_error_bitmap);
|
|
+ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
|
|
+ goto out;
|
|
}
|
|
- return 0;
|
|
+out:
|
|
+ scrub_stripe_report_errors(stripe->sctx, stripe);
|
|
+ set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
|
|
+ wake_up(&stripe->repair_wait);
|
|
}
|
|
|
|
-static int scrub_sectors_for_parity(struct scrub_parity *sparity,
|
|
- u64 logical, u32 len,
|
|
- u64 physical, struct btrfs_device *dev,
|
|
- u64 flags, u64 gen, int mirror_num, u8 *csum)
|
|
+static void scrub_read_endio(struct btrfs_bio *bbio)
|
|
{
|
|
- struct scrub_ctx *sctx = sparity->sctx;
|
|
- struct scrub_block *sblock;
|
|
- const u32 sectorsize = sctx->fs_info->sectorsize;
|
|
- int index;
|
|
-
|
|
- ASSERT(IS_ALIGNED(len, sectorsize));
|
|
+ struct scrub_stripe *stripe = bbio->private;
|
|
|
|
- sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
|
|
- if (!sblock) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- return -ENOMEM;
|
|
+ if (bbio->bio.bi_status) {
|
|
+ bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
|
|
+ bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
|
|
+ } else {
|
|
+ bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
|
|
+ }
|
|
+ bio_put(&bbio->bio);
|
|
+ if (atomic_dec_and_test(&stripe->pending_io)) {
|
|
+ wake_up(&stripe->io_wait);
|
|
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
|
|
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
|
|
}
|
|
+}
|
|
|
|
- sblock->sparity = sparity;
|
|
- scrub_parity_get(sparity);
|
|
+static void scrub_write_endio(struct btrfs_bio *bbio)
|
|
+{
|
|
+ struct scrub_stripe *stripe = bbio->private;
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ struct bio_vec *bvec;
|
|
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
|
|
+ u32 bio_size = 0;
|
|
+ int i;
|
|
|
|
- for (index = 0; len > 0; index++) {
|
|
- struct scrub_sector *sector;
|
|
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
|
|
+ bio_size += bvec->bv_len;
|
|
|
|
- sector = alloc_scrub_sector(sblock, logical);
|
|
- if (!sector) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- scrub_block_put(sblock);
|
|
- return -ENOMEM;
|
|
- }
|
|
- sblock->sectors[index] = sector;
|
|
- /* For scrub parity */
|
|
- scrub_sector_get(sector);
|
|
- list_add_tail(§or->list, &sparity->sectors_list);
|
|
- sector->flags = flags;
|
|
- sector->generation = gen;
|
|
- if (csum) {
|
|
- sector->have_csum = 1;
|
|
- memcpy(sector->csum, csum, sctx->fs_info->csum_size);
|
|
- } else {
|
|
- sector->have_csum = 0;
|
|
- }
|
|
+ if (bbio->bio.bi_status) {
|
|
+ unsigned long flags;
|
|
|
|
- /* Iterate over the stripe range in sectorsize steps */
|
|
- len -= sectorsize;
|
|
- logical += sectorsize;
|
|
- physical += sectorsize;
|
|
+ spin_lock_irqsave(&stripe->write_error_lock, flags);
|
|
+ bitmap_set(&stripe->write_error_bitmap, sector_nr,
|
|
+ bio_size >> fs_info->sectorsize_bits);
|
|
+ spin_unlock_irqrestore(&stripe->write_error_lock, flags);
|
|
}
|
|
+ bio_put(&bbio->bio);
|
|
+
|
|
+ if (atomic_dec_and_test(&stripe->pending_io))
|
|
+ wake_up(&stripe->io_wait);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Submit the write bio(s) for the sectors specified by @write_bitmap.
|
|
+ *
|
|
+ * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
|
|
+ *
|
|
+ * - Only needs logical bytenr and mirror_num
|
|
+ * Just like the scrub read path
|
|
+ *
|
|
+ * - Would only result in writes to the specified mirror
|
|
+ * Unlike the regular writeback path, which would write back to all stripes
|
|
+ *
|
|
+ * - Handle dev-replace and read-repair writeback differently
|
|
+ */
|
|
+static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
|
|
+ unsigned long write_bitmap, bool dev_replace)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+ struct btrfs_bio *bbio = NULL;
|
|
+ const bool zoned = btrfs_is_zoned(fs_info);
|
|
+ int sector_nr;
|
|
|
|
- WARN_ON(sblock->sector_count == 0);
|
|
- for (index = 0; index < sblock->sector_count; index++) {
|
|
- struct scrub_sector *sector = sblock->sectors[index];
|
|
+ for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
|
|
+ struct page *page = scrub_stripe_get_page(stripe, sector_nr);
|
|
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
|
|
int ret;
|
|
|
|
- ret = scrub_add_sector_to_rd_bio(sctx, sector);
|
|
- if (ret) {
|
|
- scrub_block_put(sblock);
|
|
- return ret;
|
|
+ /* We should only writeback sectors covered by an extent. */
|
|
+ ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
|
|
+
|
|
+ /* Cannot merge with previous sector, submit the current one. */
|
|
+ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
|
|
+ fill_writer_pointer_gap(sctx, stripe->physical +
|
|
+ (sector_nr << fs_info->sectorsize_bits));
|
|
+ atomic_inc(&stripe->pending_io);
|
|
+ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
|
|
+ /* For zoned writeback, queue depth must be 1. */
|
|
+ if (zoned)
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ bbio = NULL;
|
|
}
|
|
+ if (!bbio) {
|
|
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
|
|
+ fs_info, scrub_write_endio, stripe);
|
|
+ bbio->bio.bi_iter.bi_sector = (stripe->logical +
|
|
+ (sector_nr << fs_info->sectorsize_bits)) >>
|
|
+ SECTOR_SHIFT;
|
|
+ }
|
|
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
|
|
+ ASSERT(ret == fs_info->sectorsize);
|
|
+ }
|
|
+ if (bbio) {
|
|
+ fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
|
|
+ SECTOR_SHIFT);
|
|
+ atomic_inc(&stripe->pending_io);
|
|
+ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
|
|
+ if (zoned)
|
|
+ wait_scrub_stripe_io(stripe);
|
|
}
|
|
-
|
|
- /* Last one frees, either here or in bio completion for last sector */
|
|
- scrub_block_put(sblock);
|
|
- return 0;
|
|
}
|
|
|
|
-static int scrub_extent_for_parity(struct scrub_parity *sparity,
|
|
- u64 logical, u32 len,
|
|
- u64 physical, struct btrfs_device *dev,
|
|
- u64 flags, u64 gen, int mirror_num)
|
|
+/*
|
|
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
|
|
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
|
|
+ */
|
|
+static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
|
|
+ unsigned int bio_size)
|
|
{
|
|
- struct scrub_ctx *sctx = sparity->sctx;
|
|
- int ret;
|
|
- u8 csum[BTRFS_CSUM_SIZE];
|
|
- u32 blocksize;
|
|
+ const int time_slice = 1000;
|
|
+ s64 delta;
|
|
+ ktime_t now;
|
|
+ u32 div;
|
|
+ u64 bwlimit;
|
|
|
|
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
|
|
- scrub_parity_mark_sectors_error(sparity, logical, len);
|
|
- return 0;
|
|
+ bwlimit = READ_ONCE(device->scrub_speed_max);
|
|
+ if (bwlimit == 0)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Slice is divided into intervals when the IO is submitted, adjust by
|
|
+ * bwlimit and maximum of 64 intervals.
|
|
+ */
|
|
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
|
|
+ div = min_t(u32, 64, div);
|
|
+
|
|
+ /* Start new epoch, set deadline */
|
|
+ now = ktime_get();
|
|
+ if (sctx->throttle_deadline == 0) {
|
|
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
|
|
+ sctx->throttle_sent = 0;
|
|
}
|
|
|
|
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
|
|
- blocksize = sparity->stripe_len;
|
|
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
|
- blocksize = sparity->stripe_len;
|
|
+ /* Still in the time to send? */
|
|
+ if (ktime_before(now, sctx->throttle_deadline)) {
|
|
+ /* If current bio is within the limit, send it */
|
|
+ sctx->throttle_sent += bio_size;
|
|
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
|
|
+ return;
|
|
+
|
|
+ /* We're over the limit, sleep until the rest of the slice */
|
|
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
|
|
} else {
|
|
- blocksize = sctx->fs_info->sectorsize;
|
|
- WARN_ON(1);
|
|
+ /* New request after deadline, start new epoch */
|
|
+ delta = 0;
|
|
}
|
|
|
|
- while (len) {
|
|
- u32 l = min(len, blocksize);
|
|
- int have_csum = 0;
|
|
+ if (delta) {
|
|
+ long timeout;
|
|
|
|
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
|
|
- /* push csums to sbio */
|
|
- have_csum = scrub_find_csum(sctx, logical, csum);
|
|
- if (have_csum == 0)
|
|
- goto skip;
|
|
- }
|
|
- ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
|
|
- flags, gen, mirror_num,
|
|
- have_csum ? csum : NULL);
|
|
- if (ret)
|
|
- return ret;
|
|
-skip:
|
|
- len -= l;
|
|
- logical += l;
|
|
- physical += l;
|
|
+ timeout = div_u64(delta * HZ, 1000);
|
|
+ schedule_timeout_interruptible(timeout);
|
|
}
|
|
- return 0;
|
|
+
|
|
+ /* Next call will start the deadline period */
|
|
+ sctx->throttle_deadline = 0;
|
|
}
|
|
|
|
/*
|
|
@@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
|
|
{
|
|
int i;
|
|
int j = 0;
|
|
- u64 stripe_nr;
|
|
u64 last_offset;
|
|
- u32 stripe_index;
|
|
- u32 rot;
|
|
const int data_stripes = nr_data_stripes(map);
|
|
|
|
last_offset = (physical - map->stripes[num].physical) * data_stripes;
|
|
@@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num,
|
|
|
|
*offset = last_offset;
|
|
for (i = 0; i < data_stripes; i++) {
|
|
- *offset = last_offset + i * map->stripe_len;
|
|
+ u32 stripe_nr;
|
|
+ u32 stripe_index;
|
|
+ u32 rot;
|
|
|
|
- stripe_nr = div64_u64(*offset, map->stripe_len);
|
|
- stripe_nr = div_u64(stripe_nr, data_stripes);
|
|
+ *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
|
|
+
|
|
+ stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
|
|
|
|
/* Work out the disk rotation on this stripe-set */
|
|
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
|
|
+ rot = stripe_nr % map->num_stripes;
|
|
+ stripe_nr /= map->num_stripes;
|
|
/* calculate which stripe this data locates */
|
|
rot += i;
|
|
stripe_index = rot % map->num_stripes;
|
|
@@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
|
|
if (stripe_index < num)
|
|
j++;
|
|
}
|
|
- *offset = last_offset + j * map->stripe_len;
|
|
+ *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
|
|
return 1;
|
|
}
|
|
|
|
-static void scrub_free_parity(struct scrub_parity *sparity)
|
|
-{
|
|
- struct scrub_ctx *sctx = sparity->sctx;
|
|
- struct scrub_sector *curr, *next;
|
|
- int nbits;
|
|
-
|
|
- nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
|
|
- if (nbits) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.read_errors += nbits;
|
|
- sctx->stat.uncorrectable_errors += nbits;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- }
|
|
-
|
|
- list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
|
|
- list_del_init(&curr->list);
|
|
- scrub_sector_put(curr);
|
|
- }
|
|
-
|
|
- kfree(sparity);
|
|
-}
|
|
-
|
|
-static void scrub_parity_bio_endio_worker(struct work_struct *work)
|
|
-{
|
|
- struct scrub_parity *sparity = container_of(work, struct scrub_parity,
|
|
- work);
|
|
- struct scrub_ctx *sctx = sparity->sctx;
|
|
-
|
|
- btrfs_bio_counter_dec(sctx->fs_info);
|
|
- scrub_free_parity(sparity);
|
|
- scrub_pending_bio_dec(sctx);
|
|
-}
|
|
-
|
|
-static void scrub_parity_bio_endio(struct bio *bio)
|
|
-{
|
|
- struct scrub_parity *sparity = bio->bi_private;
|
|
- struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
|
|
-
|
|
- if (bio->bi_status)
|
|
- bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
|
|
- &sparity->dbitmap, sparity->nsectors);
|
|
-
|
|
- bio_put(bio);
|
|
-
|
|
- INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
|
|
- queue_work(fs_info->scrub_parity_workers, &sparity->work);
|
|
-}
|
|
-
|
|
-static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
|
|
-{
|
|
- struct scrub_ctx *sctx = sparity->sctx;
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- struct bio *bio;
|
|
- struct btrfs_raid_bio *rbio;
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
- u64 length;
|
|
- int ret;
|
|
-
|
|
- if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
|
|
- &sparity->ebitmap, sparity->nsectors))
|
|
- goto out;
|
|
-
|
|
- length = sparity->logic_end - sparity->logic_start;
|
|
-
|
|
- btrfs_bio_counter_inc_blocked(fs_info);
|
|
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
|
|
- &length, &bioc);
|
|
- if (ret || !bioc || !bioc->raid_map)
|
|
- goto bioc_out;
|
|
-
|
|
- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
|
|
- bio->bi_iter.bi_sector = sparity->logic_start >> 9;
|
|
- bio->bi_private = sparity;
|
|
- bio->bi_end_io = scrub_parity_bio_endio;
|
|
-
|
|
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
|
|
- sparity->scrub_dev,
|
|
- &sparity->dbitmap,
|
|
- sparity->nsectors);
|
|
- btrfs_put_bioc(bioc);
|
|
- if (!rbio)
|
|
- goto rbio_out;
|
|
-
|
|
- scrub_pending_bio_inc(sctx);
|
|
- raid56_parity_submit_scrub_rbio(rbio);
|
|
- return;
|
|
-
|
|
-rbio_out:
|
|
- bio_put(bio);
|
|
-bioc_out:
|
|
- btrfs_bio_counter_dec(fs_info);
|
|
- bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
|
|
- sparity->nsectors);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
-out:
|
|
- scrub_free_parity(sparity);
|
|
-}
|
|
-
|
|
-static void scrub_parity_get(struct scrub_parity *sparity)
|
|
-{
|
|
- refcount_inc(&sparity->refs);
|
|
-}
|
|
-
|
|
-static void scrub_parity_put(struct scrub_parity *sparity)
|
|
-{
|
|
- if (!refcount_dec_and_test(&sparity->refs))
|
|
- return;
|
|
-
|
|
- scrub_parity_check_and_repair(sparity);
|
|
-}
|
|
-
|
|
/*
|
|
* Return 0 if the extent item range covers any byte of the range.
|
|
* Return <0 if the extent item is before @search_start.
|
|
@@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
|
|
*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
|
|
}
|
|
|
|
-static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
|
|
- u64 boundary_start, u64 boudary_len)
|
|
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
|
|
+ u64 physical, u64 physical_end)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!btrfs_is_zoned(fs_info))
|
|
+ return 0;
|
|
+
|
|
+ mutex_lock(&sctx->wr_lock);
|
|
+ if (sctx->write_pointer < physical_end) {
|
|
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
|
|
+ physical,
|
|
+ sctx->write_pointer);
|
|
+ if (ret)
|
|
+ btrfs_err(fs_info,
|
|
+ "zoned: failed to recover write pointer");
|
|
+ }
|
|
+ mutex_unlock(&sctx->wr_lock);
|
|
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
|
|
+ struct scrub_stripe *stripe,
|
|
+ u64 extent_start, u64 extent_len,
|
|
+ u64 extent_flags, u64 extent_gen)
|
|
+{
|
|
+ for (u64 cur_logical = max(stripe->logical, extent_start);
|
|
+ cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
|
|
+ extent_start + extent_len);
|
|
+ cur_logical += fs_info->sectorsize) {
|
|
+ const int nr_sector = (cur_logical - stripe->logical) >>
|
|
+ fs_info->sectorsize_bits;
|
|
+ struct scrub_sector_verification *sector =
|
|
+ &stripe->sectors[nr_sector];
|
|
+
|
|
+ set_bit(nr_sector, &stripe->extent_sector_bitmap);
|
|
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
|
+ sector->is_metadata = true;
|
|
+ sector->generation = extent_gen;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
|
|
+{
|
|
+ stripe->extent_sector_bitmap = 0;
|
|
+ stripe->init_error_bitmap = 0;
|
|
+ stripe->error_bitmap = 0;
|
|
+ stripe->io_error_bitmap = 0;
|
|
+ stripe->csum_error_bitmap = 0;
|
|
+ stripe->meta_error_bitmap = 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Locate one stripe which has at least one extent in its range.
|
|
+ *
|
|
+ * Return 0 if found such stripe, and store its info into @stripe.
|
|
+ * Return >0 if there is no such stripe in the specified range.
|
|
+ * Return <0 for error.
|
|
+ */
|
|
+static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
|
|
+ struct btrfs_device *dev, u64 physical,
|
|
+ int mirror_num, u64 logical_start,
|
|
+ u32 logical_len,
|
|
+ struct scrub_stripe *stripe)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = bg->fs_info;
|
|
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
|
|
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
|
|
+ const u64 logical_end = logical_start + logical_len;
|
|
+ struct btrfs_path path = { 0 };
|
|
+ u64 cur_logical = logical_start;
|
|
+ u64 stripe_end;
|
|
+ u64 extent_start;
|
|
+ u64 extent_len;
|
|
+ u64 extent_flags;
|
|
+ u64 extent_gen;
|
|
+ int ret;
|
|
+
|
|
+ memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
|
|
+ stripe->nr_sectors);
|
|
+ scrub_stripe_reset_bitmaps(stripe);
|
|
+
|
|
+ /* The range must be inside the bg. */
|
|
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
|
|
+
|
|
+ path.search_commit_root = 1;
|
|
+ path.skip_locking = 1;
|
|
+
|
|
+ ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
|
|
+ /* Either error or not found. */
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
|
|
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
|
|
+ stripe->nr_meta_extents++;
|
|
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
|
|
+ stripe->nr_data_extents++;
|
|
+ cur_logical = max(extent_start, cur_logical);
|
|
+
|
|
+ /*
|
|
+ * Round down to stripe boundary.
|
|
+ *
|
|
+ * The extra calculation against bg->start is to handle block groups
|
|
+ * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
|
|
+ */
|
|
+ stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
|
|
+ bg->start;
|
|
+ stripe->physical = physical + stripe->logical - logical_start;
|
|
+ stripe->dev = dev;
|
|
+ stripe->bg = bg;
|
|
+ stripe->mirror_num = mirror_num;
|
|
+ stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
|
|
+
|
|
+ /* Fill the first extent info into stripe->sectors[] array. */
|
|
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
|
|
+ extent_flags, extent_gen);
|
|
+ cur_logical = extent_start + extent_len;
|
|
+
|
|
+ /* Fill the extent info for the remaining sectors. */
|
|
+ while (cur_logical <= stripe_end) {
|
|
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
|
|
+ stripe_end - cur_logical + 1);
|
|
+ if (ret < 0)
|
|
+ goto out;
|
|
+ if (ret > 0) {
|
|
+ ret = 0;
|
|
+ break;
|
|
+ }
|
|
+ get_extent_info(&path, &extent_start, &extent_len,
|
|
+ &extent_flags, &extent_gen);
|
|
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
|
|
+ stripe->nr_meta_extents++;
|
|
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
|
|
+ stripe->nr_data_extents++;
|
|
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
|
|
+ extent_flags, extent_gen);
|
|
+ cur_logical = extent_start + extent_len;
|
|
+ }
|
|
+
|
|
+ /* Now fill the data csum. */
|
|
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
|
|
+ int sector_nr;
|
|
+ unsigned long csum_bitmap = 0;
|
|
+
|
|
+ /* Csum space should have already been allocated. */
|
|
+ ASSERT(stripe->csums);
|
|
+
|
|
+ /*
|
|
+ * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
|
|
+ * should contain at most 16 sectors.
|
|
+ */
|
|
+ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
|
|
+
|
|
+ ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
|
|
+ stripe_end, stripe->csums,
|
|
+ &csum_bitmap, true);
|
|
+ if (ret < 0)
|
|
+ goto out;
|
|
+ if (ret > 0)
|
|
+ ret = 0;
|
|
+
|
|
+ for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
|
|
+ stripe->sectors[sector_nr].csum = stripe->csums +
|
|
+ sector_nr * fs_info->csum_size;
|
|
+ }
|
|
+ }
|
|
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
|
|
+out:
|
|
+ btrfs_release_path(&path);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void scrub_reset_stripe(struct scrub_stripe *stripe)
|
|
+{
|
|
+ scrub_stripe_reset_bitmaps(stripe);
|
|
+
|
|
+ stripe->nr_meta_extents = 0;
|
|
+ stripe->nr_data_extents = 0;
|
|
+ stripe->state = 0;
|
|
+
|
|
+ for (int i = 0; i < stripe->nr_sectors; i++) {
|
|
+ stripe->sectors[i].is_metadata = false;
|
|
+ stripe->sectors[i].csum = NULL;
|
|
+ stripe->sectors[i].generation = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void scrub_submit_initial_read(struct scrub_ctx *sctx,
|
|
+ struct scrub_stripe *stripe)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ struct btrfs_bio *bbio;
|
|
+ int mirror = stripe->mirror_num;
|
|
+
|
|
+ ASSERT(stripe->bg);
|
|
+ ASSERT(stripe->mirror_num > 0);
|
|
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
|
|
+
|
|
+ bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
|
|
+ scrub_read_endio, stripe);
|
|
+
|
|
+ /* Read the whole stripe. */
|
|
+ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
|
|
+ for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
|
|
+ int ret;
|
|
+
|
|
+ ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
|
|
+ /* We should have allocated enough bio vectors. */
|
|
+ ASSERT(ret == PAGE_SIZE);
|
|
+ }
|
|
+ atomic_inc(&stripe->pending_io);
|
|
+
|
|
+ /*
|
|
+ * For dev-replace, either user asks to avoid the source dev, or
|
|
+ * the device is missing, we try the next mirror instead.
|
|
+ */
|
|
+ if (sctx->is_dev_replace &&
|
|
+ (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
|
|
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
|
|
+ !stripe->dev->bdev)) {
|
|
+ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
|
|
+ stripe->bg->length);
|
|
+
|
|
+ mirror = calc_next_mirror(mirror, num_copies);
|
|
+ }
|
|
+ btrfs_submit_bio(bbio, mirror);
|
|
+}
|
|
+
|
|
+static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
|
|
{
|
|
- return (extent_start < boundary_start &&
|
|
- extent_start + extent_len > boundary_start) ||
|
|
- (extent_start < boundary_start + boudary_len &&
|
|
- extent_start + extent_len > boundary_start + boudary_len);
|
|
+ int i;
|
|
+
|
|
+ for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
|
|
+ if (stripe->sectors[i].is_metadata) {
|
|
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
|
|
+
|
|
+ btrfs_err(fs_info,
|
|
+ "stripe %llu has unrepaired metadata sector at %llu",
|
|
+ stripe->logical,
|
|
+ stripe->logical + (i << fs_info->sectorsize_bits));
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+ return false;
|
|
}
|
|
|
|
-static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
|
|
- struct scrub_parity *sparity,
|
|
- struct map_lookup *map,
|
|
- struct btrfs_device *sdev,
|
|
- struct btrfs_path *path,
|
|
- u64 logical)
|
|
+static int flush_scrub_stripes(struct scrub_ctx *sctx)
|
|
{
|
|
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
|
|
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
|
|
- u64 cur_logical = logical;
|
|
- int ret;
|
|
+ struct scrub_stripe *stripe;
|
|
+ const int nr_stripes = sctx->cur_stripe;
|
|
+ int ret = 0;
|
|
|
|
- ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
|
|
+ if (!nr_stripes)
|
|
+ return 0;
|
|
|
|
- /* Path must not be populated */
|
|
- ASSERT(!path->nodes[0]);
|
|
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
|
|
|
|
- while (cur_logical < logical + map->stripe_len) {
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
- struct btrfs_device *extent_dev;
|
|
- u64 extent_start;
|
|
- u64 extent_size;
|
|
- u64 mapped_length;
|
|
- u64 extent_flags;
|
|
- u64 extent_gen;
|
|
- u64 extent_physical;
|
|
- u64 extent_mirror_num;
|
|
-
|
|
- ret = find_first_extent_item(extent_root, path, cur_logical,
|
|
- logical + map->stripe_len - cur_logical);
|
|
- /* No more extent item in this data stripe */
|
|
- if (ret > 0) {
|
|
- ret = 0;
|
|
- break;
|
|
- }
|
|
- if (ret < 0)
|
|
- break;
|
|
- get_extent_info(path, &extent_start, &extent_size, &extent_flags,
|
|
- &extent_gen);
|
|
+ scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
|
|
+ nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ stripe = &sctx->stripes[i];
|
|
+ scrub_submit_initial_read(sctx, stripe);
|
|
+ }
|
|
|
|
- /* Metadata should not cross stripe boundaries */
|
|
- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
|
|
- does_range_cross_boundary(extent_start, extent_size,
|
|
- logical, map->stripe_len)) {
|
|
- btrfs_err(fs_info,
|
|
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
|
|
- extent_start, logical);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- cur_logical += extent_size;
|
|
- continue;
|
|
- }
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ stripe = &sctx->stripes[i];
|
|
+
|
|
+ wait_event(stripe->repair_wait,
|
|
+ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
|
|
+ }
|
|
|
|
- /* Skip hole range which doesn't have any extent */
|
|
- cur_logical = max(extent_start, cur_logical);
|
|
+ /*
|
|
+ * Submit the repaired sectors. For zoned case, we cannot do repair
|
|
+ * in-place, but queue the bg to be relocated.
|
|
+ */
|
|
+ if (btrfs_is_zoned(fs_info)) {
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ stripe = &sctx->stripes[i];
|
|
|
|
- /* Truncate the range inside this data stripe */
|
|
- extent_size = min(extent_start + extent_size,
|
|
- logical + map->stripe_len) - cur_logical;
|
|
- extent_start = cur_logical;
|
|
- ASSERT(extent_size <= U32_MAX);
|
|
+ if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
|
|
+ btrfs_repair_one_zone(fs_info,
|
|
+ sctx->stripes[0].bg->start);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ unsigned long repaired;
|
|
|
|
- scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
|
|
+ stripe = &sctx->stripes[i];
|
|
|
|
- mapped_length = extent_size;
|
|
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
|
|
- &mapped_length, &bioc, 0);
|
|
- if (!ret && (!bioc || mapped_length < extent_size))
|
|
- ret = -EIO;
|
|
- if (ret) {
|
|
- btrfs_put_bioc(bioc);
|
|
- scrub_parity_mark_sectors_error(sparity, extent_start,
|
|
- extent_size);
|
|
- break;
|
|
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
|
|
+ &stripe->error_bitmap, stripe->nr_sectors);
|
|
+ scrub_write_sectors(sctx, stripe, repaired, false);
|
|
}
|
|
- extent_physical = bioc->stripes[0].physical;
|
|
- extent_mirror_num = bioc->mirror_num;
|
|
- extent_dev = bioc->stripes[0].dev;
|
|
- btrfs_put_bioc(bioc);
|
|
+ }
|
|
|
|
- ret = btrfs_lookup_csums_list(csum_root, extent_start,
|
|
- extent_start + extent_size - 1,
|
|
- &sctx->csum_list, 1, false);
|
|
- if (ret) {
|
|
- scrub_parity_mark_sectors_error(sparity, extent_start,
|
|
- extent_size);
|
|
- break;
|
|
+ /* Submit for dev-replace. */
|
|
+ if (sctx->is_dev_replace) {
|
|
+ /*
|
|
+ * For dev-replace, if we know there is something wrong with
|
|
+ * metadata, we should immedately abort.
|
|
+ */
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ if (stripe_has_metadata_error(&sctx->stripes[i])) {
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
}
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ unsigned long good;
|
|
|
|
- ret = scrub_extent_for_parity(sparity, extent_start,
|
|
- extent_size, extent_physical,
|
|
- extent_dev, extent_flags,
|
|
- extent_gen, extent_mirror_num);
|
|
- scrub_free_csums(sctx);
|
|
+ stripe = &sctx->stripes[i];
|
|
|
|
- if (ret) {
|
|
- scrub_parity_mark_sectors_error(sparity, extent_start,
|
|
- extent_size);
|
|
- break;
|
|
+ ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
|
|
+
|
|
+ bitmap_andnot(&good, &stripe->extent_sector_bitmap,
|
|
+ &stripe->error_bitmap, stripe->nr_sectors);
|
|
+ scrub_write_sectors(sctx, stripe, good, true);
|
|
}
|
|
+ }
|
|
|
|
- cond_resched();
|
|
- cur_logical += extent_size;
|
|
+ /* Wait for the above writebacks to finish. */
|
|
+ for (int i = 0; i < nr_stripes; i++) {
|
|
+ stripe = &sctx->stripes[i];
|
|
+
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ scrub_reset_stripe(stripe);
|
|
}
|
|
- btrfs_release_path(path);
|
|
+out:
|
|
+ sctx->cur_stripe = 0;
|
|
return ret;
|
|
}
|
|
|
|
-static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
|
|
- struct map_lookup *map,
|
|
- struct btrfs_device *sdev,
|
|
- u64 logic_start,
|
|
- u64 logic_end)
|
|
+static void raid56_scrub_wait_endio(struct bio *bio)
|
|
{
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- struct btrfs_path *path;
|
|
- u64 cur_logical;
|
|
+ complete(bio->bi_private);
|
|
+}
|
|
+
|
|
+static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
|
|
+ struct btrfs_device *dev, int mirror_num,
|
|
+ u64 logical, u32 length, u64 physical)
|
|
+{
|
|
+ struct scrub_stripe *stripe;
|
|
int ret;
|
|
- struct scrub_parity *sparity;
|
|
- int nsectors;
|
|
|
|
- path = btrfs_alloc_path();
|
|
- if (!path) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- return -ENOMEM;
|
|
+ /* No available slot, submit all stripes and wait for them. */
|
|
+ if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
|
|
+ ret = flush_scrub_stripes(sctx);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
}
|
|
- path->search_commit_root = 1;
|
|
- path->skip_locking = 1;
|
|
|
|
- ASSERT(map->stripe_len <= U32_MAX);
|
|
- nsectors = map->stripe_len >> fs_info->sectorsize_bits;
|
|
- ASSERT(nsectors <= BITS_PER_LONG);
|
|
- sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
|
|
- if (!sparity) {
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.malloc_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- btrfs_free_path(path);
|
|
- return -ENOMEM;
|
|
- }
|
|
+ stripe = &sctx->stripes[sctx->cur_stripe];
|
|
+
|
|
+ /* We can queue one stripe using the remaining slot. */
|
|
+ scrub_reset_stripe(stripe);
|
|
+ ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
|
|
+ logical, length, stripe);
|
|
+ /* Either >0 as no more extents or <0 for error. */
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ sctx->cur_stripe++;
|
|
+ return 0;
|
|
+}
|
|
|
|
- ASSERT(map->stripe_len <= U32_MAX);
|
|
- sparity->stripe_len = map->stripe_len;
|
|
- sparity->nsectors = nsectors;
|
|
- sparity->sctx = sctx;
|
|
- sparity->scrub_dev = sdev;
|
|
- sparity->logic_start = logic_start;
|
|
- sparity->logic_end = logic_end;
|
|
- refcount_set(&sparity->refs, 1);
|
|
- INIT_LIST_HEAD(&sparity->sectors_list);
|
|
+static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
|
|
+ struct btrfs_device *scrub_dev,
|
|
+ struct btrfs_block_group *bg,
|
|
+ struct map_lookup *map,
|
|
+ u64 full_stripe_start)
|
|
+{
|
|
+ DECLARE_COMPLETION_ONSTACK(io_done);
|
|
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ struct btrfs_raid_bio *rbio;
|
|
+ struct btrfs_io_context *bioc = NULL;
|
|
+ struct bio *bio;
|
|
+ struct scrub_stripe *stripe;
|
|
+ bool all_empty = true;
|
|
+ const int data_stripes = nr_data_stripes(map);
|
|
+ unsigned long extent_bitmap = 0;
|
|
+ u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
|
|
+ int ret;
|
|
|
|
- ret = 0;
|
|
- for (cur_logical = logic_start; cur_logical < logic_end;
|
|
- cur_logical += map->stripe_len) {
|
|
- ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
|
|
- sdev, path, cur_logical);
|
|
+ ASSERT(sctx->raid56_data_stripes);
|
|
+
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ int stripe_index;
|
|
+ int rot;
|
|
+ u64 physical;
|
|
+
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
+ rot = div_u64(full_stripe_start - bg->start,
|
|
+ data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
|
|
+ stripe_index = (i + rot) % map->num_stripes;
|
|
+ physical = map->stripes[stripe_index].physical +
|
|
+ (rot << BTRFS_STRIPE_LEN_SHIFT);
|
|
+
|
|
+ scrub_reset_stripe(stripe);
|
|
+ set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
|
|
+ ret = scrub_find_fill_first_stripe(bg,
|
|
+ map->stripes[stripe_index].dev, physical, 1,
|
|
+ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
|
|
+ BTRFS_STRIPE_LEN, stripe);
|
|
if (ret < 0)
|
|
+ goto out;
|
|
+ /*
|
|
+ * No extent in this data stripe, need to manually mark them
|
|
+ * initialized to make later read submission happy.
|
|
+ */
|
|
+ if (ret > 0) {
|
|
+ stripe->logical = full_stripe_start +
|
|
+ (i << BTRFS_STRIPE_LEN_SHIFT);
|
|
+ stripe->dev = map->stripes[stripe_index].dev;
|
|
+ stripe->mirror_num = 1;
|
|
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Check if all data stripes are empty. */
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
+ if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
|
|
+ all_empty = false;
|
|
break;
|
|
+ }
|
|
+ }
|
|
+ if (all_empty) {
|
|
+ ret = 0;
|
|
+ goto out;
|
|
}
|
|
|
|
- scrub_parity_put(sparity);
|
|
- scrub_submit(sctx);
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
+ scrub_submit_initial_read(sctx, stripe);
|
|
+ }
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
|
|
- btrfs_free_path(path);
|
|
- return ret < 0 ? ret : 0;
|
|
-}
|
|
+ wait_event(stripe->repair_wait,
|
|
+ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
|
|
+ }
|
|
+ /* For now, no zoned support for RAID56. */
|
|
+ ASSERT(!btrfs_is_zoned(sctx->fs_info));
|
|
|
|
-static void sync_replace_for_zoned(struct scrub_ctx *sctx)
|
|
-{
|
|
- if (!btrfs_is_zoned(sctx->fs_info))
|
|
- return;
|
|
+ /* Writeback for the repaired sectors. */
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ unsigned long repaired;
|
|
|
|
- sctx->flush_all_writes = true;
|
|
- scrub_submit(sctx);
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
|
|
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
|
|
-}
|
|
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
|
|
+ &stripe->error_bitmap, stripe->nr_sectors);
|
|
+ scrub_write_sectors(sctx, stripe, repaired, false);
|
|
+ }
|
|
|
|
-static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
|
|
- u64 physical, u64 physical_end)
|
|
-{
|
|
- struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- int ret = 0;
|
|
+ /* Wait for the above writebacks to finish. */
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
|
|
- if (!btrfs_is_zoned(fs_info))
|
|
- return 0;
|
|
+ wait_scrub_stripe_io(stripe);
|
|
+ }
|
|
|
|
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
|
|
+ /*
|
|
+ * Now all data stripes are properly verified. Check if we have any
|
|
+ * unrepaired, if so abort immediately or we could further corrupt the
|
|
+ * P/Q stripes.
|
|
+ *
|
|
+ * During the loop, also populate extent_bitmap.
|
|
+ */
|
|
+ for (int i = 0; i < data_stripes; i++) {
|
|
+ unsigned long error;
|
|
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- if (sctx->write_pointer < physical_end) {
|
|
- ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
|
|
- physical,
|
|
- sctx->write_pointer);
|
|
- if (ret)
|
|
+ stripe = &sctx->raid56_data_stripes[i];
|
|
+
|
|
+ /*
|
|
+ * We should only check the errors where there is an extent.
|
|
+ * As we may hit an empty data stripe while it's missing.
|
|
+ */
|
|
+ bitmap_and(&error, &stripe->error_bitmap,
|
|
+ &stripe->extent_sector_bitmap, stripe->nr_sectors);
|
|
+ if (!bitmap_empty(&error, stripe->nr_sectors)) {
|
|
btrfs_err(fs_info,
|
|
- "zoned: failed to recover write pointer");
|
|
+"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
|
|
+ full_stripe_start, i, stripe->nr_sectors,
|
|
+ &error);
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
+ bitmap_or(&extent_bitmap, &extent_bitmap,
|
|
+ &stripe->extent_sector_bitmap, stripe->nr_sectors);
|
|
}
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
- btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
|
|
|
|
+ /* Now we can check and regenerate the P/Q stripe. */
|
|
+ bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
|
|
+ bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
|
|
+ bio->bi_private = &io_done;
|
|
+ bio->bi_end_io = raid56_scrub_wait_endio;
|
|
+
|
|
+ btrfs_bio_counter_inc_blocked(fs_info);
|
|
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
|
|
+ &length, &bioc);
|
|
+ if (ret < 0) {
|
|
+ btrfs_put_bioc(bioc);
|
|
+ btrfs_bio_counter_dec(fs_info);
|
|
+ goto out;
|
|
+ }
|
|
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
|
|
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
|
|
+ btrfs_put_bioc(bioc);
|
|
+ if (!rbio) {
|
|
+ ret = -ENOMEM;
|
|
+ btrfs_bio_counter_dec(fs_info);
|
|
+ goto out;
|
|
+ }
|
|
+ raid56_parity_submit_scrub_rbio(rbio);
|
|
+ wait_for_completion_io(&io_done);
|
|
+ ret = blk_status_to_errno(bio->bi_status);
|
|
+ bio_put(bio);
|
|
+ btrfs_bio_counter_dec(fs_info);
|
|
+
|
|
+out:
|
|
return ret;
|
|
}
|
|
|
|
@@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
|
|
* and @logical_length parameter.
|
|
*/
|
|
static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
- struct btrfs_root *extent_root,
|
|
- struct btrfs_root *csum_root,
|
|
struct btrfs_block_group *bg,
|
|
struct map_lookup *map,
|
|
u64 logical_start, u64 logical_length,
|
|
@@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
const u64 logical_end = logical_start + logical_length;
|
|
/* An artificial limit, inherit from old scrub behavior */
|
|
- const u32 max_length = SZ_64K;
|
|
struct btrfs_path path = { 0 };
|
|
u64 cur_logical = logical_start;
|
|
int ret;
|
|
@@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
path.skip_locking = 1;
|
|
/* Go through each extent items inside the logical range */
|
|
while (cur_logical < logical_end) {
|
|
- u64 extent_start;
|
|
- u64 extent_len;
|
|
- u64 extent_flags;
|
|
- u64 extent_gen;
|
|
- u64 scrub_len;
|
|
+ u64 cur_physical = physical + cur_logical - logical_start;
|
|
|
|
/* Canceled? */
|
|
if (atomic_read(&fs_info->scrub_cancel_req) ||
|
|
@@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
/* Paused? */
|
|
if (atomic_read(&fs_info->scrub_pause_req)) {
|
|
/* Push queued extents */
|
|
- sctx->flush_all_writes = true;
|
|
- scrub_submit(sctx);
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
- wait_event(sctx->list_wait,
|
|
- atomic_read(&sctx->bios_in_flight) == 0);
|
|
- sctx->flush_all_writes = false;
|
|
scrub_blocked_if_needed(fs_info);
|
|
}
|
|
/* Block group removed? */
|
|
@@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
}
|
|
spin_unlock(&bg->lock);
|
|
|
|
- ret = find_first_extent_item(extent_root, &path, cur_logical,
|
|
- logical_end - cur_logical);
|
|
+ ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
|
|
+ cur_logical, logical_end - cur_logical,
|
|
+ cur_physical);
|
|
if (ret > 0) {
|
|
/* No more extent, just update the accounting */
|
|
sctx->stat.last_physical = physical + logical_length;
|
|
@@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
|
|
}
|
|
if (ret < 0)
|
|
break;
|
|
- get_extent_info(&path, &extent_start, &extent_len,
|
|
- &extent_flags, &extent_gen);
|
|
- /* Skip hole range which doesn't have any extent */
|
|
- cur_logical = max(extent_start, cur_logical);
|
|
|
|
- /*
|
|
- * Scrub len has three limits:
|
|
- * - Extent size limit
|
|
- * - Scrub range limit
|
|
- * This is especially imporatant for RAID0/RAID10 to reuse
|
|
- * this function
|
|
- * - Max scrub size limit
|
|
- */
|
|
- scrub_len = min(min(extent_start + extent_len,
|
|
- logical_end), cur_logical + max_length) -
|
|
- cur_logical;
|
|
-
|
|
- if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
|
|
- ret = btrfs_lookup_csums_list(csum_root, cur_logical,
|
|
- cur_logical + scrub_len - 1,
|
|
- &sctx->csum_list, 1, false);
|
|
- if (ret)
|
|
- break;
|
|
- }
|
|
- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
|
|
- does_range_cross_boundary(extent_start, extent_len,
|
|
- logical_start, logical_length)) {
|
|
- btrfs_err(fs_info,
|
|
-"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
|
|
- extent_start, logical_start, logical_end);
|
|
- spin_lock(&sctx->stat_lock);
|
|
- sctx->stat.uncorrectable_errors++;
|
|
- spin_unlock(&sctx->stat_lock);
|
|
- cur_logical += scrub_len;
|
|
- continue;
|
|
- }
|
|
- ret = scrub_extent(sctx, map, cur_logical, scrub_len,
|
|
- cur_logical - logical_start + physical,
|
|
- device, extent_flags, extent_gen,
|
|
- mirror_num);
|
|
- scrub_free_csums(sctx);
|
|
- if (ret)
|
|
- break;
|
|
- if (sctx->is_dev_replace)
|
|
- sync_replace_for_zoned(sctx);
|
|
- cur_logical += scrub_len;
|
|
+ ASSERT(sctx->cur_stripe > 0);
|
|
+ cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
|
|
+ + BTRFS_STRIPE_LEN;
|
|
+
|
|
/* Don't hold CPU for too long time */
|
|
cond_resched();
|
|
}
|
|
@@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
|
|
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
|
|
BTRFS_BLOCK_GROUP_RAID10));
|
|
|
|
- return map->num_stripes / map->sub_stripes * map->stripe_len;
|
|
+ return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
|
|
}
|
|
|
|
/* Get the logical bytenr for the stripe */
|
|
@@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
|
|
* (stripe_index / sub_stripes) gives how many data stripes we need to
|
|
* skip.
|
|
*/
|
|
- return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
|
|
+ return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
|
|
+ bg->start;
|
|
}
|
|
|
|
/* Get the mirror number for the stripe */
|
|
@@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
|
|
}
|
|
|
|
static int scrub_simple_stripe(struct scrub_ctx *sctx,
|
|
- struct btrfs_root *extent_root,
|
|
- struct btrfs_root *csum_root,
|
|
struct btrfs_block_group *bg,
|
|
struct map_lookup *map,
|
|
struct btrfs_device *device,
|
|
@@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
|
|
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
|
|
* this stripe.
|
|
*/
|
|
- ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
|
|
- cur_logical, map->stripe_len, device,
|
|
- cur_physical, mirror_num);
|
|
+ ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
|
|
+ BTRFS_STRIPE_LEN, device, cur_physical,
|
|
+ mirror_num);
|
|
if (ret)
|
|
return ret;
|
|
/* Skip to next stripe which belongs to the target device */
|
|
cur_logical += logical_increment;
|
|
/* For physical offset, we just go to next stripe */
|
|
- cur_physical += map->stripe_len;
|
|
+ cur_physical += BTRFS_STRIPE_LEN;
|
|
}
|
|
return ret;
|
|
}
|
|
@@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
struct btrfs_device *scrub_dev,
|
|
int stripe_index)
|
|
{
|
|
- struct btrfs_path *path;
|
|
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
- struct btrfs_root *root;
|
|
- struct btrfs_root *csum_root;
|
|
- struct blk_plug plug;
|
|
struct map_lookup *map = em->map_lookup;
|
|
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
|
|
const u64 chunk_logical = bg->start;
|
|
int ret;
|
|
+ int ret2;
|
|
u64 physical = map->stripes[stripe_index].physical;
|
|
const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
|
|
const u64 physical_end = physical + dev_stripe_len;
|
|
@@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
/* Offset inside the chunk */
|
|
u64 offset;
|
|
u64 stripe_logical;
|
|
- u64 stripe_end;
|
|
int stop_loop = 0;
|
|
|
|
- path = btrfs_alloc_path();
|
|
- if (!path)
|
|
- return -ENOMEM;
|
|
-
|
|
- /*
|
|
- * work on commit root. The related disk blocks are static as
|
|
- * long as COW is applied. This means, it is save to rewrite
|
|
- * them to repair disk errors without any race conditions
|
|
- */
|
|
- path->search_commit_root = 1;
|
|
- path->skip_locking = 1;
|
|
- path->reada = READA_FORWARD;
|
|
-
|
|
- wait_event(sctx->list_wait,
|
|
- atomic_read(&sctx->bios_in_flight) == 0);
|
|
scrub_blocked_if_needed(fs_info);
|
|
|
|
- root = btrfs_extent_root(fs_info, bg->start);
|
|
- csum_root = btrfs_csum_root(fs_info, bg->start);
|
|
-
|
|
- /*
|
|
- * collect all data csums for the stripe to avoid seeking during
|
|
- * the scrub. This might currently (crc32) end up to be about 1MB
|
|
- */
|
|
- blk_start_plug(&plug);
|
|
-
|
|
if (sctx->is_dev_replace &&
|
|
btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
|
|
mutex_lock(&sctx->wr_lock);
|
|
sctx->write_pointer = physical;
|
|
mutex_unlock(&sctx->wr_lock);
|
|
- sctx->flush_all_writes = true;
|
|
}
|
|
|
|
+ /* Prepare the extra data stripes used by RAID56. */
|
|
+ if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
+ ASSERT(sctx->raid56_data_stripes == NULL);
|
|
+
|
|
+ sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
|
|
+ sizeof(struct scrub_stripe),
|
|
+ GFP_KERNEL);
|
|
+ if (!sctx->raid56_data_stripes) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ for (int i = 0; i < nr_data_stripes(map); i++) {
|
|
+ ret = init_scrub_stripe(fs_info,
|
|
+ &sctx->raid56_data_stripes[i]);
|
|
+ if (ret < 0)
|
|
+ goto out;
|
|
+ sctx->raid56_data_stripes[i].bg = bg;
|
|
+ sctx->raid56_data_stripes[i].sctx = sctx;
|
|
+ }
|
|
+ }
|
|
/*
|
|
* There used to be a big double loop to handle all profiles using the
|
|
* same routine, which grows larger and more gross over time.
|
|
@@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
* Only @physical and @mirror_num needs to calculated using
|
|
* @stripe_index.
|
|
*/
|
|
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
|
|
- bg->start, bg->length, scrub_dev,
|
|
- map->stripes[stripe_index].physical,
|
|
+ ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
|
|
+ scrub_dev, map->stripes[stripe_index].physical,
|
|
stripe_index + 1);
|
|
offset = 0;
|
|
goto out;
|
|
}
|
|
if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
|
|
- ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
|
|
- scrub_dev, stripe_index);
|
|
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
|
|
+ ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
|
|
+ offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
|
|
goto out;
|
|
}
|
|
|
|
@@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
|
|
/* Initialize @offset in case we need to go to out: label */
|
|
get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
|
|
- increment = map->stripe_len * nr_data_stripes(map);
|
|
+ increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
/*
|
|
* Due to the rotation, for RAID56 it's better to iterate each stripe
|
|
@@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
if (ret) {
|
|
/* it is parity strip */
|
|
stripe_logical += chunk_logical;
|
|
- stripe_end = stripe_logical + increment;
|
|
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
|
|
- stripe_logical,
|
|
- stripe_end);
|
|
+ ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
|
|
+ map, stripe_logical);
|
|
if (ret)
|
|
goto out;
|
|
goto next;
|
|
@@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
* We can reuse scrub_simple_mirror() here, as the repair part
|
|
* is still based on @mirror_num.
|
|
*/
|
|
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
|
|
- logical, map->stripe_len,
|
|
+ ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
|
|
scrub_dev, physical, 1);
|
|
if (ret < 0)
|
|
goto out;
|
|
next:
|
|
logical += increment;
|
|
- physical += map->stripe_len;
|
|
+ physical += BTRFS_STRIPE_LEN;
|
|
spin_lock(&sctx->stat_lock);
|
|
if (stop_loop)
|
|
sctx->stat.last_physical =
|
|
@@ -3754,14 +2237,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
|
|
break;
|
|
}
|
|
out:
|
|
- /* push queued extents */
|
|
- scrub_submit(sctx);
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
-
|
|
- blk_finish_plug(&plug);
|
|
- btrfs_free_path(path);
|
|
+ ret2 = flush_scrub_stripes(sctx);
|
|
+ if (!ret2)
|
|
+ ret = ret2;
|
|
+ if (sctx->raid56_data_stripes) {
|
|
+ for (int i = 0; i < nr_data_stripes(map); i++)
|
|
+ release_scrub_stripe(&sctx->raid56_data_stripes[i]);
|
|
+ kfree(sctx->raid56_data_stripes);
|
|
+ sctx->raid56_data_stripes = NULL;
|
|
+ }
|
|
|
|
if (sctx->is_dev_replace && ret >= 0) {
|
|
int ret2;
|
|
@@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
|
|
|
|
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
|
|
dev_extent_len);
|
|
-
|
|
- /*
|
|
- * flush, submit all pending read and write bios, afterwards
|
|
- * wait for them.
|
|
- * Note that in the dev replace case, a read request causes
|
|
- * write requests that are submitted in the read completion
|
|
- * worker. Therefore in the current situation, it is required
|
|
- * that all write requests are flushed, so that all read and
|
|
- * write requests are really completed when bios_in_flight
|
|
- * changes to 0.
|
|
- */
|
|
- sctx->flush_all_writes = true;
|
|
- scrub_submit(sctx);
|
|
- mutex_lock(&sctx->wr_lock);
|
|
- scrub_wr_submit(sctx);
|
|
- mutex_unlock(&sctx->wr_lock);
|
|
-
|
|
- wait_event(sctx->list_wait,
|
|
- atomic_read(&sctx->bios_in_flight) == 0);
|
|
-
|
|
- scrub_pause_on(fs_info);
|
|
-
|
|
- /*
|
|
- * must be called before we decrease @scrub_paused.
|
|
- * make sure we don't block transaction commit while
|
|
- * we are waiting pending workers finished.
|
|
- */
|
|
- wait_event(sctx->list_wait,
|
|
- atomic_read(&sctx->workers_pending) == 0);
|
|
- sctx->flush_all_writes = false;
|
|
-
|
|
- scrub_pause_off(fs_info);
|
|
-
|
|
if (sctx->is_dev_replace &&
|
|
!btrfs_finish_block_group_to_copy(dev_replace->srcdev,
|
|
cache, found_key.offset))
|
|
@@ -4168,18 +2619,62 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
|
|
return ret;
|
|
}
|
|
|
|
+static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
|
|
+ struct page *page, u64 physical, u64 generation)
|
|
+{
|
|
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
+ struct bio_vec bvec;
|
|
+ struct bio bio;
|
|
+ struct btrfs_super_block *sb = page_address(page);
|
|
+ int ret;
|
|
+
|
|
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
|
|
+ bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
|
|
+ __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
|
|
+ ret = submit_bio_wait(&bio);
|
|
+ bio_uninit(&bio);
|
|
+
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+ ret = btrfs_check_super_csum(fs_info, sb);
|
|
+ if (ret != 0) {
|
|
+ btrfs_err_rl(fs_info,
|
|
+ "super block at physical %llu devid %llu has bad csum",
|
|
+ physical, dev->devid);
|
|
+ return -EIO;
|
|
+ }
|
|
+ if (btrfs_super_generation(sb) != generation) {
|
|
+ btrfs_err_rl(fs_info,
|
|
+"super block at physical %llu devid %llu has bad generation %llu expect %llu",
|
|
+ physical, dev->devid,
|
|
+ btrfs_super_generation(sb), generation);
|
|
+ return -EUCLEAN;
|
|
+ }
|
|
+
|
|
+ return btrfs_validate_super(fs_info, sb, -1);
|
|
+}
|
|
+
|
|
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
|
|
struct btrfs_device *scrub_dev)
|
|
{
|
|
int i;
|
|
u64 bytenr;
|
|
u64 gen;
|
|
- int ret;
|
|
+ int ret = 0;
|
|
+ struct page *page;
|
|
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
|
|
|
if (BTRFS_FS_ERROR(fs_info))
|
|
return -EROFS;
|
|
|
|
+ page = alloc_page(GFP_KERNEL);
|
|
+ if (!page) {
|
|
+ spin_lock(&sctx->stat_lock);
|
|
+ sctx->stat.malloc_errors++;
|
|
+ spin_unlock(&sctx->stat_lock);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
/* Seed devices of a new filesystem has their own generation. */
|
|
if (scrub_dev->fs_devices != fs_info->fs_devices)
|
|
gen = scrub_dev->generation;
|
|
@@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
|
|
if (!btrfs_check_super_location(scrub_dev, bytenr))
|
|
continue;
|
|
|
|
- ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
|
|
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
|
|
- NULL, bytenr);
|
|
- if (ret)
|
|
- return ret;
|
|
+ ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
|
|
+ if (ret) {
|
|
+ spin_lock(&sctx->stat_lock);
|
|
+ sctx->stat.super_errors++;
|
|
+ spin_unlock(&sctx->stat_lock);
|
|
+ }
|
|
}
|
|
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
|
|
-
|
|
+ __free_page(page);
|
|
return 0;
|
|
}
|
|
|
|
@@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
|
|
struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
|
|
struct workqueue_struct *scrub_wr_comp =
|
|
fs_info->scrub_wr_completion_workers;
|
|
- struct workqueue_struct *scrub_parity =
|
|
- fs_info->scrub_parity_workers;
|
|
|
|
fs_info->scrub_workers = NULL;
|
|
fs_info->scrub_wr_completion_workers = NULL;
|
|
- fs_info->scrub_parity_workers = NULL;
|
|
mutex_unlock(&fs_info->scrub_lock);
|
|
|
|
if (scrub_workers)
|
|
destroy_workqueue(scrub_workers);
|
|
if (scrub_wr_comp)
|
|
destroy_workqueue(scrub_wr_comp);
|
|
- if (scrub_parity)
|
|
- destroy_workqueue(scrub_parity);
|
|
}
|
|
}
|
|
|
|
@@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
|
|
{
|
|
struct workqueue_struct *scrub_workers = NULL;
|
|
struct workqueue_struct *scrub_wr_comp = NULL;
|
|
- struct workqueue_struct *scrub_parity = NULL;
|
|
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
|
|
int max_active = fs_info->thread_pool_size;
|
|
int ret = -ENOMEM;
|
|
@@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
|
|
if (!scrub_wr_comp)
|
|
goto fail_scrub_wr_completion_workers;
|
|
|
|
- scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
|
|
- if (!scrub_parity)
|
|
- goto fail_scrub_parity_workers;
|
|
-
|
|
mutex_lock(&fs_info->scrub_lock);
|
|
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
|
|
ASSERT(fs_info->scrub_workers == NULL &&
|
|
- fs_info->scrub_wr_completion_workers == NULL &&
|
|
- fs_info->scrub_parity_workers == NULL);
|
|
+ fs_info->scrub_wr_completion_workers == NULL);
|
|
fs_info->scrub_workers = scrub_workers;
|
|
fs_info->scrub_wr_completion_workers = scrub_wr_comp;
|
|
- fs_info->scrub_parity_workers = scrub_parity;
|
|
refcount_set(&fs_info->scrub_workers_refcnt, 1);
|
|
mutex_unlock(&fs_info->scrub_lock);
|
|
return 0;
|
|
@@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
|
|
mutex_unlock(&fs_info->scrub_lock);
|
|
|
|
ret = 0;
|
|
- destroy_workqueue(scrub_parity);
|
|
-fail_scrub_parity_workers:
|
|
+
|
|
destroy_workqueue(scrub_wr_comp);
|
|
fail_scrub_wr_completion_workers:
|
|
destroy_workqueue(scrub_workers);
|
|
@@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
|
|
ret = scrub_enumerate_chunks(sctx, dev, start, end);
|
|
memalloc_nofs_restore(nofs_flag);
|
|
|
|
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
|
|
atomic_dec(&fs_info->scrubs_running);
|
|
wake_up(&fs_info->scrub_pause_wait);
|
|
|
|
- wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
|
|
-
|
|
if (progress)
|
|
memcpy(progress, &sctx->stat, sizeof(*progress));
|
|
|
|
@@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
|
|
|
|
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
|
|
}
|
|
-
|
|
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
|
|
- u64 extent_logical, u32 extent_len,
|
|
- u64 *extent_physical,
|
|
- struct btrfs_device **extent_dev,
|
|
- int *extent_mirror_num)
|
|
-{
|
|
- u64 mapped_length;
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
- int ret;
|
|
-
|
|
- mapped_length = extent_len;
|
|
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
|
|
- &mapped_length, &bioc, 0);
|
|
- if (ret || !bioc || mapped_length < extent_len ||
|
|
- !bioc->stripes[0].dev->bdev) {
|
|
- btrfs_put_bioc(bioc);
|
|
- return;
|
|
- }
|
|
-
|
|
- *extent_physical = bioc->stripes[0].physical;
|
|
- *extent_mirror_num = bioc->mirror_num;
|
|
- *extent_dev = bioc->stripes[0].dev;
|
|
- btrfs_put_bioc(bioc);
|
|
-}
|
|
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
|
|
index e5c963bb873d..af2e153543a5 100644
|
|
--- a/fs/btrfs/send.c
|
|
+++ b/fs/btrfs/send.c
|
|
@@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
|
|
int left_ret;
|
|
int right_ret;
|
|
u64 left_gen;
|
|
- u64 right_gen;
|
|
+ u64 right_gen = 0;
|
|
struct btrfs_inode_info info;
|
|
|
|
ret = get_inode_info(sctx->send_root, ino, &info);
|
|
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
|
|
index 3eecce86f63f..75e7fa337e66 100644
|
|
--- a/fs/btrfs/space-info.c
|
|
+++ b/fs/btrfs/space-info.c
|
|
@@ -537,7 +537,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
|
|
up_read(&info->groups_sem);
|
|
}
|
|
|
|
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
|
|
+static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
|
|
u64 to_reclaim)
|
|
{
|
|
u64 bytes;
|
|
@@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
|
|
return nr;
|
|
}
|
|
|
|
+static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
|
|
+ u64 to_reclaim)
|
|
+{
|
|
+ const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
|
|
+ u64 nr;
|
|
+
|
|
+ nr = div64_u64(to_reclaim, bytes);
|
|
+ if (!nr)
|
|
+ nr = 1;
|
|
+ return nr;
|
|
+}
|
|
+
|
|
#define EXTENT_SIZE_PER_ITEM SZ_256K
|
|
|
|
/*
|
|
@@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
|
|
break;
|
|
}
|
|
if (state == FLUSH_DELAYED_REFS_NR)
|
|
- nr = calc_reclaim_items_nr(fs_info, num_bytes);
|
|
+ nr = calc_delayed_refs_nr(fs_info, num_bytes);
|
|
else
|
|
nr = 0;
|
|
btrfs_run_delayed_refs(trans, nr);
|
|
@@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
|
|
struct reserve_ticket ticket;
|
|
u64 start_ns = 0;
|
|
u64 used;
|
|
- int ret = 0;
|
|
+ int ret = -ENOSPC;
|
|
bool pending_tickets;
|
|
|
|
ASSERT(orig_bytes);
|
|
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
|
|
+ /*
|
|
+ * If have a transaction handle (current->journal_info != NULL), then
|
|
+ * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
|
|
+ * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
|
|
+ * flushing methods can trigger transaction commits.
|
|
+ */
|
|
+ if (current->journal_info) {
|
|
+ /* One assert per line for easier debugging. */
|
|
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
|
|
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
|
|
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
|
|
+ }
|
|
|
|
if (flush == BTRFS_RESERVE_FLUSH_DATA)
|
|
async_work = &fs_info->async_data_reclaim_work;
|
|
@@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
|
|
async_work = &fs_info->async_reclaim_work;
|
|
|
|
spin_lock(&space_info->lock);
|
|
- ret = -ENOSPC;
|
|
used = btrfs_space_info_used(space_info, true);
|
|
|
|
/*
|
|
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
|
|
index 2033b71b18ce..0bb9d14e60a8 100644
|
|
--- a/fs/btrfs/space-info.h
|
|
+++ b/fs/btrfs/space-info.h
|
|
@@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum {
|
|
* - Running delayed refs
|
|
* - Running delalloc and waiting for ordered extents
|
|
* - Allocating a new chunk
|
|
+ * - Committing transaction
|
|
*/
|
|
BTRFS_RESERVE_FLUSH_EVICT,
|
|
|
|
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
|
|
index 366fb4cde145..6cb97efee976 100644
|
|
--- a/fs/btrfs/super.c
|
|
+++ b/fs/btrfs/super.c
|
|
@@ -1158,6 +1158,7 @@ static int btrfs_fill_super(struct super_block *sb,
|
|
inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
|
|
if (IS_ERR(inode)) {
|
|
err = PTR_ERR(inode);
|
|
+ btrfs_handle_fs_error(fs_info, err, NULL);
|
|
goto fail_close;
|
|
}
|
|
|
|
@@ -2412,7 +2413,7 @@ static int __init btrfs_print_mod_info(void)
|
|
", fsverity=no"
|
|
#endif
|
|
;
|
|
- pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
|
|
+ pr_info("Btrfs loaded%s\n", options);
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
|
|
index 37fc58a7f27e..25294e624851 100644
|
|
--- a/fs/btrfs/sysfs.c
|
|
+++ b/fs/btrfs/sysfs.c
|
|
@@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
|
|
if (ret)
|
|
return ret;
|
|
|
|
+#ifdef CONFIG_BTRFS_DEBUG
|
|
+ if (thresh != 0 && (thresh > 100))
|
|
+ return -EINVAL;
|
|
+#else
|
|
if (thresh != 0 && (thresh <= 50 || thresh > 100))
|
|
return -EINVAL;
|
|
+#endif
|
|
|
|
WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
|
|
|
|
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
|
|
index f2f2e11dac4c..ed0f36ae5346 100644
|
|
--- a/fs/btrfs/tests/extent-map-tests.c
|
|
+++ b/fs/btrfs/tests/extent-map-tests.c
|
|
@@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
|
|
em->map_lookup = map;
|
|
|
|
map->num_stripes = test->num_stripes;
|
|
- map->stripe_len = BTRFS_STRIPE_LEN;
|
|
map->type = test->raid_type;
|
|
|
|
for (i = 0; i < map->num_stripes; i++) {
|
|
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
|
|
index b8d5b1fa9a03..8b6a99b8d7f6 100644
|
|
--- a/fs/btrfs/transaction.c
|
|
+++ b/fs/btrfs/transaction.c
|
|
@@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
|
|
/*
|
|
* We want to reserve all the bytes we may need all at once, so
|
|
* we only do 1 enospc flushing cycle per transaction start. We
|
|
- * accomplish this by simply assuming we'll do 2 x num_items
|
|
- * worth of delayed refs updates in this trans handle, and
|
|
- * refill that amount for whatever is missing in the reserve.
|
|
+ * accomplish this by simply assuming we'll do num_items worth
|
|
+ * of delayed refs updates in this trans handle, and refill that
|
|
+ * amount for whatever is missing in the reserve.
|
|
*/
|
|
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
|
|
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
|
|
- btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
|
|
- delayed_refs_bytes = num_bytes;
|
|
- num_bytes <<= 1;
|
|
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
|
|
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
|
|
+ num_items);
|
|
+ num_bytes += delayed_refs_bytes;
|
|
}
|
|
|
|
/*
|
|
@@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
|
|
wait_current_trans(fs_info);
|
|
}
|
|
|
|
-static bool should_end_transaction(struct btrfs_trans_handle *trans)
|
|
-{
|
|
- struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
-
|
|
- if (btrfs_check_space_for_delayed_refs(fs_info))
|
|
- return true;
|
|
-
|
|
- return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
|
|
-}
|
|
-
|
|
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
|
|
{
|
|
struct btrfs_transaction *cur_trans = trans->transaction;
|
|
@@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
|
|
test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
|
|
return true;
|
|
|
|
- return should_end_transaction(trans);
|
|
+ if (btrfs_check_space_for_delayed_refs(trans->fs_info))
|
|
+ return true;
|
|
+
|
|
+ return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
|
|
}
|
|
|
|
static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
|
|
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
|
|
index baad1ed7e111..e2b54793bf0c 100644
|
|
--- a/fs/btrfs/tree-checker.c
|
|
+++ b/fs/btrfs/tree-checker.c
|
|
@@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
|
|
stripe_len);
|
|
return -EUCLEAN;
|
|
}
|
|
+ /*
|
|
+ * We artificially limit the chunk size, so that the number of stripes
|
|
+ * inside a chunk can be fit into a U32. The current limit (256G) is
|
|
+ * way too large for real world usage anyway, and it's also much larger
|
|
+ * than our existing limit (10G).
|
|
+ *
|
|
+ * Thus it should be a good way to catch obvious bitflips.
|
|
+ */
|
|
+ if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) {
|
|
+ chunk_err(leaf, chunk, logical,
|
|
+ "chunk length too large: have %llu limit %llu",
|
|
+ length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT);
|
|
+ return -EUCLEAN;
|
|
+ }
|
|
if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
|
|
BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
|
|
chunk_err(leaf, chunk, logical,
|
|
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
|
|
index 200cea6e49e5..9b212e8c70cc 100644
|
|
--- a/fs/btrfs/tree-log.c
|
|
+++ b/fs/btrfs/tree-log.c
|
|
@@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
|
|
btrfs_put_block_group(cache);
|
|
}
|
|
|
|
+static int clean_log_buffer(struct btrfs_trans_handle *trans,
|
|
+ struct extent_buffer *eb)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ btrfs_tree_lock(eb);
|
|
+ btrfs_clear_buffer_dirty(trans, eb);
|
|
+ wait_on_extent_buffer_writeback(eb);
|
|
+ btrfs_tree_unlock(eb);
|
|
+
|
|
+ if (trans) {
|
|
+ ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ btrfs_redirty_list_add(trans->transaction, eb);
|
|
+ } else {
|
|
+ unaccount_log_buffer(eb->fs_info, eb->start);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root,
|
|
struct btrfs_path *path, int *level,
|
|
@@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
|
|
u64 ptr_gen;
|
|
struct extent_buffer *next;
|
|
struct extent_buffer *cur;
|
|
- u32 blocksize;
|
|
int ret = 0;
|
|
|
|
while (*level > 0) {
|
|
@@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
|
|
check.level = *level - 1;
|
|
check.has_first_key = true;
|
|
btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
|
|
- blocksize = fs_info->nodesize;
|
|
|
|
next = btrfs_find_create_tree_block(fs_info, bytenr,
|
|
btrfs_header_owner(cur),
|
|
@@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
|
|
return ret;
|
|
}
|
|
|
|
- btrfs_tree_lock(next);
|
|
- btrfs_clear_buffer_dirty(trans, next);
|
|
- wait_on_extent_buffer_writeback(next);
|
|
- btrfs_tree_unlock(next);
|
|
-
|
|
- if (trans) {
|
|
- ret = btrfs_pin_reserved_extent(trans,
|
|
- bytenr, blocksize);
|
|
- if (ret) {
|
|
- free_extent_buffer(next);
|
|
- return ret;
|
|
- }
|
|
- btrfs_redirty_list_add(
|
|
- trans->transaction, next);
|
|
- } else {
|
|
- unaccount_log_buffer(fs_info, bytenr);
|
|
+ ret = clean_log_buffer(trans, next);
|
|
+ if (ret) {
|
|
+ free_extent_buffer(next);
|
|
+ return ret;
|
|
}
|
|
}
|
|
free_extent_buffer(next);
|
|
@@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
|
|
struct btrfs_path *path, int *level,
|
|
struct walk_control *wc)
|
|
{
|
|
- struct btrfs_fs_info *fs_info = root->fs_info;
|
|
int i;
|
|
int slot;
|
|
int ret;
|
|
@@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
|
|
return ret;
|
|
|
|
if (wc->free) {
|
|
- struct extent_buffer *next;
|
|
-
|
|
- next = path->nodes[*level];
|
|
-
|
|
- btrfs_tree_lock(next);
|
|
- btrfs_clear_buffer_dirty(trans, next);
|
|
- wait_on_extent_buffer_writeback(next);
|
|
- btrfs_tree_unlock(next);
|
|
-
|
|
- if (trans) {
|
|
- ret = btrfs_pin_reserved_extent(trans,
|
|
- path->nodes[*level]->start,
|
|
- path->nodes[*level]->len);
|
|
- if (ret)
|
|
- return ret;
|
|
- btrfs_redirty_list_add(trans->transaction,
|
|
- next);
|
|
- } else {
|
|
- unaccount_log_buffer(fs_info,
|
|
- path->nodes[*level]->start);
|
|
- }
|
|
+ ret = clean_log_buffer(trans, path->nodes[*level]);
|
|
+ if (ret)
|
|
+ return ret;
|
|
}
|
|
free_extent_buffer(path->nodes[*level]);
|
|
path->nodes[*level] = NULL;
|
|
@@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
|
|
static int walk_log_tree(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *log, struct walk_control *wc)
|
|
{
|
|
- struct btrfs_fs_info *fs_info = log->fs_info;
|
|
int ret = 0;
|
|
int wret;
|
|
int level;
|
|
@@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
|
|
orig_level);
|
|
if (ret)
|
|
goto out;
|
|
- if (wc->free) {
|
|
- struct extent_buffer *next;
|
|
-
|
|
- next = path->nodes[orig_level];
|
|
-
|
|
- btrfs_tree_lock(next);
|
|
- btrfs_clear_buffer_dirty(trans, next);
|
|
- wait_on_extent_buffer_writeback(next);
|
|
- btrfs_tree_unlock(next);
|
|
-
|
|
- if (trans) {
|
|
- ret = btrfs_pin_reserved_extent(trans,
|
|
- next->start, next->len);
|
|
- if (ret)
|
|
- goto out;
|
|
- btrfs_redirty_list_add(trans->transaction, next);
|
|
- } else {
|
|
- unaccount_log_buffer(fs_info, next->start);
|
|
- }
|
|
- }
|
|
+ if (wc->free)
|
|
+ ret = clean_log_buffer(trans, path->nodes[orig_level]);
|
|
}
|
|
|
|
out:
|
|
@@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
|
|
ret = BTRFS_LOG_FORCE_COMMIT;
|
|
else
|
|
inode->last_dir_index_offset = last_index;
|
|
+
|
|
+ if (btrfs_get_first_dir_index_to_log(inode) == 0)
|
|
+ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
|
|
out:
|
|
kfree(ins_data);
|
|
|
|
@@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
found_key.offset = 0;
|
|
found_key.type = 0;
|
|
- ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
|
|
+ ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
@@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
|
|
LIST_HEAD(dir_list);
|
|
struct btrfs_dir_list *dir_elem;
|
|
u64 ino = btrfs_ino(start_inode);
|
|
+ struct btrfs_inode *curr_inode = start_inode;
|
|
int ret = 0;
|
|
|
|
/*
|
|
@@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
+ /* Pairs with btrfs_add_delayed_iput below. */
|
|
+ ihold(&curr_inode->vfs_inode);
|
|
+
|
|
while (true) {
|
|
- struct extent_buffer *leaf;
|
|
- struct btrfs_key min_key;
|
|
+ struct inode *vfs_inode;
|
|
+ struct btrfs_key key;
|
|
+ struct btrfs_key found_key;
|
|
+ u64 next_index;
|
|
bool continue_curr_inode = true;
|
|
- int nritems;
|
|
- int i;
|
|
+ int iter_ret;
|
|
|
|
- min_key.objectid = ino;
|
|
- min_key.type = BTRFS_DIR_INDEX_KEY;
|
|
- min_key.offset = 0;
|
|
+ key.objectid = ino;
|
|
+ key.type = BTRFS_DIR_INDEX_KEY;
|
|
+ key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
|
|
+ next_index = key.offset;
|
|
again:
|
|
- btrfs_release_path(path);
|
|
- ret = btrfs_search_forward(root, &min_key, path, trans->transid);
|
|
- if (ret < 0) {
|
|
- break;
|
|
- } else if (ret > 0) {
|
|
- ret = 0;
|
|
- goto next;
|
|
- }
|
|
-
|
|
- leaf = path->nodes[0];
|
|
- nritems = btrfs_header_nritems(leaf);
|
|
- for (i = path->slots[0]; i < nritems; i++) {
|
|
+ btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
|
|
+ struct extent_buffer *leaf = path->nodes[0];
|
|
struct btrfs_dir_item *di;
|
|
struct btrfs_key di_key;
|
|
struct inode *di_inode;
|
|
int log_mode = LOG_INODE_EXISTS;
|
|
int type;
|
|
|
|
- btrfs_item_key_to_cpu(leaf, &min_key, i);
|
|
- if (min_key.objectid != ino ||
|
|
- min_key.type != BTRFS_DIR_INDEX_KEY) {
|
|
+ if (found_key.objectid != ino ||
|
|
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
|
|
continue_curr_inode = false;
|
|
break;
|
|
}
|
|
|
|
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
|
|
+ next_index = found_key.offset + 1;
|
|
+
|
|
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
|
|
type = btrfs_dir_ftype(leaf, di);
|
|
if (btrfs_dir_transid(leaf, di) < trans->transid)
|
|
continue;
|
|
@@ -5496,12 +5466,24 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
|
|
break;
|
|
}
|
|
|
|
- if (continue_curr_inode && min_key.offset < (u64)-1) {
|
|
- min_key.offset++;
|
|
+ btrfs_release_path(path);
|
|
+
|
|
+ if (iter_ret < 0) {
|
|
+ ret = iter_ret;
|
|
+ goto out;
|
|
+ } else if (iter_ret > 0) {
|
|
+ continue_curr_inode = false;
|
|
+ } else {
|
|
+ key = found_key;
|
|
+ }
|
|
+
|
|
+ if (continue_curr_inode && key.offset < (u64)-1) {
|
|
+ key.offset++;
|
|
goto again;
|
|
}
|
|
|
|
-next:
|
|
+ btrfs_set_first_dir_index_to_log(curr_inode, next_index);
|
|
+
|
|
if (list_empty(&dir_list))
|
|
break;
|
|
|
|
@@ -5509,9 +5491,22 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
|
|
ino = dir_elem->ino;
|
|
list_del(&dir_elem->list);
|
|
kfree(dir_elem);
|
|
+
|
|
+ btrfs_add_delayed_iput(curr_inode);
|
|
+ curr_inode = NULL;
|
|
+
|
|
+ vfs_inode = btrfs_iget(fs_info->sb, ino, root);
|
|
+ if (IS_ERR(vfs_inode)) {
|
|
+ ret = PTR_ERR(vfs_inode);
|
|
+ break;
|
|
+ }
|
|
+ curr_inode = BTRFS_I(vfs_inode);
|
|
}
|
|
out:
|
|
btrfs_free_path(path);
|
|
+ if (curr_inode)
|
|
+ btrfs_add_delayed_iput(curr_inode);
|
|
+
|
|
if (ret) {
|
|
struct btrfs_dir_list *next;
|
|
|
|
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
|
|
index c6d592870400..03f52e4a20aa 100644
|
|
--- a/fs/btrfs/volumes.c
|
|
+++ b/fs/btrfs/volumes.c
|
|
@@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device)
|
|
{
|
|
WARN_ON(!list_empty(&device->post_commit_list));
|
|
rcu_string_free(device->name);
|
|
- extent_io_tree_release(&device->alloc_state);
|
|
btrfs_destroy_dev_zone_info(device);
|
|
kfree(device);
|
|
}
|
|
@@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
|
|
device->last_flush_error = 0;
|
|
|
|
/* Verify the device is back in a pristine state */
|
|
- ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
|
|
- ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
|
|
- ASSERT(list_empty(&device->dev_alloc_list));
|
|
- ASSERT(list_empty(&device->post_commit_list));
|
|
+ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
|
|
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
|
|
+ WARN_ON(!list_empty(&device->dev_alloc_list));
|
|
+ WARN_ON(!list_empty(&device->post_commit_list));
|
|
}
|
|
|
|
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
|
|
@@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
|
|
struct block_device *bdev;
|
|
struct super_block *sb = fs_info->sb;
|
|
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
|
- struct btrfs_fs_devices *seed_devices;
|
|
+ struct btrfs_fs_devices *seed_devices = NULL;
|
|
u64 orig_super_total_bytes;
|
|
u64 orig_super_num_devices;
|
|
int ret = 0;
|
|
@@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular(
|
|
/* We don't want a chunk larger than 10% of writable space */
|
|
ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
|
|
ctl->max_chunk_size);
|
|
- ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
|
|
+ ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT;
|
|
}
|
|
|
|
static void init_alloc_chunk_ctl_policy_zoned(
|
|
@@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
|
|
j * ctl->stripe_size;
|
|
}
|
|
}
|
|
- map->stripe_len = BTRFS_STRIPE_LEN;
|
|
map->io_align = BTRFS_STRIPE_LEN;
|
|
map->io_width = BTRFS_STRIPE_LEN;
|
|
map->type = type;
|
|
@@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
|
|
}
|
|
write_unlock(&em_tree->lock);
|
|
|
|
- block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
|
|
+ block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
|
|
if (IS_ERR(block_group))
|
|
goto error_del_extent;
|
|
|
|
@@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
|
|
|
|
btrfs_set_stack_chunk_length(chunk, bg->length);
|
|
btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
|
|
- btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
|
|
+ btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
|
|
btrfs_set_stack_chunk_type(chunk, map->type);
|
|
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
|
|
- btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
|
|
- btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
|
|
+ btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
|
|
+ btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
|
|
btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
|
|
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
|
|
|
|
@@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
|
|
*/
|
|
ret = map->num_stripes;
|
|
free_extent_map(em);
|
|
-
|
|
- down_read(&fs_info->dev_replace.rwsem);
|
|
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
|
|
- fs_info->dev_replace.tgtdev)
|
|
- ret++;
|
|
- up_read(&fs_info->dev_replace.rwsem);
|
|
-
|
|
return ret;
|
|
}
|
|
|
|
@@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
|
|
if (!WARN_ON(IS_ERR(em))) {
|
|
map = em->map_lookup;
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
|
- len = map->stripe_len * nr_data_stripes(map);
|
|
+ len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
|
|
free_extent_map(em);
|
|
}
|
|
return len;
|
|
@@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
|
return preferred_mirror;
|
|
}
|
|
|
|
-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
|
|
-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
|
|
-{
|
|
- int i;
|
|
- int again = 1;
|
|
-
|
|
- while (again) {
|
|
- again = 0;
|
|
- for (i = 0; i < num_stripes - 1; i++) {
|
|
- /* Swap if parity is on a smaller index */
|
|
- if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
|
|
- swap(bioc->stripes[i], bioc->stripes[i + 1]);
|
|
- swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
|
|
- again = 1;
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
|
|
- int total_stripes,
|
|
- int real_stripes)
|
|
+ u16 total_stripes)
|
|
{
|
|
- struct btrfs_io_context *bioc = kzalloc(
|
|
+ struct btrfs_io_context *bioc;
|
|
+
|
|
+ bioc = kzalloc(
|
|
/* The size of btrfs_io_context */
|
|
sizeof(struct btrfs_io_context) +
|
|
/* Plus the variable array for the stripes */
|
|
- sizeof(struct btrfs_io_stripe) * (total_stripes) +
|
|
- /* Plus the variable array for the tgt dev */
|
|
- sizeof(int) * (real_stripes) +
|
|
- /*
|
|
- * Plus the raid_map, which includes both the tgt dev
|
|
- * and the stripes.
|
|
- */
|
|
- sizeof(u64) * (total_stripes),
|
|
+ sizeof(struct btrfs_io_stripe) * (total_stripes),
|
|
GFP_NOFS);
|
|
|
|
if (!bioc)
|
|
@@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
|
|
refcount_set(&bioc->refs, 1);
|
|
|
|
bioc->fs_info = fs_info;
|
|
- bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
|
|
- bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
|
|
+ bioc->replace_stripe_src = -1;
|
|
+ bioc->full_stripe_logical = (u64)-1;
|
|
|
|
return bioc;
|
|
}
|
|
@@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
struct btrfs_discard_stripe *stripes;
|
|
u64 length = *length_ret;
|
|
u64 offset;
|
|
- u64 stripe_nr;
|
|
- u64 stripe_nr_end;
|
|
+ u32 stripe_nr;
|
|
+ u32 stripe_nr_end;
|
|
+ u32 stripe_cnt;
|
|
u64 stripe_end_offset;
|
|
- u64 stripe_cnt;
|
|
- u64 stripe_len;
|
|
u64 stripe_offset;
|
|
u32 stripe_index;
|
|
u32 factor = 0;
|
|
u32 sub_stripes = 0;
|
|
- u64 stripes_per_dev = 0;
|
|
+ u32 stripes_per_dev = 0;
|
|
u32 remaining_stripes = 0;
|
|
u32 last_stripe = 0;
|
|
int ret;
|
|
@@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
ret = -EOPNOTSUPP;
|
|
goto out_free_map;
|
|
-}
|
|
+ }
|
|
|
|
offset = logical - em->start;
|
|
length = min_t(u64, em->start + em->len - logical, length);
|
|
*length_ret = length;
|
|
|
|
- stripe_len = map->stripe_len;
|
|
/*
|
|
* stripe_nr counts the total number of stripes we have to stride
|
|
* to get to this block
|
|
*/
|
|
- stripe_nr = div64_u64(offset, stripe_len);
|
|
+ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
/* stripe_offset is the offset of this block in its stripe */
|
|
- stripe_offset = offset - stripe_nr * stripe_len;
|
|
+ stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
|
|
|
|
- stripe_nr_end = round_up(offset + length, map->stripe_len);
|
|
- stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
|
|
+ stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
|
|
+ BTRFS_STRIPE_LEN_SHIFT;
|
|
stripe_cnt = stripe_nr_end - stripe_nr;
|
|
- stripe_end_offset = stripe_nr_end * map->stripe_len -
|
|
+ stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) -
|
|
(offset + length);
|
|
/*
|
|
* after this, stripe_nr is the number of stripes on this
|
|
@@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
factor = map->num_stripes / sub_stripes;
|
|
*num_stripes = min_t(u64, map->num_stripes,
|
|
sub_stripes * stripe_cnt);
|
|
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
|
|
+ stripe_index = stripe_nr % factor;
|
|
+ stripe_nr /= factor;
|
|
stripe_index *= sub_stripes;
|
|
- stripes_per_dev = div_u64_rem(stripe_cnt, factor,
|
|
- &remaining_stripes);
|
|
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
|
|
- last_stripe *= sub_stripes;
|
|
+
|
|
+ remaining_stripes = stripe_cnt % factor;
|
|
+ stripes_per_dev = stripe_cnt / factor;
|
|
+ last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
|
|
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
|
|
BTRFS_BLOCK_GROUP_DUP)) {
|
|
*num_stripes = map->num_stripes;
|
|
} else {
|
|
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
|
|
- &stripe_index);
|
|
+ stripe_index = stripe_nr % map->num_stripes;
|
|
+ stripe_nr /= map->num_stripes;
|
|
}
|
|
|
|
stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
|
|
@@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
for (i = 0; i < *num_stripes; i++) {
|
|
stripes[i].physical =
|
|
map->stripes[stripe_index].physical +
|
|
- stripe_offset + stripe_nr * map->stripe_len;
|
|
+ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
|
|
stripes[i].dev = map->stripes[stripe_index].dev;
|
|
|
|
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
|
|
BTRFS_BLOCK_GROUP_RAID10)) {
|
|
- stripes[i].length = stripes_per_dev * map->stripe_len;
|
|
+ stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
if (i / sub_stripes < remaining_stripes)
|
|
- stripes[i].length += map->stripe_len;
|
|
+ stripes[i].length += BTRFS_STRIPE_LEN;
|
|
|
|
/*
|
|
* Special for the first stripe and
|
|
@@ -6103,83 +6068,6 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
-/*
|
|
- * In dev-replace case, for repair case (that's the only case where the mirror
|
|
- * is selected explicitly when calling btrfs_map_block), blocks left of the
|
|
- * left cursor can also be read from the target drive.
|
|
- *
|
|
- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
|
|
- * array of stripes.
|
|
- * For READ, it also needs to be supported using the same mirror number.
|
|
- *
|
|
- * If the requested block is not left of the left cursor, EIO is returned. This
|
|
- * can happen because btrfs_num_copies() returns one more in the dev-replace
|
|
- * case.
|
|
- */
|
|
-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
|
|
- u64 logical, u64 length,
|
|
- u64 srcdev_devid, int *mirror_num,
|
|
- u64 *physical)
|
|
-{
|
|
- struct btrfs_io_context *bioc = NULL;
|
|
- int num_stripes;
|
|
- int index_srcdev = 0;
|
|
- int found = 0;
|
|
- u64 physical_of_found = 0;
|
|
- int i;
|
|
- int ret = 0;
|
|
-
|
|
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
|
|
- logical, &length, &bioc, NULL, NULL, 0);
|
|
- if (ret) {
|
|
- ASSERT(bioc == NULL);
|
|
- return ret;
|
|
- }
|
|
-
|
|
- num_stripes = bioc->num_stripes;
|
|
- if (*mirror_num > num_stripes) {
|
|
- /*
|
|
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
|
|
- * that means that the requested area is not left of the left
|
|
- * cursor
|
|
- */
|
|
- btrfs_put_bioc(bioc);
|
|
- return -EIO;
|
|
- }
|
|
-
|
|
- /*
|
|
- * process the rest of the function using the mirror_num of the source
|
|
- * drive. Therefore look it up first. At the end, patch the device
|
|
- * pointer to the one of the target drive.
|
|
- */
|
|
- for (i = 0; i < num_stripes; i++) {
|
|
- if (bioc->stripes[i].dev->devid != srcdev_devid)
|
|
- continue;
|
|
-
|
|
- /*
|
|
- * In case of DUP, in order to keep it simple, only add the
|
|
- * mirror with the lowest physical address
|
|
- */
|
|
- if (found &&
|
|
- physical_of_found <= bioc->stripes[i].physical)
|
|
- continue;
|
|
-
|
|
- index_srcdev = i;
|
|
- found = 1;
|
|
- physical_of_found = bioc->stripes[i].physical;
|
|
- }
|
|
-
|
|
- btrfs_put_bioc(bioc);
|
|
-
|
|
- ASSERT(found);
|
|
- if (!found)
|
|
- return -EIO;
|
|
-
|
|
- *mirror_num = index_srcdev + 1;
|
|
- *physical = physical_of_found;
|
|
- return ret;
|
|
-}
|
|
-
|
|
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
|
|
{
|
|
struct btrfs_block_group *cache;
|
|
@@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
|
|
}
|
|
|
|
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
|
|
- struct btrfs_io_context **bioc_ret,
|
|
+ struct btrfs_io_context *bioc,
|
|
struct btrfs_dev_replace *dev_replace,
|
|
u64 logical,
|
|
int *num_stripes_ret, int *max_errors_ret)
|
|
{
|
|
- struct btrfs_io_context *bioc = *bioc_ret;
|
|
u64 srcdev_devid = dev_replace->srcdev->devid;
|
|
- int tgtdev_indexes = 0;
|
|
+ /*
|
|
+ * At this stage, num_stripes is still the real number of stripes,
|
|
+ * excluding the duplicated stripes.
|
|
+ */
|
|
int num_stripes = *num_stripes_ret;
|
|
+ int nr_extra_stripes = 0;
|
|
int max_errors = *max_errors_ret;
|
|
int i;
|
|
|
|
- if (op == BTRFS_MAP_WRITE) {
|
|
- int index_where_to_add;
|
|
+ /*
|
|
+ * A block group which has "to_copy" set will eventually be copied by
|
|
+ * the dev-replace process. We can avoid cloning IO here.
|
|
+ */
|
|
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
|
|
+ return;
|
|
|
|
- /*
|
|
- * A block group which have "to_copy" set will eventually
|
|
- * copied by dev-replace process. We can avoid cloning IO here.
|
|
- */
|
|
- if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
|
|
- return;
|
|
+ /*
|
|
+ * Duplicate the write operations while the dev-replace procedure is
|
|
+ * running. Since the copying of the old disk to the new disk takes
|
|
+ * place at run time while the filesystem is mounted writable, the
|
|
+ * regular write operations to the old disk have to be duplicated to go
|
|
+ * to the new disk as well.
|
|
+ *
|
|
+ * Note that device->missing is handled by the caller, and that the
|
|
+ * write to the old disk is already set up in the stripes array.
|
|
+ */
|
|
+ for (i = 0; i < num_stripes; i++) {
|
|
+ struct btrfs_io_stripe *old = &bioc->stripes[i];
|
|
+ struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
|
|
|
|
- /*
|
|
- * duplicate the write operations while the dev replace
|
|
- * procedure is running. Since the copying of the old disk to
|
|
- * the new disk takes place at run time while the filesystem is
|
|
- * mounted writable, the regular write operations to the old
|
|
- * disk have to be duplicated to go to the new disk as well.
|
|
- *
|
|
- * Note that device->missing is handled by the caller, and that
|
|
- * the write to the old disk is already set up in the stripes
|
|
- * array.
|
|
- */
|
|
- index_where_to_add = num_stripes;
|
|
- for (i = 0; i < num_stripes; i++) {
|
|
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
|
|
- /* write to new disk, too */
|
|
- struct btrfs_io_stripe *new =
|
|
- bioc->stripes + index_where_to_add;
|
|
- struct btrfs_io_stripe *old =
|
|
- bioc->stripes + i;
|
|
-
|
|
- new->physical = old->physical;
|
|
- new->dev = dev_replace->tgtdev;
|
|
- bioc->tgtdev_map[i] = index_where_to_add;
|
|
- index_where_to_add++;
|
|
- max_errors++;
|
|
- tgtdev_indexes++;
|
|
- }
|
|
- }
|
|
- num_stripes = index_where_to_add;
|
|
- } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
|
|
- int index_srcdev = 0;
|
|
- int found = 0;
|
|
- u64 physical_of_found = 0;
|
|
+ if (old->dev->devid != srcdev_devid)
|
|
+ continue;
|
|
|
|
- /*
|
|
- * During the dev-replace procedure, the target drive can also
|
|
- * be used to read data in case it is needed to repair a corrupt
|
|
- * block elsewhere. This is possible if the requested area is
|
|
- * left of the left cursor. In this area, the target drive is a
|
|
- * full copy of the source drive.
|
|
- */
|
|
- for (i = 0; i < num_stripes; i++) {
|
|
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
|
|
- /*
|
|
- * In case of DUP, in order to keep it simple,
|
|
- * only add the mirror with the lowest physical
|
|
- * address
|
|
- */
|
|
- if (found &&
|
|
- physical_of_found <= bioc->stripes[i].physical)
|
|
- continue;
|
|
- index_srcdev = i;
|
|
- found = 1;
|
|
- physical_of_found = bioc->stripes[i].physical;
|
|
- }
|
|
- }
|
|
- if (found) {
|
|
- struct btrfs_io_stripe *tgtdev_stripe =
|
|
- bioc->stripes + num_stripes;
|
|
+ new->physical = old->physical;
|
|
+ new->dev = dev_replace->tgtdev;
|
|
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
|
+ bioc->replace_stripe_src = i;
|
|
+ nr_extra_stripes++;
|
|
+ }
|
|
+
|
|
+ /* We can only have at most 2 extra nr_stripes (for DUP). */
|
|
+ ASSERT(nr_extra_stripes <= 2);
|
|
+ /*
|
|
+ * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
|
|
+ * replace.
|
|
+ * If we have 2 extra stripes, only choose the one with smaller physical.
|
|
+ */
|
|
+ if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
|
|
+ struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
|
|
+ struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
|
|
|
|
- tgtdev_stripe->physical = physical_of_found;
|
|
- tgtdev_stripe->dev = dev_replace->tgtdev;
|
|
- bioc->tgtdev_map[index_srcdev] = num_stripes;
|
|
+ /* Only DUP can have two extra stripes. */
|
|
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
|
|
|
|
- tgtdev_indexes++;
|
|
- num_stripes++;
|
|
+ /*
|
|
+ * Swap the last stripe stripes and reduce @nr_extra_stripes.
|
|
+ * The extra stripe would still be there, but won't be accessed.
|
|
+ */
|
|
+ if (first->physical > second->physical) {
|
|
+ swap(second->physical, first->physical);
|
|
+ swap(second->dev, first->dev);
|
|
+ nr_extra_stripes--;
|
|
}
|
|
}
|
|
|
|
- *num_stripes_ret = num_stripes;
|
|
- *max_errors_ret = max_errors;
|
|
- bioc->num_tgtdevs = tgtdev_indexes;
|
|
- *bioc_ret = bioc;
|
|
+ *num_stripes_ret = num_stripes + nr_extra_stripes;
|
|
+ *max_errors_ret = max_errors + nr_extra_stripes;
|
|
+ bioc->replace_nr_stripes = nr_extra_stripes;
|
|
}
|
|
|
|
static bool need_full_stripe(enum btrfs_map_op op)
|
|
@@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op)
|
|
}
|
|
|
|
static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
|
|
- u64 offset, u64 *stripe_nr, u64 *stripe_offset,
|
|
+ u64 offset, u32 *stripe_nr, u64 *stripe_offset,
|
|
u64 *full_stripe_start)
|
|
{
|
|
- u32 stripe_len = map->stripe_len;
|
|
-
|
|
ASSERT(op != BTRFS_MAP_DISCARD);
|
|
|
|
/*
|
|
* Stripe_nr is the stripe where this block falls. stripe_offset is
|
|
* the offset of this block in its stripe.
|
|
*/
|
|
- *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
|
|
+ *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
|
|
+ *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
|
|
ASSERT(*stripe_offset < U32_MAX);
|
|
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
|
|
+ unsigned long full_stripe_len = nr_data_stripes(map) <<
|
|
+ BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
+ /*
|
|
+ * For full stripe start, we use previously calculated
|
|
+ * @stripe_nr. Align it to nr_data_stripes, then multiply with
|
|
+ * STRIPE_LEN.
|
|
+ *
|
|
+ * By this we can avoid u64 division completely. And we have
|
|
+ * to go rounddown(), not round_down(), as nr_data_stripes is
|
|
+ * not ensured to be power of 2.
|
|
+ */
|
|
*full_stripe_start =
|
|
- div64_u64(offset, full_stripe_len) * full_stripe_len;
|
|
+ rounddown(*stripe_nr, nr_data_stripes(map)) <<
|
|
+ BTRFS_STRIPE_LEN_SHIFT;
|
|
|
|
/*
|
|
* For writes to RAID56, allow to write a full stripe set, but
|
|
@@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
|
|
* a single disk).
|
|
*/
|
|
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
|
|
- return stripe_len - *stripe_offset;
|
|
+ return BTRFS_STRIPE_LEN - *stripe_offset;
|
|
return U64_MAX;
|
|
}
|
|
|
|
static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
|
|
- u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
|
|
+ u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
|
|
{
|
|
dst->dev = map->stripes[stripe_index].dev;
|
|
dst->physical = map->stripes[stripe_index].physical +
|
|
- stripe_offset + stripe_nr * map->stripe_len;
|
|
+ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
|
|
}
|
|
|
|
int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
@@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
struct map_lookup *map;
|
|
u64 map_offset;
|
|
u64 stripe_offset;
|
|
- u64 stripe_nr;
|
|
- u64 stripe_len;
|
|
+ u32 stripe_nr;
|
|
u32 stripe_index;
|
|
int data_stripes;
|
|
int i;
|
|
int ret = 0;
|
|
int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
|
|
int num_stripes;
|
|
+ int num_copies;
|
|
int max_errors = 0;
|
|
- int tgtdev_indexes = 0;
|
|
struct btrfs_io_context *bioc = NULL;
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
int dev_replace_is_ongoing = 0;
|
|
- int num_alloc_stripes;
|
|
- int patch_the_first_stripe_for_dev_replace = 0;
|
|
- u64 physical_to_patch_in_first_stripe = 0;
|
|
+ u16 num_alloc_stripes;
|
|
u64 raid56_full_stripe_start = (u64)-1;
|
|
u64 max_len;
|
|
|
|
ASSERT(bioc_ret);
|
|
ASSERT(op != BTRFS_MAP_DISCARD);
|
|
|
|
+ num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
|
|
+ if (mirror_num > num_copies)
|
|
+ return -EINVAL;
|
|
+
|
|
em = btrfs_get_chunk_map(fs_info, logical, *length);
|
|
if (IS_ERR(em))
|
|
return PTR_ERR(em);
|
|
|
|
map = em->map_lookup;
|
|
data_stripes = nr_data_stripes(map);
|
|
- stripe_len = map->stripe_len;
|
|
|
|
map_offset = logical - em->start;
|
|
max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
|
|
@@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
if (!dev_replace_is_ongoing)
|
|
up_read(&dev_replace->rwsem);
|
|
|
|
- if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
|
|
- !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
|
|
- ret = get_extra_mirror_from_replace(fs_info, logical, *length,
|
|
- dev_replace->srcdev->devid,
|
|
- &mirror_num,
|
|
- &physical_to_patch_in_first_stripe);
|
|
- if (ret)
|
|
- goto out;
|
|
- else
|
|
- patch_the_first_stripe_for_dev_replace = 1;
|
|
- } else if (mirror_num > map->num_stripes) {
|
|
- mirror_num = 0;
|
|
- }
|
|
-
|
|
num_stripes = 1;
|
|
stripe_index = 0;
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
|
|
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
|
|
- &stripe_index);
|
|
+ stripe_index = stripe_nr % map->num_stripes;
|
|
+ stripe_nr /= map->num_stripes;
|
|
if (!need_full_stripe(op))
|
|
mirror_num = 1;
|
|
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
|
|
@@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
|
|
u32 factor = map->num_stripes / map->sub_stripes;
|
|
|
|
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
|
|
- stripe_index *= map->sub_stripes;
|
|
+ stripe_index = (stripe_nr % factor) * map->sub_stripes;
|
|
+ stripe_nr /= factor;
|
|
|
|
if (need_full_stripe(op))
|
|
num_stripes = map->sub_stripes;
|
|
@@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
}
|
|
|
|
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
- ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
|
|
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
|
|
- /* push stripe_nr back to the start of the full stripe */
|
|
- stripe_nr = div64_u64(raid56_full_stripe_start,
|
|
- stripe_len * data_stripes);
|
|
+ /*
|
|
+ * Push stripe_nr back to the start of the full stripe
|
|
+ * For those cases needing a full stripe, @stripe_nr
|
|
+ * is the full stripe number.
|
|
+ *
|
|
+ * Originally we go raid56_full_stripe_start / full_stripe_len,
|
|
+ * but that can be expensive. Here we just divide
|
|
+ * @stripe_nr with @data_stripes.
|
|
+ */
|
|
+ stripe_nr /= data_stripes;
|
|
|
|
/* RAID[56] write or recovery. Return all stripes */
|
|
num_stripes = map->num_stripes;
|
|
@@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
/* Return the length to the full stripe end */
|
|
*length = min(logical + *length,
|
|
raid56_full_stripe_start + em->start +
|
|
- data_stripes * stripe_len) - logical;
|
|
+ (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical;
|
|
stripe_index = 0;
|
|
stripe_offset = 0;
|
|
} else {
|
|
@@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
* Mirror #2 is RAID5 parity block.
|
|
* Mirror #3 is RAID6 Q block.
|
|
*/
|
|
- stripe_nr = div_u64_rem(stripe_nr,
|
|
- data_stripes, &stripe_index);
|
|
+ stripe_index = stripe_nr % data_stripes;
|
|
+ stripe_nr /= data_stripes;
|
|
if (mirror_num > 1)
|
|
stripe_index = data_stripes + mirror_num - 2;
|
|
|
|
/* We distribute the parity blocks across stripes */
|
|
- div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
|
|
- &stripe_index);
|
|
+ stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
|
|
if (!need_full_stripe(op) && mirror_num <= 1)
|
|
mirror_num = 1;
|
|
}
|
|
} else {
|
|
/*
|
|
- * after this, stripe_nr is the number of stripes on this
|
|
+ * After this, stripe_nr is the number of stripes on this
|
|
* device we have to walk to find the data, and stripe_index is
|
|
* the number of our device in the stripe array
|
|
*/
|
|
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
|
|
- &stripe_index);
|
|
+ stripe_index = stripe_nr % map->num_stripes;
|
|
+ stripe_nr /= map->num_stripes;
|
|
mirror_num = stripe_index + 1;
|
|
}
|
|
if (stripe_index >= map->num_stripes) {
|
|
@@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
}
|
|
|
|
num_alloc_stripes = num_stripes;
|
|
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
|
|
- if (op == BTRFS_MAP_WRITE)
|
|
- num_alloc_stripes <<= 1;
|
|
- if (op == BTRFS_MAP_GET_READ_MIRRORS)
|
|
- num_alloc_stripes++;
|
|
- tgtdev_indexes = num_stripes;
|
|
- }
|
|
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
|
|
+ op != BTRFS_MAP_READ)
|
|
+ /*
|
|
+ * For replace case, we need to add extra stripes for extra
|
|
+ * duplicated stripes.
|
|
+ *
|
|
+ * For both WRITE and GET_READ_MIRRORS, we may have at most
|
|
+ * 2 more stripes (DUP types, otherwise 1).
|
|
+ */
|
|
+ num_alloc_stripes += 2;
|
|
|
|
/*
|
|
* If this I/O maps to a single device, try to return the device and
|
|
@@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
|
|
(!need_full_stripe(op) || !dev_replace_is_ongoing ||
|
|
!dev_replace->tgtdev)) {
|
|
- if (patch_the_first_stripe_for_dev_replace) {
|
|
- smap->dev = dev_replace->tgtdev;
|
|
- smap->physical = physical_to_patch_in_first_stripe;
|
|
- *mirror_num_ret = map->num_stripes + 1;
|
|
- } else {
|
|
- set_io_stripe(smap, map, stripe_index, stripe_offset,
|
|
- stripe_nr);
|
|
- *mirror_num_ret = mirror_num;
|
|
- }
|
|
+ set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
|
|
+ *mirror_num_ret = mirror_num;
|
|
*bioc_ret = NULL;
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
|
|
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
|
|
if (!bioc) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
+ bioc->map_type = map->type;
|
|
|
|
- for (i = 0; i < num_stripes; i++) {
|
|
- set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
|
|
- stripe_nr);
|
|
- stripe_index++;
|
|
- }
|
|
-
|
|
- /* Build raid_map */
|
|
+ /*
|
|
+ * For RAID56 full map, we need to make sure the stripes[] follows the
|
|
+ * rule that data stripes are all ordered, then followed with P and Q
|
|
+ * (if we have).
|
|
+ *
|
|
+ * It's still mostly the same as other profiles, just with extra rotation.
|
|
+ */
|
|
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
|
|
(need_full_stripe(op) || mirror_num > 1)) {
|
|
- u64 tmp;
|
|
- unsigned rot;
|
|
-
|
|
- /* Work out the disk rotation on this stripe-set */
|
|
- div_u64_rem(stripe_nr, num_stripes, &rot);
|
|
-
|
|
- /* Fill in the logical address of each stripe */
|
|
- tmp = stripe_nr * data_stripes;
|
|
- for (i = 0; i < data_stripes; i++)
|
|
- bioc->raid_map[(i + rot) % num_stripes] =
|
|
- em->start + (tmp + i) * map->stripe_len;
|
|
-
|
|
- bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
|
|
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
|
|
- bioc->raid_map[(i + rot + 1) % num_stripes] =
|
|
- RAID6_Q_STRIPE;
|
|
-
|
|
- sort_parity_stripes(bioc, num_stripes);
|
|
+ /*
|
|
+ * For RAID56 @stripe_nr is already the number of full stripes
|
|
+ * before us, which is also the rotation value (needs to modulo
|
|
+ * with num_stripes).
|
|
+ *
|
|
+ * In this case, we just add @stripe_nr with @i, then do the
|
|
+ * modulo, to reduce one modulo call.
|
|
+ */
|
|
+ bioc->full_stripe_logical = em->start +
|
|
+ ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
|
|
+ for (i = 0; i < num_stripes; i++)
|
|
+ set_io_stripe(&bioc->stripes[i], map,
|
|
+ (i + stripe_nr) % num_stripes,
|
|
+ stripe_offset, stripe_nr);
|
|
+ } else {
|
|
+ /*
|
|
+ * For all other non-RAID56 profiles, just copy the target
|
|
+ * stripe into the bioc.
|
|
+ */
|
|
+ for (i = 0; i < num_stripes; i++) {
|
|
+ set_io_stripe(&bioc->stripes[i], map, stripe_index,
|
|
+ stripe_offset, stripe_nr);
|
|
+ stripe_index++;
|
|
+ }
|
|
}
|
|
|
|
if (need_full_stripe(op))
|
|
@@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
|
|
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
|
|
need_full_stripe(op)) {
|
|
- handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
|
|
+ handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
|
|
&num_stripes, &max_errors);
|
|
}
|
|
|
|
*bioc_ret = bioc;
|
|
- bioc->map_type = map->type;
|
|
bioc->num_stripes = num_stripes;
|
|
bioc->max_errors = max_errors;
|
|
bioc->mirror_num = mirror_num;
|
|
|
|
- /*
|
|
- * this is the case that REQ_READ && dev_replace_is_ongoing &&
|
|
- * mirror_num == num_stripes + 1 && dev_replace target drive is
|
|
- * available as a mirror
|
|
- */
|
|
- if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
|
|
- WARN_ON(num_stripes > 1);
|
|
- bioc->stripes[0].dev = dev_replace->tgtdev;
|
|
- bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
|
|
- bioc->mirror_num = map->num_stripes + 1;
|
|
- }
|
|
out:
|
|
if (dev_replace_is_ongoing) {
|
|
lockdep_assert_held(&dev_replace->rwsem);
|
|
@@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
|
map->num_stripes = num_stripes;
|
|
map->io_width = btrfs_chunk_io_width(leaf, chunk);
|
|
map->io_align = btrfs_chunk_io_align(leaf, chunk);
|
|
- map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
|
|
map->type = type;
|
|
/*
|
|
* We can't use the sub_stripes value, as for profiles other than
|
|
@@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
|
|
|
|
return true;
|
|
}
|
|
+
|
|
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
|
|
+ struct btrfs_io_stripe *smap,
|
|
+ u64 logical)
|
|
+{
|
|
+ int data_stripes = nr_bioc_data_stripes(bioc);
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < data_stripes; i++) {
|
|
+ u64 stripe_start = bioc->full_stripe_logical +
|
|
+ (i << BTRFS_STRIPE_LEN_SHIFT);
|
|
+
|
|
+ if (logical >= stripe_start &&
|
|
+ logical < stripe_start + BTRFS_STRIPE_LEN)
|
|
+ break;
|
|
+ }
|
|
+ ASSERT(i < data_stripes);
|
|
+ smap->dev = bioc->stripes[i].dev;
|
|
+ smap->physical = bioc->stripes[i].physical +
|
|
+ ((logical - bioc->full_stripe_logical) &
|
|
+ BTRFS_STRIPE_LEN_MASK);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Map a repair write into a single device.
|
|
+ *
|
|
+ * A repair write is triggered by read time repair or scrub, which would only
|
|
+ * update the contents of a single device.
|
|
+ * Not update any other mirrors nor go through RMW path.
|
|
+ *
|
|
+ * Callers should ensure:
|
|
+ *
|
|
+ * - Call btrfs_bio_counter_inc_blocked() first
|
|
+ * - The range does not cross stripe boundary
|
|
+ * - Has a valid @mirror_num passed in.
|
|
+ */
|
|
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_io_stripe *smap, u64 logical,
|
|
+ u32 length, int mirror_num)
|
|
+{
|
|
+ struct btrfs_io_context *bioc = NULL;
|
|
+ u64 map_length = length;
|
|
+ int mirror_ret = mirror_num;
|
|
+ int ret;
|
|
+
|
|
+ ASSERT(mirror_num > 0);
|
|
+
|
|
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
|
|
+ &bioc, smap, &mirror_ret, true);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ /* The map range should not cross stripe boundary. */
|
|
+ ASSERT(map_length >= length);
|
|
+
|
|
+ /* Already mapped to single stripe. */
|
|
+ if (!bioc)
|
|
+ goto out;
|
|
+
|
|
+ /* Map the RAID56 multi-stripe writes to a single one. */
|
|
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
+ map_raid56_repair_block(bioc, smap, logical);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ASSERT(mirror_num <= bioc->num_stripes);
|
|
+ smap->dev = bioc->stripes[mirror_num - 1].dev;
|
|
+ smap->physical = bioc->stripes[mirror_num - 1].physical;
|
|
+out:
|
|
+ btrfs_put_bioc(bioc);
|
|
+ ASSERT(smap->dev);
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
|
|
index 7e51f2238f72..bf47a1a70813 100644
|
|
--- a/fs/btrfs/volumes.h
|
|
+++ b/fs/btrfs/volumes.h
|
|
@@ -17,7 +17,11 @@
|
|
|
|
extern struct mutex uuid_mutex;
|
|
|
|
-#define BTRFS_STRIPE_LEN SZ_64K
|
|
+#define BTRFS_STRIPE_LEN SZ_64K
|
|
+#define BTRFS_STRIPE_LEN_SHIFT (16)
|
|
+#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1)
|
|
+
|
|
+static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
|
|
|
|
/* Used by sanity check for btrfs_raid_types. */
|
|
#define const_ffs(n) (__builtin_ctzll(n) + 1)
|
|
@@ -404,17 +408,74 @@ struct btrfs_io_context {
|
|
u64 map_type; /* get from map_lookup->type */
|
|
struct bio *orig_bio;
|
|
atomic_t error;
|
|
- int max_errors;
|
|
- int num_stripes;
|
|
- int mirror_num;
|
|
- int num_tgtdevs;
|
|
- int *tgtdev_map;
|
|
+ u16 max_errors;
|
|
+
|
|
+ /*
|
|
+ * The total number of stripes, including the extra duplicated
|
|
+ * stripe for replace.
|
|
+ */
|
|
+ u16 num_stripes;
|
|
+
|
|
+ /*
|
|
+ * The mirror_num of this bioc.
|
|
+ *
|
|
+ * This is for reads which use 0 as mirror_num, thus we should return a
|
|
+ * valid mirror_num (>0) for the reader.
|
|
+ */
|
|
+ u16 mirror_num;
|
|
+
|
|
+ /*
|
|
+ * The following two members are for dev-replace case only.
|
|
+ *
|
|
+ * @replace_nr_stripes: Number of duplicated stripes which need to be
|
|
+ * written to replace target.
|
|
+ * Should be <= 2 (2 for DUP, otherwise <= 1).
|
|
+ * @replace_stripe_src: The array indicates where the duplicated stripes
|
|
+ * are from.
|
|
+ *
|
|
+ * The @replace_stripe_src[] array is mostly for RAID56 cases.
|
|
+ * As non-RAID56 stripes share the same contents of the mapped range,
|
|
+ * thus no need to bother where the duplicated ones are from.
|
|
+ *
|
|
+ * But for RAID56 case, all stripes contain different contents, thus
|
|
+ * we need a way to know the mapping.
|
|
+ *
|
|
+ * There is an example for the two members, using a RAID5 write:
|
|
+ *
|
|
+ * num_stripes: 4 (3 + 1 duplicated write)
|
|
+ * stripes[0]: dev = devid 1, physical = X
|
|
+ * stripes[1]: dev = devid 2, physical = Y
|
|
+ * stripes[2]: dev = devid 3, physical = Z
|
|
+ * stripes[3]: dev = devid 0, physical = Y
|
|
+ *
|
|
+ * replace_nr_stripes = 1
|
|
+ * replace_stripe_src = 1 <- Means stripes[1] is involved in replace.
|
|
+ * The duplicated stripe index would be
|
|
+ * (@num_stripes - 1).
|
|
+ *
|
|
+ * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
|
|
+ * In that case, all stripes share the same content, thus we don't
|
|
+ * need to bother @replace_stripe_src value at all.
|
|
+ */
|
|
+ u16 replace_nr_stripes;
|
|
+ s16 replace_stripe_src;
|
|
/*
|
|
- * logical block numbers for the start of each stripe
|
|
- * The last one or two are p/q. These are sorted,
|
|
- * so raid_map[0] is the start of our full stripe
|
|
+ * Logical bytenr of the full stripe start, only for RAID56 cases.
|
|
+ *
|
|
+ * When this value is set to other than (u64)-1, the stripes[] should
|
|
+ * follow this pattern:
|
|
+ *
|
|
+ * (real_stripes = num_stripes - replace_nr_stripes)
|
|
+ * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
|
|
+ *
|
|
+ * stripes[0]: The first data stripe
|
|
+ * stripes[1]: The second data stripe
|
|
+ * ...
|
|
+ * stripes[data_stripes - 1]: The last data stripe
|
|
+ * stripes[data_stripes]: The P stripe
|
|
+ * stripes[data_stripes + 1]: The Q stripe (only for RAID6).
|
|
*/
|
|
- u64 *raid_map;
|
|
+ u64 full_stripe_logical;
|
|
struct btrfs_io_stripe stripes[];
|
|
};
|
|
|
|
@@ -446,7 +507,6 @@ struct map_lookup {
|
|
u64 type;
|
|
int io_align;
|
|
int io_width;
|
|
- u32 stripe_len;
|
|
int num_stripes;
|
|
int sub_stripes;
|
|
int verified_stripes; /* For mount time dev extent verification */
|
|
@@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
|
struct btrfs_io_context **bioc_ret,
|
|
struct btrfs_io_stripe *smap, int *mirror_num_ret,
|
|
int need_raid_map);
|
|
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
|
|
+ struct btrfs_io_stripe *smap, u64 logical,
|
|
+ u32 length, int mirror_num);
|
|
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
|
|
u64 logical, u64 *length_ret,
|
|
u32 *num_stripes);
|
|
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
|
|
index da7bb9187b68..8acb05e176c5 100644
|
|
--- a/fs/btrfs/zlib.c
|
|
+++ b/fs/btrfs/zlib.c
|
|
@@ -350,8 +350,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
zlib_inflateEnd(&workspace->strm);
|
|
if (data_in)
|
|
kunmap_local(data_in);
|
|
- if (!ret)
|
|
- zero_fill_bio(cb->orig_bio);
|
|
return ret;
|
|
}
|
|
|
|
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
|
|
index 45d04092f2f8..a9b32ba6b2ce 100644
|
|
--- a/fs/btrfs/zoned.c
|
|
+++ b/fs/btrfs/zoned.c
|
|
@@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
|
|
{
|
|
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
|
|
struct btrfs_inode *inode = bbio->inode;
|
|
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
|
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
|
|
struct btrfs_block_group *cache;
|
|
bool ret = false;
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
return false;
|
|
|
|
- if (!is_data_inode(&inode->vfs_inode))
|
|
+ if (!inode || !is_data_inode(&inode->vfs_inode))
|
|
return false;
|
|
|
|
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
|
|
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
|
|
index e34f1ab99d56..f798da267590 100644
|
|
--- a/fs/btrfs/zstd.c
|
|
+++ b/fs/btrfs/zstd.c
|
|
@@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
|
}
|
|
}
|
|
ret = 0;
|
|
- zero_fill_bio(cb->orig_bio);
|
|
done:
|
|
if (workspace->in_buf.src)
|
|
kunmap_local(workspace->in_buf.src);
|
|
diff --git a/include/linux/bio.h b/include/linux/bio.h
|
|
index d766be7152e1..b3e7529ff55e 100644
|
|
--- a/include/linux/bio.h
|
|
+++ b/include/linux/bio.h
|
|
@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio);
|
|
void bio_associate_blkg_from_css(struct bio *bio,
|
|
struct cgroup_subsys_state *css);
|
|
void bio_clone_blkg_association(struct bio *dst, struct bio *src);
|
|
+void blkcg_punt_bio_submit(struct bio *bio);
|
|
#else /* CONFIG_BLK_CGROUP */
|
|
static inline void bio_associate_blkg(struct bio *bio) { }
|
|
static inline void bio_associate_blkg_from_css(struct bio *bio,
|
|
@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio,
|
|
{ }
|
|
static inline void bio_clone_blkg_association(struct bio *dst,
|
|
struct bio *src) { }
|
|
+static inline void blkcg_punt_bio_submit(struct bio *bio)
|
|
+{
|
|
+ submit_bio(bio);
|
|
+}
|
|
#endif /* CONFIG_BLK_CGROUP */
|
|
|
|
static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
|
|
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
|
|
index 99be590f952f..fb8843990d28 100644
|
|
--- a/include/linux/blk_types.h
|
|
+++ b/include/linux/blk_types.h
|
|
@@ -404,18 +404,11 @@ enum req_flag_bits {
|
|
__REQ_RAHEAD, /* read ahead, can fail anytime */
|
|
__REQ_BACKGROUND, /* background IO */
|
|
__REQ_NOWAIT, /* Don't wait if request will block */
|
|
- /*
|
|
- * When a shared kthread needs to issue a bio for a cgroup, doing
|
|
- * so synchronously can lead to priority inversions as the kthread
|
|
- * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
|
|
- * submit_bio() punt the actual issuing to a dedicated per-blkcg
|
|
- * work item to avoid such priority inversions.
|
|
- */
|
|
- __REQ_CGROUP_PUNT,
|
|
__REQ_POLLED, /* caller polls for completion using bio_poll */
|
|
__REQ_ALLOC_CACHE, /* allocate IO from cache if available */
|
|
__REQ_SWAP, /* swap I/O */
|
|
__REQ_DRV, /* for driver use */
|
|
+ __REQ_FS_PRIVATE, /* for file system (submitter) use */
|
|
|
|
/*
|
|
* Command specific flags, keep last:
|
|
@@ -443,14 +436,13 @@ enum req_flag_bits {
|
|
#define REQ_RAHEAD (__force blk_opf_t)(1ULL << __REQ_RAHEAD)
|
|
#define REQ_BACKGROUND (__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
|
|
#define REQ_NOWAIT (__force blk_opf_t)(1ULL << __REQ_NOWAIT)
|
|
-#define REQ_CGROUP_PUNT (__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
|
|
-
|
|
-#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
|
|
#define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED)
|
|
#define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
|
|
-
|
|
-#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV)
|
|
#define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP)
|
|
+#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV)
|
|
+#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
|
|
+
|
|
+#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
|
|
|
|
#define REQ_FAILFAST_MASK \
|
|
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
|
|
diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
|
|
index bd21af828ff6..357ae4611a45 100644
|
|
--- a/include/linux/crc32c.h
|
|
+++ b/include/linux/crc32c.h
|
|
@@ -5,7 +5,6 @@
|
|
#include <linux/types.h>
|
|
|
|
extern u32 crc32c(u32 crc, const void *address, unsigned int length);
|
|
-extern const char *crc32c_impl(void);
|
|
|
|
/* This macro exists for backwards-compatibility. */
|
|
#define crc32c_le crc32c
|
|
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
|
|
index 46020373e155..fba937999fbf 100644
|
|
--- a/include/linux/writeback.h
|
|
+++ b/include/linux/writeback.h
|
|
@@ -70,8 +70,6 @@ struct writeback_control {
|
|
*/
|
|
unsigned no_cgroup_owner:1;
|
|
|
|
- unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
|
|
-
|
|
/* To enable batching of swap writes to non-block-device backends,
|
|
* "plug" can be set point to a 'struct swap_iocb *'. When all swap
|
|
* writes have been submitted, if with swap_iocb is not NULL,
|
|
@@ -97,9 +95,6 @@ static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
|
|
{
|
|
blk_opf_t flags = 0;
|
|
|
|
- if (wbc->punt_to_cgroup)
|
|
- flags = REQ_CGROUP_PUNT;
|
|
-
|
|
if (wbc->sync_mode == WB_SYNC_ALL)
|
|
flags |= REQ_SYNC;
|
|
else if (wbc->for_kupdate || wbc->for_background)
|
|
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
|
|
index 75d7d22c3a27..8ea9cea9bfeb 100644
|
|
--- a/include/trace/events/btrfs.h
|
|
+++ b/include/trace/events/btrfs.h
|
|
@@ -2422,7 +2422,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio,
|
|
),
|
|
|
|
TP_fast_assign_btrfs(rbio->bioc->fs_info,
|
|
- __entry->full_stripe = rbio->bioc->raid_map[0];
|
|
+ __entry->full_stripe = rbio->bioc->full_stripe_logical;
|
|
__entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
|
__entry->len = bio->bi_iter.bi_size;
|
|
__entry->opf = bio_op(bio);
|
|
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
|
|
index ada0a489bf2b..dbb8b96da50d 100644
|
|
--- a/include/uapi/linux/btrfs.h
|
|
+++ b/include/uapi/linux/btrfs.h
|
|
@@ -187,6 +187,7 @@ struct btrfs_scrub_progress {
|
|
};
|
|
|
|
#define BTRFS_SCRUB_READONLY 1
|
|
+#define BTRFS_SCRUB_SUPPORTED_FLAGS (BTRFS_SCRUB_READONLY)
|
|
struct btrfs_ioctl_scrub_args {
|
|
__u64 devid; /* in */
|
|
__u64 start; /* in */
|
|
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
|
|
index 5ca0d815a95d..649e687413a0 100644
|
|
--- a/lib/libcrc32c.c
|
|
+++ b/lib/libcrc32c.c
|
|
@@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void)
|
|
crypto_free_shash(tfm);
|
|
}
|
|
|
|
-const char *crc32c_impl(void)
|
|
-{
|
|
- return crypto_shash_driver_name(tfm);
|
|
-}
|
|
-EXPORT_SYMBOL(crc32c_impl);
|
|
-
|
|
module_init(libcrc32c_mod_init);
|
|
module_exit(libcrc32c_mod_fini);
|
|
|
|
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
|
|
index f937be1afe65..060032cfb046 100644
|
|
--- a/tools/objtool/check.c
|
|
+++ b/tools/objtool/check.c
|
|
@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
|
|
"__reiserfs_panic",
|
|
"__stack_chk_fail",
|
|
"__ubsan_handle_builtin_unreachable",
|
|
+ "btrfs_assertfail",
|
|
"cpu_bringup_and_idle",
|
|
"cpu_startup_entry",
|
|
"do_exit",
|
|
--
|
|
2.40.1
|
|
|
|
From 0ad50219edceae27eb649c5fb76f2b8aebe27e3f Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Mon, 17 Apr 2023 18:32:06 +0200
|
|
Subject: [PATCH 06/10] Implement amd-pstate guided driver
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
.../admin-guide/kernel-parameters.txt | 40 ++--
|
|
Documentation/admin-guide/pm/amd-pstate.rst | 31 ++-
|
|
drivers/acpi/cppc_acpi.c | 121 ++++++++++-
|
|
drivers/cpufreq/amd-pstate.c | 199 ++++++++++++------
|
|
include/acpi/cppc_acpi.h | 11 +
|
|
include/linux/amd-pstate.h | 2 +
|
|
6 files changed, 312 insertions(+), 92 deletions(-)
|
|
|
|
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
|
index 97303fa40350..dddaba21a9a7 100644
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -339,6 +339,29 @@
|
|
This mode requires kvm-amd.avic=1.
|
|
(Default when IOMMU HW support is present.)
|
|
|
|
+ amd_pstate= [X86]
|
|
+ disable
|
|
+ Do not enable amd_pstate as the default
|
|
+ scaling driver for the supported processors
|
|
+ passive
|
|
+ Use amd_pstate with passive mode as a scaling driver.
|
|
+ In this mode autonomous selection is disabled.
|
|
+ Driver requests a desired performance level and platform
|
|
+ tries to match the same performance level if it is
|
|
+ satisfied by guaranteed performance level.
|
|
+ active
|
|
+ Use amd_pstate_epp driver instance as the scaling driver,
|
|
+ driver provides a hint to the hardware if software wants
|
|
+ to bias toward performance (0x0) or energy efficiency (0xff)
|
|
+ to the CPPC firmware. then CPPC power algorithm will
|
|
+ calculate the runtime workload and adjust the realtime cores
|
|
+ frequency.
|
|
+ guided
|
|
+ Activate guided autonomous mode. Driver requests minimum and
|
|
+ maximum performance level and the platform autonomously
|
|
+ selects a performance level in this range and appropriate
|
|
+ to the current workload.
|
|
+
|
|
amijoy.map= [HW,JOY] Amiga joystick support
|
|
Map of devices attached to JOY0DAT and JOY1DAT
|
|
Format: <a>,<b>
|
|
@@ -7077,20 +7100,3 @@
|
|
xmon commands.
|
|
off xmon is disabled.
|
|
|
|
- amd_pstate= [X86]
|
|
- disable
|
|
- Do not enable amd_pstate as the default
|
|
- scaling driver for the supported processors
|
|
- passive
|
|
- Use amd_pstate as a scaling driver, driver requests a
|
|
- desired performance on this abstract scale and the power
|
|
- management firmware translates the requests into actual
|
|
- hardware states (core frequency, data fabric and memory
|
|
- clocks etc.)
|
|
- active
|
|
- Use amd_pstate_epp driver instance as the scaling driver,
|
|
- driver provides a hint to the hardware if software wants
|
|
- to bias toward performance (0x0) or energy efficiency (0xff)
|
|
- to the CPPC firmware. then CPPC power algorithm will
|
|
- calculate the runtime workload and adjust the realtime cores
|
|
- frequency.
|
|
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
|
|
index 6e5298b521b1..1cf40f69278c 100644
|
|
--- a/Documentation/admin-guide/pm/amd-pstate.rst
|
|
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
|
|
@@ -303,13 +303,18 @@ efficiency frequency management method on AMD processors.
|
|
AMD Pstate Driver Operation Modes
|
|
=================================
|
|
|
|
-``amd_pstate`` CPPC has two operation modes: CPPC Autonomous(active) mode and
|
|
-CPPC non-autonomous(passive) mode.
|
|
-active mode and passive mode can be chosen by different kernel parameters.
|
|
-When in Autonomous mode, CPPC ignores requests done in the Desired Performance
|
|
-Target register and takes into account only the values set to the Minimum requested
|
|
-performance, Maximum requested performance, and Energy Performance Preference
|
|
-registers. When Autonomous is disabled, it only considers the Desired Performance Target.
|
|
+``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
|
|
+non-autonomous (passive) mode and guided autonomous (guided) mode.
|
|
+Active/passive/guided mode can be chosen by different kernel parameters.
|
|
+
|
|
+- In autonomous mode, platform ignores the desired performance level request
|
|
+ and takes into account only the values set to the minimum, maximum and energy
|
|
+ performance preference registers.
|
|
+- In non-autonomous mode, platform gets desired performance level
|
|
+ from OS directly through Desired Performance Register.
|
|
+- In guided-autonomous mode, platform sets operating performance level
|
|
+ autonomously according to the current workload and within the limits set by
|
|
+ OS through min and max performance registers.
|
|
|
|
Active Mode
|
|
------------
|
|
@@ -338,6 +343,15 @@ to the Performance Reduction Tolerance register. Above the nominal performance l
|
|
processor must provide at least nominal performance requested and go higher if current
|
|
operating conditions allow.
|
|
|
|
+Guided Mode
|
|
+-----------
|
|
+
|
|
+``amd_pstate=guided``
|
|
+
|
|
+If ``amd_pstate=guided`` is passed to kernel command line option then this mode
|
|
+is activated. In this mode, driver requests minimum and maximum performance
|
|
+level and the platform autonomously selects a performance level in this range
|
|
+and appropriate to the current workload.
|
|
|
|
User Space Interface in ``sysfs`` - General
|
|
===========================================
|
|
@@ -358,6 +372,9 @@ control its functionality at the system level. They are located in the
|
|
"passive"
|
|
The driver is functional and in the ``passive mode``
|
|
|
|
+ "guided"
|
|
+ The driver is functional and in the ``guided mode``
|
|
+
|
|
"disable"
|
|
The driver is unregistered and not functional now.
|
|
|
|
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
|
|
index c51d3ccb4cca..02a4bfb54967 100644
|
|
--- a/drivers/acpi/cppc_acpi.c
|
|
+++ b/drivers/acpi/cppc_acpi.c
|
|
@@ -1433,6 +1433,103 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
|
|
}
|
|
EXPORT_SYMBOL_GPL(cppc_set_epp_perf);
|
|
|
|
+/*
|
|
+ * cppc_get_auto_sel_caps - Read autonomous selection register.
|
|
+ * @cpunum : CPU from which to read register.
|
|
+ * @perf_caps : struct where autonomous selection register value is updated.
|
|
+ */
|
|
+int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
|
|
+{
|
|
+ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
|
|
+ struct cpc_register_resource *auto_sel_reg;
|
|
+ u64 auto_sel;
|
|
+
|
|
+ if (!cpc_desc) {
|
|
+ pr_debug("No CPC descriptor for CPU:%d\n", cpunum);
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
|
|
+
|
|
+ if (!CPC_SUPPORTED(auto_sel_reg))
|
|
+ pr_warn_once("Autonomous mode is not unsupported!\n");
|
|
+
|
|
+ if (CPC_IN_PCC(auto_sel_reg)) {
|
|
+ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
|
|
+ struct cppc_pcc_data *pcc_ss_data = NULL;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (pcc_ss_id < 0)
|
|
+ return -ENODEV;
|
|
+
|
|
+ pcc_ss_data = pcc_data[pcc_ss_id];
|
|
+
|
|
+ down_write(&pcc_ss_data->pcc_lock);
|
|
+
|
|
+ if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) {
|
|
+ cpc_read(cpunum, auto_sel_reg, &auto_sel);
|
|
+ perf_caps->auto_sel = (bool)auto_sel;
|
|
+ } else {
|
|
+ ret = -EIO;
|
|
+ }
|
|
+
|
|
+ up_write(&pcc_ss_data->pcc_lock);
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(cppc_get_auto_sel_caps);
|
|
+
|
|
+/*
|
|
+ * cppc_set_auto_sel - Write autonomous selection register.
|
|
+ * @cpu : CPU to which to write register.
|
|
+ * @enable : the desired value of autonomous selection resiter to be updated.
|
|
+ */
|
|
+int cppc_set_auto_sel(int cpu, bool enable)
|
|
+{
|
|
+ int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
|
|
+ struct cpc_register_resource *auto_sel_reg;
|
|
+ struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
|
|
+ struct cppc_pcc_data *pcc_ss_data = NULL;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ if (!cpc_desc) {
|
|
+ pr_debug("No CPC descriptor for CPU:%d\n", cpu);
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE];
|
|
+
|
|
+ if (CPC_IN_PCC(auto_sel_reg)) {
|
|
+ if (pcc_ss_id < 0) {
|
|
+ pr_debug("Invalid pcc_ss_id\n");
|
|
+ return -ENODEV;
|
|
+ }
|
|
+
|
|
+ if (CPC_SUPPORTED(auto_sel_reg)) {
|
|
+ ret = cpc_write(cpu, auto_sel_reg, enable);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ pcc_ss_data = pcc_data[pcc_ss_id];
|
|
+
|
|
+ down_write(&pcc_ss_data->pcc_lock);
|
|
+ /* after writing CPC, transfer the ownership of PCC to platform */
|
|
+ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE);
|
|
+ up_write(&pcc_ss_data->pcc_lock);
|
|
+ } else {
|
|
+ ret = -ENOTSUPP;
|
|
+ pr_debug("_CPC in PCC is not supported\n");
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(cppc_set_auto_sel);
|
|
+
|
|
+
|
|
/**
|
|
* cppc_set_enable - Set to enable CPPC on the processor by writing the
|
|
* Continuous Performance Control package EnableRegister field.
|
|
@@ -1488,7 +1585,7 @@ EXPORT_SYMBOL_GPL(cppc_set_enable);
|
|
int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
|
{
|
|
struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
|
|
- struct cpc_register_resource *desired_reg;
|
|
+ struct cpc_register_resource *desired_reg, *min_perf_reg, *max_perf_reg;
|
|
int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
|
|
struct cppc_pcc_data *pcc_ss_data = NULL;
|
|
int ret = 0;
|
|
@@ -1499,6 +1596,8 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
|
}
|
|
|
|
desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
|
|
+ min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF];
|
|
+ max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF];
|
|
|
|
/*
|
|
* This is Phase-I where we want to write to CPC registers
|
|
@@ -1507,7 +1606,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
|
* Since read_lock can be acquired by multiple CPUs simultaneously we
|
|
* achieve that goal here
|
|
*/
|
|
- if (CPC_IN_PCC(desired_reg)) {
|
|
+ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
|
|
if (pcc_ss_id < 0) {
|
|
pr_debug("Invalid pcc_ss_id\n");
|
|
return -ENODEV;
|
|
@@ -1530,13 +1629,19 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
|
cpc_desc->write_cmd_status = 0;
|
|
}
|
|
|
|
- /*
|
|
- * Skip writing MIN/MAX until Linux knows how to come up with
|
|
- * useful values.
|
|
- */
|
|
cpc_write(cpu, desired_reg, perf_ctrls->desired_perf);
|
|
|
|
- if (CPC_IN_PCC(desired_reg))
|
|
+ /**
|
|
+ * Only write if min_perf and max_perf not zero. Some drivers pass zero
|
|
+ * value to min and max perf, but they don't mean to set the zero value,
|
|
+ * they just don't want to write to those registers.
|
|
+ */
|
|
+ if (perf_ctrls->min_perf)
|
|
+ cpc_write(cpu, min_perf_reg, perf_ctrls->min_perf);
|
|
+ if (perf_ctrls->max_perf)
|
|
+ cpc_write(cpu, max_perf_reg, perf_ctrls->max_perf);
|
|
+
|
|
+ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg))
|
|
up_read(&pcc_ss_data->pcc_lock); /* END Phase-I */
|
|
/*
|
|
* This is Phase-II where we transfer the ownership of PCC to Platform
|
|
@@ -1584,7 +1689,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
|
* case during a CMD_READ and if there are pending writes it delivers
|
|
* the write command before servicing the read command
|
|
*/
|
|
- if (CPC_IN_PCC(desired_reg)) {
|
|
+ if (CPC_IN_PCC(desired_reg) || CPC_IN_PCC(min_perf_reg) || CPC_IN_PCC(max_perf_reg)) {
|
|
if (down_write_trylock(&pcc_ss_data->pcc_lock)) {/* BEGIN Phase-II */
|
|
/* Update only if there are pending write commands */
|
|
if (pcc_ss_data->pending_pcc_write_cmd)
|
|
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
|
|
index 8dd46fad151e..5a3d4aa0f45a 100644
|
|
--- a/drivers/cpufreq/amd-pstate.c
|
|
+++ b/drivers/cpufreq/amd-pstate.c
|
|
@@ -63,7 +63,6 @@ static struct cpufreq_driver *current_pstate_driver;
|
|
static struct cpufreq_driver amd_pstate_driver;
|
|
static struct cpufreq_driver amd_pstate_epp_driver;
|
|
static int cppc_state = AMD_PSTATE_DISABLE;
|
|
-struct kobject *amd_pstate_kobj;
|
|
|
|
/*
|
|
* AMD Energy Preference Performance (EPP)
|
|
@@ -106,6 +105,8 @@ static unsigned int epp_values[] = {
|
|
[EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE,
|
|
};
|
|
|
|
+typedef int (*cppc_mode_transition_fn)(int);
|
|
+
|
|
static inline int get_mode_idx_from_str(const char *str, size_t size)
|
|
{
|
|
int i;
|
|
@@ -308,7 +309,22 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
|
|
cppc_perf.lowest_nonlinear_perf);
|
|
WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
|
|
|
|
- return 0;
|
|
+ if (cppc_state == AMD_PSTATE_ACTIVE)
|
|
+ return 0;
|
|
+
|
|
+ ret = cppc_get_auto_sel_caps(cpudata->cpu, &cppc_perf);
|
|
+ if (ret) {
|
|
+ pr_warn("failed to get auto_sel, ret: %d\n", ret);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ ret = cppc_set_auto_sel(cpudata->cpu,
|
|
+ (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
|
|
+
|
|
+ if (ret)
|
|
+ pr_warn("failed to set auto_sel, ret: %d\n", ret);
|
|
+
|
|
+ return ret;
|
|
}
|
|
|
|
DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
|
|
@@ -385,12 +401,18 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
|
|
}
|
|
|
|
static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
|
|
- u32 des_perf, u32 max_perf, bool fast_switch)
|
|
+ u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags)
|
|
{
|
|
u64 prev = READ_ONCE(cpudata->cppc_req_cached);
|
|
u64 value = prev;
|
|
|
|
des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
|
|
+
|
|
+ if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
|
|
+ min_perf = des_perf;
|
|
+ des_perf = 0;
|
|
+ }
|
|
+
|
|
value &= ~AMD_CPPC_MIN_PERF(~0L);
|
|
value |= AMD_CPPC_MIN_PERF(min_perf);
|
|
|
|
@@ -445,7 +467,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
|
|
|
|
cpufreq_freq_transition_begin(policy, &freqs);
|
|
amd_pstate_update(cpudata, min_perf, des_perf,
|
|
- max_perf, false);
|
|
+ max_perf, false, policy->governor->flags);
|
|
cpufreq_freq_transition_end(policy, &freqs, false);
|
|
|
|
return 0;
|
|
@@ -479,7 +501,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
|
|
if (max_perf < min_perf)
|
|
max_perf = min_perf;
|
|
|
|
- amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true);
|
|
+ amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
|
|
+ policy->governor->flags);
|
|
cpufreq_cpu_put(policy);
|
|
}
|
|
|
|
@@ -816,6 +839,98 @@ static ssize_t show_energy_performance_preference(
|
|
return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
|
|
}
|
|
|
|
+static void amd_pstate_driver_cleanup(void)
|
|
+{
|
|
+ amd_pstate_enable(false);
|
|
+ cppc_state = AMD_PSTATE_DISABLE;
|
|
+ current_pstate_driver = NULL;
|
|
+}
|
|
+
|
|
+static int amd_pstate_register_driver(int mode)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
|
|
+ current_pstate_driver = &amd_pstate_driver;
|
|
+ else if (mode == AMD_PSTATE_ACTIVE)
|
|
+ current_pstate_driver = &amd_pstate_epp_driver;
|
|
+ else
|
|
+ return -EINVAL;
|
|
+
|
|
+ cppc_state = mode;
|
|
+ ret = cpufreq_register_driver(current_pstate_driver);
|
|
+ if (ret) {
|
|
+ amd_pstate_driver_cleanup();
|
|
+ return ret;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int amd_pstate_unregister_driver(int dummy)
|
|
+{
|
|
+ cpufreq_unregister_driver(current_pstate_driver);
|
|
+ amd_pstate_driver_cleanup();
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int amd_pstate_change_mode_without_dvr_change(int mode)
|
|
+{
|
|
+ int cpu = 0;
|
|
+
|
|
+ cppc_state = mode;
|
|
+
|
|
+ if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
|
|
+ return 0;
|
|
+
|
|
+ for_each_present_cpu(cpu) {
|
|
+ cppc_set_auto_sel(cpu, (cppc_state == AMD_PSTATE_PASSIVE) ? 0 : 1);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int amd_pstate_change_driver_mode(int mode)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = amd_pstate_unregister_driver(0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = amd_pstate_register_driver(mode);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static cppc_mode_transition_fn mode_state_machine[AMD_PSTATE_MAX][AMD_PSTATE_MAX] = {
|
|
+ [AMD_PSTATE_DISABLE] = {
|
|
+ [AMD_PSTATE_DISABLE] = NULL,
|
|
+ [AMD_PSTATE_PASSIVE] = amd_pstate_register_driver,
|
|
+ [AMD_PSTATE_ACTIVE] = amd_pstate_register_driver,
|
|
+ [AMD_PSTATE_GUIDED] = amd_pstate_register_driver,
|
|
+ },
|
|
+ [AMD_PSTATE_PASSIVE] = {
|
|
+ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver,
|
|
+ [AMD_PSTATE_PASSIVE] = NULL,
|
|
+ [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode,
|
|
+ [AMD_PSTATE_GUIDED] = amd_pstate_change_mode_without_dvr_change,
|
|
+ },
|
|
+ [AMD_PSTATE_ACTIVE] = {
|
|
+ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver,
|
|
+ [AMD_PSTATE_PASSIVE] = amd_pstate_change_driver_mode,
|
|
+ [AMD_PSTATE_ACTIVE] = NULL,
|
|
+ [AMD_PSTATE_GUIDED] = amd_pstate_change_driver_mode,
|
|
+ },
|
|
+ [AMD_PSTATE_GUIDED] = {
|
|
+ [AMD_PSTATE_DISABLE] = amd_pstate_unregister_driver,
|
|
+ [AMD_PSTATE_PASSIVE] = amd_pstate_change_mode_without_dvr_change,
|
|
+ [AMD_PSTATE_ACTIVE] = amd_pstate_change_driver_mode,
|
|
+ [AMD_PSTATE_GUIDED] = NULL,
|
|
+ },
|
|
+};
|
|
+
|
|
static ssize_t amd_pstate_show_status(char *buf)
|
|
{
|
|
if (!current_pstate_driver)
|
|
@@ -824,55 +939,22 @@ static ssize_t amd_pstate_show_status(char *buf)
|
|
return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
|
|
}
|
|
|
|
-static void amd_pstate_driver_cleanup(void)
|
|
-{
|
|
- current_pstate_driver = NULL;
|
|
-}
|
|
-
|
|
static int amd_pstate_update_status(const char *buf, size_t size)
|
|
{
|
|
- int ret = 0;
|
|
int mode_idx;
|
|
|
|
- if (size > 7 || size < 6)
|
|
+ if (size > strlen("passive") || size < strlen("active"))
|
|
return -EINVAL;
|
|
+
|
|
mode_idx = get_mode_idx_from_str(buf, size);
|
|
|
|
- switch(mode_idx) {
|
|
- case AMD_PSTATE_DISABLE:
|
|
- if (current_pstate_driver) {
|
|
- cpufreq_unregister_driver(current_pstate_driver);
|
|
- amd_pstate_driver_cleanup();
|
|
- }
|
|
- break;
|
|
- case AMD_PSTATE_PASSIVE:
|
|
- if (current_pstate_driver) {
|
|
- if (current_pstate_driver == &amd_pstate_driver)
|
|
- return 0;
|
|
- cpufreq_unregister_driver(current_pstate_driver);
|
|
- }
|
|
+ if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
|
|
+ return -EINVAL;
|
|
|
|
- current_pstate_driver = &amd_pstate_driver;
|
|
- cppc_state = AMD_PSTATE_PASSIVE;
|
|
- ret = cpufreq_register_driver(current_pstate_driver);
|
|
- break;
|
|
- case AMD_PSTATE_ACTIVE:
|
|
- if (current_pstate_driver) {
|
|
- if (current_pstate_driver == &amd_pstate_epp_driver)
|
|
- return 0;
|
|
- cpufreq_unregister_driver(current_pstate_driver);
|
|
- }
|
|
+ if (mode_state_machine[cppc_state][mode_idx])
|
|
+ return mode_state_machine[cppc_state][mode_idx](mode_idx);
|
|
|
|
- current_pstate_driver = &amd_pstate_epp_driver;
|
|
- cppc_state = AMD_PSTATE_ACTIVE;
|
|
- ret = cpufreq_register_driver(current_pstate_driver);
|
|
- break;
|
|
- default:
|
|
- ret = -EINVAL;
|
|
- break;
|
|
- }
|
|
-
|
|
- return ret;
|
|
+ return 0;
|
|
}
|
|
|
|
static ssize_t show_status(struct kobject *kobj,
|
|
@@ -930,6 +1012,7 @@ static struct attribute *pstate_global_attributes[] = {
|
|
};
|
|
|
|
static const struct attribute_group amd_pstate_global_attr_group = {
|
|
+ .name = "amd_pstate",
|
|
.attrs = pstate_global_attributes,
|
|
};
|
|
|
|
@@ -1251,6 +1334,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
|
|
|
|
static int __init amd_pstate_init(void)
|
|
{
|
|
+ struct device *dev_root;
|
|
int ret;
|
|
|
|
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
|
|
@@ -1277,7 +1361,7 @@ static int __init amd_pstate_init(void)
|
|
/* capability check */
|
|
if (boot_cpu_has(X86_FEATURE_CPPC)) {
|
|
pr_debug("AMD CPPC MSR based functionality is supported\n");
|
|
- if (cppc_state == AMD_PSTATE_PASSIVE)
|
|
+ if (cppc_state != AMD_PSTATE_ACTIVE)
|
|
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
|
|
} else {
|
|
pr_debug("AMD CPPC shared memory based functionality is supported\n");
|
|
@@ -1297,24 +1381,19 @@ static int __init amd_pstate_init(void)
|
|
if (ret)
|
|
pr_err("failed to register with return %d\n", ret);
|
|
|
|
- amd_pstate_kobj = kobject_create_and_add("amd_pstate", &cpu_subsys.dev_root->kobj);
|
|
- if (!amd_pstate_kobj) {
|
|
- ret = -EINVAL;
|
|
- pr_err("global sysfs registration failed.\n");
|
|
- goto kobject_free;
|
|
- }
|
|
-
|
|
- ret = sysfs_create_group(amd_pstate_kobj, &amd_pstate_global_attr_group);
|
|
- if (ret) {
|
|
- pr_err("sysfs attribute export failed with error %d.\n", ret);
|
|
- goto global_attr_free;
|
|
+ dev_root = bus_get_dev_root(&cpu_subsys);
|
|
+ if (dev_root) {
|
|
+ ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);
|
|
+ put_device(dev_root);
|
|
+ if (ret) {
|
|
+ pr_err("sysfs attribute export failed with error %d.\n", ret);
|
|
+ goto global_attr_free;
|
|
+ }
|
|
}
|
|
|
|
return ret;
|
|
|
|
global_attr_free:
|
|
- kobject_put(amd_pstate_kobj);
|
|
-kobject_free:
|
|
cpufreq_unregister_driver(current_pstate_driver);
|
|
return ret;
|
|
}
|
|
@@ -1339,7 +1418,7 @@ static int __init amd_pstate_param(char *str)
|
|
if (cppc_state == AMD_PSTATE_ACTIVE)
|
|
current_pstate_driver = &amd_pstate_epp_driver;
|
|
|
|
- if (cppc_state == AMD_PSTATE_PASSIVE)
|
|
+ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
|
|
current_pstate_driver = &amd_pstate_driver;
|
|
|
|
return 0;
|
|
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
|
|
index 6b487a5bd638..6126c977ece0 100644
|
|
--- a/include/acpi/cppc_acpi.h
|
|
+++ b/include/acpi/cppc_acpi.h
|
|
@@ -109,6 +109,7 @@ struct cppc_perf_caps {
|
|
u32 lowest_freq;
|
|
u32 nominal_freq;
|
|
u32 energy_perf;
|
|
+ bool auto_sel;
|
|
};
|
|
|
|
struct cppc_perf_ctrls {
|
|
@@ -153,6 +154,8 @@ extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val);
|
|
extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val);
|
|
extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
|
|
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
|
|
+extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
|
|
+extern int cppc_set_auto_sel(int cpu, bool enable);
|
|
#else /* !CONFIG_ACPI_CPPC_LIB */
|
|
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
|
|
{
|
|
@@ -214,6 +217,14 @@ static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
|
|
{
|
|
return -ENOTSUPP;
|
|
}
|
|
+static inline int cppc_set_auto_sel(int cpu, bool enable)
|
|
+{
|
|
+ return -ENOTSUPP;
|
|
+}
|
|
+static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
|
|
+{
|
|
+ return -ENOTSUPP;
|
|
+}
|
|
#endif /* !CONFIG_ACPI_CPPC_LIB */
|
|
|
|
#endif /* _CPPC_ACPI_H*/
|
|
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
|
|
index f5f22418e64b..c10ebf8c42e6 100644
|
|
--- a/include/linux/amd-pstate.h
|
|
+++ b/include/linux/amd-pstate.h
|
|
@@ -97,6 +97,7 @@ enum amd_pstate_mode {
|
|
AMD_PSTATE_DISABLE = 0,
|
|
AMD_PSTATE_PASSIVE,
|
|
AMD_PSTATE_ACTIVE,
|
|
+ AMD_PSTATE_GUIDED,
|
|
AMD_PSTATE_MAX,
|
|
};
|
|
|
|
@@ -104,6 +105,7 @@ static const char * const amd_pstate_mode_string[] = {
|
|
[AMD_PSTATE_DISABLE] = "disable",
|
|
[AMD_PSTATE_PASSIVE] = "passive",
|
|
[AMD_PSTATE_ACTIVE] = "active",
|
|
+ [AMD_PSTATE_GUIDED] = "guided",
|
|
NULL,
|
|
};
|
|
#endif /* _LINUX_AMD_PSTATE_H */
|
|
--
|
|
2.40.1
|
|
|
|
From 3162c47812c5d8dac222403897b3c8f424648c6e Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Mon, 17 Apr 2023 18:28:52 +0200
|
|
Subject: [PATCH 07/10] ksm
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
arch/alpha/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/arm/tools/syscall.tbl | 1 +
|
|
arch/arm64/include/asm/unistd.h | 2 +-
|
|
arch/arm64/include/asm/unistd32.h | 2 +
|
|
arch/ia64/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/m68k/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
|
|
arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
|
|
arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
|
|
arch/parisc/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/s390/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/sh/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/sparc/kernel/syscalls/syscall.tbl | 1 +
|
|
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
|
|
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
|
|
arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
|
|
include/linux/ksm.h | 4 +
|
|
include/linux/syscalls.h | 1 +
|
|
include/uapi/asm-generic/unistd.h | 5 +-
|
|
kernel/sys_ni.c | 1 +
|
|
mm/ksm.c | 82 +++++++++-----
|
|
mm/madvise.c | 117 ++++++++++++++++++++
|
|
24 files changed, 199 insertions(+), 31 deletions(-)
|
|
|
|
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
|
|
index 8ebacf37a8cf..c9d25f85d86d 100644
|
|
--- a/arch/alpha/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
|
|
@@ -490,3 +490,4 @@
|
|
558 common process_mrelease sys_process_mrelease
|
|
559 common futex_waitv sys_futex_waitv
|
|
560 common set_mempolicy_home_node sys_ni_syscall
|
|
+561 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
|
|
index ac964612d8b0..90933eabe115 100644
|
|
--- a/arch/arm/tools/syscall.tbl
|
|
+++ b/arch/arm/tools/syscall.tbl
|
|
@@ -464,3 +464,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
|
|
index 037feba03a51..64a514f90131 100644
|
|
--- a/arch/arm64/include/asm/unistd.h
|
|
+++ b/arch/arm64/include/asm/unistd.h
|
|
@@ -39,7 +39,7 @@
|
|
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
|
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
|
|
|
-#define __NR_compat_syscalls 451
|
|
+#define __NR_compat_syscalls 452
|
|
#endif
|
|
|
|
#define __ARCH_WANT_SYS_CLONE
|
|
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
|
|
index 604a2053d006..91f2bb7199af 100644
|
|
--- a/arch/arm64/include/asm/unistd32.h
|
|
+++ b/arch/arm64/include/asm/unistd32.h
|
|
@@ -907,6 +907,8 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
|
|
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
|
#define __NR_set_mempolicy_home_node 450
|
|
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
|
|
+#define __NR_pmadv_ksm 451
|
|
+__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
|
|
|
|
/*
|
|
* Please add new compat syscalls above this comment and update
|
|
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
|
|
index 72c929d9902b..0d5b1d14b2b5 100644
|
|
--- a/arch/ia64/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
|
|
@@ -371,3 +371,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
|
|
index b1f3940bc298..5ccf925567da 100644
|
|
--- a/arch/m68k/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
|
|
@@ -450,3 +450,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
|
|
index 820145e47350..6b76208597f3 100644
|
|
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
|
|
@@ -456,3 +456,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
|
|
index 253ff994ed2e..e4aeedb17c38 100644
|
|
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
|
|
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
|
|
@@ -389,3 +389,4 @@
|
|
448 n32 process_mrelease sys_process_mrelease
|
|
449 n32 futex_waitv sys_futex_waitv
|
|
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 n32 pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
|
|
index 3f1886ad9d80..fe88db51efa0 100644
|
|
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
|
|
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
|
|
@@ -365,3 +365,4 @@
|
|
448 n64 process_mrelease sys_process_mrelease
|
|
449 n64 futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 n64 pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
|
|
index 8f243e35a7b2..674cb940bd15 100644
|
|
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
|
|
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
|
|
@@ -438,3 +438,4 @@
|
|
448 o32 process_mrelease sys_process_mrelease
|
|
449 o32 futex_waitv sys_futex_waitv
|
|
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 o32 pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
|
|
index 0e42fceb2d5e..5914aa460255 100644
|
|
--- a/arch/parisc/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
|
|
@@ -448,3 +448,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
|
|
index a0be127475b1..347894da4eb6 100644
|
|
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
|
|
@@ -537,3 +537,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
|
|
index 799147658dee..1cd523748bd2 100644
|
|
--- a/arch/s390/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/s390/kernel/syscalls/syscall.tbl
|
|
@@ -453,3 +453,4 @@
|
|
448 common process_mrelease sys_process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
|
|
index 2de85c977f54..cfc75fa43eae 100644
|
|
--- a/arch/sh/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/sh/kernel/syscalls/syscall.tbl
|
|
@@ -453,3 +453,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
|
|
index 4398cc6fb68d..d2c0a6426f6b 100644
|
|
--- a/arch/sparc/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
|
|
@@ -496,3 +496,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
|
|
index 320480a8db4f..331aaf1a782f 100644
|
|
--- a/arch/x86/entry/syscalls/syscall_32.tbl
|
|
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
|
|
@@ -455,3 +455,4 @@
|
|
448 i386 process_mrelease sys_process_mrelease
|
|
449 i386 futex_waitv sys_futex_waitv
|
|
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 i386 pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
|
|
index c84d12608cd2..14902db4c01f 100644
|
|
--- a/arch/x86/entry/syscalls/syscall_64.tbl
|
|
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
|
|
@@ -372,6 +372,7 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
|
|
#
|
|
# Due to a historical design error, certain syscalls are numbered differently
|
|
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
|
|
index 52c94ab5c205..1518e261d882 100644
|
|
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
|
|
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
|
|
@@ -421,3 +421,4 @@
|
|
448 common process_mrelease sys_process_mrelease
|
|
449 common futex_waitv sys_futex_waitv
|
|
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
|
+451 common pmadv_ksm sys_pmadv_ksm
|
|
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
|
|
index 7e232ba59b86..57ed92987717 100644
|
|
--- a/include/linux/ksm.h
|
|
+++ b/include/linux/ksm.h
|
|
@@ -16,6 +16,10 @@
|
|
#include <linux/sched/coredump.h>
|
|
|
|
#ifdef CONFIG_KSM
|
|
+int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
+ const vm_flags_t *vm_flags);
|
|
+int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
|
|
+ unsigned long end, const vm_flags_t *vm_flags);
|
|
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
|
|
unsigned long end, int advice, unsigned long *vm_flags);
|
|
int __ksm_enter(struct mm_struct *mm);
|
|
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
|
|
index 33a0ee3bcb2e..62f14e800839 100644
|
|
--- a/include/linux/syscalls.h
|
|
+++ b/include/linux/syscalls.h
|
|
@@ -919,6 +919,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
|
|
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
|
|
size_t vlen, int behavior, unsigned int flags);
|
|
asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
|
|
+asmlinkage long sys_pmadv_ksm(int pidfd, int behavior, unsigned int flags);
|
|
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
|
unsigned long prot, unsigned long pgoff,
|
|
unsigned long flags);
|
|
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
|
|
index 45fa180cc56a..40f7e6d04af0 100644
|
|
--- a/include/uapi/asm-generic/unistd.h
|
|
+++ b/include/uapi/asm-generic/unistd.h
|
|
@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
|
#define __NR_set_mempolicy_home_node 450
|
|
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
|
|
|
|
+#define __NR_pmadv_ksm 451
|
|
+__SYSCALL(__NR_pmadv_ksm, sys_pmadv_ksm)
|
|
+
|
|
#undef __NR_syscalls
|
|
-#define __NR_syscalls 451
|
|
+#define __NR_syscalls 452
|
|
|
|
/*
|
|
* 32 bit systems traditionally used different
|
|
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
|
|
index 860b2dcf3ac4..810e1fcaff94 100644
|
|
--- a/kernel/sys_ni.c
|
|
+++ b/kernel/sys_ni.c
|
|
@@ -292,6 +292,7 @@ COND_SYSCALL(mincore);
|
|
COND_SYSCALL(madvise);
|
|
COND_SYSCALL(process_madvise);
|
|
COND_SYSCALL(process_mrelease);
|
|
+COND_SYSCALL(pmadv_ksm);
|
|
COND_SYSCALL(remap_file_pages);
|
|
COND_SYSCALL(mbind);
|
|
COND_SYSCALL(get_mempolicy);
|
|
diff --git a/mm/ksm.c b/mm/ksm.c
|
|
index 2b8d30068cbb..ab9a157873f4 100644
|
|
--- a/mm/ksm.c
|
|
+++ b/mm/ksm.c
|
|
@@ -2471,52 +2471,76 @@ static int ksm_scan_thread(void *nothing)
|
|
return 0;
|
|
}
|
|
|
|
-int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
|
|
- unsigned long end, int advice, unsigned long *vm_flags)
|
|
+int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
+ const vm_flags_t *vm_flags)
|
|
{
|
|
- struct mm_struct *mm = vma->vm_mm;
|
|
int err;
|
|
|
|
- switch (advice) {
|
|
- case MADV_MERGEABLE:
|
|
- /*
|
|
- * Be somewhat over-protective for now!
|
|
- */
|
|
- if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
|
|
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
|
|
- VM_HUGETLB | VM_MIXEDMAP))
|
|
- return 0; /* just ignore the advice */
|
|
+ /*
|
|
+ * Be somewhat over-protective for now!
|
|
+ */
|
|
+ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
|
|
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
|
|
+ VM_HUGETLB | VM_MIXEDMAP))
|
|
+ return 0; /* just ignore the advice */
|
|
|
|
- if (vma_is_dax(vma))
|
|
- return 0;
|
|
+ if (vma_is_dax(vma))
|
|
+ return 0;
|
|
|
|
#ifdef VM_SAO
|
|
if (*vm_flags & VM_SAO)
|
|
return 0;
|
|
#endif
|
|
#ifdef VM_SPARC_ADI
|
|
- if (*vm_flags & VM_SPARC_ADI)
|
|
- return 0;
|
|
+ if (*vm_flags & VM_SPARC_ADI)
|
|
+ return 0;
|
|
#endif
|
|
|
|
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
|
|
- err = __ksm_enter(mm);
|
|
- if (err)
|
|
- return err;
|
|
- }
|
|
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
|
|
+ err = __ksm_enter(mm);
|
|
+ if (err)
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start,
|
|
+ unsigned long end, const vm_flags_t *vm_flags)
|
|
+{
|
|
+ int err;
|
|
+
|
|
+ if (!(*vm_flags & VM_MERGEABLE))
|
|
+ return 0; /* just ignore the advice */
|
|
+
|
|
+ if (vma->anon_vma) {
|
|
+ err = unmerge_ksm_pages(vma, start, end);
|
|
+ if (err)
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
|
|
+ unsigned long end, int advice, unsigned long *vm_flags)
|
|
+{
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
+ int err;
|
|
+
|
|
+ switch (advice) {
|
|
+ case MADV_MERGEABLE:
|
|
+ err = ksm_madvise_merge(mm, vma, vm_flags);
|
|
+ if (err)
|
|
+ return err;
|
|
|
|
*vm_flags |= VM_MERGEABLE;
|
|
break;
|
|
|
|
case MADV_UNMERGEABLE:
|
|
- if (!(*vm_flags & VM_MERGEABLE))
|
|
- return 0; /* just ignore the advice */
|
|
-
|
|
- if (vma->anon_vma) {
|
|
- err = unmerge_ksm_pages(vma, start, end);
|
|
- if (err)
|
|
- return err;
|
|
- }
|
|
+ err = ksm_madvise_unmerge(vma, start, end, vm_flags);
|
|
+ if (err)
|
|
+ return err;
|
|
|
|
*vm_flags &= ~VM_MERGEABLE;
|
|
break;
|
|
diff --git a/mm/madvise.c b/mm/madvise.c
|
|
index 340125d08c03..36e756355f04 100644
|
|
--- a/mm/madvise.c
|
|
+++ b/mm/madvise.c
|
|
@@ -1522,3 +1522,120 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
|
out:
|
|
return ret;
|
|
}
|
|
+
|
|
+SYSCALL_DEFINE3(pmadv_ksm, int, pidfd, int, behaviour, unsigned int, flags)
|
|
+{
|
|
+#ifdef CONFIG_KSM
|
|
+ ssize_t ret;
|
|
+ struct pid *pid;
|
|
+ struct task_struct *task;
|
|
+ struct mm_struct *mm;
|
|
+ unsigned int f_flags;
|
|
+ struct vm_area_struct *vma;
|
|
+ struct vma_iterator vmi;
|
|
+
|
|
+ if (flags != 0) {
|
|
+ ret = -EINVAL;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ switch (behaviour) {
|
|
+ case MADV_MERGEABLE:
|
|
+ case MADV_UNMERGEABLE:
|
|
+ break;
|
|
+ default:
|
|
+ ret = -EINVAL;
|
|
+ goto out;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ pid = pidfd_get_pid(pidfd, &f_flags);
|
|
+ if (IS_ERR(pid)) {
|
|
+ ret = PTR_ERR(pid);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ task = get_pid_task(pid, PIDTYPE_PID);
|
|
+ if (!task) {
|
|
+ ret = -ESRCH;
|
|
+ goto put_pid;
|
|
+ }
|
|
+
|
|
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
|
|
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
|
|
+ if (IS_ERR_OR_NULL(mm)) {
|
|
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
|
|
+ goto release_task;
|
|
+ }
|
|
+
|
|
+ /* Require CAP_SYS_NICE for influencing process performance. */
|
|
+ if (!capable(CAP_SYS_NICE)) {
|
|
+ ret = -EPERM;
|
|
+ goto release_mm;
|
|
+ }
|
|
+
|
|
+ if (mmap_write_lock_killable(mm)) {
|
|
+ ret = -EINTR;
|
|
+ goto release_mm;
|
|
+ }
|
|
+
|
|
+ vma_iter_init(&vmi, mm, 0);
|
|
+ for_each_vma(vmi, vma) {
|
|
+ switch (behaviour) {
|
|
+ case MADV_MERGEABLE:
|
|
+ ret = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags);
|
|
+ if (!ret)
|
|
+ vm_flags_set(vma, VM_MERGEABLE);
|
|
+ break;
|
|
+ case MADV_UNMERGEABLE:
|
|
+ ret = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags);
|
|
+ if (!ret)
|
|
+ vm_flags_clear(vma, VM_MERGEABLE);
|
|
+ break;
|
|
+ default:
|
|
+ /* look, ma, no brain */
|
|
+ break;
|
|
+ }
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ mmap_write_unlock(mm);
|
|
+
|
|
+release_mm:
|
|
+ mmput(mm);
|
|
+release_task:
|
|
+ put_task_struct(task);
|
|
+put_pid:
|
|
+ put_pid(pid);
|
|
+out:
|
|
+ return ret;
|
|
+#else /* CONFIG_KSM */
|
|
+ return -ENOSYS;
|
|
+#endif /* CONFIG_KSM */
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_KSM
|
|
+static ssize_t ksm_show(struct kobject *kobj, struct kobj_attribute *attr,
|
|
+ char *buf)
|
|
+{
|
|
+ return sprintf(buf, "%u\n", __NR_pmadv_ksm);
|
|
+}
|
|
+static struct kobj_attribute pmadv_ksm_attr = __ATTR_RO(ksm);
|
|
+
|
|
+static struct attribute *pmadv_sysfs_attrs[] = {
|
|
+ &pmadv_ksm_attr.attr,
|
|
+ NULL,
|
|
+};
|
|
+
|
|
+static const struct attribute_group pmadv_sysfs_attr_group = {
|
|
+ .attrs = pmadv_sysfs_attrs,
|
|
+ .name = "pmadv",
|
|
+};
|
|
+
|
|
+static int __init pmadv_sysfs_init(void)
|
|
+{
|
|
+ return sysfs_create_group(kernel_kobj, &pmadv_sysfs_attr_group);
|
|
+}
|
|
+subsys_initcall(pmadv_sysfs_init);
|
|
+#endif /* CONFIG_KSM */
|
|
--
|
|
2.40.1
|
|
|
|
From 26780b606ac659096b0e1a9a2bba12aa747cbf66 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Fri, 28 Apr 2023 20:00:54 +0200
|
|
Subject: [PATCH 08/10] Per-VMA locks
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Previous versions:
|
|
v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/
|
|
v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/
|
|
v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
|
|
RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
|
|
|
|
LWN article describing the feature:
|
|
https://lwn.net/Articles/906852/
|
|
|
|
Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
|
|
last year [2], which concluded with suggestion that “a reader/writer
|
|
semaphore could be put into the VMA itself; that would have the effect of
|
|
using the VMA as a sort of range lock. There would still be contention at
|
|
the VMA level, but it would be an improvement.” This patchset implements
|
|
this suggested approach.
|
|
|
|
When handling page faults we lookup the VMA that contains the faulting
|
|
page under RCU protection and try to acquire its lock. If that fails we
|
|
fall back to using mmap_lock, similar to how SPF handled this situation.
|
|
|
|
One notable way the implementation deviates from the proposal is the way
|
|
VMAs are read-locked. During some of mm updates, multiple VMAs need to be
|
|
locked until the end of the update (e.g. vma_merge, split_vma, etc).
|
|
Tracking all the locked VMAs, avoiding recursive locks, figuring out when
|
|
it's safe to unlock previously locked VMAs would make the code more
|
|
complex. So, instead of the usual lock/unlock pattern, the proposed
|
|
solution marks a VMA as locked and provides an efficient way to:
|
|
1. Identify locked VMAs.
|
|
2. Unlock all locked VMAs in bulk.
|
|
We also postpone unlocking the locked VMAs until the end of the update,
|
|
when we do mmap_write_unlock. Potentially this keeps a VMA locked for
|
|
longer than is absolutely necessary but it results in a big reduction of
|
|
code complexity.
|
|
Read-locking a VMA is done using two sequence numbers - one in the
|
|
vm_area_struct and one in the mm_struct. VMA is considered read-locked
|
|
when these sequence numbers are equal. To read-lock a VMA we set the
|
|
sequence number in vm_area_struct to be equal to the sequence number in
|
|
mm_struct. To unlock all VMAs we increment mm_struct's seq number. This
|
|
allows for an efficient way to track locked VMAs and to drop the locks on
|
|
all VMAs at the end of the update.
|
|
|
|
The patchset implements per-VMA locking only for anonymous pages which
|
|
are not in swap and avoids userfaultfs as their implementation is more
|
|
complex. Additional support for file-back page faults, swapped and user
|
|
pages can be added incrementally.
|
|
|
|
Performance benchmarks show similar although slightly smaller benefits as
|
|
with SPF patchset (~75% of SPF benefits). Still, with lower complexity
|
|
this approach might be more desirable.
|
|
|
|
Since RFC was posted in September 2022, two separate Google teams outside
|
|
of Android evaluated the patchset and confirmed positive results. Here are
|
|
the known usecases when per-VMA locks show benefits:
|
|
|
|
Android:
|
|
Apps with high number of threads (~100) launch times improve by up to 20%.
|
|
Each thread mmaps several areas upon startup (Stack and Thread-local
|
|
storage (TLS), thread signal stack, indirect ref table), which requires
|
|
taking mmap_lock in write mode. Page faults take mmap_lock in read mode.
|
|
During app launch, both thread creation and page faults establishing the
|
|
active workinget are happening in parallel and that causes lock contention
|
|
between mm writers and readers even if updates and page faults are
|
|
happening in different VMAs. Per-vma locks prevent this contention by
|
|
providing more granular lock.
|
|
|
|
Google Fibers:
|
|
We have several dynamically sized thread pools that spawn new threads
|
|
under increased load and reduce their number when idling. For example,
|
|
Google's in-process scheduling/threading framework, UMCG/Fibers, is backed
|
|
by such a thread pool. When idling, only a small number of idle worker
|
|
threads are available; when a spike of incoming requests arrive, each
|
|
request is handled in its own "fiber", which is a work item posted onto a
|
|
UMCG worker thread; quite often these spikes lead to a number of new
|
|
threads spawning. Each new thread needs to allocate and register an RSEQ
|
|
section on its TLS, then register itself with the kernel as a UMCG worker
|
|
thread, and only after that it can be considered by the in-process
|
|
UMCG/Fiber scheduler as available to do useful work. In short, during an
|
|
incoming workload spike new threads have to be spawned, and they perform
|
|
several syscalls (RSEQ registration, UMCG worker registration, memory
|
|
allocations) before they can actually start doing useful work. Removing
|
|
any bottlenecks on this thread startup path will greatly improve our
|
|
services' latencies when faced with request/workload spikes.
|
|
At high scale, mmap_lock contention during thread creation and stack page
|
|
faults leads to user-visible multi-second serving latencies in a similar
|
|
pattern to Android app startup. Per-VMA locking patchset has been run
|
|
successfully in limited experiments with user-facing production workloads.
|
|
In these experiments, we observed that the peak thread creation rate was
|
|
high enough that thread creation is no longer a bottleneck.
|
|
|
|
TCP zerocopy receive:
|
|
From the point of view of TCP zerocopy receive, the per-vma lock patch is
|
|
massively beneficial.
|
|
In today's implementation, a process with N threads where N - 1 are
|
|
performing zerocopy receive and 1 thread is performing madvise() with the
|
|
write lock taken (e.g. needs to change vm_flags) will result in all N -1
|
|
receive threads blocking until the madvise is done. Conversely, on a busy
|
|
process receiving a lot of data, an madvise operation that does need to
|
|
take the mmap lock in write mode will need to wait for all of the receives
|
|
to be done - a lose:lose proposition. Per-VMA locking _removes_ by
|
|
definition this source of contention entirely.
|
|
There are other benefits for receive as well, chiefly a reduction in
|
|
cacheline bouncing across receiving threads for locking/unlocking the
|
|
single mmap lock. On an RPC style synthetic workload with 4KB RPCs:
|
|
1a) The find+lock+unlock VMA path in the base case, without the per-vma
|
|
lock patchset, is about 0.7% of cycles as measured by perf.
|
|
1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5%
|
|
cycles overall - most of this is within the TCP read hotpath (a small
|
|
fraction is 'other' usage in the system).
|
|
2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial
|
|
patch written to take advantage of it in TCP, is about 0.4% of cycles
|
|
(down from 0.7% above)
|
|
2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1%
|
|
cycles and is out of the TCP read hotpath entirely (down from 0.5% before,
|
|
the remaining usage is the 'other' usage in the system).
|
|
So, in addition to entirely removing an onerous source of contention, it
|
|
also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+
|
|
(compared to overall cycles in perf) for the 'small' RPC scenario.
|
|
|
|
The patchset structure is:
|
|
0001-0008: Enable maple-tree RCU mode
|
|
0009-0031: Main per-vma locks patchset
|
|
0032-0033: Performance optimizations
|
|
|
|
Changes since v3:
|
|
- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge
|
|
- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo
|
|
- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett
|
|
- Dropped patch [6] from the set as unnecessary, per Liam Howlett
|
|
|
|
[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/
|
|
[2] https://lwn.net/Articles/893906/
|
|
[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/
|
|
[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/
|
|
[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/
|
|
[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/
|
|
|
|
The patchset applies cleanly over mm-unstable branch.
|
|
|
|
Laurent Dufour (1):
|
|
powerc/mm: try VMA lock-based page fault handling first
|
|
|
|
Liam Howlett (4):
|
|
maple_tree: Be more cautious about dead nodes
|
|
maple_tree: Detect dead nodes in mas_start()
|
|
maple_tree: Fix freeing of nodes in rcu mode
|
|
maple_tree: remove extra smp_wmb() from mas_dead_leaves()
|
|
|
|
Liam R. Howlett (4):
|
|
maple_tree: Fix write memory barrier of nodes once dead for RCU mode
|
|
maple_tree: Add smp_rmb() to dead node detection
|
|
maple_tree: Add RCU lock checking to rcu callback functions
|
|
mm: Enable maple tree RCU mode by default.
|
|
|
|
Michel Lespinasse (1):
|
|
mm: rcu safe VMA freeing
|
|
|
|
Suren Baghdasaryan (23):
|
|
mm: introduce CONFIG_PER_VMA_LOCK
|
|
mm: move mmap_lock assert function definitions
|
|
mm: add per-VMA lock and helper functions to control it
|
|
mm: mark VMA as being written when changing vm_flags
|
|
mm/mmap: move vma_prepare before vma_adjust_trans_huge
|
|
mm/khugepaged: write-lock VMA while collapsing a huge page
|
|
mm/mmap: write-lock VMAs in vma_prepare before modifying them
|
|
mm/mremap: write-lock VMA while remapping it to a new address range
|
|
mm: write-lock VMAs before removing them from VMA tree
|
|
mm: conditionally write-lock VMA in free_pgtables
|
|
kernel/fork: assert no VMA readers during its destruction
|
|
mm/mmap: prevent pagefault handler from racing with mmu_notifier
|
|
registration
|
|
mm: introduce vma detached flag
|
|
mm: introduce lock_vma_under_rcu to be used from arch-specific code
|
|
mm: fall back to mmap_lock if vma->anon_vma is not yet set
|
|
mm: add FAULT_FLAG_VMA_LOCK flag
|
|
mm: prevent do_swap_page from handling page faults under VMA lock
|
|
mm: prevent userfaults to be handled under per-vma lock
|
|
mm: introduce per-VMA lock statistics
|
|
x86/mm: try VMA lock-based page fault handling first
|
|
arm64/mm: try VMA lock-based page fault handling first
|
|
mm/mmap: free vm_area_struct without call_rcu in exit_mmap
|
|
mm: separate vma->lock from vm_area_struct
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
arch/arm64/Kconfig | 1 +
|
|
arch/arm64/mm/fault.c | 36 +++++++
|
|
arch/powerpc/mm/fault.c | 37 +++++++
|
|
arch/powerpc/platforms/powernv/Kconfig | 1 +
|
|
arch/powerpc/platforms/pseries/Kconfig | 1 +
|
|
arch/s390/Kconfig | 1 +
|
|
arch/s390/mm/fault.c | 24 +++++
|
|
arch/x86/Kconfig | 1 +
|
|
arch/x86/mm/fault.c | 36 +++++++
|
|
include/linux/mm.h | 127 +++++++++++++++++++++++--
|
|
include/linux/mm_types.h | 30 +++++-
|
|
include/linux/mmap_lock.h | 37 ++++---
|
|
include/linux/vm_event_item.h | 6 ++
|
|
include/linux/vmstat.h | 6 ++
|
|
kernel/fork.c | 96 ++++++++++++++++---
|
|
mm/Kconfig | 12 +++
|
|
mm/Kconfig.debug | 6 ++
|
|
mm/init-mm.c | 3 +
|
|
mm/internal.h | 2 +-
|
|
mm/khugepaged.c | 8 ++
|
|
mm/memory.c | 72 +++++++++++++-
|
|
mm/mmap.c | 48 +++++++---
|
|
mm/mremap.c | 1 +
|
|
mm/rmap.c | 31 +++---
|
|
mm/vmstat.c | 6 ++
|
|
25 files changed, 567 insertions(+), 62 deletions(-)
|
|
|
|
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
|
|
index 1023e896d46b..6f104c829731 100644
|
|
--- a/arch/arm64/Kconfig
|
|
+++ b/arch/arm64/Kconfig
|
|
@@ -95,6 +95,7 @@ config ARM64
|
|
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
|
|
select ARCH_SUPPORTS_NUMA_BALANCING
|
|
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
|
+ select ARCH_SUPPORTS_PER_VMA_LOCK
|
|
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
|
|
select ARCH_WANT_DEFAULT_BPF_JIT
|
|
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
|
|
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
|
|
index f4cb0f85ccf4..9e0db5c387e3 100644
|
|
--- a/arch/arm64/mm/fault.c
|
|
+++ b/arch/arm64/mm/fault.c
|
|
@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
|
|
unsigned long vm_flags;
|
|
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
|
|
unsigned long addr = untagged_addr(far);
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ struct vm_area_struct *vma;
|
|
+#endif
|
|
|
|
if (kprobe_page_fault(regs, esr))
|
|
return 0;
|
|
@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
|
|
|
|
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ if (!(mm_flags & FAULT_FLAG_USER))
|
|
+ goto lock_mmap;
|
|
+
|
|
+ vma = lock_vma_under_rcu(mm, addr);
|
|
+ if (!vma)
|
|
+ goto lock_mmap;
|
|
+
|
|
+ if (!(vma->vm_flags & vm_flags)) {
|
|
+ vma_end_read(vma);
|
|
+ goto lock_mmap;
|
|
+ }
|
|
+ fault = handle_mm_fault(vma, addr & PAGE_MASK,
|
|
+ mm_flags | FAULT_FLAG_VMA_LOCK, regs);
|
|
+ vma_end_read(vma);
|
|
+
|
|
+ if (!(fault & VM_FAULT_RETRY)) {
|
|
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
|
|
+ goto done;
|
|
+ }
|
|
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
|
|
+
|
|
+ /* Quick path to respond to signals */
|
|
+ if (fault_signal_pending(fault, regs)) {
|
|
+ if (!user_mode(regs))
|
|
+ goto no_context;
|
|
+ return 0;
|
|
+ }
|
|
+lock_mmap:
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
/*
|
|
* As per x86, we may deadlock here. However, since the kernel only
|
|
* validly references user space from well defined areas of the code,
|
|
@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
|
|
}
|
|
mmap_read_unlock(mm);
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+done:
|
|
+#endif
|
|
/*
|
|
* Handle the "normal" (no error) case first.
|
|
*/
|
|
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
|
|
index af46aa88422b..531177a4ee08 100644
|
|
--- a/arch/powerpc/mm/fault.c
|
|
+++ b/arch/powerpc/mm/fault.c
|
|
@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
|
|
if (is_exec)
|
|
flags |= FAULT_FLAG_INSTRUCTION;
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ if (!(flags & FAULT_FLAG_USER))
|
|
+ goto lock_mmap;
|
|
+
|
|
+ vma = lock_vma_under_rcu(mm, address);
|
|
+ if (!vma)
|
|
+ goto lock_mmap;
|
|
+
|
|
+ if (unlikely(access_pkey_error(is_write, is_exec,
|
|
+ (error_code & DSISR_KEYFAULT), vma))) {
|
|
+ vma_end_read(vma);
|
|
+ goto lock_mmap;
|
|
+ }
|
|
+
|
|
+ if (unlikely(access_error(is_write, is_exec, vma))) {
|
|
+ vma_end_read(vma);
|
|
+ goto lock_mmap;
|
|
+ }
|
|
+
|
|
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
|
|
+ vma_end_read(vma);
|
|
+
|
|
+ if (!(fault & VM_FAULT_RETRY)) {
|
|
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
|
|
+ goto done;
|
|
+ }
|
|
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
|
|
+
|
|
+ if (fault_signal_pending(fault, regs))
|
|
+ return user_mode(regs) ? 0 : SIGBUS;
|
|
+
|
|
+lock_mmap:
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
/* When running in the kernel we expect faults to occur only to
|
|
* addresses in user space. All other faults represent errors in the
|
|
* kernel and should generate an OOPS. Unfortunately, in the case of an
|
|
@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
|
|
|
|
mmap_read_unlock(current->mm);
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+done:
|
|
+#endif
|
|
if (unlikely(fault & VM_FAULT_ERROR))
|
|
return mm_fault_error(regs, address, fault);
|
|
|
|
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
|
|
index ae248a161b43..70a46acc70d6 100644
|
|
--- a/arch/powerpc/platforms/powernv/Kconfig
|
|
+++ b/arch/powerpc/platforms/powernv/Kconfig
|
|
@@ -16,6 +16,7 @@ config PPC_POWERNV
|
|
select PPC_DOORBELL
|
|
select MMU_NOTIFIER
|
|
select FORCE_SMP
|
|
+ select ARCH_SUPPORTS_PER_VMA_LOCK
|
|
default y
|
|
|
|
config OPAL_PRD
|
|
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
|
|
index 21b22bf16ce6..4ebf2ef2845d 100644
|
|
--- a/arch/powerpc/platforms/pseries/Kconfig
|
|
+++ b/arch/powerpc/platforms/pseries/Kconfig
|
|
@@ -22,6 +22,7 @@ config PPC_PSERIES
|
|
select HOTPLUG_CPU
|
|
select FORCE_SMP
|
|
select SWIOTLB
|
|
+ select ARCH_SUPPORTS_PER_VMA_LOCK
|
|
default y
|
|
|
|
config PARAVIRT
|
|
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
|
|
index 9809c74e1240..548b5b587003 100644
|
|
--- a/arch/s390/Kconfig
|
|
+++ b/arch/s390/Kconfig
|
|
@@ -120,6 +120,7 @@ config S390
|
|
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
|
select ARCH_SUPPORTS_HUGETLBFS
|
|
select ARCH_SUPPORTS_NUMA_BALANCING
|
|
+ select ARCH_SUPPORTS_PER_VMA_LOCK
|
|
select ARCH_USE_BUILTIN_BSWAP
|
|
select ARCH_USE_CMPXCHG_LOCKREF
|
|
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
|
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
|
|
index a2632fd97d00..b65144c392b0 100644
|
|
--- a/arch/s390/mm/fault.c
|
|
+++ b/arch/s390/mm/fault.c
|
|
@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
|
|
access = VM_WRITE;
|
|
if (access == VM_WRITE)
|
|
flags |= FAULT_FLAG_WRITE;
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ if (!(flags & FAULT_FLAG_USER))
|
|
+ goto lock_mmap;
|
|
+ vma = lock_vma_under_rcu(mm, address);
|
|
+ if (!vma)
|
|
+ goto lock_mmap;
|
|
+ if (!(vma->vm_flags & access)) {
|
|
+ vma_end_read(vma);
|
|
+ goto lock_mmap;
|
|
+ }
|
|
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
|
|
+ vma_end_read(vma);
|
|
+ if (!(fault & VM_FAULT_RETRY)) {
|
|
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
|
|
+ goto out;
|
|
+ }
|
|
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
|
|
+ /* Quick path to respond to signals */
|
|
+ if (fault_signal_pending(fault, regs)) {
|
|
+ fault = VM_FAULT_SIGNAL;
|
|
+ goto out;
|
|
+ }
|
|
+lock_mmap:
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
mmap_read_lock(mm);
|
|
|
|
gmap = NULL;
|
|
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
|
|
index a825bf031f49..df21fba77db1 100644
|
|
--- a/arch/x86/Kconfig
|
|
+++ b/arch/x86/Kconfig
|
|
@@ -27,6 +27,7 @@ config X86_64
|
|
# Options that are inherently 64-bit kernel only:
|
|
select ARCH_HAS_GIGANTIC_PAGE
|
|
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
|
|
+ select ARCH_SUPPORTS_PER_VMA_LOCK
|
|
select ARCH_USE_CMPXCHG_LOCKREF
|
|
select HAVE_ARCH_SOFT_DIRTY
|
|
select MODULES_USE_ELF_RELA
|
|
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
|
|
index a498ae1fbe66..e4399983c50c 100644
|
|
--- a/arch/x86/mm/fault.c
|
|
+++ b/arch/x86/mm/fault.c
|
|
@@ -19,6 +19,7 @@
|
|
#include <linux/uaccess.h> /* faulthandler_disabled() */
|
|
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
|
|
#include <linux/mm_types.h>
|
|
+#include <linux/mm.h> /* find_and_lock_vma() */
|
|
|
|
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
|
|
#include <asm/traps.h> /* dotraplinkage, ... */
|
|
@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
|
|
}
|
|
#endif
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ if (!(flags & FAULT_FLAG_USER))
|
|
+ goto lock_mmap;
|
|
+
|
|
+ vma = lock_vma_under_rcu(mm, address);
|
|
+ if (!vma)
|
|
+ goto lock_mmap;
|
|
+
|
|
+ if (unlikely(access_error(error_code, vma))) {
|
|
+ vma_end_read(vma);
|
|
+ goto lock_mmap;
|
|
+ }
|
|
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
|
|
+ vma_end_read(vma);
|
|
+
|
|
+ if (!(fault & VM_FAULT_RETRY)) {
|
|
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
|
|
+ goto done;
|
|
+ }
|
|
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
|
|
+
|
|
+ /* Quick path to respond to signals */
|
|
+ if (fault_signal_pending(fault, regs)) {
|
|
+ if (!user_mode(regs))
|
|
+ kernelmode_fixup_or_oops(regs, error_code, address,
|
|
+ SIGBUS, BUS_ADRERR,
|
|
+ ARCH_DEFAULT_PKEY);
|
|
+ return;
|
|
+ }
|
|
+lock_mmap:
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
/*
|
|
* Kernel-mode access to the user address space should only occur
|
|
* on well-defined single instructions listed in the exception
|
|
@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs,
|
|
}
|
|
|
|
mmap_read_unlock(mm);
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+done:
|
|
+#endif
|
|
if (likely(!(fault & VM_FAULT_ERROR)))
|
|
return;
|
|
|
|
diff --git a/include/linux/mm.h b/include/linux/mm.h
|
|
index 1f79667824eb..c4c9de7d1916 100644
|
|
--- a/include/linux/mm.h
|
|
+++ b/include/linux/mm.h
|
|
@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
|
|
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
|
|
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
|
|
void vm_area_free(struct vm_area_struct *);
|
|
+/* Use only if VMA has no other users */
|
|
+void __vm_area_free(struct vm_area_struct *vma);
|
|
|
|
#ifndef CONFIG_MMU
|
|
extern struct rb_root nommu_region_tree;
|
|
@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
|
|
{ FAULT_FLAG_USER, "USER" }, \
|
|
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
|
|
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
|
|
- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
|
|
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
|
|
+ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
|
|
|
|
/*
|
|
* vm_fault is filled by the pagefault handler and passed to the vma's
|
|
@@ -623,6 +626,117 @@ struct vm_operations_struct {
|
|
unsigned long addr);
|
|
};
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+/*
|
|
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
|
|
+ * locked result to avoid performance overhead, in which case we fall back to
|
|
+ * using mmap_lock. The function should never yield false unlocked result.
|
|
+ */
|
|
+static inline bool vma_start_read(struct vm_area_struct *vma)
|
|
+{
|
|
+ /* Check before locking. A race might cause false locked result. */
|
|
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
|
|
+ return false;
|
|
+
|
|
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * Overflow might produce false locked result.
|
|
+ * False unlocked result is impossible because we modify and check
|
|
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
|
|
+ * modification invalidates all existing locks.
|
|
+ */
|
|
+ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
|
|
+ up_read(&vma->vm_lock->lock);
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void vma_end_read(struct vm_area_struct *vma)
|
|
+{
|
|
+ rcu_read_lock(); /* keeps vma alive till the end of up_read */
|
|
+ up_read(&vma->vm_lock->lock);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
|
|
+{
|
|
+ mmap_assert_write_locked(vma->vm_mm);
|
|
+
|
|
+ /*
|
|
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
|
|
+ * mm->mm_lock_seq can't be concurrently modified.
|
|
+ */
|
|
+ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
|
|
+ return (vma->vm_lock_seq == *mm_lock_seq);
|
|
+}
|
|
+
|
|
+static inline void vma_start_write(struct vm_area_struct *vma)
|
|
+{
|
|
+ int mm_lock_seq;
|
|
+
|
|
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
|
|
+ return;
|
|
+
|
|
+ down_write(&vma->vm_lock->lock);
|
|
+ vma->vm_lock_seq = mm_lock_seq;
|
|
+ up_write(&vma->vm_lock->lock);
|
|
+}
|
|
+
|
|
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
|
|
+{
|
|
+ int mm_lock_seq;
|
|
+
|
|
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
|
|
+ return true;
|
|
+
|
|
+ if (!down_write_trylock(&vma->vm_lock->lock))
|
|
+ return false;
|
|
+
|
|
+ vma->vm_lock_seq = mm_lock_seq;
|
|
+ up_write(&vma->vm_lock->lock);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
|
|
+{
|
|
+ int mm_lock_seq;
|
|
+
|
|
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
|
|
+}
|
|
+
|
|
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
|
|
+{
|
|
+ /* When detaching vma should be write-locked */
|
|
+ if (detached)
|
|
+ vma_assert_write_locked(vma);
|
|
+ vma->detached = detached;
|
|
+}
|
|
+
|
|
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
|
|
+ unsigned long address);
|
|
+
|
|
+#else /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
|
|
+static inline bool vma_start_read(struct vm_area_struct *vma)
|
|
+ { return false; }
|
|
+static inline void vma_end_read(struct vm_area_struct *vma) {}
|
|
+static inline void vma_start_write(struct vm_area_struct *vma) {}
|
|
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
|
|
+ { return true; }
|
|
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
|
|
+static inline void vma_mark_detached(struct vm_area_struct *vma,
|
|
+ bool detached) {}
|
|
+
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
+/*
|
|
+ * WARNING: vma_init does not initialize vma->vm_lock.
|
|
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
|
|
+ */
|
|
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
|
|
{
|
|
static const struct vm_operations_struct dummy_vm_ops = {};
|
|
@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
|
|
vma->vm_mm = mm;
|
|
vma->vm_ops = &dummy_vm_ops;
|
|
INIT_LIST_HEAD(&vma->anon_vma_chain);
|
|
+ vma_mark_detached(vma, false);
|
|
}
|
|
|
|
/* Use when VMA is not part of the VMA tree and needs no locking */
|
|
@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
|
|
static inline void vm_flags_reset(struct vm_area_struct *vma,
|
|
vm_flags_t flags)
|
|
{
|
|
- mmap_assert_write_locked(vma->vm_mm);
|
|
+ vma_start_write(vma);
|
|
vm_flags_init(vma, flags);
|
|
}
|
|
|
|
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
|
|
vm_flags_t flags)
|
|
{
|
|
- mmap_assert_write_locked(vma->vm_mm);
|
|
+ vma_start_write(vma);
|
|
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
|
|
}
|
|
|
|
static inline void vm_flags_set(struct vm_area_struct *vma,
|
|
vm_flags_t flags)
|
|
{
|
|
- mmap_assert_write_locked(vma->vm_mm);
|
|
+ vma_start_write(vma);
|
|
ACCESS_PRIVATE(vma, __vm_flags) |= flags;
|
|
}
|
|
|
|
static inline void vm_flags_clear(struct vm_area_struct *vma,
|
|
vm_flags_t flags)
|
|
{
|
|
- mmap_assert_write_locked(vma->vm_mm);
|
|
+ vma_start_write(vma);
|
|
ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
|
|
}
|
|
|
|
@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
|
|
static inline void vm_flags_mod(struct vm_area_struct *vma,
|
|
vm_flags_t set, vm_flags_t clear)
|
|
{
|
|
- mmap_assert_write_locked(vma->vm_mm);
|
|
+ vma_start_write(vma);
|
|
__vm_flags_mod(vma, set, clear);
|
|
}
|
|
|
|
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
|
|
index a57e6ae78e65..ac4b5df9ba56 100644
|
|
--- a/include/linux/mm_types.h
|
|
+++ b/include/linux/mm_types.h
|
|
@@ -471,6 +471,10 @@ struct anon_vma_name {
|
|
char name[];
|
|
};
|
|
|
|
+struct vma_lock {
|
|
+ struct rw_semaphore lock;
|
|
+};
|
|
+
|
|
/*
|
|
* This struct describes a virtual memory area. There is one of these
|
|
* per VM-area/task. A VM area is any part of the process virtual memory
|
|
@@ -480,9 +484,16 @@ struct anon_vma_name {
|
|
struct vm_area_struct {
|
|
/* The first cache line has the info for VMA tree walking. */
|
|
|
|
- unsigned long vm_start; /* Our start address within vm_mm. */
|
|
- unsigned long vm_end; /* The first byte after our end address
|
|
- within vm_mm. */
|
|
+ union {
|
|
+ struct {
|
|
+ /* VMA covers [vm_start; vm_end) addresses within mm */
|
|
+ unsigned long vm_start;
|
|
+ unsigned long vm_end;
|
|
+ };
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ struct rcu_head vm_rcu; /* Used for deferred freeing. */
|
|
+#endif
|
|
+ };
|
|
|
|
struct mm_struct *vm_mm; /* The address space we belong to. */
|
|
|
|
@@ -501,6 +512,14 @@ struct vm_area_struct {
|
|
vm_flags_t __private __vm_flags;
|
|
};
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ int vm_lock_seq;
|
|
+ struct vma_lock *vm_lock;
|
|
+
|
|
+ /* Flag to indicate areas detached from the mm->mm_mt tree */
|
|
+ bool detached;
|
|
+#endif
|
|
+
|
|
/*
|
|
* For areas with an address space and backing store,
|
|
* linkage into the address_space->i_mmap interval tree.
|
|
@@ -637,6 +656,9 @@ struct mm_struct {
|
|
* init_mm.mmlist, and are protected
|
|
* by mmlist_lock
|
|
*/
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ int mm_lock_seq;
|
|
+#endif
|
|
|
|
|
|
unsigned long hiwater_rss; /* High-watermark of RSS usage */
|
|
@@ -1037,6 +1059,7 @@ typedef struct {
|
|
* mapped after the fault.
|
|
* @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
|
|
* We should only access orig_pte if this flag set.
|
|
+ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
|
|
*
|
|
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
|
|
* whether we would allow page faults to retry by specifying these two
|
|
@@ -1074,6 +1097,7 @@ enum fault_flag {
|
|
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
|
|
FAULT_FLAG_UNSHARE = 1 << 10,
|
|
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
|
|
+ FAULT_FLAG_VMA_LOCK = 1 << 12,
|
|
};
|
|
|
|
typedef unsigned int __bitwise zap_flags_t;
|
|
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
|
|
index 96e113e23d04..aab8f1b28d26 100644
|
|
--- a/include/linux/mmap_lock.h
|
|
+++ b/include/linux/mmap_lock.h
|
|
@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
|
|
|
|
#endif /* CONFIG_TRACING */
|
|
|
|
+static inline void mmap_assert_locked(struct mm_struct *mm)
|
|
+{
|
|
+ lockdep_assert_held(&mm->mmap_lock);
|
|
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
+}
|
|
+
|
|
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
|
|
+{
|
|
+ lockdep_assert_held_write(&mm->mmap_lock);
|
|
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+static inline void vma_end_write_all(struct mm_struct *mm)
|
|
+{
|
|
+ mmap_assert_write_locked(mm);
|
|
+ /* No races during update due to exclusive mmap_lock being held */
|
|
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
|
|
+}
|
|
+#else
|
|
+static inline void vma_end_write_all(struct mm_struct *mm) {}
|
|
+#endif
|
|
+
|
|
static inline void mmap_init_lock(struct mm_struct *mm)
|
|
{
|
|
init_rwsem(&mm->mmap_lock);
|
|
@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
|
|
static inline void mmap_write_unlock(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_released(mm, true);
|
|
+ vma_end_write_all(mm);
|
|
up_write(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline void mmap_write_downgrade(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_acquire_returned(mm, false, true);
|
|
+ vma_end_write_all(mm);
|
|
downgrade_write(&mm->mmap_lock);
|
|
}
|
|
|
|
@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
|
|
up_read_non_owner(&mm->mmap_lock);
|
|
}
|
|
|
|
-static inline void mmap_assert_locked(struct mm_struct *mm)
|
|
-{
|
|
- lockdep_assert_held(&mm->mmap_lock);
|
|
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
-}
|
|
-
|
|
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
|
|
-{
|
|
- lockdep_assert_held_write(&mm->mmap_lock);
|
|
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
-}
|
|
-
|
|
static inline int mmap_lock_is_contended(struct mm_struct *mm)
|
|
{
|
|
return rwsem_is_contended(&mm->mmap_lock);
|
|
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
|
|
index 7f5d1caf5890..8abfa1240040 100644
|
|
--- a/include/linux/vm_event_item.h
|
|
+++ b/include/linux/vm_event_item.h
|
|
@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
|
#ifdef CONFIG_X86
|
|
DIRECT_MAP_LEVEL2_SPLIT,
|
|
DIRECT_MAP_LEVEL3_SPLIT,
|
|
+#endif
|
|
+#ifdef CONFIG_PER_VMA_LOCK_STATS
|
|
+ VMA_LOCK_SUCCESS,
|
|
+ VMA_LOCK_ABORT,
|
|
+ VMA_LOCK_RETRY,
|
|
+ VMA_LOCK_MISS,
|
|
#endif
|
|
NR_VM_EVENT_ITEMS
|
|
};
|
|
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
|
|
index 19cf5b6892ce..fed855bae6d8 100644
|
|
--- a/include/linux/vmstat.h
|
|
+++ b/include/linux/vmstat.h
|
|
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
|
|
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
|
|
#endif
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK_STATS
|
|
+#define count_vm_vma_lock_event(x) count_vm_event(x)
|
|
+#else
|
|
+#define count_vm_vma_lock_event(x) do {} while (0)
|
|
+#endif
|
|
+
|
|
#define __count_zid_vm_events(item, zid, delta) \
|
|
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
|
|
|
|
diff --git a/kernel/fork.c b/kernel/fork.c
|
|
index 349945168239..ebd353730887 100644
|
|
--- a/kernel/fork.c
|
|
+++ b/kernel/fork.c
|
|
@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep;
|
|
/* SLAB cache for mm_struct structures (tsk->mm) */
|
|
static struct kmem_cache *mm_cachep;
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+
|
|
+/* SLAB cache for vm_area_struct.lock */
|
|
+static struct kmem_cache *vma_lock_cachep;
|
|
+
|
|
+static bool vma_lock_alloc(struct vm_area_struct *vma)
|
|
+{
|
|
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
|
|
+ if (!vma->vm_lock)
|
|
+ return false;
|
|
+
|
|
+ init_rwsem(&vma->vm_lock->lock);
|
|
+ vma->vm_lock_seq = -1;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void vma_lock_free(struct vm_area_struct *vma)
|
|
+{
|
|
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
|
|
+}
|
|
+
|
|
+#else /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
|
|
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
|
|
+
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
|
- if (vma)
|
|
- vma_init(vma, mm);
|
|
+ if (!vma)
|
|
+ return NULL;
|
|
+
|
|
+ vma_init(vma, mm);
|
|
+ if (!vma_lock_alloc(vma)) {
|
|
+ kmem_cache_free(vm_area_cachep, vma);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
return vma;
|
|
}
|
|
|
|
@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
|
|
{
|
|
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
|
|
|
- if (new) {
|
|
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
|
|
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
|
|
- /*
|
|
- * orig->shared.rb may be modified concurrently, but the clone
|
|
- * will be reinitialized.
|
|
- */
|
|
- data_race(memcpy(new, orig, sizeof(*new)));
|
|
- INIT_LIST_HEAD(&new->anon_vma_chain);
|
|
- dup_anon_vma_name(orig, new);
|
|
+ if (!new)
|
|
+ return NULL;
|
|
+
|
|
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
|
|
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
|
|
+ /*
|
|
+ * orig->shared.rb may be modified concurrently, but the clone
|
|
+ * will be reinitialized.
|
|
+ */
|
|
+ data_race(memcpy(new, orig, sizeof(*new)));
|
|
+ if (!vma_lock_alloc(new)) {
|
|
+ kmem_cache_free(vm_area_cachep, new);
|
|
+ return NULL;
|
|
}
|
|
+ INIT_LIST_HEAD(&new->anon_vma_chain);
|
|
+ dup_anon_vma_name(orig, new);
|
|
+
|
|
return new;
|
|
}
|
|
|
|
-void vm_area_free(struct vm_area_struct *vma)
|
|
+void __vm_area_free(struct vm_area_struct *vma)
|
|
{
|
|
free_anon_vma_name(vma);
|
|
+ vma_lock_free(vma);
|
|
kmem_cache_free(vm_area_cachep, vma);
|
|
}
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+static void vm_area_free_rcu_cb(struct rcu_head *head)
|
|
+{
|
|
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
|
|
+ vm_rcu);
|
|
+
|
|
+ /* The vma should not be locked while being destroyed. */
|
|
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
|
|
+ __vm_area_free(vma);
|
|
+}
|
|
+#endif
|
|
+
|
|
+void vm_area_free(struct vm_area_struct *vma)
|
|
+{
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
|
|
+#else
|
|
+ __vm_area_free(vma);
|
|
+#endif
|
|
+}
|
|
+
|
|
static void account_kernel_stack(struct task_struct *tsk, int account)
|
|
{
|
|
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
|
@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
|
seqcount_init(&mm->write_protect_seq);
|
|
mmap_init_lock(mm);
|
|
INIT_LIST_HEAD(&mm->mmlist);
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ mm->mm_lock_seq = 0;
|
|
+#endif
|
|
mm_pgtables_bytes_init(mm);
|
|
mm->map_count = 0;
|
|
mm->locked_vm = 0;
|
|
@@ -3074,6 +3141,9 @@ void __init proc_caches_init(void)
|
|
NULL);
|
|
|
|
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
|
|
+#endif
|
|
mmap_init();
|
|
nsproxy_cache_init();
|
|
}
|
|
diff --git a/mm/Kconfig b/mm/Kconfig
|
|
index cf2e47030fe8..459af2123189 100644
|
|
--- a/mm/Kconfig
|
|
+++ b/mm/Kconfig
|
|
@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS
|
|
This option has a per-memcg and per-node memory overhead.
|
|
# }
|
|
|
|
+config ARCH_SUPPORTS_PER_VMA_LOCK
|
|
+ def_bool n
|
|
+
|
|
+config PER_VMA_LOCK
|
|
+ def_bool y
|
|
+ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
|
|
+ help
|
|
+ Allow per-vma locking during page fault handling.
|
|
+
|
|
+ This feature allows locking each virtual memory area separately when
|
|
+ handling page faults instead of taking mmap_lock.
|
|
+
|
|
source "mm/damon/Kconfig"
|
|
|
|
endmenu
|
|
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
|
|
index c3547a373c9c..4965a7333a3f 100644
|
|
--- a/mm/Kconfig.debug
|
|
+++ b/mm/Kconfig.debug
|
|
@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
|
|
|
|
If unsure, say Y.
|
|
|
|
+config PER_VMA_LOCK_STATS
|
|
+ bool "Statistics for per-vma locks"
|
|
+ depends on PER_VMA_LOCK
|
|
+ default y
|
|
+ help
|
|
+ Statistics for per-vma locks.
|
|
diff --git a/mm/init-mm.c b/mm/init-mm.c
|
|
index c9327abb771c..33269314e060 100644
|
|
--- a/mm/init-mm.c
|
|
+++ b/mm/init-mm.c
|
|
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
|
|
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
|
|
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
|
|
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+ .mm_lock_seq = 0,
|
|
+#endif
|
|
.user_ns = &init_user_ns,
|
|
.cpu_bitmap = CPU_BITS_NONE,
|
|
#ifdef CONFIG_IOMMU_SVA
|
|
diff --git a/mm/internal.h b/mm/internal.h
|
|
index 7920a8b7982e..0c455d6e4e3e 100644
|
|
--- a/mm/internal.h
|
|
+++ b/mm/internal.h
|
|
@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
|
|
|
|
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
|
|
struct vm_area_struct *start_vma, unsigned long floor,
|
|
- unsigned long ceiling);
|
|
+ unsigned long ceiling, bool mm_wr_locked);
|
|
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
|
|
|
|
struct zap_details;
|
|
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
|
|
index 0ec69b96b497..37a52a0ec9da 100644
|
|
--- a/mm/khugepaged.c
|
|
+++ b/mm/khugepaged.c
|
|
@@ -1053,6 +1053,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
|
|
if (result != SCAN_SUCCEED)
|
|
goto out_up_write;
|
|
|
|
+ vma_start_write(vma);
|
|
anon_vma_lock_write(vma->anon_vma);
|
|
|
|
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
|
|
@@ -1516,6 +1517,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
|
|
goto drop_hpage;
|
|
}
|
|
|
|
+ /* Lock the vma before taking i_mmap and page table locks */
|
|
+ vma_start_write(vma);
|
|
+
|
|
/*
|
|
* We need to lock the mapping so that from here on, only GUP-fast and
|
|
* hardware page walks can access the parts of the page tables that
|
|
@@ -1693,6 +1697,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
|
|
result = SCAN_PTE_MAPPED_HUGEPAGE;
|
|
if ((cc->is_khugepaged || is_target) &&
|
|
mmap_write_trylock(mm)) {
|
|
+ /* trylock for the same lock inversion as above */
|
|
+ if (!vma_try_start_write(vma))
|
|
+ goto unlock_next;
|
|
+
|
|
/*
|
|
* Re-check whether we have an ->anon_vma, because
|
|
* collapse_and_free_pmd() requires that either no
|
|
diff --git a/mm/memory.c b/mm/memory.c
|
|
index 01a23ad48a04..c76183ced67a 100644
|
|
--- a/mm/memory.c
|
|
+++ b/mm/memory.c
|
|
@@ -348,7 +348,7 @@ void free_pgd_range(struct mmu_gather *tlb,
|
|
|
|
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
|
|
struct vm_area_struct *vma, unsigned long floor,
|
|
- unsigned long ceiling)
|
|
+ unsigned long ceiling, bool mm_wr_locked)
|
|
{
|
|
MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
|
|
|
|
@@ -366,6 +366,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
|
|
* Hide vma from rmap and truncate_pagecache before freeing
|
|
* pgtables
|
|
*/
|
|
+ if (mm_wr_locked)
|
|
+ vma_start_write(vma);
|
|
unlink_anon_vmas(vma);
|
|
unlink_file_vma(vma);
|
|
|
|
@@ -380,6 +382,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
|
|
&& !is_vm_hugetlb_page(next)) {
|
|
vma = next;
|
|
next = mas_find(&mas, ceiling - 1);
|
|
+ if (mm_wr_locked)
|
|
+ vma_start_write(vma);
|
|
unlink_anon_vmas(vma);
|
|
unlink_file_vma(vma);
|
|
}
|
|
@@ -3698,6 +3702,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
|
if (!pte_unmap_same(vmf))
|
|
goto out;
|
|
|
|
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
|
|
+ ret = VM_FAULT_RETRY;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
entry = pte_to_swp_entry(vmf->orig_pte);
|
|
if (unlikely(non_swap_entry(entry))) {
|
|
if (is_migration_entry(entry)) {
|
|
@@ -5230,6 +5239,67 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
|
|
}
|
|
EXPORT_SYMBOL_GPL(handle_mm_fault);
|
|
|
|
+#ifdef CONFIG_PER_VMA_LOCK
|
|
+/*
|
|
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
|
|
+ * stable and not isolated. If the VMA is not found or is being modified the
|
|
+ * function returns NULL.
|
|
+ */
|
|
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
|
|
+ unsigned long address)
|
|
+{
|
|
+ MA_STATE(mas, &mm->mm_mt, address, address);
|
|
+ struct vm_area_struct *vma;
|
|
+
|
|
+ rcu_read_lock();
|
|
+retry:
|
|
+ vma = mas_walk(&mas);
|
|
+ if (!vma)
|
|
+ goto inval;
|
|
+
|
|
+ /* Only anonymous vmas are supported for now */
|
|
+ if (!vma_is_anonymous(vma))
|
|
+ goto inval;
|
|
+
|
|
+ /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
|
|
+ if (!vma->anon_vma)
|
|
+ goto inval;
|
|
+
|
|
+ if (!vma_start_read(vma))
|
|
+ goto inval;
|
|
+
|
|
+ /*
|
|
+ * Due to the possibility of userfault handler dropping mmap_lock, avoid
|
|
+ * it for now and fall back to page fault handling under mmap_lock.
|
|
+ */
|
|
+ if (userfaultfd_armed(vma)) {
|
|
+ vma_end_read(vma);
|
|
+ goto inval;
|
|
+ }
|
|
+
|
|
+ /* Check since vm_start/vm_end might change before we lock the VMA */
|
|
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
|
|
+ vma_end_read(vma);
|
|
+ goto inval;
|
|
+ }
|
|
+
|
|
+ /* Check if the VMA got isolated after we found it */
|
|
+ if (vma->detached) {
|
|
+ vma_end_read(vma);
|
|
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
|
|
+ /* The area was replaced with another one */
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ return vma;
|
|
+inval:
|
|
+ rcu_read_unlock();
|
|
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
|
|
+ return NULL;
|
|
+}
|
|
+#endif /* CONFIG_PER_VMA_LOCK */
|
|
+
|
|
#ifndef __PAGETABLE_P4D_FOLDED
|
|
/*
|
|
* Allocate p4d page table.
|
|
diff --git a/mm/mmap.c b/mm/mmap.c
|
|
index d5475fbf5729..cbac45aa39ae 100644
|
|
--- a/mm/mmap.c
|
|
+++ b/mm/mmap.c
|
|
@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
|
|
/*
|
|
* Close a vm structure and free it.
|
|
*/
|
|
-static void remove_vma(struct vm_area_struct *vma)
|
|
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
|
|
{
|
|
might_sleep();
|
|
if (vma->vm_ops && vma->vm_ops->close)
|
|
@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
|
|
if (vma->vm_file)
|
|
fput(vma->vm_file);
|
|
mpol_put(vma_policy(vma));
|
|
- vm_area_free(vma);
|
|
+ if (unreachable)
|
|
+ __vm_area_free(vma);
|
|
+ else
|
|
+ vm_area_free(vma);
|
|
}
|
|
|
|
static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
|
|
@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp,
|
|
*/
|
|
static inline void vma_prepare(struct vma_prepare *vp)
|
|
{
|
|
+ vma_start_write(vp->vma);
|
|
+ if (vp->adj_next)
|
|
+ vma_start_write(vp->adj_next);
|
|
+ /* vp->insert is always a newly created VMA, no need for locking */
|
|
+ if (vp->remove)
|
|
+ vma_start_write(vp->remove);
|
|
+ if (vp->remove2)
|
|
+ vma_start_write(vp->remove2);
|
|
+
|
|
if (vp->file) {
|
|
uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
|
|
|
|
@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp,
|
|
|
|
if (vp->remove) {
|
|
again:
|
|
+ vma_mark_detached(vp->remove, true);
|
|
if (vp->file) {
|
|
uprobe_munmap(vp->remove, vp->remove->vm_start,
|
|
vp->remove->vm_end);
|
|
@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
if (vma_iter_prealloc(vmi))
|
|
goto nomem;
|
|
|
|
+ vma_prepare(&vp);
|
|
vma_adjust_trans_huge(vma, start, end, 0);
|
|
/* VMA iterator points to previous, so set to start if necessary */
|
|
if (vma_iter_addr(vmi) != start)
|
|
vma_iter_set(vmi, start);
|
|
|
|
- vma_prepare(&vp);
|
|
vma->vm_start = start;
|
|
vma->vm_end = end;
|
|
vma->vm_pgoff = pgoff;
|
|
@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
return -ENOMEM;
|
|
|
|
init_vma_prep(&vp, vma);
|
|
- vma_adjust_trans_huge(vma, start, end, 0);
|
|
vma_prepare(&vp);
|
|
+ vma_adjust_trans_huge(vma, start, end, 0);
|
|
|
|
if (vma->vm_start < start)
|
|
vma_iter_clear(vmi, vma->vm_start, start);
|
|
@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
|
|
if (vma_iter_prealloc(vmi))
|
|
return NULL;
|
|
|
|
- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
|
|
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
|
|
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
|
|
vp.anon_vma != adjust->anon_vma);
|
|
|
|
vma_prepare(&vp);
|
|
+ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
|
|
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
|
|
vma_expanded = true;
|
|
|
|
@@ -2157,7 +2170,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
|
|
if (vma->vm_flags & VM_ACCOUNT)
|
|
nr_accounted += nrpages;
|
|
vm_stat_account(mm, vma->vm_flags, -nrpages);
|
|
- remove_vma(vma);
|
|
+ remove_vma(vma, false);
|
|
}
|
|
vm_unacct_memory(nr_accounted);
|
|
validate_mm(mm);
|
|
@@ -2180,7 +2193,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
|
|
update_hiwater_rss(mm);
|
|
unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
|
|
free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
|
|
- next ? next->vm_start : USER_PGTABLES_CEILING);
|
|
+ next ? next->vm_start : USER_PGTABLES_CEILING,
|
|
+ mm_wr_locked);
|
|
tlb_finish_mmu(&tlb);
|
|
}
|
|
|
|
@@ -2236,10 +2250,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
if (new->vm_ops && new->vm_ops->open)
|
|
new->vm_ops->open(new);
|
|
|
|
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
|
|
init_vma_prep(&vp, vma);
|
|
vp.insert = new;
|
|
vma_prepare(&vp);
|
|
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
|
|
|
|
if (new_below) {
|
|
vma->vm_start = addr;
|
|
@@ -2283,10 +2297,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
static inline int munmap_sidetree(struct vm_area_struct *vma,
|
|
struct ma_state *mas_detach)
|
|
{
|
|
+ vma_start_write(vma);
|
|
mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
|
|
if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
|
|
return -ENOMEM;
|
|
|
|
+ vma_mark_detached(vma, true);
|
|
if (vma->vm_flags & VM_LOCKED)
|
|
vma->vm_mm->locked_vm -= vma_pages(vma);
|
|
|
|
@@ -2942,9 +2958,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
if (vma_iter_prealloc(vmi))
|
|
goto unacct_fail;
|
|
|
|
- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
|
|
init_vma_prep(&vp, vma);
|
|
vma_prepare(&vp);
|
|
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
|
|
vma->vm_end = addr + len;
|
|
vm_flags_set(vma, VM_SOFTDIRTY);
|
|
vma_iter_store(vmi, vma);
|
|
@@ -3077,7 +3093,7 @@ void exit_mmap(struct mm_struct *mm)
|
|
mmap_write_lock(mm);
|
|
mt_clear_in_rcu(&mm->mm_mt);
|
|
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
|
|
- USER_PGTABLES_CEILING);
|
|
+ USER_PGTABLES_CEILING, true);
|
|
tlb_finish_mmu(&tlb);
|
|
|
|
/*
|
|
@@ -3088,7 +3104,7 @@ void exit_mmap(struct mm_struct *mm)
|
|
do {
|
|
if (vma->vm_flags & VM_ACCOUNT)
|
|
nr_accounted += vma_pages(vma);
|
|
- remove_vma(vma);
|
|
+ remove_vma(vma, true);
|
|
count++;
|
|
cond_resched();
|
|
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
|
|
@@ -3211,6 +3227,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
|
get_file(new_vma->vm_file);
|
|
if (new_vma->vm_ops && new_vma->vm_ops->open)
|
|
new_vma->vm_ops->open(new_vma);
|
|
+ vma_start_write(new_vma);
|
|
if (vma_link(mm, new_vma))
|
|
goto out_vma_link;
|
|
*need_rmap_locks = false;
|
|
@@ -3505,6 +3522,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
|
|
* of mm/rmap.c:
|
|
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
|
|
* hugetlb mapping);
|
|
+ * - all vmas marked locked
|
|
* - all i_mmap_rwsem locks;
|
|
* - all anon_vma->rwseml
|
|
*
|
|
@@ -3527,6 +3545,13 @@ int mm_take_all_locks(struct mm_struct *mm)
|
|
|
|
mutex_lock(&mm_all_locks_mutex);
|
|
|
|
+ mas_for_each(&mas, vma, ULONG_MAX) {
|
|
+ if (signal_pending(current))
|
|
+ goto out_unlock;
|
|
+ vma_start_write(vma);
|
|
+ }
|
|
+
|
|
+ mas_set(&mas, 0);
|
|
mas_for_each(&mas, vma, ULONG_MAX) {
|
|
if (signal_pending(current))
|
|
goto out_unlock;
|
|
@@ -3616,6 +3641,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
|
|
if (vma->vm_file && vma->vm_file->f_mapping)
|
|
vm_unlock_mapping(vma->vm_file->f_mapping);
|
|
}
|
|
+ vma_end_write_all(mm);
|
|
|
|
mutex_unlock(&mm_all_locks_mutex);
|
|
}
|
|
diff --git a/mm/mremap.c b/mm/mremap.c
|
|
index 411a85682b58..dd541e59edda 100644
|
|
--- a/mm/mremap.c
|
|
+++ b/mm/mremap.c
|
|
@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
|
return -ENOMEM;
|
|
}
|
|
|
|
+ vma_start_write(vma);
|
|
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
|
|
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
|
|
&need_rmap_locks);
|
|
diff --git a/mm/rmap.c b/mm/rmap.c
|
|
index 8632e02661ac..cfdaa56cad3e 100644
|
|
--- a/mm/rmap.c
|
|
+++ b/mm/rmap.c
|
|
@@ -25,21 +25,22 @@
|
|
* mapping->invalidate_lock (in filemap_fault)
|
|
* page->flags PG_locked (lock_page)
|
|
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
|
|
- * mapping->i_mmap_rwsem
|
|
- * anon_vma->rwsem
|
|
- * mm->page_table_lock or pte_lock
|
|
- * swap_lock (in swap_duplicate, swap_info_get)
|
|
- * mmlist_lock (in mmput, drain_mmlist and others)
|
|
- * mapping->private_lock (in block_dirty_folio)
|
|
- * folio_lock_memcg move_lock (in block_dirty_folio)
|
|
- * i_pages lock (widely used)
|
|
- * lruvec->lru_lock (in folio_lruvec_lock_irq)
|
|
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
|
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
|
- * sb_lock (within inode_lock in fs/fs-writeback.c)
|
|
- * i_pages lock (widely used, in set_page_dirty,
|
|
- * in arch-dependent flush_dcache_mmap_lock,
|
|
- * within bdi.wb->list_lock in __sync_single_inode)
|
|
+ * vma_start_write
|
|
+ * mapping->i_mmap_rwsem
|
|
+ * anon_vma->rwsem
|
|
+ * mm->page_table_lock or pte_lock
|
|
+ * swap_lock (in swap_duplicate, swap_info_get)
|
|
+ * mmlist_lock (in mmput, drain_mmlist and others)
|
|
+ * mapping->private_lock (in block_dirty_folio)
|
|
+ * folio_lock_memcg move_lock (in block_dirty_folio)
|
|
+ * i_pages lock (widely used)
|
|
+ * lruvec->lru_lock (in folio_lruvec_lock_irq)
|
|
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
|
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
|
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
|
|
+ * i_pages lock (widely used, in set_page_dirty,
|
|
+ * in arch-dependent flush_dcache_mmap_lock,
|
|
+ * within bdi.wb->list_lock in __sync_single_inode)
|
|
*
|
|
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
|
|
* ->tasklist_lock
|
|
diff --git a/mm/vmstat.c b/mm/vmstat.c
|
|
index 1ea6a5ce1c41..4f1089a1860e 100644
|
|
--- a/mm/vmstat.c
|
|
+++ b/mm/vmstat.c
|
|
@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
|
|
"direct_map_level2_splits",
|
|
"direct_map_level3_splits",
|
|
#endif
|
|
+#ifdef CONFIG_PER_VMA_LOCK_STATS
|
|
+ "vma_lock_success",
|
|
+ "vma_lock_abort",
|
|
+ "vma_lock_retry",
|
|
+ "vma_lock_miss",
|
|
+#endif
|
|
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
|
|
};
|
|
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
|
|
--
|
|
2.40.1
|
|
|
|
From 56fd0f1397471be0786d1f696598173b9ebb9a35 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Tue, 25 Apr 2023 17:19:06 +0200
|
|
Subject: [PATCH 09/10] sched
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
arch/x86/kernel/itmt.c | 23 +--
|
|
arch/x86/kernel/smpboot.c | 4 +-
|
|
include/linux/sched.h | 3 +
|
|
include/linux/sched/sd_flags.h | 5 +-
|
|
kernel/sched/core.c | 4 +-
|
|
kernel/sched/debug.c | 1 +
|
|
kernel/sched/fair.c | 265 ++++++++++++++++++++-------------
|
|
kernel/sched/features.h | 1 +
|
|
kernel/sched/pelt.c | 60 ++++++++
|
|
kernel/sched/pelt.h | 42 +++++-
|
|
kernel/sched/sched.h | 23 ++-
|
|
11 files changed, 294 insertions(+), 137 deletions(-)
|
|
|
|
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
|
|
index 9ff480e94511..6510883c5e81 100644
|
|
--- a/arch/x86/kernel/itmt.c
|
|
+++ b/arch/x86/kernel/itmt.c
|
|
@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu)
|
|
|
|
/**
|
|
* sched_set_itmt_core_prio() - Set CPU priority based on ITMT
|
|
- * @prio: Priority of cpu core
|
|
- * @core_cpu: The cpu number associated with the core
|
|
+ * @prio: Priority of @cpu
|
|
+ * @cpu: The CPU number
|
|
*
|
|
* The pstate driver will find out the max boost frequency
|
|
* and call this function to set a priority proportional
|
|
- * to the max boost frequency. CPU with higher boost
|
|
+ * to the max boost frequency. CPUs with higher boost
|
|
* frequency will receive higher priority.
|
|
*
|
|
* No need to rebuild sched domain after updating
|
|
* the CPU priorities. The sched domains have no
|
|
* dependency on CPU priorities.
|
|
*/
|
|
-void sched_set_itmt_core_prio(int prio, int core_cpu)
|
|
+void sched_set_itmt_core_prio(int prio, int cpu)
|
|
{
|
|
- int cpu, i = 1;
|
|
-
|
|
- for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
|
|
- int smt_prio;
|
|
-
|
|
- /*
|
|
- * Ensure that the siblings are moved to the end
|
|
- * of the priority chain and only used when
|
|
- * all other high priority cpus are out of capacity.
|
|
- */
|
|
- smt_prio = prio * smp_num_siblings / (i * i);
|
|
- per_cpu(sched_core_priority, cpu) = smt_prio;
|
|
- i++;
|
|
- }
|
|
+ per_cpu(sched_core_priority, cpu) = prio;
|
|
}
|
|
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
|
|
index 9013bb28255a..cea297d97034 100644
|
|
--- a/arch/x86/kernel/smpboot.c
|
|
+++ b/arch/x86/kernel/smpboot.c
|
|
@@ -547,7 +547,7 @@ static int x86_core_flags(void)
|
|
#ifdef CONFIG_SCHED_SMT
|
|
static int x86_smt_flags(void)
|
|
{
|
|
- return cpu_smt_flags() | x86_sched_itmt_flags();
|
|
+ return cpu_smt_flags();
|
|
}
|
|
#endif
|
|
#ifdef CONFIG_SCHED_CLUSTER
|
|
@@ -578,7 +578,7 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = {
|
|
#ifdef CONFIG_SCHED_MC
|
|
{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
|
|
#endif
|
|
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
|
+ { cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(DIE) },
|
|
{ NULL, },
|
|
};
|
|
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 63d242164b1a..6d398b337b0d 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -557,6 +557,9 @@ struct sched_entity {
|
|
u64 prev_sum_exec_runtime;
|
|
|
|
u64 nr_migrations;
|
|
+ u64 prev_sleep_sum_runtime;
|
|
+ /* average duration of a task */
|
|
+ u64 dur_avg;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
int depth;
|
|
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
|
|
index 57bde66d95f7..fad77b5172e2 100644
|
|
--- a/include/linux/sched/sd_flags.h
|
|
+++ b/include/linux/sched/sd_flags.h
|
|
@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
|
/*
|
|
* Place busy tasks earlier in the domain
|
|
*
|
|
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
|
|
- * up, but currently assumed to be set from the base domain
|
|
- * upwards (see update_top_cache_domain()).
|
|
* NEEDS_GROUPS: Load balancing flag.
|
|
*/
|
|
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
|
+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
|
|
|
|
/*
|
|
* Prefer to place tasks in a sibling domain
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 0d18c3969f90..17bb9637f314 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -724,7 +724,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
|
|
update_irq_load_avg(rq, irq_delta + steal);
|
|
#endif
|
|
- update_rq_clock_pelt(rq, delta);
|
|
+ update_rq_clock_task_mult(rq, delta);
|
|
}
|
|
|
|
void update_rq_clock(struct rq *rq)
|
|
@@ -4434,6 +4434,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|
p->se.prev_sum_exec_runtime = 0;
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
+ p->se.dur_avg = 0;
|
|
+ p->se.prev_sleep_sum_runtime = 0;
|
|
INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
|
|
index 1637b65ba07a..8d64fba16cfe 100644
|
|
--- a/kernel/sched/debug.c
|
|
+++ b/kernel/sched/debug.c
|
|
@@ -1024,6 +1024,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
|
|
__PS("nr_involuntary_switches", p->nivcsw);
|
|
|
|
P(se.load.weight);
|
|
+ P(se.dur_avg);
|
|
#ifdef CONFIG_SMP
|
|
P(se.avg.load_sum);
|
|
P(se.avg.runnable_sum);
|
|
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
|
|
index 96c66b50ee48..0f92281fbed9 100644
|
|
--- a/kernel/sched/fair.c
|
|
+++ b/kernel/sched/fair.c
|
|
@@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
* Scheduling class queueing methods:
|
|
*/
|
|
|
|
+static inline bool is_core_idle(int cpu)
|
|
+{
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ int sibling;
|
|
+
|
|
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
|
|
+ if (cpu == sibling)
|
|
+ continue;
|
|
+
|
|
+ if (!idle_cpu(sibling))
|
|
+ return false;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
#ifdef CONFIG_NUMA
|
|
#define NUMA_IMBALANCE_MIN 2
|
|
|
|
@@ -1718,23 +1735,6 @@ struct numa_stats {
|
|
int idle_cpu;
|
|
};
|
|
|
|
-static inline bool is_core_idle(int cpu)
|
|
-{
|
|
-#ifdef CONFIG_SCHED_SMT
|
|
- int sibling;
|
|
-
|
|
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
|
|
- if (cpu == sibling)
|
|
- continue;
|
|
-
|
|
- if (!idle_cpu(sibling))
|
|
- return false;
|
|
- }
|
|
-#endif
|
|
-
|
|
- return true;
|
|
-}
|
|
-
|
|
struct task_numa_env {
|
|
struct task_struct *p;
|
|
|
|
@@ -6333,6 +6333,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
static void set_next_buddy(struct sched_entity *se);
|
|
|
|
+static inline void dur_avg_update(struct task_struct *p, bool task_sleep)
|
|
+{
|
|
+ u64 dur;
|
|
+
|
|
+ if (!task_sleep)
|
|
+ return;
|
|
+
|
|
+ dur = p->se.sum_exec_runtime - p->se.prev_sleep_sum_runtime;
|
|
+ p->se.prev_sleep_sum_runtime = p->se.sum_exec_runtime;
|
|
+ update_avg(&p->se.dur_avg, dur);
|
|
+}
|
|
+
|
|
/*
|
|
* The dequeue_task method is called before nr_running is
|
|
* decreased. We remove the task from the rbtree and
|
|
@@ -6405,6 +6417,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
dequeue_throttle:
|
|
util_est_update(&rq->cfs, p, task_sleep);
|
|
+ dur_avg_update(p, task_sleep);
|
|
hrtick_update(rq);
|
|
}
|
|
|
|
@@ -6538,6 +6551,23 @@ static int wake_wide(struct task_struct *p)
|
|
return 1;
|
|
}
|
|
|
|
+/*
|
|
+ * If a task switches in and then voluntarily relinquishes the
|
|
+ * CPU quickly, it is regarded as a short duration task.
|
|
+ *
|
|
+ * SIS_SHORT tries to wake up the short wakee on current CPU. This
|
|
+ * aims to avoid race condition among CPUs due to frequent context
|
|
+ * switch. Besides, the candidate short task should not be the one
|
|
+ * that wakes up more than one tasks, otherwise SIS_SHORT might
|
|
+ * stack too many tasks on current CPU.
|
|
+ */
|
|
+static inline int is_short_task(struct task_struct *p)
|
|
+{
|
|
+ return sched_feat(SIS_SHORT) && !p->wakee_flips &&
|
|
+ p->se.dur_avg &&
|
|
+ ((p->se.dur_avg * 8) < sysctl_sched_min_granularity);
|
|
+}
|
|
+
|
|
/*
|
|
* The purpose of wake_affine() is to quickly determine on which CPU we can run
|
|
* soonest. For the purpose of speed we only consider the waking and previous
|
|
@@ -6574,6 +6604,11 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
|
|
if (available_idle_cpu(prev_cpu))
|
|
return prev_cpu;
|
|
|
|
+ /* The only running task is a short duration one. */
|
|
+ if (cpu_rq(this_cpu)->nr_running == 1 &&
|
|
+ is_short_task(rcu_dereference(cpu_curr(this_cpu))))
|
|
+ return this_cpu;
|
|
+
|
|
return nr_cpumask_bits;
|
|
}
|
|
|
|
@@ -6948,6 +6983,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
|
/* overloaded LLC is unlikely to have idle cpu/core */
|
|
if (nr == 1)
|
|
return -1;
|
|
+
|
|
+ /*
|
|
+ * If the scan number suggested by SIS_UTIL is smaller
|
|
+ * than 60% of llc_weight, it indicates a util_avg% higher
|
|
+ * than 50%. System busier than this could lower its bar to
|
|
+ * choose a compromised "idle" CPU. This co-exists with
|
|
+ * !has_idle_core to not stack too many tasks on one CPU.
|
|
+ */
|
|
+ if (!has_idle_core && this == target &&
|
|
+ (5 * nr < 3 * sd->span_weight) &&
|
|
+ cpu_rq(target)->nr_running <= 1 &&
|
|
+ is_short_task(p) &&
|
|
+ is_short_task(rcu_dereference(cpu_curr(target))))
|
|
+ return target;
|
|
}
|
|
}
|
|
|
|
@@ -9288,96 +9337,65 @@ group_type group_classify(unsigned int imbalance_pct,
|
|
}
|
|
|
|
/**
|
|
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
|
|
- * @dst_cpu: Destination CPU of the load balancing
|
|
- * @sds: Load-balancing data with statistics of the local group
|
|
- * @sgs: Load-balancing statistics of the candidate busiest group
|
|
- * @sg: The candidate busiest group
|
|
- *
|
|
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
|
|
- * if @dst_cpu can pull tasks.
|
|
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
|
|
+ * @sd: The scheduling domain of the load balancing
|
|
+ * @cpu: A CPU
|
|
*
|
|
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
|
|
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
|
|
- * only if @dst_cpu has higher priority.
|
|
+ * Always use CPU priority when balancing load between SMT siblings. When
|
|
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
|
|
+ * use CPU priority if the whole core is idle.
|
|
*
|
|
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
|
|
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
|
|
- * Bigger imbalances in the number of busy CPUs will be dealt with in
|
|
- * update_sd_pick_busiest().
|
|
- *
|
|
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
|
|
- * of @dst_cpu are idle and @sg has lower priority.
|
|
- *
|
|
- * Return: true if @dst_cpu can pull tasks, false otherwise.
|
|
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
|
|
*/
|
|
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
|
|
- struct sg_lb_stats *sgs,
|
|
- struct sched_group *sg)
|
|
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
|
|
{
|
|
#ifdef CONFIG_SCHED_SMT
|
|
- bool local_is_smt, sg_is_smt;
|
|
- int sg_busy_cpus;
|
|
-
|
|
- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
|
|
- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
|
|
-
|
|
- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
|
|
-
|
|
- if (!local_is_smt) {
|
|
- /*
|
|
- * If we are here, @dst_cpu is idle and does not have SMT
|
|
- * siblings. Pull tasks if candidate group has two or more
|
|
- * busy CPUs.
|
|
- */
|
|
- if (sg_busy_cpus >= 2) /* implies sg_is_smt */
|
|
- return true;
|
|
-
|
|
- /*
|
|
- * @dst_cpu does not have SMT siblings. @sg may have SMT
|
|
- * siblings and only one is busy. In such case, @dst_cpu
|
|
- * can help if it has higher priority and is idle (i.e.,
|
|
- * it has no running tasks).
|
|
- */
|
|
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
|
|
- }
|
|
-
|
|
- /* @dst_cpu has SMT siblings. */
|
|
-
|
|
- if (sg_is_smt) {
|
|
- int local_busy_cpus = sds->local->group_weight -
|
|
- sds->local_stat.idle_cpus;
|
|
- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
|
|
-
|
|
- if (busy_cpus_delta == 1)
|
|
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
|
|
-
|
|
- return false;
|
|
- }
|
|
-
|
|
- /*
|
|
- * @sg does not have SMT siblings. Ensure that @sds::local does not end
|
|
- * up with more than one busy SMT sibling and only pull tasks if there
|
|
- * are not busy CPUs (i.e., no CPU has running tasks).
|
|
- */
|
|
- if (!sds->local_stat.sum_nr_running)
|
|
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
|
|
+ if (!sched_smt_active())
|
|
+ return true;
|
|
|
|
- return false;
|
|
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
|
|
#else
|
|
- /* Always return false so that callers deal with non-SMT cases. */
|
|
- return false;
|
|
+ return true;
|
|
#endif
|
|
}
|
|
|
|
+/**
|
|
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
|
|
+ * @env: The load balancing environment
|
|
+ * @sds: Load-balancing data with statistics of the local group
|
|
+ * @sgs: Load-balancing statistics of the candidate busiest group
|
|
+ * @group: The candidate busiest group
|
|
+ *
|
|
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
|
|
+ * preferred CPU of @group.
|
|
+ *
|
|
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
|
|
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
|
|
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
|
|
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
|
|
+ *
|
|
+ * If we are balancing load within an SMT core, or at DIE domain level, always
|
|
+ * proceed.
|
|
+ *
|
|
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
|
|
+ * otherwise.
|
|
+ */
|
|
static inline bool
|
|
sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
|
|
struct sched_group *group)
|
|
{
|
|
- /* Only do SMT checks if either local or candidate have SMT siblings */
|
|
- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
|
|
- (group->flags & SD_SHARE_CPUCAPACITY))
|
|
- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
|
|
+ /* Ensure that the whole local core is idle, if applicable. */
|
|
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
|
|
+ return false;
|
|
+
|
|
+ /*
|
|
+ * CPU priorities does not make sense for SMT cores with more than one
|
|
+ * busy sibling.
|
|
+ */
|
|
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
|
|
+ if (sgs->group_weight - sgs->idle_cpus != 1)
|
|
+ return false;
|
|
+ }
|
|
|
|
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
|
|
}
|
|
@@ -9567,10 +9585,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
* contention when accessing shared HW resources.
|
|
*
|
|
* XXX for now avg_load is not computed and always 0 so we
|
|
- * select the 1st one.
|
|
+ * select the 1st one, except if @sg is composed of SMT
|
|
+ * siblings.
|
|
*/
|
|
- if (sgs->avg_load <= busiest->avg_load)
|
|
+
|
|
+ if (sgs->avg_load < busiest->avg_load)
|
|
return false;
|
|
+
|
|
+ if (sgs->avg_load == busiest->avg_load) {
|
|
+ /*
|
|
+ * SMT sched groups need more help than non-SMT groups.
|
|
+ * If @sg happens to also be SMT, either choice is good.
|
|
+ */
|
|
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
break;
|
|
|
|
case group_has_spare:
|
|
@@ -10045,7 +10075,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
|
|
|
|
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
|
|
{
|
|
- struct sched_domain *child = env->sd->child;
|
|
struct sched_group *sg = env->sd->groups;
|
|
struct sg_lb_stats *local = &sds->local_stat;
|
|
struct sg_lb_stats tmp_sgs;
|
|
@@ -10086,8 +10115,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
sg = sg->next;
|
|
} while (sg != env->sd->groups);
|
|
|
|
- /* Tag domain that child domain prefers tasks go to siblings first */
|
|
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
|
|
+ /*
|
|
+ * Indicate that the child domain of the busiest group prefers tasks
|
|
+ * go to a child's sibling domains first. NB the flags of a sched group
|
|
+ * are those of the child domain.
|
|
+ */
|
|
+ if (sds->busiest)
|
|
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
|
|
|
|
|
|
if (env->sd->flags & SD_NUMA)
|
|
@@ -10397,7 +10431,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
goto out_balanced;
|
|
}
|
|
|
|
- /* Try to move all excess tasks to child's sibling domain */
|
|
+ /*
|
|
+ * Try to move all excess tasks to a sibling domain of the busiest
|
|
+ * group's child domain.
|
|
+ */
|
|
if (sds.prefer_sibling && local->group_type == group_has_spare &&
|
|
busiest->sum_nr_running > local->sum_nr_running + 1)
|
|
goto force_balance;
|
|
@@ -10499,8 +10536,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
nr_running == 1)
|
|
continue;
|
|
|
|
- /* Make sure we only pull tasks from a CPU of lower priority */
|
|
+ /*
|
|
+ * Make sure we only pull tasks from a CPU of lower priority
|
|
+ * when balancing between SMT siblings.
|
|
+ *
|
|
+ * If balancing between cores, let lower priority CPUs help
|
|
+ * SMT cores with more than one busy sibling.
|
|
+ */
|
|
if ((env->sd->flags & SD_ASYM_PACKING) &&
|
|
+ sched_use_asym_prio(env->sd, i) &&
|
|
sched_asym_prefer(i, env->dst_cpu) &&
|
|
nr_running == 1)
|
|
continue;
|
|
@@ -10589,12 +10633,19 @@ static inline bool
|
|
asym_active_balance(struct lb_env *env)
|
|
{
|
|
/*
|
|
- * ASYM_PACKING needs to force migrate tasks from busy but
|
|
- * lower priority CPUs in order to pack all tasks in the
|
|
- * highest priority CPUs.
|
|
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
|
|
+ * priority CPUs in order to pack all tasks in the highest priority
|
|
+ * CPUs. When done between cores, do it only if the whole core if the
|
|
+ * whole core is idle.
|
|
+ *
|
|
+ * If @env::src_cpu is an SMT core with busy siblings, let
|
|
+ * the lower priority @env::dst_cpu help it. Do not follow
|
|
+ * CPU priority.
|
|
*/
|
|
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
|
|
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
|
|
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
|
|
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
|
|
+ !sched_use_asym_prio(env->sd, env->src_cpu));
|
|
}
|
|
|
|
static inline bool
|
|
@@ -11328,9 +11379,13 @@ static void nohz_balancer_kick(struct rq *rq)
|
|
* When ASYM_PACKING; see if there's a more preferred CPU
|
|
* currently idle; in which case, kick the ILB to move tasks
|
|
* around.
|
|
+ *
|
|
+ * When balancing betwen cores, all the SMT siblings of the
|
|
+ * preferred CPU must be idle.
|
|
*/
|
|
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
|
|
- if (sched_asym_prefer(i, cpu)) {
|
|
+ if (sched_use_asym_prio(sd, i) &&
|
|
+ sched_asym_prefer(i, cpu)) {
|
|
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
|
|
goto unlock;
|
|
}
|
|
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
|
|
index ee7f23c76bd3..efdc29c42161 100644
|
|
--- a/kernel/sched/features.h
|
|
+++ b/kernel/sched/features.h
|
|
@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
|
|
*/
|
|
SCHED_FEAT(SIS_PROP, false)
|
|
SCHED_FEAT(SIS_UTIL, true)
|
|
+SCHED_FEAT(SIS_SHORT, true)
|
|
|
|
/*
|
|
* Issue a WARN when we do multiple update_rq_clock() calls
|
|
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
|
|
index 0f310768260c..036b0e2cd2b4 100644
|
|
--- a/kernel/sched/pelt.c
|
|
+++ b/kernel/sched/pelt.c
|
|
@@ -467,3 +467,63 @@ int update_irq_load_avg(struct rq *rq, u64 running)
|
|
return ret;
|
|
}
|
|
#endif
|
|
+
|
|
+__read_mostly unsigned int sched_pelt_lshift;
|
|
+
|
|
+#ifdef CONFIG_SYSCTL
|
|
+static unsigned int sysctl_sched_pelt_multiplier = 1;
|
|
+
|
|
+int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
|
|
+ size_t *lenp, loff_t *ppos)
|
|
+{
|
|
+ static DEFINE_MUTEX(mutex);
|
|
+ unsigned int old;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&mutex);
|
|
+ old = sysctl_sched_pelt_multiplier;
|
|
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
+ if (ret)
|
|
+ goto undo;
|
|
+ if (!write)
|
|
+ goto done;
|
|
+
|
|
+ switch (sysctl_sched_pelt_multiplier) {
|
|
+ case 1:
|
|
+ fallthrough;
|
|
+ case 2:
|
|
+ fallthrough;
|
|
+ case 4:
|
|
+ WRITE_ONCE(sched_pelt_lshift,
|
|
+ sysctl_sched_pelt_multiplier >> 1);
|
|
+ goto done;
|
|
+ default:
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+
|
|
+undo:
|
|
+ sysctl_sched_pelt_multiplier = old;
|
|
+done:
|
|
+ mutex_unlock(&mutex);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct ctl_table sched_pelt_sysctls[] = {
|
|
+ {
|
|
+ .procname = "sched_pelt_multiplier",
|
|
+ .data = &sysctl_sched_pelt_multiplier,
|
|
+ .maxlen = sizeof(unsigned int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = sched_pelt_multiplier,
|
|
+ },
|
|
+ {}
|
|
+};
|
|
+
|
|
+static int __init sched_pelt_sysctl_init(void)
|
|
+{
|
|
+ register_sysctl_init("kernel", sched_pelt_sysctls);
|
|
+ return 0;
|
|
+}
|
|
+late_initcall(sched_pelt_sysctl_init);
|
|
+#endif
|
|
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
|
|
index 3a0e0dc28721..9b35b5072bae 100644
|
|
--- a/kernel/sched/pelt.h
|
|
+++ b/kernel/sched/pelt.h
|
|
@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
|
|
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
|
}
|
|
|
|
+static inline u64 rq_clock_task_mult(struct rq *rq)
|
|
+{
|
|
+ lockdep_assert_rq_held(rq);
|
|
+ assert_clock_updated(rq);
|
|
+
|
|
+ return rq->clock_task_mult;
|
|
+}
|
|
+
|
|
static inline u64 rq_clock_pelt(struct rq *rq)
|
|
{
|
|
lockdep_assert_rq_held(rq);
|
|
@@ -72,7 +80,7 @@ static inline u64 rq_clock_pelt(struct rq *rq)
|
|
/* The rq is idle, we can sync to clock_task */
|
|
static inline void _update_idle_rq_clock_pelt(struct rq *rq)
|
|
{
|
|
- rq->clock_pelt = rq_clock_task(rq);
|
|
+ rq->clock_pelt = rq_clock_task_mult(rq);
|
|
|
|
u64_u32_store(rq->clock_idle, rq_clock(rq));
|
|
/* Paired with smp_rmb in migrate_se_pelt_lag() */
|
|
@@ -121,6 +129,27 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
|
|
rq->clock_pelt += delta;
|
|
}
|
|
|
|
+extern unsigned int sched_pelt_lshift;
|
|
+
|
|
+/*
|
|
+ * absolute time |1 |2 |3 |4 |5 |6 |
|
|
+ * @ mult = 1 --------****************--------****************-
|
|
+ * @ mult = 2 --------********----------------********---------
|
|
+ * @ mult = 4 --------****--------------------****-------------
|
|
+ * clock task mult
|
|
+ * @ mult = 2 | | |2 |3 | | | | |5 |6 | | |
|
|
+ * @ mult = 4 | | | | |2|3| | | | | | | | | | |5|6| | | | | | |
|
|
+ *
|
|
+ */
|
|
+static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta)
|
|
+{
|
|
+ delta <<= READ_ONCE(sched_pelt_lshift);
|
|
+
|
|
+ rq->clock_task_mult += delta;
|
|
+
|
|
+ update_rq_clock_pelt(rq, delta);
|
|
+}
|
|
+
|
|
/*
|
|
* When rq becomes idle, we have to check if it has lost idle time
|
|
* because it was fully busy. A rq is fully used when the /Sum util_sum
|
|
@@ -147,7 +176,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
|
|
* rq's clock_task.
|
|
*/
|
|
if (util_sum >= divider)
|
|
- rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
|
|
+ rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt;
|
|
|
|
_update_idle_rq_clock_pelt(rq);
|
|
}
|
|
@@ -218,13 +247,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
|
|
return 0;
|
|
}
|
|
|
|
-static inline u64 rq_clock_pelt(struct rq *rq)
|
|
+static inline u64 rq_clock_task_mult(struct rq *rq)
|
|
{
|
|
return rq_clock_task(rq);
|
|
}
|
|
|
|
+static inline u64 rq_clock_pelt(struct rq *rq)
|
|
+{
|
|
+ return rq_clock_task_mult(rq);
|
|
+}
|
|
+
|
|
static inline void
|
|
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
|
|
+update_rq_clock_task_mult(struct rq *rq, s64 delta) { }
|
|
|
|
static inline void
|
|
update_idle_rq_clock_pelt(struct rq *rq) { }
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index 3e8df6d31c1e..7331d436ebc4 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -1018,6 +1018,7 @@ struct rq {
|
|
u64 clock;
|
|
/* Ensure that all clocks are in the same cache line */
|
|
u64 clock_task ____cacheline_aligned;
|
|
+ u64 clock_task_mult;
|
|
u64 clock_pelt;
|
|
unsigned long lost_idle_time;
|
|
u64 clock_pelt_idle;
|
|
@@ -1772,6 +1773,13 @@ queue_balance_callback(struct rq *rq,
|
|
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
|
|
__sd; __sd = __sd->parent)
|
|
|
|
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
|
|
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
|
|
+static const unsigned int SD_SHARED_CHILD_MASK =
|
|
+#include <linux/sched/sd_flags.h>
|
|
+0;
|
|
+#undef SD_FLAG
|
|
+
|
|
/**
|
|
* highest_flag_domain - Return highest sched_domain containing flag.
|
|
* @cpu: The CPU whose highest level of sched domain is to
|
|
@@ -1779,16 +1787,25 @@ queue_balance_callback(struct rq *rq,
|
|
* @flag: The flag to check for the highest sched_domain
|
|
* for the given CPU.
|
|
*
|
|
- * Returns the highest sched_domain of a CPU which contains the given flag.
|
|
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
|
|
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
|
|
*/
|
|
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
|
|
{
|
|
struct sched_domain *sd, *hsd = NULL;
|
|
|
|
for_each_domain(cpu, sd) {
|
|
- if (!(sd->flags & flag))
|
|
+ if (sd->flags & flag) {
|
|
+ hsd = sd;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Stop the search if @flag is known to be shared at lower
|
|
+ * levels. It will not be found further up.
|
|
+ */
|
|
+ if (flag & SD_SHARED_CHILD_MASK)
|
|
break;
|
|
- hsd = sd;
|
|
}
|
|
|
|
return hsd;
|
|
--
|
|
2.40.1
|
|
|
|
From fed8faa97161f725528a30330a22a3ba5b8e9965 Mon Sep 17 00:00:00 2001
|
|
From: Peter Jung <admin@ptr1337.dev>
|
|
Date: Sat, 22 Apr 2023 11:46:46 +0200
|
|
Subject: [PATCH 10/10] zstd: import 1.5.5
|
|
|
|
Signed-off-by: Peter Jung <admin@ptr1337.dev>
|
|
---
|
|
include/linux/zstd.h | 2 +-
|
|
include/linux/zstd_errors.h | 23 +-
|
|
include/linux/zstd_lib.h | 703 +++++--
|
|
kernel/module/decompress.c | 2 +-
|
|
lib/zstd/Makefile | 2 +-
|
|
lib/zstd/common/allocations.h | 56 +
|
|
lib/zstd/common/bits.h | 149 ++
|
|
lib/zstd/common/bitstream.h | 53 +-
|
|
lib/zstd/common/compiler.h | 14 +-
|
|
lib/zstd/common/cpu.h | 3 +-
|
|
lib/zstd/common/debug.c | 3 +-
|
|
lib/zstd/common/debug.h | 3 +-
|
|
lib/zstd/common/entropy_common.c | 42 +-
|
|
lib/zstd/common/error_private.c | 12 +-
|
|
lib/zstd/common/error_private.h | 3 +-
|
|
lib/zstd/common/fse.h | 89 +-
|
|
lib/zstd/common/fse_decompress.c | 94 +-
|
|
lib/zstd/common/huf.h | 222 +--
|
|
lib/zstd/common/mem.h | 2 +-
|
|
lib/zstd/common/portability_macros.h | 26 +-
|
|
lib/zstd/common/zstd_common.c | 38 +-
|
|
lib/zstd/common/zstd_deps.h | 2 +-
|
|
lib/zstd/common/zstd_internal.h | 99 +-
|
|
lib/zstd/compress/clevels.h | 3 +-
|
|
lib/zstd/compress/fse_compress.c | 59 +-
|
|
lib/zstd/compress/hist.c | 3 +-
|
|
lib/zstd/compress/hist.h | 3 +-
|
|
lib/zstd/compress/huf_compress.c | 372 ++--
|
|
lib/zstd/compress/zstd_compress.c | 1762 ++++++++++++-----
|
|
lib/zstd/compress/zstd_compress_internal.h | 333 +++-
|
|
lib/zstd/compress/zstd_compress_literals.c | 155 +-
|
|
lib/zstd/compress/zstd_compress_literals.h | 25 +-
|
|
lib/zstd/compress/zstd_compress_sequences.c | 7 +-
|
|
lib/zstd/compress/zstd_compress_sequences.h | 3 +-
|
|
lib/zstd/compress/zstd_compress_superblock.c | 47 +-
|
|
lib/zstd/compress/zstd_compress_superblock.h | 3 +-
|
|
lib/zstd/compress/zstd_cwksp.h | 149 +-
|
|
lib/zstd/compress/zstd_double_fast.c | 129 +-
|
|
lib/zstd/compress/zstd_double_fast.h | 6 +-
|
|
lib/zstd/compress/zstd_fast.c | 582 ++++--
|
|
lib/zstd/compress/zstd_fast.h | 6 +-
|
|
lib/zstd/compress/zstd_lazy.c | 518 ++---
|
|
lib/zstd/compress/zstd_lazy.h | 7 +-
|
|
lib/zstd/compress/zstd_ldm.c | 11 +-
|
|
lib/zstd/compress/zstd_ldm.h | 3 +-
|
|
lib/zstd/compress/zstd_ldm_geartab.h | 3 +-
|
|
lib/zstd/compress/zstd_opt.c | 187 +-
|
|
lib/zstd/compress/zstd_opt.h | 3 +-
|
|
lib/zstd/decompress/huf_decompress.c | 731 ++++---
|
|
lib/zstd/decompress/zstd_ddict.c | 9 +-
|
|
lib/zstd/decompress/zstd_ddict.h | 3 +-
|
|
lib/zstd/decompress/zstd_decompress.c | 269 ++-
|
|
lib/zstd/decompress/zstd_decompress_block.c | 283 ++-
|
|
lib/zstd/decompress/zstd_decompress_block.h | 8 +-
|
|
.../decompress/zstd_decompress_internal.h | 7 +-
|
|
lib/zstd/decompress_sources.h | 2 +-
|
|
lib/zstd/zstd_common_module.c | 5 +-
|
|
lib/zstd/zstd_compress_module.c | 2 +-
|
|
lib/zstd/zstd_decompress_module.c | 4 +-
|
|
59 files changed, 4732 insertions(+), 2612 deletions(-)
|
|
create mode 100644 lib/zstd/common/allocations.h
|
|
create mode 100644 lib/zstd/common/bits.h
|
|
|
|
diff --git a/include/linux/zstd.h b/include/linux/zstd.h
|
|
index 113408eef6ec..f109d49f43f8 100644
|
|
--- a/include/linux/zstd.h
|
|
+++ b/include/linux/zstd.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
|
|
index 58b6dd45a969..6d5cf55f0bf3 100644
|
|
--- a/include/linux/zstd_errors.h
|
|
+++ b/include/linux/zstd_errors.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -17,8 +18,17 @@
|
|
|
|
|
|
/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */
|
|
-#define ZSTDERRORLIB_VISIBILITY
|
|
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
|
|
+#define ZSTDERRORLIB_VISIBLE
|
|
+
|
|
+#ifndef ZSTDERRORLIB_HIDDEN
|
|
+# if (__GNUC__ >= 4) && !defined(__MINGW32__)
|
|
+# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
|
|
+# else
|
|
+# define ZSTDERRORLIB_HIDDEN
|
|
+# endif
|
|
+#endif
|
|
+
|
|
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
|
|
|
|
/*-*********************************************
|
|
* Error codes list
|
|
@@ -43,14 +53,17 @@ typedef enum {
|
|
ZSTD_error_frameParameter_windowTooLarge = 16,
|
|
ZSTD_error_corruption_detected = 20,
|
|
ZSTD_error_checksum_wrong = 22,
|
|
+ ZSTD_error_literals_headerWrong = 24,
|
|
ZSTD_error_dictionary_corrupted = 30,
|
|
ZSTD_error_dictionary_wrong = 32,
|
|
ZSTD_error_dictionaryCreation_failed = 34,
|
|
ZSTD_error_parameter_unsupported = 40,
|
|
+ ZSTD_error_parameter_combination_unsupported = 41,
|
|
ZSTD_error_parameter_outOfBound = 42,
|
|
ZSTD_error_tableLog_tooLarge = 44,
|
|
ZSTD_error_maxSymbolValue_tooLarge = 46,
|
|
ZSTD_error_maxSymbolValue_tooSmall = 48,
|
|
+ ZSTD_error_stabilityCondition_notRespected = 50,
|
|
ZSTD_error_stage_wrong = 60,
|
|
ZSTD_error_init_missing = 62,
|
|
ZSTD_error_memory_allocation = 64,
|
|
@@ -58,11 +71,15 @@ typedef enum {
|
|
ZSTD_error_dstSize_tooSmall = 70,
|
|
ZSTD_error_srcSize_wrong = 72,
|
|
ZSTD_error_dstBuffer_null = 74,
|
|
+ ZSTD_error_noForwardProgress_destFull = 80,
|
|
+ ZSTD_error_noForwardProgress_inputEmpty = 82,
|
|
/* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
|
|
ZSTD_error_frameIndex_tooLarge = 100,
|
|
ZSTD_error_seekableIO = 102,
|
|
ZSTD_error_dstBuffer_wrong = 104,
|
|
ZSTD_error_srcBuffer_wrong = 105,
|
|
+ ZSTD_error_sequenceProducer_failed = 106,
|
|
+ ZSTD_error_externalSequences_invalid = 107,
|
|
ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
|
|
} ZSTD_ErrorCode;
|
|
|
|
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
|
|
index 79d55465d5c1..738fe8ea4ead 100644
|
|
--- a/include/linux/zstd_lib.h
|
|
+++ b/include/linux/zstd_lib.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -11,23 +12,42 @@
|
|
#ifndef ZSTD_H_235446
|
|
#define ZSTD_H_235446
|
|
|
|
-/* ====== Dependency ======*/
|
|
+/* ====== Dependencies ======*/
|
|
#include <linux/limits.h> /* INT_MAX */
|
|
#include <linux/types.h> /* size_t */
|
|
|
|
|
|
/* ===== ZSTDLIB_API : control library symbols visibility ===== */
|
|
-#ifndef ZSTDLIB_VISIBLE
|
|
+#define ZSTDLIB_VISIBLE
|
|
+
|
|
+#ifndef ZSTDLIB_HIDDEN
|
|
# if (__GNUC__ >= 4) && !defined(__MINGW32__)
|
|
-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
|
|
# define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
|
|
# else
|
|
-# define ZSTDLIB_VISIBLE
|
|
# define ZSTDLIB_HIDDEN
|
|
# endif
|
|
#endif
|
|
+
|
|
#define ZSTDLIB_API ZSTDLIB_VISIBLE
|
|
|
|
+/* Deprecation warnings :
|
|
+ * Should these warnings be a problem, it is generally possible to disable them,
|
|
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
|
|
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
|
|
+ */
|
|
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
|
|
+# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
|
|
+#else
|
|
+# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
|
|
+# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
|
|
+# elif (__GNUC__ >= 3)
|
|
+# define ZSTD_DEPRECATED(message) __attribute__((deprecated))
|
|
+# else
|
|
+# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
|
|
+# define ZSTD_DEPRECATED(message)
|
|
+# endif
|
|
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
|
|
+
|
|
|
|
/* *****************************************************************************
|
|
Introduction
|
|
@@ -65,7 +85,7 @@
|
|
/*------ Version ------*/
|
|
#define ZSTD_VERSION_MAJOR 1
|
|
#define ZSTD_VERSION_MINOR 5
|
|
-#define ZSTD_VERSION_RELEASE 2
|
|
+#define ZSTD_VERSION_RELEASE 5
|
|
#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
|
|
|
|
/*! ZSTD_versionNumber() :
|
|
@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
|
|
***************************************/
|
|
/*! ZSTD_compress() :
|
|
* Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
|
|
- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
|
|
+ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
|
|
+ * enough space to successfully compress the data.
|
|
* @return : compressed size written into `dst` (<= `dstCapacity),
|
|
* or an error code if it fails (which can be tested using ZSTD_isError()). */
|
|
ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
|
|
@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
|
|
* "empty", "unknown" and "error" results to the same return value (0),
|
|
* while ZSTD_getFrameContentSize() gives them separate return values.
|
|
* @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
|
|
-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
|
|
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
|
|
+ZSTDLIB_API
|
|
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
|
|
|
|
/*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
|
|
* `src` should point to the start of a ZSTD frame or skippable frame.
|
|
@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
|
|
|
|
|
|
/*====== Helper functions ======*/
|
|
-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
|
|
-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
|
|
+/* ZSTD_compressBound() :
|
|
+ * maximum compressed size in worst case single-pass scenario.
|
|
+ * When invoking `ZSTD_compress()` or any other one-pass compression function,
|
|
+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
|
|
+ * as it eliminates one potential failure scenario,
|
|
+ * aka not enough room in dst buffer to write the compressed frame.
|
|
+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
|
|
+ * In which case, ZSTD_compressBound() will return an error code
|
|
+ * which can be tested using ZSTD_isError().
|
|
+ *
|
|
+ * ZSTD_COMPRESSBOUND() :
|
|
+ * same as ZSTD_compressBound(), but as a macro.
|
|
+ * It can be used to produce constants, which can be useful for static allocation,
|
|
+ * for example to size a static array on stack.
|
|
+ * Will produce constant value 0 if srcSize too large.
|
|
+ */
|
|
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
|
|
+#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
|
|
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
|
|
+/* ZSTD_isError() :
|
|
+ * Most ZSTD_* functions returning a size_t value can be tested for error,
|
|
+ * using ZSTD_isError().
|
|
+ * @return 1 if error, 0 otherwise
|
|
+ */
|
|
ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
|
|
ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
|
|
ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */
|
|
@@ -412,6 +457,9 @@ typedef enum {
|
|
* ZSTD_c_validateSequences
|
|
* ZSTD_c_useBlockSplitter
|
|
* ZSTD_c_useRowMatchFinder
|
|
+ * ZSTD_c_prefetchCDictTables
|
|
+ * ZSTD_c_enableSeqProducerFallback
|
|
+ * ZSTD_c_maxBlockSize
|
|
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
|
|
* note : never ever use experimentalParam? names directly;
|
|
* also, the enums values themselves are unstable and can still change.
|
|
@@ -430,7 +478,11 @@ typedef enum {
|
|
ZSTD_c_experimentalParam12=1009,
|
|
ZSTD_c_experimentalParam13=1010,
|
|
ZSTD_c_experimentalParam14=1011,
|
|
- ZSTD_c_experimentalParam15=1012
|
|
+ ZSTD_c_experimentalParam15=1012,
|
|
+ ZSTD_c_experimentalParam16=1013,
|
|
+ ZSTD_c_experimentalParam17=1014,
|
|
+ ZSTD_c_experimentalParam18=1015,
|
|
+ ZSTD_c_experimentalParam19=1016
|
|
} ZSTD_cParameter;
|
|
|
|
typedef struct {
|
|
@@ -493,7 +545,7 @@ typedef enum {
|
|
* They will be used to compress next frame.
|
|
* Resetting session never fails.
|
|
* - The parameters : changes all parameters back to "default".
|
|
- * This removes any reference to any dictionary too.
|
|
+ * This also removes any reference to any dictionary or external sequence producer.
|
|
* Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
|
|
* otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
|
|
* - Both : similar to resetting the session, followed by resetting parameters.
|
|
@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
|
|
* Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
|
|
* - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
|
|
* - The function is always blocking, returns when compression is completed.
|
|
- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
|
|
+ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
|
|
+ * enough space to successfully compress the data, though it is possible it fails for other reasons.
|
|
* @return : compressed size written into `dst` (<= `dstCapacity),
|
|
* or an error code if it fails (which can be tested using ZSTD_isError()).
|
|
*/
|
|
@@ -543,13 +596,15 @@ typedef enum {
|
|
* ZSTD_d_stableOutBuffer
|
|
* ZSTD_d_forceIgnoreChecksum
|
|
* ZSTD_d_refMultipleDDicts
|
|
+ * ZSTD_d_disableHuffmanAssembly
|
|
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
|
|
* note : never ever use experimentalParam? names directly
|
|
*/
|
|
ZSTD_d_experimentalParam1=1000,
|
|
ZSTD_d_experimentalParam2=1001,
|
|
ZSTD_d_experimentalParam3=1002,
|
|
- ZSTD_d_experimentalParam4=1003
|
|
+ ZSTD_d_experimentalParam4=1003,
|
|
+ ZSTD_d_experimentalParam5=1004
|
|
|
|
} ZSTD_dParameter;
|
|
|
|
@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output
|
|
* This following is a legacy streaming API, available since v1.0+ .
|
|
* It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
|
|
* It is redundant, but remains fully supported.
|
|
- * Streaming in combination with advanced parameters and dictionary compression
|
|
- * can only be used through the new API.
|
|
******************************************************************************/
|
|
|
|
/*!
|
|
@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output
|
|
* ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
|
|
* ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
|
|
* ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
|
|
+ *
|
|
+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
|
|
+ * to compress with a dictionary.
|
|
*/
|
|
ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
|
|
/*!
|
|
@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer
|
|
|
|
/*===== Streaming decompression functions =====*/
|
|
|
|
-/* This function is redundant with the advanced API and equivalent to:
|
|
+/*! ZSTD_initDStream() :
|
|
+ * Initialize/reset DStream state for new decompression operation.
|
|
+ * Call before new decompression operation using same DStream.
|
|
*
|
|
+ * Note : This function is redundant with the advanced API and equivalent to:
|
|
* ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
|
|
* ZSTD_DCtx_refDDict(zds, NULL);
|
|
*/
|
|
ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
|
|
|
|
+/*! ZSTD_decompressStream() :
|
|
+ * Streaming decompression function.
|
|
+ * Call repetitively to consume full input updating it as necessary.
|
|
+ * Function will update both input and output `pos` fields exposing current state via these fields:
|
|
+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
|
|
+ * on the next call.
|
|
+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
|
|
+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
|
|
+ * call ZSTD_decompressStream() again to flush remaining data to output.
|
|
+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
|
|
+ *
|
|
+ * @return : 0 when a frame is completely decoded and fully flushed,
|
|
+ * or an error code, which can be tested using ZSTD_isError(),
|
|
+ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
|
|
+ */
|
|
ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
|
|
|
|
ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */
|
|
@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
|
|
* If @return == 0, the dictID could not be decoded.
|
|
* This could for one of the following reasons :
|
|
* - The frame does not require a dictionary to be decoded (most common case).
|
|
- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
|
|
+ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
|
|
* Note : this use case also happens when using a non-conformant dictionary.
|
|
* - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
|
|
* - This is not a Zstandard frame.
|
|
@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
|
|
* Advanced dictionary and prefix API (Requires v1.4.0+)
|
|
*
|
|
* This API allows dictionaries to be used with ZSTD_compress2(),
|
|
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
|
|
- * only reset with the context is reset with ZSTD_reset_parameters or
|
|
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
|
|
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
|
|
+ * Dictionaries are sticky, they remain valid when same context is re-used,
|
|
+ * they only reset when the context is reset
|
|
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
|
|
+ * In contrast, Prefixes are single-use.
|
|
******************************************************************************/
|
|
|
|
|
|
@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
|
|
* @result : 0, or an error code (which can be tested with ZSTD_isError()).
|
|
* Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
|
|
* meaning "return to no-dictionary mode".
|
|
- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
|
|
- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
|
|
+ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
|
|
+ * until parameters are reset, a new dictionary is loaded, or the dictionary
|
|
+ * is explicitly invalidated by loading a NULL dictionary.
|
|
* Note 2 : Loading a dictionary involves building tables.
|
|
* It's also a CPU consuming operation, with non-negligible impact on latency.
|
|
* Tables are dependent on compression parameters, and for this reason,
|
|
@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
|
|
* Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
|
|
* In such a case, dictionary buffer must outlive its users.
|
|
* Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
|
|
- * to precisely select how dictionary content must be interpreted. */
|
|
+ * to precisely select how dictionary content must be interpreted.
|
|
+ * Note 5 : This method does not benefit from LDM (long distance mode).
|
|
+ * If you want to employ LDM on some large dictionary content,
|
|
+ * prefer employing ZSTD_CCtx_refPrefix() described below.
|
|
+ */
|
|
ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
|
|
|
|
/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
|
|
- * Reference a prepared dictionary, to be used for all next compressed frames.
|
|
+ * Reference a prepared dictionary, to be used for all future compressed frames.
|
|
* Note that compression parameters are enforced from within CDict,
|
|
* and supersede any compression parameter previously set within CCtx.
|
|
* The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
|
|
@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
|
|
* Decompression will need same prefix to properly regenerate data.
|
|
* Compressing with a prefix is similar in outcome as performing a diff and compressing it,
|
|
* but performs much faster, especially during decompression (compression speed is tunable with compression level).
|
|
+ * This method is compatible with LDM (long distance mode).
|
|
* @result : 0, or an error code (which can be tested with ZSTD_isError()).
|
|
* Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
|
|
* Note 1 : Prefix buffer is referenced. It **must** outlive compression.
|
|
@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
|
|
const void* prefix, size_t prefixSize);
|
|
|
|
/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
|
|
- * Create an internal DDict from dict buffer,
|
|
- * to be used to decompress next frames.
|
|
- * The dictionary remains valid for all future frames, until explicitly invalidated.
|
|
+ * Create an internal DDict from dict buffer, to be used to decompress all future frames.
|
|
+ * The dictionary remains valid for all future frames, until explicitly invalidated, or
|
|
+ * a new dictionary is loaded.
|
|
* @result : 0, or an error code (which can be tested with ZSTD_isError()).
|
|
* Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
|
|
* meaning "return to no-dictionary mode".
|
|
@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
|
|
* The memory for the table is allocated on the first call to refDDict, and can be
|
|
* freed with ZSTD_freeDCtx().
|
|
*
|
|
+ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
|
|
+ * will be managed, and referencing a dictionary effectively "discards" any previous one.
|
|
+ *
|
|
* @result : 0, or an error code (which can be tested with ZSTD_isError()).
|
|
- * Note 1 : Currently, only one dictionary can be managed.
|
|
- * Referencing a new dictionary effectively "discards" any previous one.
|
|
* Special: referencing a NULL DDict means "return to no-dictionary mode".
|
|
* Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
|
|
*/
|
|
@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
|
|
#define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
|
|
#endif
|
|
|
|
-/* Deprecation warnings :
|
|
- * Should these warnings be a problem, it is generally possible to disable them,
|
|
- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
|
|
- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
|
|
- */
|
|
-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
|
|
-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */
|
|
-#else
|
|
-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
|
|
-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
|
|
-# elif (__GNUC__ >= 3)
|
|
-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
|
|
-# else
|
|
-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
|
|
-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
|
|
-# endif
|
|
-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
|
|
-
|
|
/* **************************************************************************************
|
|
* experimental API (static linking only)
|
|
****************************************************************************************
|
|
@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
|
|
#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */
|
|
#define ZSTD_STRATEGY_MIN ZSTD_fast
|
|
#define ZSTD_STRATEGY_MAX ZSTD_btultra2
|
|
+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
|
|
|
|
|
|
#define ZSTD_OVERLAPLOG_MIN 0
|
|
@@ -1303,7 +1369,7 @@ typedef enum {
|
|
} ZSTD_paramSwitch_e;
|
|
|
|
/* *************************************
|
|
-* Frame size functions
|
|
+* Frame header and size functions
|
|
***************************************/
|
|
|
|
/*! ZSTD_findDecompressedSize() :
|
|
@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
|
|
* or an error code (if srcSize is too small) */
|
|
ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
|
|
|
|
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
|
|
+typedef struct {
|
|
+ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
|
|
+ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
|
|
+ unsigned blockSizeMax;
|
|
+ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
|
|
+ unsigned headerSize;
|
|
+ unsigned dictID;
|
|
+ unsigned checksumFlag;
|
|
+ unsigned _reserved1;
|
|
+ unsigned _reserved2;
|
|
+} ZSTD_frameHeader;
|
|
+
|
|
+/*! ZSTD_getFrameHeader() :
|
|
+ * decode Frame Header, or requires larger `srcSize`.
|
|
+ * @return : 0, `zfhPtr` is correctly filled,
|
|
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
|
|
+ * or an error code, which can be tested using ZSTD_isError() */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */
|
|
+/*! ZSTD_getFrameHeader_advanced() :
|
|
+ * same as ZSTD_getFrameHeader(),
|
|
+ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
|
|
+
|
|
+/*! ZSTD_decompressionMargin() :
|
|
+ * Zstd supports in-place decompression, where the input and output buffers overlap.
|
|
+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
|
|
+ * and the input buffer must be at the end of the output buffer.
|
|
+ *
|
|
+ * _______________________ Output Buffer ________________________
|
|
+ * | |
|
|
+ * | ____ Input Buffer ____|
|
|
+ * | | |
|
|
+ * v v v
|
|
+ * |---------------------------------------|-----------|----------|
|
|
+ * ^ ^ ^
|
|
+ * |___________________ Output_Size ___________________|_ Margin _|
|
|
+ *
|
|
+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
|
|
+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
|
|
+ * ZSTD_decompressDCtx().
|
|
+ * NOTE: This function supports multi-frame input.
|
|
+ *
|
|
+ * @param src The compressed frame(s)
|
|
+ * @param srcSize The size of the compressed frame(s)
|
|
+ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
|
|
+ */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
|
|
+
|
|
+/*! ZSTD_DECOMPRESS_MARGIN() :
|
|
+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
|
|
+ * the compressed frame, compute it from the original size and the blockSizeLog.
|
|
+ * See ZSTD_decompressionMargin() for details.
|
|
+ *
|
|
+ * WARNING: This macro does not support multi-frame input, the input must be a single
|
|
+ * zstd frame. If you need that support use the function, or implement it yourself.
|
|
+ *
|
|
+ * @param originalSize The original uncompressed size of the data.
|
|
+ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
|
|
+ * Unless you explicitly set the windowLog smaller than
|
|
+ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
|
|
+ */
|
|
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \
|
|
+ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \
|
|
+ 4 /* checksum */ + \
|
|
+ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
|
|
+ (blockSize) /* One block of margin */ \
|
|
+ ))
|
|
+
|
|
typedef enum {
|
|
ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
|
|
ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */
|
|
} ZSTD_sequenceFormat_e;
|
|
|
|
+/*! ZSTD_sequenceBound() :
|
|
+ * `srcSize` : size of the input buffer
|
|
+ * @return : upper-bound for the number of sequences that can be generated
|
|
+ * from a buffer of srcSize bytes
|
|
+ *
|
|
+ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
|
|
+ */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
|
|
+
|
|
/*! ZSTD_generateSequences() :
|
|
- * Generate sequences using ZSTD_compress2, given a source buffer.
|
|
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
|
|
*
|
|
* Each block will end with a dummy sequence
|
|
* with offset == 0, matchLength == 0, and litLength == length of last literals.
|
|
* litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
|
|
* simply acts as a block delimiter.
|
|
*
|
|
- * zc can be used to insert custom compression params.
|
|
- * This function invokes ZSTD_compress2
|
|
+ * @zc can be used to insert custom compression params.
|
|
+ * This function invokes ZSTD_compress2().
|
|
*
|
|
* The output of this function can be fed into ZSTD_compressSequences() with CCtx
|
|
* setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
|
|
* @return : number of sequences generated
|
|
*/
|
|
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
|
- size_t outSeqsSize, const void* src, size_t srcSize);
|
|
+ZSTDLIB_STATIC_API size_t
|
|
+ZSTD_generateSequences( ZSTD_CCtx* zc,
|
|
+ ZSTD_Sequence* outSeqs, size_t outSeqsSize,
|
|
+ const void* src, size_t srcSize);
|
|
|
|
/*! ZSTD_mergeBlockDelimiters() :
|
|
* Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
|
|
@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
|
|
ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
|
|
|
|
/*! ZSTD_compressSequences() :
|
|
- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
|
|
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
|
|
+ * @src contains the entire input (not just the literals).
|
|
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
|
|
* If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
|
|
* The entire source is compressed into a single frame.
|
|
*
|
|
@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
|
|
* Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
|
|
* Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
|
|
* and cannot emit an RLE block that disagrees with the repcode history
|
|
- * @return : final compressed size or a ZSTD error.
|
|
+ * @return : final compressed size, or a ZSTD error code.
|
|
*/
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
|
|
- const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
|
|
- const void* src, size_t srcSize);
|
|
+ZSTDLIB_STATIC_API size_t
|
|
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
|
|
+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
|
|
+ const void* src, size_t srcSize);
|
|
|
|
|
|
/*! ZSTD_writeSkippableFrame() :
|
|
@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
|
|
* and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
|
|
* Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
|
|
*
|
|
- * Note 2 : only single-threaded compression is supported.
|
|
+ * Note : only single-threaded compression is supported.
|
|
* ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
|
|
+ *
|
|
+ * Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
|
|
+ * Size estimates assume that no external sequence producer is registered.
|
|
*/
|
|
ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
|
|
ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
|
|
@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
|
|
* or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
|
|
* Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
|
|
* an internal ?Dict will be created, which additional size is not estimated here.
|
|
- * In this case, get total size by adding ZSTD_estimate?DictSize */
|
|
+ * In this case, get total size by adding ZSTD_estimate?DictSize
|
|
+ * Note 2 : only single-threaded compression is supported.
|
|
+ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
|
|
+ * Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
|
|
+ * Size estimates assume that no external sequence producer is registered.
|
|
+ */
|
|
ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
|
|
ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
|
|
ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
|
|
@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
|
|
* This function never fails (wide contract) */
|
|
ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
|
|
|
|
+/*! ZSTD_CCtx_setCParams() :
|
|
+ * Set all parameters provided within @p cparams into the working @p cctx.
|
|
+ * Note : if modifying parameters during compression (MT mode only),
|
|
+ * note that changes to the .windowLog parameter will be ignored.
|
|
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
|
|
+ * On failure, no parameters are updated.
|
|
+ */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
|
|
+
|
|
+/*! ZSTD_CCtx_setFParams() :
|
|
+ * Set all parameters provided within @p fparams into the working @p cctx.
|
|
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
|
|
+ */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
|
|
+
|
|
+/*! ZSTD_CCtx_setParams() :
|
|
+ * Set all parameters provided within @p params into the working @p cctx.
|
|
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
|
|
+ */
|
|
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
|
|
+
|
|
/*! ZSTD_compress_advanced() :
|
|
* Note : this function is now DEPRECATED.
|
|
* It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
|
|
* This prototype will generate compilation warnings. */
|
|
ZSTD_DEPRECATED("use ZSTD_compress2")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
|
|
- void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize,
|
|
- const void* dict,size_t dictSize,
|
|
- ZSTD_parameters params);
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize,
|
|
+ const void* dict,size_t dictSize,
|
|
+ ZSTD_parameters params);
|
|
|
|
/*! ZSTD_compress_usingCDict_advanced() :
|
|
* Note : this function is now DEPRECATED.
|
|
* It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
|
|
* This prototype will generate compilation warnings. */
|
|
ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
|
|
void* dst, size_t dstCapacity,
|
|
const void* src, size_t srcSize,
|
|
@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
|
|
* Experimental parameter.
|
|
* Default is 0 == disabled. Set to 1 to enable.
|
|
*
|
|
- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
|
|
- * between calls, except for the modifications that zstd makes to pos (the
|
|
- * caller must not modify pos). This is checked by the compressor, and
|
|
- * compression will fail if it ever changes. This means the only flush
|
|
- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
|
|
- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
|
|
- * MUST not be modified during compression or you will get data corruption.
|
|
+ * Tells the compressor that input data presented with ZSTD_inBuffer
|
|
+ * will ALWAYS be the same between calls.
|
|
+ * Technically, the @src pointer must never be changed,
|
|
+ * and the @pos field can only be updated by zstd.
|
|
+ * However, it's possible to increase the @size field,
|
|
+ * allowing scenarios where more data can be appended after compressions starts.
|
|
+ * These conditions are checked by the compressor,
|
|
+ * and compression will fail if they are not respected.
|
|
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
|
|
+ * MUST not be modified during compression or it will result in data corruption.
|
|
*
|
|
* When this flag is enabled zstd won't allocate an input window buffer,
|
|
* because the user guarantees it can reference the ZSTD_inBuffer until
|
|
@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
|
|
* large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
|
|
* avoid the memcpy() from the input buffer to the input window buffer.
|
|
*
|
|
- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
|
|
- * That means this flag cannot be used with ZSTD_compressStream().
|
|
- *
|
|
* NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
|
|
* this flag is ALWAYS memory safe, and will never access out-of-bounds
|
|
- * memory. However, compression WILL fail if you violate the preconditions.
|
|
+ * memory. However, compression WILL fail if conditions are not respected.
|
|
*
|
|
- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
|
|
- * not be modified during compression or you will get data corruption. This
|
|
- * is because zstd needs to reference data in the ZSTD_inBuffer to find
|
|
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
|
|
+ * not be modified during compression or it will result in data corruption.
|
|
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
|
|
* matches. Normally zstd maintains its own window buffer for this purpose,
|
|
- * but passing this flag tells zstd to use the user provided buffer.
|
|
+ * but passing this flag tells zstd to rely on user provided buffer instead.
|
|
*/
|
|
#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
|
|
|
|
@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
|
|
* Without validation, providing a sequence that does not conform to the zstd spec will cause
|
|
* undefined behavior, and may produce a corrupted block.
|
|
*
|
|
- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
|
|
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
|
|
* specifics regarding offset/matchlength requirements) then the function will bail out and
|
|
* return an error.
|
|
*
|
|
@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
|
|
*/
|
|
#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
|
|
|
|
+/* ZSTD_c_prefetchCDictTables
|
|
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
|
|
+ *
|
|
+ * In some situations, zstd uses CDict tables in-place rather than copying them
|
|
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
|
|
+ * In such situations, compression speed is seriously impacted when CDict tables are
|
|
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
|
|
+ * when they are used in-place.
|
|
+ *
|
|
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
|
|
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
|
|
+ * into the working context, so there is no need to prefetch. This parameter is
|
|
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
|
|
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
|
|
+ * makes sense is best determined by careful experimentation.
|
|
+ *
|
|
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
|
|
+ * but in the future zstd may conditionally enable this feature via an auto-detection
|
|
+ * heuristic for cold CDicts.
|
|
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
|
|
+ */
|
|
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
|
|
+
|
|
+/* ZSTD_c_enableSeqProducerFallback
|
|
+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
|
|
+ *
|
|
+ * Controls whether zstd will fall back to an internal sequence producer if an
|
|
+ * external sequence producer is registered and returns an error code. This fallback
|
|
+ * is block-by-block: the internal sequence producer will only be called for blocks
|
|
+ * where the external sequence producer returns an error code. Fallback parsing will
|
|
+ * follow any other cParam settings, such as compression level, the same as in a
|
|
+ * normal (fully-internal) compression operation.
|
|
+ *
|
|
+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
|
|
+ * documentation (below) before setting this parameter. */
|
|
+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
|
|
+
|
|
+/* ZSTD_c_maxBlockSize
|
|
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
|
|
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
|
|
+ *
|
|
+ * This parameter can be used to set an upper bound on the blocksize
|
|
+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
|
|
+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
|
|
+ * compressBound() inaccurate). Only currently meant to be used for testing.
|
|
+ *
|
|
+ */
|
|
+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
|
|
+
|
|
+/* ZSTD_c_searchForExternalRepcodes
|
|
+ * This parameter affects how zstd parses external sequences, such as sequences
|
|
+ * provided through the compressSequences() API or from an external block-level
|
|
+ * sequence producer.
|
|
+ *
|
|
+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
|
|
+ * external sequences, even if those repcodes are not explicitly indicated in
|
|
+ * the "rep" field. Note that this is the only way to exploit repcode matches
|
|
+ * while using compressSequences() or an external sequence producer, since zstd
|
|
+ * currently ignores the "rep" field of external sequences.
|
|
+ *
|
|
+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
|
|
+ * external sequences, regardless of whether the "rep" field has been set. This
|
|
+ * reduces sequence compression overhead by about 25% while sacrificing some
|
|
+ * compression ratio.
|
|
+ *
|
|
+ * The default value is ZSTD_ps_auto, for which the library will enable/disable
|
|
+ * based on compression level.
|
|
+ *
|
|
+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
|
|
+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
|
|
+ */
|
|
+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
|
|
+
|
|
/*! ZSTD_CCtx_getParameter() :
|
|
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
|
|
* and store it into int* value.
|
|
@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
|
|
* in the range [dst, dst + pos) MUST not be modified during decompression
|
|
* or you will get data corruption.
|
|
*
|
|
- * When this flags is enabled zstd won't allocate an output buffer, because
|
|
+ * When this flag is enabled zstd won't allocate an output buffer, because
|
|
* it can write directly to the ZSTD_outBuffer, but it will still allocate
|
|
* an input buffer large enough to fit any compressed block. This will also
|
|
* avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
|
|
@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
|
|
*/
|
|
#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
|
|
|
|
+/* ZSTD_d_disableHuffmanAssembly
|
|
+ * Set to 1 to disable the Huffman assembly implementation.
|
|
+ * The default value is 0, which allows zstd to use the Huffman assembly
|
|
+ * implementation if available.
|
|
+ *
|
|
+ * This parameter can be used to disable Huffman assembly at runtime.
|
|
+ * If you want to disable it at compile time you can define the macro
|
|
+ * ZSTD_DISABLE_ASM.
|
|
+ */
|
|
+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
|
|
+
|
|
|
|
/*! ZSTD_DCtx_setFormat() :
|
|
* This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
|
|
@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
|
|
* such ZSTD_f_zstd1_magicless for example.
|
|
* @return : 0, or an error code (which can be tested using ZSTD_isError()). */
|
|
ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
|
|
|
|
/*! ZSTD_decompressStream_simpleArgs() :
|
|
@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
|
|
int compressionLevel,
|
|
unsigned long long pledgedSrcSize);
|
|
@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
|
|
const void* dict, size_t dictSize,
|
|
int compressionLevel);
|
|
|
|
/*! ZSTD_initCStream_advanced() :
|
|
- * This function is DEPRECATED, and is approximately equivalent to:
|
|
+ * This function is DEPRECATED, and is equivalent to:
|
|
* ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
|
|
- * // Pseudocode: Set each zstd parameter and leave the rest as-is.
|
|
- * for ((param, value) : params) {
|
|
- * ZSTD_CCtx_setParameter(zcs, param, value);
|
|
- * }
|
|
+ * ZSTD_CCtx_setParams(zcs, params);
|
|
* ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
|
|
* ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
|
|
*
|
|
@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
|
|
const void* dict, size_t dictSize,
|
|
ZSTD_parameters params,
|
|
@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
|
|
|
|
/*! ZSTD_initCStream_usingCDict_advanced() :
|
|
- * This function is DEPRECATED, and is approximately equivalent to:
|
|
+ * This function is DEPRECATED, and is equivalent to:
|
|
* ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
|
|
- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
|
|
- * for ((fParam, value) : fParams) {
|
|
- * ZSTD_CCtx_setParameter(zcs, fParam, value);
|
|
- * }
|
|
+ * ZSTD_CCtx_setFParams(zcs, fParams);
|
|
* ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
|
|
* ZSTD_CCtx_refCDict(zcs, cdict);
|
|
*
|
|
@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
|
|
const ZSTD_CDict* cdict,
|
|
ZSTD_frameParameters fParams,
|
|
@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
|
|
* This prototype will generate compilation warnings.
|
|
*/
|
|
ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
|
|
|
|
|
|
@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
|
|
* ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
|
|
*
|
|
* note: no dictionary will be used if dict == NULL or dictSize < 8
|
|
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
|
|
*/
|
|
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
|
|
|
|
/*!
|
|
@@ -2330,27 +2595,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
|
|
* ZSTD_DCtx_refDDict(zds, ddict);
|
|
*
|
|
* note : ddict is referenced, it must outlive decompression session
|
|
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
|
|
*/
|
|
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
|
|
|
|
-/*!
|
|
- * This function is deprecated, and is equivalent to:
|
|
- *
|
|
- * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
|
|
- *
|
|
- * re-use decompression parameters from previous init; saves dictionary loading
|
|
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
|
|
- */
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
|
|
+
|
|
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
|
|
+ *
|
|
+ * *** OVERVIEW ***
|
|
+ * The Block-Level Sequence Producer API allows users to provide their own custom
|
|
+ * sequence producer which libzstd invokes to process each block. The produced list
|
|
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
|
|
+ * valid compressed blocks.
|
|
+ *
|
|
+ * This block-level offload API is a more granular complement of the existing
|
|
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
|
|
+ * an easier migration story for applications already integrated with libzstd: the
|
|
+ * user application continues to invoke the same compression functions
|
|
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
|
|
+ * from the specific advantages of the external sequence producer. For example,
|
|
+ * the sequence producer could be tuned to take advantage of known characteristics
|
|
+ * of the input, to offer better speed / ratio, or could leverage hardware
|
|
+ * acceleration not available within libzstd itself.
|
|
+ *
|
|
+ * See contrib/externalSequenceProducer for an example program employing the
|
|
+ * Block-Level Sequence Producer API.
|
|
+ *
|
|
+ * *** USAGE ***
|
|
+ * The user is responsible for implementing a function of type
|
|
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
|
|
+ * arguments to the user-provided function:
|
|
+ *
|
|
+ * - sequenceProducerState: a pointer to a user-managed state for the sequence
|
|
+ * producer.
|
|
+ *
|
|
+ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
|
|
+ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
|
|
+ * backing outSeqs is managed by the CCtx.
|
|
+ *
|
|
+ * - src, srcSize: an input buffer for the sequence producer to parse.
|
|
+ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
|
|
+ *
|
|
+ * - dict, dictSize: a history buffer, which may be empty, which the sequence
|
|
+ * producer may reference as it parses the src buffer. Currently, zstd will
|
|
+ * always pass dictSize == 0 into external sequence producers, but this will
|
|
+ * change in the future.
|
|
+ *
|
|
+ * - compressionLevel: a signed integer representing the zstd compression level
|
|
+ * set by the user for the current operation. The sequence producer may choose
|
|
+ * to use this information to change its compression strategy and speed/ratio
|
|
+ * tradeoff. Note: the compression level does not reflect zstd parameters set
|
|
+ * through the advanced API.
|
|
+ *
|
|
+ * - windowSize: a size_t representing the maximum allowed offset for external
|
|
+ * sequences. Note that sequence offsets are sometimes allowed to exceed the
|
|
+ * windowSize if a dictionary is present, see doc/zstd_compression_format.md
|
|
+ * for details.
|
|
+ *
|
|
+ * The user-provided function shall return a size_t representing the number of
|
|
+ * sequences written to outSeqs. This return value will be treated as an error
|
|
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
|
|
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
|
|
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
|
|
+ * an error code.
|
|
+ *
|
|
+ * If the user-provided function does not return an error code, the sequences
|
|
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
|
|
+ * occur if the parse is not valid. A parse is defined to be valid if the
|
|
+ * following conditions hold:
|
|
+ * - The sum of matchLengths and literalLengths must equal srcSize.
|
|
+ * - All sequences in the parse, except for the final sequence, must have
|
|
+ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
|
|
+ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
|
|
+ * - All offsets must respect the windowSize parameter as specified in
|
|
+ * doc/zstd_compression_format.md.
|
|
+ * - If the final sequence has matchLength == 0, it must also have offset == 0.
|
|
+ *
|
|
+ * zstd will only validate these conditions (and fail compression if they do not
|
|
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
|
|
+ * validation has a performance cost.
|
|
+ *
|
|
+ * If the user-provided function returns an error, zstd will either fall back
|
|
+ * to an internal sequence producer or fail the compression operation. The user can
|
|
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
|
|
+ * cParam. Fallback compression will follow any other cParam settings, such as
|
|
+ * compression level, the same as in a normal compression operation.
|
|
+ *
|
|
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
|
|
+ * function by calling
|
|
+ * ZSTD_registerSequenceProducer(cctx,
|
|
+ * sequenceProducerState,
|
|
+ * sequenceProducer)
|
|
+ * This setting will persist until the next parameter reset of the CCtx.
|
|
+ *
|
|
+ * The sequenceProducerState must be initialized by the user before calling
|
|
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
|
|
+ * sequenceProducerState.
|
|
+ *
|
|
+ * *** LIMITATIONS ***
|
|
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
|
|
+ * However, there are three limitations:
|
|
+ *
|
|
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
|
|
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
|
|
+ * external sequence producer.
|
|
+ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
|
|
+ * cases (see its documentation for details). Users must explicitly set
|
|
+ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
|
|
+ * sequence producer is registered.
|
|
+ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
|
|
+ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
|
|
+ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
|
|
+ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
|
|
+ *
|
|
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
|
|
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
|
|
+ * - Dictionaries are not currently supported. Compression will *not* fail if the user
|
|
+ * references a dictionary, but the dictionary won't have any effect.
|
|
+ * - Stream history is not currently supported. All advanced compression APIs, including
|
|
+ * streaming APIs, work with external sequence producers, but each block is treated as
|
|
+ * an independent chunk without history from previous blocks.
|
|
+ *
|
|
+ * Third, multi-threading within a single compression is not currently supported. In other words,
|
|
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
|
|
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
|
|
+ *
|
|
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
|
|
+ * overcoming them. It is purely a question of engineering effort.
|
|
+ */
|
|
+
|
|
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
|
|
+
|
|
+typedef size_t ZSTD_sequenceProducer_F (
|
|
+ void* sequenceProducerState,
|
|
+ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
|
|
+ const void* src, size_t srcSize,
|
|
+ const void* dict, size_t dictSize,
|
|
+ int compressionLevel,
|
|
+ size_t windowSize
|
|
+);
|
|
+
|
|
+/*! ZSTD_registerSequenceProducer() :
|
|
+ * Instruct zstd to use a block-level external sequence producer function.
|
|
+ *
|
|
+ * The sequenceProducerState must be initialized by the caller, and the caller is
|
|
+ * responsible for managing its lifetime. This parameter is sticky across
|
|
+ * compressions. It will remain set until the user explicitly resets compression
|
|
+ * parameters.
|
|
+ *
|
|
+ * Sequence producer registration is considered to be an "advanced parameter",
|
|
+ * part of the "advanced API". This means it will only have an effect on compression
|
|
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
|
|
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
|
|
+ * "advanced parameters", will ignore any external sequence producer setting.
|
|
+ *
|
|
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
|
|
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
|
|
+ *
|
|
+ * The user is strongly encouraged to read the full API documentation (above) before
|
|
+ * calling this function. */
|
|
+ZSTDLIB_STATIC_API void
|
|
+ZSTD_registerSequenceProducer(
|
|
+ ZSTD_CCtx* cctx,
|
|
+ void* sequenceProducerState,
|
|
+ ZSTD_sequenceProducer_F* sequenceProducer
|
|
+);
|
|
|
|
|
|
/* *******************************************************************
|
|
-* Buffer-less and synchronous inner streaming functions
|
|
+* Buffer-less and synchronous inner streaming functions (DEPRECATED)
|
|
*
|
|
-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
|
|
-* But it's also a complex one, with several restrictions, documented below.
|
|
-* Prefer normal streaming API for an easier experience.
|
|
+* This API is deprecated, and will be removed in a future version.
|
|
+* It allows streaming (de)compression with user allocated buffers.
|
|
+* However, it is hard to use, and not as well tested as the rest of
|
|
+* our API.
|
|
+*
|
|
+* Please use the normal streaming API instead: ZSTD_compressStream2,
|
|
+* and ZSTD_decompressStream.
|
|
+* If there is functionality that you need, but it doesn't provide,
|
|
+* please open an issue on our GitHub.
|
|
********************************************************************* */
|
|
|
|
/*
|
|
@@ -2362,7 +2785,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
|
|
|
|
Start by initializing a context.
|
|
Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
|
|
- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
|
|
|
|
Then, consume your input using ZSTD_compressContinue().
|
|
There are some important considerations to keep in mind when using this advanced function :
|
|
@@ -2384,18 +2806,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
|
|
*/
|
|
|
|
/*===== Buffer-less streaming compression functions =====*/
|
|
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
|
|
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
|
|
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
|
|
|
|
+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
|
|
+ZSTDLIB_STATIC_API
|
|
+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
|
|
+
|
|
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
|
|
/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
|
|
ZSTD_DEPRECATED("use advanced API to access custom parameters")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
|
|
ZSTD_DEPRECATED("use advanced API to access custom parameters")
|
|
+ZSTDLIB_STATIC_API
|
|
size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
|
|
/*
|
|
Buffer-less streaming decompression (synchronous mode)
|
|
@@ -2408,8 +2840,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
|
|
Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
|
|
Data fragment must be large enough to ensure successful decoding.
|
|
`ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
|
|
- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
|
|
- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
|
|
+ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
|
|
+ >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
|
|
errorCode, which can be tested using ZSTD_isError().
|
|
|
|
It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
|
|
@@ -2428,7 +2860,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
|
|
|
|
The most memory efficient way is to use a round buffer of sufficient size.
|
|
Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
|
|
- which can @return an error code if required value is too large for current system (in 32-bits mode).
|
|
+ which can return an error code if required value is too large for current system (in 32-bits mode).
|
|
In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
|
|
up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
|
|
which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
|
|
@@ -2448,7 +2880,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
|
|
ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
|
|
ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
|
|
|
|
- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
|
|
+ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
|
|
It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
|
|
It can also be an error code, which can be tested with ZSTD_isError().
|
|
|
|
@@ -2471,27 +2903,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
|
|
*/
|
|
|
|
/*===== Buffer-less streaming decompression functions =====*/
|
|
-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
|
|
-typedef struct {
|
|
- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
|
|
- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
|
|
- unsigned blockSizeMax;
|
|
- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
|
|
- unsigned headerSize;
|
|
- unsigned dictID;
|
|
- unsigned checksumFlag;
|
|
-} ZSTD_frameHeader;
|
|
|
|
-/*! ZSTD_getFrameHeader() :
|
|
- * decode Frame Header, or requires larger `srcSize`.
|
|
- * @return : 0, `zfhPtr` is correctly filled,
|
|
- * >0, `srcSize` is too small, value is wanted `srcSize` amount,
|
|
- * or an error code, which can be tested using ZSTD_isError() */
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */
|
|
-/*! ZSTD_getFrameHeader_advanced() :
|
|
- * same as ZSTD_getFrameHeader(),
|
|
- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
|
|
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
|
|
ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
|
|
|
|
ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
|
|
@@ -2502,6 +2914,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
|
|
ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
|
|
/* misc */
|
|
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
|
|
ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
|
|
typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
|
|
ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
|
|
@@ -2509,11 +2922,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
|
|
|
|
|
|
|
|
-/* ============================ */
|
|
-/* Block level API */
|
|
-/* ============================ */
|
|
+/* ========================================= */
|
|
+/* Block level API (DEPRECATED) */
|
|
+/* ========================================= */
|
|
|
|
/*!
|
|
+
|
|
+ This API is deprecated in favor of the regular compression API.
|
|
+ You can get the frame header down to 2 bytes by setting:
|
|
+ - ZSTD_c_format = ZSTD_f_zstd1_magicless
|
|
+ - ZSTD_c_contentSizeFlag = 0
|
|
+ - ZSTD_c_checksumFlag = 0
|
|
+ - ZSTD_c_dictIDFlag = 0
|
|
+
|
|
+ This API is not as well tested as our normal API, so we recommend not using it.
|
|
+ We will be removing it in a future version. If the normal API doesn't provide
|
|
+ the functionality you need, please open a GitHub issue.
|
|
+
|
|
Block functions produce and decode raw zstd blocks, without frame metadata.
|
|
Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
|
|
But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
|
|
@@ -2524,7 +2949,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
|
|
- It is necessary to init context before starting
|
|
+ compression : any ZSTD_compressBegin*() variant, including with dictionary
|
|
+ decompression : any ZSTD_decompressBegin*() variant, including with dictionary
|
|
- + copyCCtx() and copyDCtx() can be used too
|
|
- Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
|
|
+ If input is larger than a block size, it's necessary to split input data into multiple blocks
|
|
+ For inputs larger than a single block, consider using regular ZSTD_compress() instead.
|
|
@@ -2541,11 +2965,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
|
|
*/
|
|
|
|
/*===== Raw zstd block functions =====*/
|
|
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx);
|
|
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
|
|
ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
|
|
|
|
-
|
|
#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
|
|
|
|
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
|
|
index bb79ac1a6d8f..7ddc87bee274 100644
|
|
--- a/kernel/module/decompress.c
|
|
+++ b/kernel/module/decompress.c
|
|
@@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
|
|
zstd_dec.size = PAGE_SIZE;
|
|
|
|
ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
|
|
- kunmap(page);
|
|
+ kunmap_local(zstd_dec.dst);
|
|
retval = zstd_get_error_code(ret);
|
|
if (retval)
|
|
break;
|
|
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
|
|
index 20f08c644b71..464c410b2768 100644
|
|
--- a/lib/zstd/Makefile
|
|
+++ b/lib/zstd/Makefile
|
|
@@ -1,6 +1,6 @@
|
|
# SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
# ################################################################
|
|
-# Copyright (c) Facebook, Inc.
|
|
+# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
|
|
new file mode 100644
|
|
index 000000000000..05adbbeccaa9
|
|
--- /dev/null
|
|
+++ b/lib/zstd/common/allocations.h
|
|
@@ -0,0 +1,56 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
+/*
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
+ * All rights reserved.
|
|
+ *
|
|
+ * This source code is licensed under both the BSD-style license (found in the
|
|
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
+ * in the COPYING file in the root directory of this source tree).
|
|
+ * You may select, at your option, one of the above-listed licenses.
|
|
+ */
|
|
+
|
|
+/* This file provides custom allocation primitives
|
|
+ */
|
|
+
|
|
+#define ZSTD_DEPS_NEED_MALLOC
|
|
+#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
|
|
+
|
|
+#include "mem.h" /* MEM_STATIC */
|
|
+#define ZSTD_STATIC_LINKING_ONLY
|
|
+#include <linux/zstd.h> /* ZSTD_customMem */
|
|
+
|
|
+#ifndef ZSTD_ALLOCATIONS_H
|
|
+#define ZSTD_ALLOCATIONS_H
|
|
+
|
|
+/* custom memory allocation functions */
|
|
+
|
|
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
|
|
+{
|
|
+ if (customMem.customAlloc)
|
|
+ return customMem.customAlloc(customMem.opaque, size);
|
|
+ return ZSTD_malloc(size);
|
|
+}
|
|
+
|
|
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
|
|
+{
|
|
+ if (customMem.customAlloc) {
|
|
+ /* calloc implemented as malloc+memset;
|
|
+ * not as efficient as calloc, but next best guess for custom malloc */
|
|
+ void* const ptr = customMem.customAlloc(customMem.opaque, size);
|
|
+ ZSTD_memset(ptr, 0, size);
|
|
+ return ptr;
|
|
+ }
|
|
+ return ZSTD_calloc(1, size);
|
|
+}
|
|
+
|
|
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
|
|
+{
|
|
+ if (ptr!=NULL) {
|
|
+ if (customMem.customFree)
|
|
+ customMem.customFree(customMem.opaque, ptr);
|
|
+ else
|
|
+ ZSTD_free(ptr);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif /* ZSTD_ALLOCATIONS_H */
|
|
diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
|
|
new file mode 100644
|
|
index 000000000000..aa3487ec4b6a
|
|
--- /dev/null
|
|
+++ b/lib/zstd/common/bits.h
|
|
@@ -0,0 +1,149 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
+/*
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
+ * All rights reserved.
|
|
+ *
|
|
+ * This source code is licensed under both the BSD-style license (found in the
|
|
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
+ * in the COPYING file in the root directory of this source tree).
|
|
+ * You may select, at your option, one of the above-listed licenses.
|
|
+ */
|
|
+
|
|
+#ifndef ZSTD_BITS_H
|
|
+#define ZSTD_BITS_H
|
|
+
|
|
+#include "mem.h"
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
|
|
+{
|
|
+ assert(val != 0);
|
|
+ {
|
|
+ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
|
|
+ 30, 22, 20, 15, 25, 17, 4, 8,
|
|
+ 31, 27, 13, 23, 21, 19, 16, 7,
|
|
+ 26, 12, 18, 6, 11, 5, 10, 9};
|
|
+ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
|
|
+ }
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
|
|
+{
|
|
+ assert(val != 0);
|
|
+# if (__GNUC__ >= 4)
|
|
+ return (unsigned)__builtin_ctz(val);
|
|
+# else
|
|
+ return ZSTD_countTrailingZeros32_fallback(val);
|
|
+# endif
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
|
|
+ assert(val != 0);
|
|
+ {
|
|
+ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
|
|
+ 11, 14, 16, 18, 22, 25, 3, 30,
|
|
+ 8, 12, 20, 28, 15, 17, 24, 7,
|
|
+ 19, 27, 23, 6, 26, 5, 4, 31};
|
|
+ val |= val >> 1;
|
|
+ val |= val >> 2;
|
|
+ val |= val >> 4;
|
|
+ val |= val >> 8;
|
|
+ val |= val >> 16;
|
|
+ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
|
|
+ }
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
|
|
+{
|
|
+ assert(val != 0);
|
|
+# if (__GNUC__ >= 4)
|
|
+ return (unsigned)__builtin_clz(val);
|
|
+# else
|
|
+ return ZSTD_countLeadingZeros32_fallback(val);
|
|
+# endif
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
|
|
+{
|
|
+ assert(val != 0);
|
|
+# if (__GNUC__ >= 4) && defined(__LP64__)
|
|
+ return (unsigned)__builtin_ctzll(val);
|
|
+# else
|
|
+ {
|
|
+ U32 mostSignificantWord = (U32)(val >> 32);
|
|
+ U32 leastSignificantWord = (U32)val;
|
|
+ if (leastSignificantWord == 0) {
|
|
+ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
|
|
+ } else {
|
|
+ return ZSTD_countTrailingZeros32(leastSignificantWord);
|
|
+ }
|
|
+ }
|
|
+# endif
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
|
|
+{
|
|
+ assert(val != 0);
|
|
+# if (__GNUC__ >= 4)
|
|
+ return (unsigned)(__builtin_clzll(val));
|
|
+# else
|
|
+ {
|
|
+ U32 mostSignificantWord = (U32)(val >> 32);
|
|
+ U32 leastSignificantWord = (U32)val;
|
|
+ if (mostSignificantWord == 0) {
|
|
+ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
|
|
+ } else {
|
|
+ return ZSTD_countLeadingZeros32(mostSignificantWord);
|
|
+ }
|
|
+ }
|
|
+# endif
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
|
|
+{
|
|
+ if (MEM_isLittleEndian()) {
|
|
+ if (MEM_64bits()) {
|
|
+ return ZSTD_countTrailingZeros64((U64)val) >> 3;
|
|
+ } else {
|
|
+ return ZSTD_countTrailingZeros32((U32)val) >> 3;
|
|
+ }
|
|
+ } else { /* Big Endian CPU */
|
|
+ if (MEM_64bits()) {
|
|
+ return ZSTD_countLeadingZeros64((U64)val) >> 3;
|
|
+ } else {
|
|
+ return ZSTD_countLeadingZeros32((U32)val) >> 3;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
|
|
+{
|
|
+ assert(val != 0);
|
|
+ return 31 - ZSTD_countLeadingZeros32(val);
|
|
+}
|
|
+
|
|
+/* ZSTD_rotateRight_*():
|
|
+ * Rotates a bitfield to the right by "count" bits.
|
|
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
|
|
+ */
|
|
+MEM_STATIC
|
|
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
|
|
+ assert(count < 64);
|
|
+ count &= 0x3F; /* for fickle pattern recognition */
|
|
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
|
|
+}
|
|
+
|
|
+MEM_STATIC
|
|
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
|
|
+ assert(count < 32);
|
|
+ count &= 0x1F; /* for fickle pattern recognition */
|
|
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
|
|
+}
|
|
+
|
|
+MEM_STATIC
|
|
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
|
|
+ assert(count < 16);
|
|
+ count &= 0x0F; /* for fickle pattern recognition */
|
|
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
|
|
+}
|
|
+
|
|
+#endif /* ZSTD_BITS_H */
|
|
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
|
|
index feef3a1b1d60..444dc4f85c64 100644
|
|
--- a/lib/zstd/common/bitstream.h
|
|
+++ b/lib/zstd/common/bitstream.h
|
|
@@ -1,7 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/* ******************************************************************
|
|
* bitstream
|
|
* Part of FSE library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -27,6 +28,7 @@
|
|
#include "compiler.h" /* UNLIKELY() */
|
|
#include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */
|
|
#include "error_private.h" /* error codes and messages */
|
|
+#include "bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
/*=========================================
|
|
@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
|
|
MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
|
|
/* faster, but works only if nbBits >= 1 */
|
|
|
|
-
|
|
-
|
|
-/*-**************************************************************
|
|
-* Internal functions
|
|
-****************************************************************/
|
|
-MEM_STATIC unsigned BIT_highbit32 (U32 val)
|
|
-{
|
|
- assert(val != 0);
|
|
- {
|
|
-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */
|
|
- return __builtin_clz (val) ^ 31;
|
|
-# else /* Software version */
|
|
- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29,
|
|
- 11, 14, 16, 18, 22, 25, 3, 30,
|
|
- 8, 12, 20, 28, 15, 17, 24, 7,
|
|
- 19, 27, 23, 6, 26, 5, 4, 31 };
|
|
- U32 v = val;
|
|
- v |= v >> 1;
|
|
- v |= v >> 2;
|
|
- v |= v >> 4;
|
|
- v |= v >> 8;
|
|
- v |= v >> 16;
|
|
- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
|
|
-# endif
|
|
- }
|
|
-}
|
|
-
|
|
/*===== Local Constants =====*/
|
|
static const unsigned BIT_mask[] = {
|
|
0, 1, 3, 7, 0xF, 0x1F,
|
|
@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
|
|
return 0;
|
|
}
|
|
|
|
+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
|
|
+{
|
|
+ assert(nbBits < BIT_MASK_SIZE);
|
|
+ return bitContainer & BIT_mask[nbBits];
|
|
+}
|
|
+
|
|
/*! BIT_addBits() :
|
|
* can add up to 31 bits into `bitC`.
|
|
* Note : does not check for register overflow ! */
|
|
@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
|
|
DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
|
|
assert(nbBits < BIT_MASK_SIZE);
|
|
assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
|
|
- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
|
|
+ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
|
|
bitC->bitPos += nbBits;
|
|
}
|
|
|
|
@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
|
|
bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
|
|
bitD->bitContainer = MEM_readLEST(bitD->ptr);
|
|
{ BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
|
|
- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
|
|
+ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
|
|
if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
|
|
} else {
|
|
bitD->ptr = bitD->start;
|
|
@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
|
|
default: break;
|
|
}
|
|
{ BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
|
|
- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
|
|
+ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
|
|
if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */
|
|
}
|
|
bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
|
|
@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
|
|
#endif
|
|
}
|
|
|
|
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
|
|
-{
|
|
- assert(nbBits < BIT_MASK_SIZE);
|
|
- return bitContainer & BIT_mask[nbBits];
|
|
-}
|
|
-
|
|
/*! BIT_lookBits() :
|
|
* Provides next n bits from local register.
|
|
* local register is not modified.
|
|
@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
|
|
}
|
|
|
|
/*! BIT_readBitsFast() :
|
|
- * unsafe version; only works only if nbBits >= 1 */
|
|
+ * unsafe version; only works if nbBits >= 1 */
|
|
MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
|
|
{
|
|
size_t const value = BIT_lookBitsFast(bitD, nbBits);
|
|
@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
|
|
* This function is safe, it guarantees it will not read beyond src buffer.
|
|
* @return : status of `BIT_DStream_t` internal register.
|
|
* when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
|
|
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
|
|
+MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
|
|
{
|
|
if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */
|
|
return BIT_DStream_overflow;
|
|
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
|
|
index c42d39faf9bd..c437e0975575 100644
|
|
--- a/lib/zstd/common/compiler.h
|
|
+++ b/lib/zstd/common/compiler.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -179,6 +180,17 @@
|
|
* Sanitizer
|
|
*****************************************************************/
|
|
|
|
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
|
|
+ * abundance of caution, disable our custom poisoning on mingw. */
|
|
+#ifdef __MINGW32__
|
|
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
|
|
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
|
|
+#endif
|
|
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
|
|
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
|
|
|
|
#endif /* ZSTD_COMPILER_H */
|
|
diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
|
|
index 0db7b42407ee..d8319a2bef4c 100644
|
|
--- a/lib/zstd/common/cpu.h
|
|
+++ b/lib/zstd/common/cpu.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
|
|
index bb863c9ea616..e56ff6464e91 100644
|
|
--- a/lib/zstd/common/debug.c
|
|
+++ b/lib/zstd/common/debug.c
|
|
@@ -1,7 +1,8 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* debug
|
|
* Part of FSE library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
|
|
index 6dd88d1fbd02..da0dbfc614b8 100644
|
|
--- a/lib/zstd/common/debug.h
|
|
+++ b/lib/zstd/common/debug.h
|
|
@@ -1,7 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/* ******************************************************************
|
|
* debug
|
|
* Part of FSE library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
|
|
index fef67056f052..6cdd82233fb5 100644
|
|
--- a/lib/zstd/common/entropy_common.c
|
|
+++ b/lib/zstd/common/entropy_common.c
|
|
@@ -1,6 +1,7 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* Common functions of New Generation Entropy library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -19,8 +20,8 @@
|
|
#include "error_private.h" /* ERR_*, ERROR */
|
|
#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
|
|
#include "fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */
|
|
#include "huf.h"
|
|
+#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
|
|
|
|
|
|
/*=== Version ===*/
|
|
@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
|
|
/*-**************************************************************
|
|
* FSE NCount encoding-decoding
|
|
****************************************************************/
|
|
-static U32 FSE_ctz(U32 val)
|
|
-{
|
|
- assert(val != 0);
|
|
- {
|
|
-# if (__GNUC__ >= 3) /* GCC Intrinsic */
|
|
- return __builtin_ctz(val);
|
|
-# else /* Software version */
|
|
- U32 count = 0;
|
|
- while ((val & 1) == 0) {
|
|
- val >>= 1;
|
|
- ++count;
|
|
- }
|
|
- return count;
|
|
-# endif
|
|
- }
|
|
-}
|
|
-
|
|
FORCE_INLINE_TEMPLATE
|
|
size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
|
|
const void* headerBuffer, size_t hbSize)
|
|
@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
|
|
* repeat.
|
|
* Avoid UB by setting the high bit to 1.
|
|
*/
|
|
- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
|
|
+ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
|
|
while (repeats >= 12) {
|
|
charnum += 3 * 12;
|
|
if (LIKELY(ip <= iend-7)) {
|
|
@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
|
|
ip = iend - 4;
|
|
}
|
|
bitStream = MEM_readLE32(ip) >> bitCount;
|
|
- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
|
|
+ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
|
|
}
|
|
charnum += 3 * repeats;
|
|
bitStream >>= 2 * repeats;
|
|
@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
|
|
* know that threshold > 1.
|
|
*/
|
|
if (remaining <= 1) break;
|
|
- nbBits = BIT_highbit32(remaining) + 1;
|
|
+ nbBits = ZSTD_highbit32(remaining) + 1;
|
|
threshold = 1 << (nbBits - 1);
|
|
}
|
|
if (charnum >= maxSV1) break;
|
|
@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
|
const void* src, size_t srcSize)
|
|
{
|
|
U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
|
|
- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
|
|
+ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
|
|
}
|
|
|
|
FORCE_INLINE_TEMPLATE size_t
|
|
@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
|
if (weightTotal == 0) return ERROR(corruption_detected);
|
|
|
|
/* get last non-null symbol weight (implied, total must be 2^n) */
|
|
- { U32 const tableLog = BIT_highbit32(weightTotal) + 1;
|
|
+ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
|
|
if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
|
|
*tableLogPtr = tableLog;
|
|
/* determine last weight */
|
|
{ U32 const total = 1 << tableLog;
|
|
U32 const rest = total - weightTotal;
|
|
- U32 const verif = 1 << BIT_highbit32(rest);
|
|
- U32 const lastWeight = BIT_highbit32(rest) + 1;
|
|
+ U32 const verif = 1 << ZSTD_highbit32(rest);
|
|
+ U32 const lastWeight = ZSTD_highbit32(rest) + 1;
|
|
if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */
|
|
huffWeight[oSize] = (BYTE)lastWeight;
|
|
rankStats[lastWeight]++;
|
|
@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
|
U32* nbSymbolsPtr, U32* tableLogPtr,
|
|
const void* src, size_t srcSize,
|
|
void* workSpace, size_t wkspSize,
|
|
- int bmi2)
|
|
+ int flags)
|
|
{
|
|
#if DYNAMIC_BMI2
|
|
- if (bmi2) {
|
|
+ if (flags & HUF_flags_bmi2) {
|
|
return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
|
|
}
|
|
#endif
|
|
- (void)bmi2;
|
|
+ (void)flags;
|
|
return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
|
|
}
|
|
diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
|
|
index 6d1135f8c373..a4062d30d170 100644
|
|
--- a/lib/zstd/common/error_private.c
|
|
+++ b/lib/zstd/common/error_private.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
|
|
case PREFIX(version_unsupported): return "Version not supported";
|
|
case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
|
|
case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
|
|
- case PREFIX(corruption_detected): return "Corrupted block detected";
|
|
+ case PREFIX(corruption_detected): return "Data corruption detected";
|
|
case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
|
|
+ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
|
|
case PREFIX(parameter_unsupported): return "Unsupported parameter";
|
|
+ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
|
|
case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
|
|
case PREFIX(init_missing): return "Context should be init first";
|
|
case PREFIX(memory_allocation): return "Allocation error : not enough memory";
|
|
@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
|
|
case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
|
|
case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
|
|
case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
|
|
+ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
|
|
case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
|
|
case PREFIX(dictionary_wrong): return "Dictionary mismatch";
|
|
case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
|
|
case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
|
|
case PREFIX(srcSize_wrong): return "Src size is incorrect";
|
|
case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
|
|
+ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
|
|
+ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
|
|
/* following error codes are not stable and may be removed or changed in a future version */
|
|
case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
|
|
case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
|
|
case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
|
|
case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
|
|
+ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
|
|
+ case PREFIX(externalSequences_invalid): return "External sequences are not valid";
|
|
case PREFIX(maxCode):
|
|
default: return notErrorCode;
|
|
}
|
|
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
|
|
index ca5101e542fa..9a4699a38a88 100644
|
|
--- a/lib/zstd/common/error_private.h
|
|
+++ b/lib/zstd/common/error_private.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
|
|
index 4507043b2287..c4e25a219142 100644
|
|
--- a/lib/zstd/common/fse.h
|
|
+++ b/lib/zstd/common/fse.h
|
|
@@ -1,7 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/* ******************************************************************
|
|
* FSE : Finite State Entropy codec
|
|
* Public Prototypes declaration
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -50,34 +51,6 @@
|
|
FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */
|
|
|
|
|
|
-/*-****************************************
|
|
-* FSE simple functions
|
|
-******************************************/
|
|
-/*! FSE_compress() :
|
|
- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
|
|
- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
|
|
- @return : size of compressed data (<= dstCapacity).
|
|
- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
|
|
- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
|
|
- if FSE_isError(return), compression failed (more details using FSE_getErrorName())
|
|
-*/
|
|
-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize);
|
|
-
|
|
-/*! FSE_decompress():
|
|
- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
|
|
- into already allocated destination buffer 'dst', of size 'dstCapacity'.
|
|
- @return : size of regenerated data (<= maxDstSize),
|
|
- or an error code, which can be tested using FSE_isError() .
|
|
-
|
|
- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
|
|
- Why ? : making this distinction requires a header.
|
|
- Header management is intentionally delegated to the user layer, which can better manage special cases.
|
|
-*/
|
|
-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity,
|
|
- const void* cSrc, size_t cSrcSize);
|
|
-
|
|
-
|
|
/*-*****************************************
|
|
* Tool functions
|
|
******************************************/
|
|
@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return
|
|
FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */
|
|
|
|
|
|
-/*-*****************************************
|
|
-* FSE advanced functions
|
|
-******************************************/
|
|
-/*! FSE_compress2() :
|
|
- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
|
|
- Both parameters can be defined as '0' to mean : use default value
|
|
- @return : size of compressed data
|
|
- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
|
|
- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
|
|
- if FSE_isError(return), it's an error code.
|
|
-*/
|
|
-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
|
|
-
|
|
-
|
|
/*-*****************************************
|
|
* FSE detailed API
|
|
******************************************/
|
|
@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
|
|
/*! Constructor and Destructor of FSE_CTable.
|
|
Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
|
|
typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
|
|
-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
|
|
-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct);
|
|
|
|
/*! FSE_buildCTable():
|
|
Builds `ct`, which must be already allocated, using FSE_createCTable().
|
|
@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
|
|
unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
|
|
const void* rBuffer, size_t rBuffSize, int bmi2);
|
|
|
|
-/*! Constructor and Destructor of FSE_DTable.
|
|
- Note that its size depends on 'tableLog' */
|
|
typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
|
|
-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
|
|
-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt);
|
|
-
|
|
-/*! FSE_buildDTable():
|
|
- Builds 'dt', which must be already allocated, using FSE_createDTable().
|
|
- return : 0, or an errorCode, which can be tested using FSE_isError() */
|
|
-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
|
|
-
|
|
-/*! FSE_decompress_usingDTable():
|
|
- Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
|
|
- into `dst` which must be already allocated.
|
|
- @return : size of regenerated data (necessarily <= `dstCapacity`),
|
|
- or an errorCode, which can be tested using FSE_isError() */
|
|
-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
|
|
|
|
/*!
|
|
Tutorial :
|
|
@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
|
|
unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
|
|
/*< same as FSE_optimalTableLog(), which used `minus==2` */
|
|
|
|
-/* FSE_compress_wksp() :
|
|
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
|
|
- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
|
|
- */
|
|
-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
|
|
-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
|
|
-
|
|
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
|
|
-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
|
|
-
|
|
size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
|
|
/*< build a fake FSE_CTable, designed to compress always the same symbolValue */
|
|
|
|
@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
|
|
FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
|
|
/*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
|
|
|
|
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
|
|
-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
|
|
-
|
|
-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
|
|
-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
|
|
-
|
|
-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
|
|
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
|
|
#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
|
|
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
|
|
-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
|
|
-
|
|
size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
|
|
-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
|
|
+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
|
|
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
|
|
|
|
typedef enum {
|
|
FSE_repeat_none, /*< Cannot use the previous table */
|
|
@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
|
|
|
|
/* FSE_getMaxNbBits() :
|
|
* Approximate maximum cost of a symbol, in bits.
|
|
- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
|
|
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
|
|
* note 1 : assume symbolValue is valid (<= maxSymbolValue)
|
|
* note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
|
|
MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
|
|
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
|
|
index a0d06095be83..45cf457f31ef 100644
|
|
--- a/lib/zstd/common/fse_decompress.c
|
|
+++ b/lib/zstd/common/fse_decompress.c
|
|
@@ -1,6 +1,7 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* FSE : Finite State Entropy decoder
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -24,6 +25,7 @@
|
|
#include "error_private.h"
|
|
#define ZSTD_DEPS_NEED_MALLOC
|
|
#include "zstd_deps.h"
|
|
+#include "bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
/* **************************************************************
|
|
@@ -55,19 +57,6 @@
|
|
#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
|
|
#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
|
|
|
|
-
|
|
-/* Function templates */
|
|
-FSE_DTable* FSE_createDTable (unsigned tableLog)
|
|
-{
|
|
- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
|
|
- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
|
|
-}
|
|
-
|
|
-void FSE_freeDTable (FSE_DTable* dt)
|
|
-{
|
|
- ZSTD_free(dt);
|
|
-}
|
|
-
|
|
static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
|
|
{
|
|
void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
|
|
@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
|
|
}
|
|
}
|
|
/* Now we spread those positions across the table.
|
|
- * The benefit of doing it in two stages is that we avoid the the
|
|
+ * The benefit of doing it in two stages is that we avoid the
|
|
* variable size inner loop, which caused lots of branch misses.
|
|
* Now we can run through all the positions without any branch misses.
|
|
- * We unroll the loop twice, since that is what emperically worked best.
|
|
+ * We unroll the loop twice, since that is what empirically worked best.
|
|
*/
|
|
{
|
|
size_t position = 0;
|
|
@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
|
|
for (u=0; u<tableSize; u++) {
|
|
FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
|
|
U32 const nextState = symbolNext[symbol]++;
|
|
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
|
|
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
|
|
tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
|
|
} }
|
|
|
|
@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
|
|
/*-*******************************************************
|
|
* Decompression (Byte symbols)
|
|
*********************************************************/
|
|
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
|
|
-{
|
|
- void* ptr = dt;
|
|
- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
|
|
- void* dPtr = dt + 1;
|
|
- FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
|
|
-
|
|
- DTableH->tableLog = 0;
|
|
- DTableH->fastMode = 0;
|
|
-
|
|
- cell->newState = 0;
|
|
- cell->symbol = symbolValue;
|
|
- cell->nbBits = 0;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-
|
|
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
|
|
-{
|
|
- void* ptr = dt;
|
|
- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
|
|
- void* dPtr = dt + 1;
|
|
- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
|
|
- const unsigned tableSize = 1 << nbBits;
|
|
- const unsigned tableMask = tableSize - 1;
|
|
- const unsigned maxSV1 = tableMask+1;
|
|
- unsigned s;
|
|
-
|
|
- /* Sanity checks */
|
|
- if (nbBits < 1) return ERROR(GENERIC); /* min size */
|
|
-
|
|
- /* Build Decoding Table */
|
|
- DTableH->tableLog = (U16)nbBits;
|
|
- DTableH->fastMode = 1;
|
|
- for (s=0; s<maxSV1; s++) {
|
|
- dinfo[s].newState = 0;
|
|
- dinfo[s].symbol = (BYTE)s;
|
|
- dinfo[s].nbBits = (BYTE)nbBits;
|
|
- }
|
|
-
|
|
- return 0;
|
|
-}
|
|
|
|
FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
|
|
void* dst, size_t maxDstSize,
|
|
@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
|
|
return op-ostart;
|
|
}
|
|
|
|
-
|
|
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const FSE_DTable* dt)
|
|
-{
|
|
- const void* ptr = dt;
|
|
- const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
|
|
- const U32 fastMode = DTableH->fastMode;
|
|
-
|
|
- /* select fast mode (static) */
|
|
- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
|
|
- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
|
|
-}
|
|
-
|
|
-
|
|
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
typedef struct {
|
|
short ncount[FSE_MAX_SYMBOL_VALUE + 1];
|
|
FSE_DTable dtable[1]; /* Dynamically sized */
|
|
@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
|
|
}
|
|
|
|
if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
|
|
- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
|
|
+ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
|
|
+ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
|
|
wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
|
|
|
|
CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
|
|
@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
|
|
return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
|
|
}
|
|
|
|
-
|
|
-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
|
|
-
|
|
-
|
|
-
|
|
#endif /* FSE_COMMONDEFS_ONLY */
|
|
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
|
|
index 5042ff870308..8e7943092ed1 100644
|
|
--- a/lib/zstd/common/huf.h
|
|
+++ b/lib/zstd/common/huf.h
|
|
@@ -1,7 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/* ******************************************************************
|
|
* huff0 huffman codec,
|
|
* part of Finite State Entropy library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -18,99 +19,22 @@
|
|
|
|
/* *** Dependencies *** */
|
|
#include "zstd_deps.h" /* size_t */
|
|
-
|
|
-
|
|
-/* *** library symbols visibility *** */
|
|
-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
|
|
- * HUF symbols remain "private" (internal symbols for library only).
|
|
- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
|
|
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
|
|
-# define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
|
|
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */
|
|
-# define HUF_PUBLIC_API __declspec(dllexport)
|
|
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
|
|
-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
|
|
-#else
|
|
-# define HUF_PUBLIC_API
|
|
-#endif
|
|
-
|
|
-
|
|
-/* ========================== */
|
|
-/* *** simple functions *** */
|
|
-/* ========================== */
|
|
-
|
|
-/* HUF_compress() :
|
|
- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
|
|
- * 'dst' buffer must be already allocated.
|
|
- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
|
|
- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
|
|
- * @return : size of compressed data (<= `dstCapacity`).
|
|
- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
|
|
- * if HUF_isError(return), compression failed (more details using HUF_getErrorName())
|
|
- */
|
|
-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize);
|
|
-
|
|
-/* HUF_decompress() :
|
|
- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
|
|
- * into already allocated buffer 'dst', of minimum size 'dstSize'.
|
|
- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
|
|
- * Note : in contrast with FSE, HUF_decompress can regenerate
|
|
- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
|
|
- * because it knows size to regenerate (originalSize).
|
|
- * @return : size of regenerated data (== originalSize),
|
|
- * or an error code, which can be tested using HUF_isError()
|
|
- */
|
|
-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize,
|
|
- const void* cSrc, size_t cSrcSize);
|
|
+#include "mem.h" /* U32 */
|
|
+#define FSE_STATIC_LINKING_ONLY
|
|
+#include "fse.h"
|
|
|
|
|
|
/* *** Tool functions *** */
|
|
-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */
|
|
-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */
|
|
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */
|
|
+size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */
|
|
|
|
/* Error Management */
|
|
-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */
|
|
-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */
|
|
+unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */
|
|
+const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */
|
|
|
|
|
|
-/* *** Advanced function *** */
|
|
-
|
|
-/* HUF_compress2() :
|
|
- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
|
|
- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
|
|
- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
|
|
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize,
|
|
- unsigned maxSymbolValue, unsigned tableLog);
|
|
-
|
|
-/* HUF_compress4X_wksp() :
|
|
- * Same as HUF_compress2(), but uses externally allocated `workSpace`.
|
|
- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
|
|
#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
|
|
#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
|
|
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize,
|
|
- unsigned maxSymbolValue, unsigned tableLog,
|
|
- void* workSpace, size_t wkspSize);
|
|
-
|
|
-#endif /* HUF_H_298734234 */
|
|
-
|
|
-/* ******************************************************************
|
|
- * WARNING !!
|
|
- * The following section contains advanced and experimental definitions
|
|
- * which shall never be used in the context of a dynamic library,
|
|
- * because they are not guaranteed to remain stable in the future.
|
|
- * Only consider them in association with static linking.
|
|
- * *****************************************************************/
|
|
-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
|
|
-#define HUF_H_HUF_STATIC_LINKING_ONLY
|
|
-
|
|
-/* *** Dependencies *** */
|
|
-#include "mem.h" /* U32 */
|
|
-#define FSE_STATIC_LINKING_ONLY
|
|
-#include "fse.h"
|
|
-
|
|
|
|
/* *** Constants *** */
|
|
#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
|
|
@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
|
|
/* ****************************************
|
|
* Advanced decompression functions
|
|
******************************************/
|
|
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
|
|
-#endif
|
|
|
|
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */
|
|
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
|
|
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
|
|
-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
|
|
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
|
|
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */
|
|
-#endif
|
|
+/*
|
|
+ * Huffman flags bitset.
|
|
+ * For all flags, 0 is the default value.
|
|
+ */
|
|
+typedef enum {
|
|
+ /*
|
|
+ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
|
|
+ * Otherwise: Ignored.
|
|
+ */
|
|
+ HUF_flags_bmi2 = (1 << 0),
|
|
+ /*
|
|
+ * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
|
|
+ * If unset: Use heuristic to find the table depth.
|
|
+ */
|
|
+ HUF_flags_optimalDepth = (1 << 1),
|
|
+ /*
|
|
+ * If set: If the previous table can encode the input, always reuse the previous table.
|
|
+ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
|
|
+ */
|
|
+ HUF_flags_preferRepeat = (1 << 2),
|
|
+ /*
|
|
+ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
|
|
+ * If unset: Always histogram the entire input.
|
|
+ */
|
|
+ HUF_flags_suspectUncompressible = (1 << 3),
|
|
+ /*
|
|
+ * If set: Don't use assembly implementations
|
|
+ * If unset: Allow using assembly implementations
|
|
+ */
|
|
+ HUF_flags_disableAsm = (1 << 4),
|
|
+ /*
|
|
+ * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
|
|
+ * If unset: Use the fast decoding loop when possible.
|
|
+ */
|
|
+ HUF_flags_disableFast = (1 << 5)
|
|
+} HUF_flags_e;
|
|
|
|
|
|
/* ****************************************
|
|
* HUF detailed API
|
|
* ****************************************/
|
|
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
|
|
|
|
/*! HUF_compress() does the following:
|
|
* 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
|
|
@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
* For example, it's possible to compress several blocks using the same 'CTable',
|
|
* or to save and regenerate 'CTable' using external methods.
|
|
*/
|
|
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
|
|
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
|
|
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
|
|
+unsigned HUF_minTableLog(unsigned symbolCardinality);
|
|
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
|
|
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
|
|
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
|
|
size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
|
|
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
|
|
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
|
|
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
|
|
size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
|
|
int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
|
|
|
|
@@ -196,6 +144,7 @@ typedef enum {
|
|
HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
|
|
HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */
|
|
} HUF_repeat;
|
|
+
|
|
/* HUF_compress4X_repeat() :
|
|
* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
|
|
* If it uses hufTable it does not modify hufTable or repeat.
|
|
@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
unsigned maxSymbolValue, unsigned tableLog,
|
|
void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
|
|
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
|
|
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
|
|
|
|
/* HUF_buildCTable_wksp() :
|
|
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
|
|
* `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
|
|
*/
|
|
-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
|
|
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
|
|
#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
|
|
size_t HUF_buildCTable_wksp (HUF_CElt* tree,
|
|
const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
|
|
@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
|
|
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
|
|
const void* src, size_t srcSize,
|
|
void* workspace, size_t wkspSize,
|
|
- int bmi2);
|
|
+ int flags);
|
|
|
|
/* HUF_readCTable() :
|
|
* Loading a CTable saved with HUF_writeCTable() */
|
|
@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
|
|
#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
|
|
#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
|
|
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
|
|
-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
|
|
-#endif
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
|
|
-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
|
|
-#endif
|
|
-
|
|
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
|
|
-#endif
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
|
|
-#endif
|
|
-
|
|
|
|
/* ====================== */
|
|
/* single stream variants */
|
|
/* ====================== */
|
|
|
|
-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
|
|
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
|
|
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
|
|
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
|
|
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
|
|
/* HUF_compress1X_repeat() :
|
|
* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
|
|
* If it uses hufTable it does not modify hufTable or repeat.
|
|
@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
unsigned maxSymbolValue, unsigned tableLog,
|
|
void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
|
|
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
|
|
-
|
|
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */
|
|
-#endif
|
|
-
|
|
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
|
|
-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
|
|
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */
|
|
-#endif
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
|
|
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */
|
|
-#endif
|
|
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
|
|
|
|
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */
|
|
-#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
|
|
-#endif
|
|
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
|
|
#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
|
|
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */
|
|
#endif
|
|
|
|
/* BMI2 variants.
|
|
* If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
|
|
*/
|
|
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
|
|
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
|
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
|
|
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
|
|
#endif
|
|
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
|
|
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
|
|
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
|
|
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
|
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
|
|
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
|
|
#endif
|
|
#ifndef HUF_FORCE_DECOMPRESS_X1
|
|
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
|
|
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
|
|
#endif
|
|
|
|
-#endif /* HUF_STATIC_LINKING_ONLY */
|
|
+#endif /* HUF_H_298734234 */
|
|
|
|
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
|
|
index 1d9cc03924ca..a7231822b6e3 100644
|
|
--- a/lib/zstd/common/mem.h
|
|
+++ b/lib/zstd/common/mem.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
|
|
index 0e3b2c0a527d..7ede8cf1ffe5 100644
|
|
--- a/lib/zstd/common/portability_macros.h
|
|
+++ b/lib/zstd/common/portability_macros.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -12,7 +13,7 @@
|
|
#define ZSTD_PORTABILITY_MACROS_H
|
|
|
|
/*
|
|
- * This header file contains macro defintions to support portability.
|
|
+ * This header file contains macro definitions to support portability.
|
|
* This header is shared between C and ASM code, so it MUST only
|
|
* contain macro definitions. It MUST not contain any C code.
|
|
*
|
|
@@ -65,7 +66,7 @@
|
|
#endif
|
|
|
|
/*
|
|
- * Only enable assembly for GNUC comptabile compilers,
|
|
+ * Only enable assembly for GNUC compatible compilers,
|
|
* because other platforms may not support GAS assembly syntax.
|
|
*
|
|
* Only enable assembly for Linux / MacOS, other platforms may
|
|
@@ -90,4 +91,23 @@
|
|
*/
|
|
#define ZSTD_ENABLE_ASM_X86_64_BMI2 0
|
|
|
|
+/*
|
|
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
|
|
+ * assembly sources when CET is enabled.
|
|
+ *
|
|
+ * Additionally, any function that may be called indirectly must begin
|
|
+ * with ZSTD_CET_ENDBRANCH.
|
|
+ */
|
|
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
|
|
+ && defined(__has_include)
|
|
+# if __has_include(<cet.h>)
|
|
+# include <cet.h>
|
|
+# define ZSTD_CET_ENDBRANCH _CET_ENDBR
|
|
+# endif
|
|
+#endif
|
|
+
|
|
+#ifndef ZSTD_CET_ENDBRANCH
|
|
+# define ZSTD_CET_ENDBRANCH
|
|
+#endif
|
|
+
|
|
#endif /* ZSTD_PORTABILITY_MACROS_H */
|
|
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
|
|
index 3d7e35b309b5..44b95b25344a 100644
|
|
--- a/lib/zstd/common/zstd_common.c
|
|
+++ b/lib/zstd/common/zstd_common.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -14,7 +15,6 @@
|
|
* Dependencies
|
|
***************************************/
|
|
#define ZSTD_DEPS_NEED_MALLOC
|
|
-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
|
|
#include "error_private.h"
|
|
#include "zstd_internal.h"
|
|
|
|
@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
|
|
/*! ZSTD_getErrorString() :
|
|
* provides error code string from enum */
|
|
const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
|
|
-
|
|
-
|
|
-
|
|
-/*=**************************************************************
|
|
-* Custom allocator
|
|
-****************************************************************/
|
|
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
|
|
-{
|
|
- if (customMem.customAlloc)
|
|
- return customMem.customAlloc(customMem.opaque, size);
|
|
- return ZSTD_malloc(size);
|
|
-}
|
|
-
|
|
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
|
|
-{
|
|
- if (customMem.customAlloc) {
|
|
- /* calloc implemented as malloc+memset;
|
|
- * not as efficient as calloc, but next best guess for custom malloc */
|
|
- void* const ptr = customMem.customAlloc(customMem.opaque, size);
|
|
- ZSTD_memset(ptr, 0, size);
|
|
- return ptr;
|
|
- }
|
|
- return ZSTD_calloc(1, size);
|
|
-}
|
|
-
|
|
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
|
|
-{
|
|
- if (ptr!=NULL) {
|
|
- if (customMem.customFree)
|
|
- customMem.customFree(customMem.opaque, ptr);
|
|
- else
|
|
- ZSTD_free(ptr);
|
|
- }
|
|
-}
|
|
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
|
|
index f06df065dec0..670c5fa2a952 100644
|
|
--- a/lib/zstd/common/zstd_deps.h
|
|
+++ b/lib/zstd/common/zstd_deps.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
|
|
index 93305d9b41bb..7f023e4d4774 100644
|
|
--- a/lib/zstd/common/zstd_internal.h
|
|
+++ b/lib/zstd/common/zstd_internal.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -28,7 +29,6 @@
|
|
#include <linux/zstd.h>
|
|
#define FSE_STATIC_LINKING_ONLY
|
|
#include "fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "huf.h"
|
|
#include <linux/xxhash.h> /* XXH_reset, update, digest */
|
|
#define ZSTD_TRACE 0
|
|
@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
|
|
#define ZSTD_FRAMECHECKSUMSIZE 4
|
|
|
|
#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
|
|
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
|
|
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */
|
|
+#define MIN_LITERALS_FOR_4_STREAMS 6
|
|
|
|
-#define HufLog 12
|
|
typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
|
|
|
|
#define LONGNBSEQ 0x7F00
|
|
@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
|
|
#define MINMATCH 3
|
|
|
|
#define Litbits 8
|
|
+#define LitHufLog 11
|
|
#define MaxLit ((1<<Litbits) - 1)
|
|
#define MaxML 52
|
|
#define MaxLL 35
|
|
@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
|
|
#define LLFSELog 9
|
|
#define OffFSELog 8
|
|
#define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
|
|
+#define MaxMLBits 16
|
|
+#define MaxLLBits 16
|
|
|
|
#define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
|
|
/* Each table cannot take more than #symbols * FSELog bits */
|
|
@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
|
* one COPY16() in the first call. Then, do two calls per loop since
|
|
* at that point it is more likely to have a high trip count.
|
|
*/
|
|
-#ifdef __aarch64__
|
|
- do {
|
|
- COPY16(op, ip);
|
|
- }
|
|
- while (op < oend);
|
|
-#else
|
|
ZSTD_copy16(op, ip);
|
|
if (16 >= length) return;
|
|
op += 16;
|
|
@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
|
COPY16(op, ip);
|
|
}
|
|
while (op < oend);
|
|
-#endif
|
|
}
|
|
}
|
|
|
|
@@ -289,11 +285,11 @@ typedef enum {
|
|
typedef struct {
|
|
seqDef* sequencesStart;
|
|
seqDef* sequences; /* ptr to end of sequences */
|
|
- BYTE* litStart;
|
|
- BYTE* lit; /* ptr to end of literals */
|
|
- BYTE* llCode;
|
|
- BYTE* mlCode;
|
|
- BYTE* ofCode;
|
|
+ BYTE* litStart;
|
|
+ BYTE* lit; /* ptr to end of literals */
|
|
+ BYTE* llCode;
|
|
+ BYTE* mlCode;
|
|
+ BYTE* ofCode;
|
|
size_t maxNbSeq;
|
|
size_t maxNbLit;
|
|
|
|
@@ -301,8 +297,8 @@ typedef struct {
|
|
* in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
|
|
* the existing value of the litLength or matchLength by 0x10000.
|
|
*/
|
|
- ZSTD_longLengthType_e longLengthType;
|
|
- U32 longLengthPos; /* Index of the sequence to apply long length modification to */
|
|
+ ZSTD_longLengthType_e longLengthType;
|
|
+ U32 longLengthPos; /* Index of the sequence to apply long length modification to */
|
|
} seqStore_t;
|
|
|
|
typedef struct {
|
|
@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
|
|
seqLen.matchLength = seq->mlBase + MINMATCH;
|
|
if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
|
|
if (seqStore->longLengthType == ZSTD_llt_literalLength) {
|
|
- seqLen.litLength += 0xFFFF;
|
|
+ seqLen.litLength += 0x10000;
|
|
}
|
|
if (seqStore->longLengthType == ZSTD_llt_matchLength) {
|
|
- seqLen.matchLength += 0xFFFF;
|
|
+ seqLen.matchLength += 0x10000;
|
|
}
|
|
}
|
|
return seqLen;
|
|
@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
|
|
* `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
|
|
*/
|
|
typedef struct {
|
|
+ size_t nbBlocks;
|
|
size_t compressedSize;
|
|
unsigned long long decompressedBound;
|
|
} ZSTD_frameSizeInfo; /* decompress & legacy */
|
|
|
|
const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */
|
|
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
|
|
-
|
|
-/* custom memory allocation functions */
|
|
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
|
|
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
|
|
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
|
|
-
|
|
-
|
|
-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
|
|
-{
|
|
- assert(val != 0);
|
|
- {
|
|
-# if (__GNUC__ >= 3) /* GCC Intrinsic */
|
|
- return __builtin_clz (val) ^ 31;
|
|
-# else /* Software version */
|
|
- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
|
|
- U32 v = val;
|
|
- v |= v >> 1;
|
|
- v |= v >> 2;
|
|
- v |= v >> 4;
|
|
- v |= v >> 8;
|
|
- v |= v >> 16;
|
|
- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
|
|
-# endif
|
|
- }
|
|
-}
|
|
-
|
|
-/*
|
|
- * Counts the number of trailing zeros of a `size_t`.
|
|
- * Most compilers should support CTZ as a builtin. A backup
|
|
- * implementation is provided if the builtin isn't supported, but
|
|
- * it may not be terribly efficient.
|
|
- */
|
|
-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
|
|
-{
|
|
- if (MEM_64bits()) {
|
|
-# if (__GNUC__ >= 4)
|
|
- return __builtin_ctzll((U64)val);
|
|
-# else
|
|
- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19,
|
|
- 4, 25, 14, 28, 9, 34, 20, 56,
|
|
- 5, 17, 26, 54, 15, 41, 29, 43,
|
|
- 10, 31, 38, 35, 21, 45, 49, 57,
|
|
- 63, 6, 12, 18, 24, 27, 33, 55,
|
|
- 16, 53, 40, 42, 30, 37, 44, 48,
|
|
- 62, 11, 23, 32, 52, 39, 36, 47,
|
|
- 61, 22, 51, 46, 60, 50, 59, 58 };
|
|
- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
|
-# endif
|
|
- } else { /* 32 bits */
|
|
-# if (__GNUC__ >= 3)
|
|
- return __builtin_ctz((U32)val);
|
|
-# else
|
|
- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3,
|
|
- 30, 22, 20, 15, 25, 17, 4, 8,
|
|
- 31, 27, 13, 23, 21, 19, 16, 7,
|
|
- 26, 12, 18, 6, 11, 5, 10, 9 };
|
|
- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
|
-# endif
|
|
- }
|
|
-}
|
|
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
|
|
|
|
|
|
/* ZSTD_invalidateRepCodes() :
|
|
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
|
|
index d9a76112ec3a..6ab8be6532ef 100644
|
|
--- a/lib/zstd/compress/clevels.h
|
|
+++ b/lib/zstd/compress/clevels.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
|
|
index ec5b1ca6d71a..e46ca6621b48 100644
|
|
--- a/lib/zstd/compress/fse_compress.c
|
|
+++ b/lib/zstd/compress/fse_compress.c
|
|
@@ -1,6 +1,7 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* FSE : Finite State Entropy encoder
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -26,6 +27,7 @@
|
|
#define ZSTD_DEPS_NEED_MALLOC
|
|
#define ZSTD_DEPS_NEED_MATH64
|
|
#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
|
|
+#include "../common/bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
/* **************************************************************
|
|
@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
|
|
assert(tableLog < 16); /* required for threshold strategy to work */
|
|
|
|
/* For explanations on how to distribute symbol values over the table :
|
|
- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
|
|
+ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
|
|
|
|
#ifdef __clang_analyzer__
|
|
ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */
|
|
@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
|
|
break;
|
|
default :
|
|
assert(normalizedCounter[s] > 1);
|
|
- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
|
|
+ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
|
|
U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
|
|
symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
|
|
symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
|
|
@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
|
|
* FSE Compression Code
|
|
****************************************************************/
|
|
|
|
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
|
|
-{
|
|
- size_t size;
|
|
- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
|
|
- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
|
|
- return (FSE_CTable*)ZSTD_malloc(size);
|
|
-}
|
|
-
|
|
-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
|
|
-
|
|
/* provides the minimum logSize to safely represent a distribution */
|
|
static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
|
|
{
|
|
- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
|
|
- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
|
|
+ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
|
|
+ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
|
|
U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
|
|
assert(srcSize > 1); /* Not supported, RLE should be used instead */
|
|
return minBits;
|
|
@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
|
|
|
|
unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
|
|
{
|
|
- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
|
|
+ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
|
|
U32 tableLog = maxTableLog;
|
|
U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
|
|
assert(srcSize > 1); /* Not supported, RLE should be used instead */
|
|
@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
|
|
return tableLog;
|
|
}
|
|
|
|
-
|
|
-/* fake FSE_CTable, for raw (uncompressed) input */
|
|
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
|
|
-{
|
|
- const unsigned tableSize = 1 << nbBits;
|
|
- const unsigned tableMask = tableSize - 1;
|
|
- const unsigned maxSymbolValue = tableMask;
|
|
- void* const ptr = ct;
|
|
- U16* const tableU16 = ( (U16*) ptr) + 2;
|
|
- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */
|
|
- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
|
|
- unsigned s;
|
|
-
|
|
- /* Sanity checks */
|
|
- if (nbBits < 1) return ERROR(GENERIC); /* min size */
|
|
-
|
|
- /* header */
|
|
- tableU16[-2] = (U16) nbBits;
|
|
- tableU16[-1] = (U16) maxSymbolValue;
|
|
-
|
|
- /* Build table */
|
|
- for (s=0; s<tableSize; s++)
|
|
- tableU16[s] = (U16)(tableSize + s);
|
|
-
|
|
- /* Build Symbol Transformation Table */
|
|
- { const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
|
|
- for (s=0; s<=maxSymbolValue; s++) {
|
|
- symbolTT[s].deltaNbBits = deltaNbBits;
|
|
- symbolTT[s].deltaFindState = s-1;
|
|
- } }
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
/* fake FSE_CTable, for rle input (always same symbol) */
|
|
size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
|
|
{
|
|
@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
|
|
|
|
size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
|
|
|
|
-
|
|
#endif /* FSE_COMMONDEFS_ONLY */
|
|
diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
|
|
index 3ddc6dfb6894..0b12587cc14b 100644
|
|
--- a/lib/zstd/compress/hist.c
|
|
+++ b/lib/zstd/compress/hist.c
|
|
@@ -1,7 +1,8 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* hist : Histogram functions
|
|
* part of Finite State Entropy project
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
|
|
index fc1830abc9c6..f7687b0fc20a 100644
|
|
--- a/lib/zstd/compress/hist.h
|
|
+++ b/lib/zstd/compress/hist.h
|
|
@@ -1,7 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/* ******************************************************************
|
|
* hist : Histogram functions
|
|
* part of Finite State Entropy project
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
|
|
index 74ef0db47621..83241abafe35 100644
|
|
--- a/lib/zstd/compress/huf_compress.c
|
|
+++ b/lib/zstd/compress/huf_compress.c
|
|
@@ -1,6 +1,7 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* Huffman encoder, part of New Generation Entropy library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -26,9 +27,9 @@
|
|
#include "hist.h"
|
|
#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */
|
|
#include "../common/fse.h" /* header compression */
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include "../common/error_private.h"
|
|
+#include "../common/bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
/* **************************************************************
|
|
@@ -39,13 +40,67 @@
|
|
|
|
|
|
/* **************************************************************
|
|
-* Utils
|
|
+* Required declarations
|
|
****************************************************************/
|
|
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
|
|
+typedef struct nodeElt_s {
|
|
+ U32 count;
|
|
+ U16 parent;
|
|
+ BYTE byte;
|
|
+ BYTE nbBits;
|
|
+} nodeElt;
|
|
+
|
|
+
|
|
+/* **************************************************************
|
|
+* Debug Traces
|
|
+****************************************************************/
|
|
+
|
|
+#if DEBUGLEVEL >= 2
|
|
+
|
|
+static size_t showU32(const U32* arr, size_t size)
|
|
+{
|
|
+ size_t u;
|
|
+ for (u=0; u<size; u++) {
|
|
+ RAWLOG(6, " %u", arr[u]); (void)arr;
|
|
+ }
|
|
+ RAWLOG(6, " \n");
|
|
+ return size;
|
|
+}
|
|
+
|
|
+static size_t HUF_getNbBits(HUF_CElt elt);
|
|
+
|
|
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
|
|
{
|
|
- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
|
|
+ size_t u;
|
|
+ for (u=0; u<size; u++) {
|
|
+ RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
|
|
+ }
|
|
+ RAWLOG(6, " \n");
|
|
+ return size;
|
|
+
|
|
}
|
|
|
|
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
|
|
+{
|
|
+ size_t u;
|
|
+ for (u=0; u<size; u++) {
|
|
+ RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
|
|
+ }
|
|
+ RAWLOG(6, " \n");
|
|
+ return size;
|
|
+}
|
|
+
|
|
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
|
|
+{
|
|
+ size_t u;
|
|
+ for (u=0; u<size; u++) {
|
|
+ RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
|
|
+ }
|
|
+ RAWLOG(6, " \n");
|
|
+ return size;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
|
|
/* *******************************************************
|
|
* HUF : Huffman block compression
|
|
@@ -86,7 +141,10 @@ typedef struct {
|
|
S16 norm[HUF_TABLELOG_MAX+1];
|
|
} HUF_CompressWeightsWksp;
|
|
|
|
-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
|
|
+static size_t
|
|
+HUF_compressWeights(void* dst, size_t dstSize,
|
|
+ const void* weightTable, size_t wtSize,
|
|
+ void* workspace, size_t workspaceSize)
|
|
{
|
|
BYTE* const ostart = (BYTE*) dst;
|
|
BYTE* op = ostart;
|
|
@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
|
|
|
|
static size_t HUF_getValue(HUF_CElt elt)
|
|
{
|
|
- return elt & ~0xFF;
|
|
+ return elt & ~(size_t)0xFF;
|
|
}
|
|
|
|
static size_t HUF_getValueFast(HUF_CElt elt)
|
|
@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
|
|
U32 n;
|
|
HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
|
|
|
|
+ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
|
|
+
|
|
/* check conditions */
|
|
if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
|
|
if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
|
|
@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
|
|
return ((maxSymbolValue+1)/2) + 1;
|
|
}
|
|
|
|
-/*! HUF_writeCTable() :
|
|
- `CTable` : Huffman tree to save, using huf representation.
|
|
- @return : size of saved CTable */
|
|
-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
|
|
- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
|
|
-{
|
|
- HUF_WriteCTableWksp wksp;
|
|
- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
|
|
-}
|
|
-
|
|
|
|
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
|
|
{
|
|
@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
|
|
|
|
U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
|
|
{
|
|
- const HUF_CElt* ct = CTable + 1;
|
|
+ const HUF_CElt* const ct = CTable + 1;
|
|
assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
|
|
return (U32)HUF_getNbBits(ct[symbolValue]);
|
|
}
|
|
|
|
|
|
-typedef struct nodeElt_s {
|
|
- U32 count;
|
|
- U16 parent;
|
|
- BYTE byte;
|
|
- BYTE nbBits;
|
|
-} nodeElt;
|
|
-
|
|
/*
|
|
* HUF_setMaxHeight():
|
|
- * Enforces maxNbBits on the Huffman tree described in huffNode.
|
|
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
|
|
*
|
|
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
|
|
- * the tree to so that it is a valid canonical Huffman tree.
|
|
+ * It attempts to convert all nodes with nbBits > @targetNbBits
|
|
+ * to employ @targetNbBits instead. Then it adjusts the tree
|
|
+ * so that it remains a valid canonical Huffman tree.
|
|
*
|
|
* @pre The sum of the ranks of each symbol == 2^largestBits,
|
|
* where largestBits == huffNode[lastNonNull].nbBits.
|
|
* @post The sum of the ranks of each symbol == 2^largestBits,
|
|
- * where largestBits is the return value <= maxNbBits.
|
|
+ * where largestBits is the return value (expected <= targetNbBits).
|
|
*
|
|
- * @param huffNode The Huffman tree modified in place to enforce maxNbBits.
|
|
+ * @param huffNode The Huffman tree modified in place to enforce targetNbBits.
|
|
+ * It's presumed sorted, from most frequent to rarest symbol.
|
|
* @param lastNonNull The symbol with the lowest count in the Huffman tree.
|
|
- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree
|
|
+ * @param targetNbBits The allowed number of bits, which the Huffman tree
|
|
* may not respect. After this function the Huffman tree will
|
|
- * respect maxNbBits.
|
|
- * @return The maximum number of bits of the Huffman tree after adjustment,
|
|
- * necessarily no more than maxNbBits.
|
|
+ * respect targetNbBits.
|
|
+ * @return The maximum number of bits of the Huffman tree after adjustment.
|
|
*/
|
|
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
|
|
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
|
|
{
|
|
const U32 largestBits = huffNode[lastNonNull].nbBits;
|
|
- /* early exit : no elt > maxNbBits, so the tree is already valid. */
|
|
- if (largestBits <= maxNbBits) return largestBits;
|
|
+ /* early exit : no elt > targetNbBits, so the tree is already valid. */
|
|
+ if (largestBits <= targetNbBits) return largestBits;
|
|
+
|
|
+ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
|
|
|
|
/* there are several too large elements (at least >= 2) */
|
|
{ int totalCost = 0;
|
|
- const U32 baseCost = 1 << (largestBits - maxNbBits);
|
|
+ const U32 baseCost = 1 << (largestBits - targetNbBits);
|
|
int n = (int)lastNonNull;
|
|
|
|
- /* Adjust any ranks > maxNbBits to maxNbBits.
|
|
+ /* Adjust any ranks > targetNbBits to targetNbBits.
|
|
* Compute totalCost, which is how far the sum of the ranks is
|
|
* we are over 2^largestBits after adjust the offending ranks.
|
|
*/
|
|
- while (huffNode[n].nbBits > maxNbBits) {
|
|
+ while (huffNode[n].nbBits > targetNbBits) {
|
|
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
|
|
- huffNode[n].nbBits = (BYTE)maxNbBits;
|
|
+ huffNode[n].nbBits = (BYTE)targetNbBits;
|
|
n--;
|
|
}
|
|
- /* n stops at huffNode[n].nbBits <= maxNbBits */
|
|
- assert(huffNode[n].nbBits <= maxNbBits);
|
|
- /* n end at index of smallest symbol using < maxNbBits */
|
|
- while (huffNode[n].nbBits == maxNbBits) --n;
|
|
+ /* n stops at huffNode[n].nbBits <= targetNbBits */
|
|
+ assert(huffNode[n].nbBits <= targetNbBits);
|
|
+ /* n end at index of smallest symbol using < targetNbBits */
|
|
+ while (huffNode[n].nbBits == targetNbBits) --n;
|
|
|
|
- /* renorm totalCost from 2^largestBits to 2^maxNbBits
|
|
+ /* renorm totalCost from 2^largestBits to 2^targetNbBits
|
|
* note : totalCost is necessarily a multiple of baseCost */
|
|
- assert((totalCost & (baseCost - 1)) == 0);
|
|
- totalCost >>= (largestBits - maxNbBits);
|
|
+ assert(((U32)totalCost & (baseCost - 1)) == 0);
|
|
+ totalCost >>= (largestBits - targetNbBits);
|
|
assert(totalCost > 0);
|
|
|
|
/* repay normalized cost */
|
|
@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
|
|
|
|
/* Get pos of last (smallest = lowest cum. count) symbol per rank */
|
|
ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
|
|
- { U32 currentNbBits = maxNbBits;
|
|
+ { U32 currentNbBits = targetNbBits;
|
|
int pos;
|
|
for (pos=n ; pos >= 0; pos--) {
|
|
if (huffNode[pos].nbBits >= currentNbBits) continue;
|
|
- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */
|
|
- rankLast[maxNbBits-currentNbBits] = (U32)pos;
|
|
+ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */
|
|
+ rankLast[targetNbBits-currentNbBits] = (U32)pos;
|
|
} }
|
|
|
|
while (totalCost > 0) {
|
|
/* Try to reduce the next power of 2 above totalCost because we
|
|
* gain back half the rank.
|
|
*/
|
|
- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
|
|
+ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
|
|
for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
|
|
U32 const highPos = rankLast[nBitsToDecrease];
|
|
U32 const lowPos = rankLast[nBitsToDecrease-1];
|
|
@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
|
|
rankLast[nBitsToDecrease] = noSymbol;
|
|
else {
|
|
rankLast[nBitsToDecrease]--;
|
|
- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
|
|
+ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
|
|
rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
|
|
}
|
|
} /* while (totalCost > 0) */
|
|
@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
|
|
* TODO.
|
|
*/
|
|
while (totalCost < 0) { /* Sometimes, cost correction overshoot */
|
|
- /* special case : no rank 1 symbol (using maxNbBits-1);
|
|
- * let's create one from largest rank 0 (using maxNbBits).
|
|
+ /* special case : no rank 1 symbol (using targetNbBits-1);
|
|
+ * let's create one from largest rank 0 (using targetNbBits).
|
|
*/
|
|
if (rankLast[1] == noSymbol) {
|
|
- while (huffNode[n].nbBits == maxNbBits) n--;
|
|
+ while (huffNode[n].nbBits == targetNbBits) n--;
|
|
huffNode[n+1].nbBits--;
|
|
assert(n >= 0);
|
|
rankLast[1] = (U32)(n+1);
|
|
@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
|
|
} /* repay normalized cost */
|
|
} /* there are several too large elements (at least >= 2) */
|
|
|
|
- return maxNbBits;
|
|
+ return targetNbBits;
|
|
}
|
|
|
|
typedef struct {
|
|
@@ -429,7 +475,7 @@ typedef struct {
|
|
U16 curr;
|
|
} rankPos;
|
|
|
|
-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
|
|
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
|
|
|
|
/* Number of buckets available for HUF_sort() */
|
|
#define RANK_POSITION_TABLE_SIZE 192
|
|
@@ -448,8 +494,8 @@ typedef struct {
|
|
* Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
|
|
*/
|
|
#define RANK_POSITION_MAX_COUNT_LOG 32
|
|
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
|
|
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
|
|
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
|
|
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
|
|
|
|
/* Return the appropriate bucket index for a given count. See definition of
|
|
* RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
|
|
@@ -457,7 +503,7 @@ typedef struct {
|
|
static U32 HUF_getIndex(U32 const count) {
|
|
return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
|
|
? count
|
|
- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
|
|
+ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
|
|
}
|
|
|
|
/* Helper swap function for HUF_quickSortPartition() */
|
|
@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
|
|
|
|
/* Sort each bucket. */
|
|
for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
|
|
- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
|
|
+ int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
|
|
U32 const bucketStartIdx = rankPosition[n].base;
|
|
if (bucketSize > 1) {
|
|
assert(bucketStartIdx < maxSymbolValue1);
|
|
@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
|
|
assert(HUF_isSorted(huffNode, maxSymbolValue1));
|
|
}
|
|
|
|
+
|
|
/* HUF_buildCTable_wksp() :
|
|
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
|
|
* `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
|
|
@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
|
|
int lowS, lowN;
|
|
int nodeNb = STARTNODE;
|
|
int n, nodeRoot;
|
|
+ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
|
|
/* init for parents */
|
|
nonNullRank = (int)maxSymbolValue;
|
|
while(huffNode[nonNullRank].count == 0) nonNullRank--;
|
|
@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
|
|
for (n=0; n<=nonNullRank; n++)
|
|
huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
|
|
|
|
+ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
|
|
+
|
|
return nonNullRank;
|
|
}
|
|
|
|
@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
|
|
CTable[0] = maxNbBits;
|
|
}
|
|
|
|
-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
|
|
+size_t
|
|
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
|
|
+ void* workSpace, size_t wkspSize)
|
|
{
|
|
- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
|
|
+ HUF_buildCTable_wksp_tables* const wksp_tables =
|
|
+ (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
|
|
nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
|
|
nodeElt* const huffNode = huffNode0+1;
|
|
int nonNullRank;
|
|
|
|
+ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
|
|
+
|
|
+ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
|
|
+
|
|
/* safety checks */
|
|
if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
|
|
- return ERROR(workSpace_tooSmall);
|
|
+ return ERROR(workSpace_tooSmall);
|
|
if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
|
|
if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
|
|
- return ERROR(maxSymbolValue_tooLarge);
|
|
+ return ERROR(maxSymbolValue_tooLarge);
|
|
ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
|
|
|
|
/* sort, decreasing order */
|
|
HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
|
|
+ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
|
|
|
|
/* build tree */
|
|
nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
|
|
|
|
- /* enforce maxTableLog */
|
|
+ /* determine and enforce maxTableLog */
|
|
maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
|
|
if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
|
|
|
|
@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
|
|
#if DEBUGLEVEL >= 1
|
|
{
|
|
size_t const nbBits = HUF_getNbBits(elt);
|
|
- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
|
|
+ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
|
|
(void)dirtyBits;
|
|
/* Middle bits are 0. */
|
|
assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
|
|
@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
|
|
{
|
|
size_t const nbBits = bitC->bitPos[0] & 0xFF;
|
|
if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
|
|
- return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
|
|
+ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
|
|
}
|
|
}
|
|
|
|
@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
|
|
static size_t
|
|
HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
- const HUF_CElt* CTable, const int bmi2)
|
|
+ const HUF_CElt* CTable, const int flags)
|
|
{
|
|
- if (bmi2) {
|
|
+ if (flags & HUF_flags_bmi2) {
|
|
return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
|
|
}
|
|
return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
|
|
@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
|
|
static size_t
|
|
HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
- const HUF_CElt* CTable, const int bmi2)
|
|
+ const HUF_CElt* CTable, const int flags)
|
|
{
|
|
- (void)bmi2;
|
|
+ (void)flags;
|
|
return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
|
|
}
|
|
|
|
#endif
|
|
|
|
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
|
|
-{
|
|
- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
|
|
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
|
|
{
|
|
- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
|
|
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
|
|
}
|
|
|
|
static size_t
|
|
HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
- const HUF_CElt* CTable, int bmi2)
|
|
+ const HUF_CElt* CTable, int flags)
|
|
{
|
|
size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */
|
|
const BYTE* ip = (const BYTE*) src;
|
|
@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
op += 6; /* jumpTable */
|
|
|
|
assert(op <= oend);
|
|
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
|
|
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
|
|
if (cSize == 0 || cSize > 65535) return 0;
|
|
MEM_writeLE16(ostart, (U16)cSize);
|
|
op += cSize;
|
|
@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
|
|
ip += segmentSize;
|
|
assert(op <= oend);
|
|
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
|
|
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
|
|
if (cSize == 0 || cSize > 65535) return 0;
|
|
MEM_writeLE16(ostart+2, (U16)cSize);
|
|
op += cSize;
|
|
@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
|
|
ip += segmentSize;
|
|
assert(op <= oend);
|
|
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
|
|
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
|
|
if (cSize == 0 || cSize > 65535) return 0;
|
|
MEM_writeLE16(ostart+4, (U16)cSize);
|
|
op += cSize;
|
|
@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
ip += segmentSize;
|
|
assert(op <= oend);
|
|
assert(ip <= iend);
|
|
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
|
|
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
|
|
if (cSize == 0 || cSize > 65535) return 0;
|
|
op += cSize;
|
|
}
|
|
@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
|
return (size_t)(op-ostart);
|
|
}
|
|
|
|
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
|
|
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
|
|
{
|
|
- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
|
|
-{
|
|
- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
|
|
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
|
|
}
|
|
|
|
typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
|
|
@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
|
|
static size_t HUF_compressCTable_internal(
|
|
BYTE* const ostart, BYTE* op, BYTE* const oend,
|
|
const void* src, size_t srcSize,
|
|
- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
|
|
+ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
|
|
{
|
|
size_t const cSize = (nbStreams==HUF_singleStream) ?
|
|
- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
|
|
- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
|
|
+ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
|
|
+ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
|
|
if (HUF_isError(cSize)) { return cSize; }
|
|
if (cSize==0) { return 0; } /* uncompressible */
|
|
op += cSize;
|
|
@@ -1168,6 +1216,79 @@ typedef struct {
|
|
#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
|
|
#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
|
|
|
|
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
|
|
+{
|
|
+ unsigned cardinality = 0;
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < maxSymbolValue + 1; i++) {
|
|
+ if (count[i] != 0) cardinality += 1;
|
|
+ }
|
|
+
|
|
+ return cardinality;
|
|
+}
|
|
+
|
|
+unsigned HUF_minTableLog(unsigned symbolCardinality)
|
|
+{
|
|
+ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
|
|
+ return minBitsSymbols;
|
|
+}
|
|
+
|
|
+unsigned HUF_optimalTableLog(
|
|
+ unsigned maxTableLog,
|
|
+ size_t srcSize,
|
|
+ unsigned maxSymbolValue,
|
|
+ void* workSpace, size_t wkspSize,
|
|
+ HUF_CElt* table,
|
|
+ const unsigned* count,
|
|
+ int flags)
|
|
+{
|
|
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
|
|
+ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
|
|
+
|
|
+ if (!(flags & HUF_flags_optimalDepth)) {
|
|
+ /* cheap evaluation, based on FSE */
|
|
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
|
|
+ }
|
|
+
|
|
+ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
|
|
+ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
|
|
+ size_t maxBits, hSize, newSize;
|
|
+ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
|
|
+ const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
|
|
+ size_t optSize = ((size_t) ~0) - 1;
|
|
+ unsigned optLog = maxTableLog, optLogGuess;
|
|
+
|
|
+ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
|
|
+
|
|
+ /* Search until size increases */
|
|
+ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
|
|
+ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
|
|
+ maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
|
|
+ if (ERR_isError(maxBits)) continue;
|
|
+
|
|
+ if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
|
|
+
|
|
+ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
|
|
+
|
|
+ if (ERR_isError(hSize)) continue;
|
|
+
|
|
+ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
|
|
+
|
|
+ if (newSize > optSize + 1) {
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (newSize < optSize) {
|
|
+ optSize = newSize;
|
|
+ optLog = optLogGuess;
|
|
+ }
|
|
+ }
|
|
+ assert(optLog <= HUF_TABLELOG_MAX);
|
|
+ return optLog;
|
|
+ }
|
|
+}
|
|
+
|
|
/* HUF_compress_internal() :
|
|
* `workSpace_align4` must be aligned on 4-bytes boundaries,
|
|
* and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
|
|
@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
unsigned maxSymbolValue, unsigned huffLog,
|
|
HUF_nbStreams_e nbStreams,
|
|
void* workSpace, size_t wkspSize,
|
|
- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
|
|
- const int bmi2, unsigned suspectUncompressible)
|
|
+ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
|
|
{
|
|
HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
|
|
BYTE* const ostart = (BYTE*)dst;
|
|
BYTE* const oend = ostart + dstSize;
|
|
BYTE* op = ostart;
|
|
|
|
+ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
|
|
HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
|
|
|
|
/* checks & inits */
|
|
@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
|
|
|
|
/* Heuristic : If old table is valid, use it for small inputs */
|
|
- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
|
|
+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
|
|
return HUF_compressCTable_internal(ostart, op, oend,
|
|
src, srcSize,
|
|
- nbStreams, oldHufTable, bmi2);
|
|
+ nbStreams, oldHufTable, flags);
|
|
}
|
|
|
|
/* If uncompressible data is suspected, do a smaller sampling first */
|
|
DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
|
|
- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
|
|
+ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
|
|
size_t largestTotal = 0;
|
|
+ DEBUGLOG(5, "input suspected incompressible : sampling to check");
|
|
{ unsigned maxSymbolValueBegin = maxSymbolValue;
|
|
CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
|
|
largestTotal += largestBegin;
|
|
@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
|
|
if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
|
|
}
|
|
+ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
|
|
|
|
/* Check validity of previous table */
|
|
if ( repeat
|
|
@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
*repeat = HUF_repeat_none;
|
|
}
|
|
/* Heuristic : use existing table for small inputs */
|
|
- if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
|
|
+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
|
|
return HUF_compressCTable_internal(ostart, op, oend,
|
|
src, srcSize,
|
|
- nbStreams, oldHufTable, bmi2);
|
|
+ nbStreams, oldHufTable, flags);
|
|
}
|
|
|
|
/* Build Huffman Tree */
|
|
- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
|
|
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
|
|
{ size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
|
|
maxSymbolValue, huffLog,
|
|
&table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
|
|
CHECK_F(maxBits);
|
|
huffLog = (U32)maxBits;
|
|
+ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
|
|
}
|
|
/* Zero unused symbols in CTable, so we can check it for validity */
|
|
{
|
|
@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
|
|
return HUF_compressCTable_internal(ostart, op, oend,
|
|
src, srcSize,
|
|
- nbStreams, oldHufTable, bmi2);
|
|
+ nbStreams, oldHufTable, flags);
|
|
} }
|
|
|
|
/* Use the new huffman table */
|
|
@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
|
}
|
|
return HUF_compressCTable_internal(ostart, op, oend,
|
|
src, srcSize,
|
|
- nbStreams, table->CTable, bmi2);
|
|
-}
|
|
-
|
|
-
|
|
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
|
|
- const void* src, size_t srcSize,
|
|
- unsigned maxSymbolValue, unsigned huffLog,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_compress_internal(dst, dstSize, src, srcSize,
|
|
- maxSymbolValue, huffLog, HUF_singleStream,
|
|
- workSpace, wkspSize,
|
|
- NULL, NULL, 0, 0 /*bmi2*/, 0);
|
|
+ nbStreams, table->CTable, flags);
|
|
}
|
|
|
|
size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
unsigned maxSymbolValue, unsigned huffLog,
|
|
void* workSpace, size_t wkspSize,
|
|
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
|
|
- int bmi2, unsigned suspectUncompressible)
|
|
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
|
|
{
|
|
+ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
|
|
return HUF_compress_internal(dst, dstSize, src, srcSize,
|
|
maxSymbolValue, huffLog, HUF_singleStream,
|
|
workSpace, wkspSize, hufTable,
|
|
- repeat, preferRepeat, bmi2, suspectUncompressible);
|
|
-}
|
|
-
|
|
-/* HUF_compress4X_repeat():
|
|
- * compress input using 4 streams.
|
|
- * provide workspace to generate compression tables */
|
|
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
|
|
- const void* src, size_t srcSize,
|
|
- unsigned maxSymbolValue, unsigned huffLog,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_compress_internal(dst, dstSize, src, srcSize,
|
|
- maxSymbolValue, huffLog, HUF_fourStreams,
|
|
- workSpace, wkspSize,
|
|
- NULL, NULL, 0, 0 /*bmi2*/, 0);
|
|
+ repeat, flags);
|
|
}
|
|
|
|
/* HUF_compress4X_repeat():
|
|
@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
|
|
const void* src, size_t srcSize,
|
|
unsigned maxSymbolValue, unsigned huffLog,
|
|
void* workSpace, size_t wkspSize,
|
|
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
|
|
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
|
|
{
|
|
+ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
|
|
return HUF_compress_internal(dst, dstSize, src, srcSize,
|
|
maxSymbolValue, huffLog, HUF_fourStreams,
|
|
workSpace, wkspSize,
|
|
- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
|
|
+ hufTable, repeat, flags);
|
|
}
|
|
-
|
|
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
|
|
index f620cafca633..c1c316e9e289 100644
|
|
--- a/lib/zstd/compress/zstd_compress.c
|
|
+++ b/lib/zstd/compress/zstd_compress.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -11,12 +12,12 @@
|
|
/*-*************************************
|
|
* Dependencies
|
|
***************************************/
|
|
+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
|
|
#include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
|
|
#include "../common/mem.h"
|
|
#include "hist.h" /* HIST_countFast_wksp */
|
|
#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */
|
|
#include "../common/fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include "zstd_compress_internal.h"
|
|
#include "zstd_compress_sequences.h"
|
|
@@ -27,6 +28,7 @@
|
|
#include "zstd_opt.h"
|
|
#include "zstd_ldm.h"
|
|
#include "zstd_compress_superblock.h"
|
|
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
|
|
|
|
/* ***************************************************************
|
|
* Tuning parameters
|
|
@@ -55,14 +57,17 @@
|
|
* Helper functions
|
|
***************************************/
|
|
/* ZSTD_compressBound()
|
|
- * Note that the result from this function is only compatible with the "normal"
|
|
- * full-block strategy.
|
|
- * When there are a lot of small blocks due to frequent flush in streaming mode
|
|
- * the overhead of headers can make the compressed data to be larger than the
|
|
- * return value of ZSTD_compressBound().
|
|
+ * Note that the result from this function is only valid for
|
|
+ * the one-pass compression functions.
|
|
+ * When employing the streaming mode,
|
|
+ * if flushes are frequently altering the size of blocks,
|
|
+ * the overhead from block headers can make the compressed data larger
|
|
+ * than the return value of ZSTD_compressBound().
|
|
*/
|
|
size_t ZSTD_compressBound(size_t srcSize) {
|
|
- return ZSTD_COMPRESSBOUND(srcSize);
|
|
+ size_t const r = ZSTD_COMPRESSBOUND(srcSize);
|
|
+ if (r==0) return ERROR(srcSize_wrong);
|
|
+ return r;
|
|
}
|
|
|
|
|
|
@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
|
|
if (cctx==NULL) return 0; /* support free on NULL */
|
|
RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
|
|
"not compatible with static CCtx");
|
|
- {
|
|
- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
|
|
+ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
|
|
ZSTD_freeCCtxContent(cctx);
|
|
- if (!cctxInWorkspace) {
|
|
- ZSTD_customFree(cctx, cctx->customMem);
|
|
- }
|
|
+ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
|
|
}
|
|
return 0;
|
|
}
|
|
@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
|
|
return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
|
|
}
|
|
|
|
-/* Returns 1 if compression parameters are such that we should
|
|
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
|
|
* enable long distance matching (wlog >= 27, strategy >= btopt).
|
|
- * Returns 0 otherwise.
|
|
+ * Returns ZSTD_ps_disable otherwise.
|
|
*/
|
|
static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
|
|
const ZSTD_compressionParameters* const cParams) {
|
|
@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
|
|
return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
|
|
}
|
|
|
|
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
|
|
+ return mode;
|
|
+}
|
|
+
|
|
+/* Resolves maxBlockSize to the default if no value is present. */
|
|
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
|
|
+ if (maxBlockSize == 0) {
|
|
+ return ZSTD_BLOCKSIZE_MAX;
|
|
+ } else {
|
|
+ return maxBlockSize;
|
|
+ }
|
|
+}
|
|
+
|
|
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
|
|
+ if (value != ZSTD_ps_auto) return value;
|
|
+ if (cLevel < 10) {
|
|
+ return ZSTD_ps_disable;
|
|
+ } else {
|
|
+ return ZSTD_ps_enable;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
|
|
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
|
|
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
|
|
+ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
|
|
+}
|
|
+
|
|
static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
|
|
ZSTD_compressionParameters cParams)
|
|
{
|
|
@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
|
|
}
|
|
cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
|
|
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
|
|
+ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
|
|
+ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
|
|
+ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
|
|
+ cctxParams.compressionLevel);
|
|
assert(!ZSTD_checkCParams(cParams));
|
|
return cctxParams;
|
|
}
|
|
@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
|
|
#define ZSTD_NO_CLEVEL 0
|
|
|
|
/*
|
|
- * Initializes the cctxParams from params and compressionLevel.
|
|
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
|
|
* @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
|
|
*/
|
|
-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
|
|
+static void
|
|
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
|
|
+ const ZSTD_parameters* params,
|
|
+ int compressionLevel)
|
|
{
|
|
assert(!ZSTD_checkCParams(params->cParams));
|
|
ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
|
|
@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
|
|
cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams);
|
|
cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams);
|
|
cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams);
|
|
+ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
|
|
+ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
|
|
+ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
|
|
DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
|
|
cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
|
|
}
|
|
@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
|
|
|
|
/*
|
|
* Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
|
|
- * @param param Validated zstd parameters.
|
|
+ * @param params Validated zstd parameters.
|
|
*/
|
|
static void ZSTD_CCtxParams_setZstdParams(
|
|
ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
|
|
@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
|
|
return bounds;
|
|
|
|
case ZSTD_c_enableLongDistanceMatching:
|
|
- bounds.lowerBound = 0;
|
|
- bounds.upperBound = 1;
|
|
+ bounds.lowerBound = (int)ZSTD_ps_auto;
|
|
+ bounds.upperBound = (int)ZSTD_ps_disable;
|
|
return bounds;
|
|
|
|
case ZSTD_c_ldmHashLog:
|
|
@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
|
|
bounds.upperBound = 1;
|
|
return bounds;
|
|
|
|
+ case ZSTD_c_prefetchCDictTables:
|
|
+ bounds.lowerBound = (int)ZSTD_ps_auto;
|
|
+ bounds.upperBound = (int)ZSTD_ps_disable;
|
|
+ return bounds;
|
|
+
|
|
+ case ZSTD_c_enableSeqProducerFallback:
|
|
+ bounds.lowerBound = 0;
|
|
+ bounds.upperBound = 1;
|
|
+ return bounds;
|
|
+
|
|
+ case ZSTD_c_maxBlockSize:
|
|
+ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
|
|
+ bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
|
|
+ return bounds;
|
|
+
|
|
+ case ZSTD_c_searchForExternalRepcodes:
|
|
+ bounds.lowerBound = (int)ZSTD_ps_auto;
|
|
+ bounds.upperBound = (int)ZSTD_ps_disable;
|
|
+ return bounds;
|
|
+
|
|
default:
|
|
bounds.error = ERROR(parameter_unsupported);
|
|
return bounds;
|
|
@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
|
|
case ZSTD_c_useBlockSplitter:
|
|
case ZSTD_c_useRowMatchFinder:
|
|
case ZSTD_c_deterministicRefPrefix:
|
|
+ case ZSTD_c_prefetchCDictTables:
|
|
+ case ZSTD_c_enableSeqProducerFallback:
|
|
+ case ZSTD_c_maxBlockSize:
|
|
+ case ZSTD_c_searchForExternalRepcodes:
|
|
default:
|
|
return 0;
|
|
}
|
|
@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
|
|
if (ZSTD_isUpdateAuthorized(param)) {
|
|
cctx->cParamsChanged = 1;
|
|
} else {
|
|
- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
|
|
+ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
|
|
} }
|
|
|
|
switch(param)
|
|
@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
|
|
case ZSTD_c_useBlockSplitter:
|
|
case ZSTD_c_useRowMatchFinder:
|
|
case ZSTD_c_deterministicRefPrefix:
|
|
+ case ZSTD_c_prefetchCDictTables:
|
|
+ case ZSTD_c_enableSeqProducerFallback:
|
|
+ case ZSTD_c_maxBlockSize:
|
|
+ case ZSTD_c_searchForExternalRepcodes:
|
|
break;
|
|
|
|
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
|
@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
|
case ZSTD_c_minMatch :
|
|
if (value!=0) /* 0 => use default */
|
|
BOUNDCHECK(ZSTD_c_minMatch, value);
|
|
- CCtxParams->cParams.minMatch = value;
|
|
+ CCtxParams->cParams.minMatch = (U32)value;
|
|
return CCtxParams->cParams.minMatch;
|
|
|
|
case ZSTD_c_targetLength :
|
|
BOUNDCHECK(ZSTD_c_targetLength, value);
|
|
- CCtxParams->cParams.targetLength = value;
|
|
+ CCtxParams->cParams.targetLength = (U32)value;
|
|
return CCtxParams->cParams.targetLength;
|
|
|
|
case ZSTD_c_strategy :
|
|
@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
|
/* Content size written in frame header _when known_ (default:1) */
|
|
DEBUGLOG(4, "set content size flag = %u", (value!=0));
|
|
CCtxParams->fParams.contentSizeFlag = value != 0;
|
|
- return CCtxParams->fParams.contentSizeFlag;
|
|
+ return (size_t)CCtxParams->fParams.contentSizeFlag;
|
|
|
|
case ZSTD_c_checksumFlag :
|
|
/* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
|
|
CCtxParams->fParams.checksumFlag = value != 0;
|
|
- return CCtxParams->fParams.checksumFlag;
|
|
+ return (size_t)CCtxParams->fParams.checksumFlag;
|
|
|
|
case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
|
|
DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
|
|
@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
|
|
|
case ZSTD_c_forceMaxWindow :
|
|
CCtxParams->forceWindow = (value != 0);
|
|
- return CCtxParams->forceWindow;
|
|
+ return (size_t)CCtxParams->forceWindow;
|
|
|
|
case ZSTD_c_forceAttachDict : {
|
|
const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
|
|
- BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
|
|
+ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
|
|
CCtxParams->attachDictPref = pref;
|
|
return CCtxParams->attachDictPref;
|
|
}
|
|
|
|
case ZSTD_c_literalCompressionMode : {
|
|
const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
|
|
- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
|
|
+ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
|
|
CCtxParams->literalCompressionMode = lcm;
|
|
return CCtxParams->literalCompressionMode;
|
|
}
|
|
@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
|
|
|
case ZSTD_c_enableDedicatedDictSearch :
|
|
CCtxParams->enableDedicatedDictSearch = (value!=0);
|
|
- return CCtxParams->enableDedicatedDictSearch;
|
|
+ return (size_t)CCtxParams->enableDedicatedDictSearch;
|
|
|
|
case ZSTD_c_enableLongDistanceMatching :
|
|
+ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
|
|
CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
|
|
return CCtxParams->ldmParams.enableLdm;
|
|
|
|
case ZSTD_c_ldmHashLog :
|
|
if (value!=0) /* 0 ==> auto */
|
|
BOUNDCHECK(ZSTD_c_ldmHashLog, value);
|
|
- CCtxParams->ldmParams.hashLog = value;
|
|
+ CCtxParams->ldmParams.hashLog = (U32)value;
|
|
return CCtxParams->ldmParams.hashLog;
|
|
|
|
case ZSTD_c_ldmMinMatch :
|
|
if (value!=0) /* 0 ==> default */
|
|
BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
|
|
- CCtxParams->ldmParams.minMatchLength = value;
|
|
+ CCtxParams->ldmParams.minMatchLength = (U32)value;
|
|
return CCtxParams->ldmParams.minMatchLength;
|
|
|
|
case ZSTD_c_ldmBucketSizeLog :
|
|
if (value!=0) /* 0 ==> default */
|
|
BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
|
|
- CCtxParams->ldmParams.bucketSizeLog = value;
|
|
+ CCtxParams->ldmParams.bucketSizeLog = (U32)value;
|
|
return CCtxParams->ldmParams.bucketSizeLog;
|
|
|
|
case ZSTD_c_ldmHashRateLog :
|
|
if (value!=0) /* 0 ==> default */
|
|
BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
|
|
- CCtxParams->ldmParams.hashRateLog = value;
|
|
+ CCtxParams->ldmParams.hashRateLog = (U32)value;
|
|
return CCtxParams->ldmParams.hashRateLog;
|
|
|
|
case ZSTD_c_targetCBlockSize :
|
|
if (value!=0) /* 0 ==> default */
|
|
BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
|
|
- CCtxParams->targetCBlockSize = value;
|
|
+ CCtxParams->targetCBlockSize = (U32)value;
|
|
return CCtxParams->targetCBlockSize;
|
|
|
|
case ZSTD_c_srcSizeHint :
|
|
if (value!=0) /* 0 ==> default */
|
|
BOUNDCHECK(ZSTD_c_srcSizeHint, value);
|
|
CCtxParams->srcSizeHint = value;
|
|
- return CCtxParams->srcSizeHint;
|
|
+ return (size_t)CCtxParams->srcSizeHint;
|
|
|
|
case ZSTD_c_stableInBuffer:
|
|
BOUNDCHECK(ZSTD_c_stableInBuffer, value);
|
|
@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
|
CCtxParams->deterministicRefPrefix = !!value;
|
|
return CCtxParams->deterministicRefPrefix;
|
|
|
|
+ case ZSTD_c_prefetchCDictTables:
|
|
+ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
|
|
+ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
|
|
+ return CCtxParams->prefetchCDictTables;
|
|
+
|
|
+ case ZSTD_c_enableSeqProducerFallback:
|
|
+ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
|
|
+ CCtxParams->enableMatchFinderFallback = value;
|
|
+ return CCtxParams->enableMatchFinderFallback;
|
|
+
|
|
+ case ZSTD_c_maxBlockSize:
|
|
+ if (value!=0) /* 0 ==> default */
|
|
+ BOUNDCHECK(ZSTD_c_maxBlockSize, value);
|
|
+ CCtxParams->maxBlockSize = value;
|
|
+ return CCtxParams->maxBlockSize;
|
|
+
|
|
+ case ZSTD_c_searchForExternalRepcodes:
|
|
+ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
|
|
+ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
|
|
+ return CCtxParams->searchForExternalRepcodes;
|
|
+
|
|
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
|
}
|
|
}
|
|
@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
|
|
case ZSTD_c_deterministicRefPrefix:
|
|
*value = (int)CCtxParams->deterministicRefPrefix;
|
|
break;
|
|
+ case ZSTD_c_prefetchCDictTables:
|
|
+ *value = (int)CCtxParams->prefetchCDictTables;
|
|
+ break;
|
|
+ case ZSTD_c_enableSeqProducerFallback:
|
|
+ *value = CCtxParams->enableMatchFinderFallback;
|
|
+ break;
|
|
+ case ZSTD_c_maxBlockSize:
|
|
+ *value = (int)CCtxParams->maxBlockSize;
|
|
+ break;
|
|
+ case ZSTD_c_searchForExternalRepcodes:
|
|
+ *value = (int)CCtxParams->searchForExternalRepcodes;
|
|
+ break;
|
|
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
|
}
|
|
return 0;
|
|
@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
|
|
return 0;
|
|
}
|
|
|
|
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
|
|
+{
|
|
+ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
|
|
+ DEBUGLOG(4, "ZSTD_CCtx_setCParams");
|
|
+ /* only update if all parameters are valid */
|
|
+ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
|
|
+{
|
|
+ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
|
|
+ DEBUGLOG(4, "ZSTD_CCtx_setFParams");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
|
|
+{
|
|
+ DEBUGLOG(4, "ZSTD_CCtx_setParams");
|
|
+ /* First check cParams, because we want to update all or none. */
|
|
+ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
|
|
+ /* Next set fParams, because this could fail if the cctx isn't in init stage. */
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
|
|
+ /* Finally set cParams, which should succeed. */
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
|
|
{
|
|
- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
|
|
+ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
|
|
RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
|
|
"Can't set pledgedSrcSize when not in init stage.");
|
|
cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
|
|
@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
|
|
ZSTD_compressionParameters* cParams);
|
|
|
|
/*
|
|
- * Initializes the local dict using the requested parameters.
|
|
- * NOTE: This does not use the pledged src size, because it may be used for more
|
|
- * than one compression.
|
|
+ * Initializes the local dictionary using requested parameters.
|
|
+ * NOTE: Initialization does not employ the pledged src size,
|
|
+ * because the dictionary may be used for multiple compressions.
|
|
*/
|
|
static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
|
|
{
|
|
@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
|
|
return 0;
|
|
}
|
|
if (dl->cdict != NULL) {
|
|
- assert(cctx->cdict == dl->cdict);
|
|
/* Local dictionary already initialized. */
|
|
+ assert(cctx->cdict == dl->cdict);
|
|
return 0;
|
|
}
|
|
assert(dl->dictSize > 0);
|
|
@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
|
|
}
|
|
|
|
size_t ZSTD_CCtx_loadDictionary_advanced(
|
|
- ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
|
|
- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
|
|
+ ZSTD_CCtx* cctx,
|
|
+ const void* dict, size_t dictSize,
|
|
+ ZSTD_dictLoadMethod_e dictLoadMethod,
|
|
+ ZSTD_dictContentType_e dictContentType)
|
|
{
|
|
- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
|
|
- "Can't load a dictionary when ctx is not in init stage.");
|
|
DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
|
|
- ZSTD_clearAllDicts(cctx); /* in case one already exists */
|
|
- if (dict == NULL || dictSize == 0) /* no dictionary mode */
|
|
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
|
|
+ "Can't load a dictionary when cctx is not in init stage.");
|
|
+ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */
|
|
+ if (dict == NULL || dictSize == 0) /* no dictionary */
|
|
return 0;
|
|
if (dictLoadMethod == ZSTD_dlm_byRef) {
|
|
cctx->localDict.dict = dict;
|
|
} else {
|
|
+ /* copy dictionary content inside CCtx to own its lifetime */
|
|
void* dictBuffer;
|
|
RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
|
|
- "no malloc for static CCtx");
|
|
+ "static CCtx can't allocate for an internal copy of dictionary");
|
|
dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
|
|
- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
|
|
+ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
|
|
+ "allocation failed for dictionary content");
|
|
ZSTD_memcpy(dictBuffer, dict, dictSize);
|
|
- cctx->localDict.dictBuffer = dictBuffer;
|
|
- cctx->localDict.dict = dictBuffer;
|
|
+ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */
|
|
+ cctx->localDict.dict = dictBuffer; /* read-only reference */
|
|
}
|
|
cctx->localDict.dictSize = dictSize;
|
|
cctx->localDict.dictContentType = dictContentType;
|
|
@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
|
|
if ( (reset == ZSTD_reset_parameters)
|
|
|| (reset == ZSTD_reset_session_and_parameters) ) {
|
|
RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
|
|
- "Can't reset parameters only when not in init stage.");
|
|
+ "Reset parameters is only possible during init stage.");
|
|
ZSTD_clearAllDicts(cctx);
|
|
+ ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
|
|
return ZSTD_CCtxParams_reset(&cctx->requestedParams);
|
|
}
|
|
return 0;
|
|
@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
|
|
ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
|
|
unsigned long long srcSize,
|
|
size_t dictSize,
|
|
- ZSTD_cParamMode_e mode)
|
|
+ ZSTD_cParamMode_e mode,
|
|
+ ZSTD_paramSwitch_e useRowMatchFinder)
|
|
{
|
|
const U64 minSrcSize = 513; /* (1<<9) + 1 */
|
|
const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
|
|
@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
|
|
}
|
|
|
|
/* resize windowLog if input is small enough, to use less memory */
|
|
- if ( (srcSize < maxWindowResize)
|
|
- && (dictSize < maxWindowResize) ) {
|
|
+ if ( (srcSize <= maxWindowResize)
|
|
+ && (dictSize <= maxWindowResize) ) {
|
|
U32 const tSize = (U32)(srcSize + dictSize);
|
|
static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
|
|
U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
|
|
@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
|
|
if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
|
|
cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */
|
|
|
|
+ /* We can't use more than 32 bits of hash in total, so that means that we require:
|
|
+ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
|
|
+ */
|
|
+ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
|
|
+ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ if (cPar.hashLog > maxShortCacheHashLog) {
|
|
+ cPar.hashLog = maxShortCacheHashLog;
|
|
+ }
|
|
+ if (cPar.chainLog > maxShortCacheHashLog) {
|
|
+ cPar.chainLog = maxShortCacheHashLog;
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ /* At this point, we aren't 100% sure if we are using the row match finder.
|
|
+ * Unless it is explicitly disabled, conservatively assume that it is enabled.
|
|
+ * In this case it will only be disabled for small sources, so shrinking the
|
|
+ * hash log a little bit shouldn't result in any ratio loss.
|
|
+ */
|
|
+ if (useRowMatchFinder == ZSTD_ps_auto)
|
|
+ useRowMatchFinder = ZSTD_ps_enable;
|
|
+
|
|
+ /* We can't hash more than 32-bits in total. So that means that we require:
|
|
+ * (hashLog - rowLog + 8) <= 32
|
|
+ */
|
|
+ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
|
|
+ /* Switch to 32-entry rows if searchLog is 5 (or more) */
|
|
+ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
|
|
+ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
|
|
+ U32 const maxHashLog = maxRowHashLog + rowLog;
|
|
+ assert(cPar.hashLog >= rowLog);
|
|
+ if (cPar.hashLog > maxHashLog) {
|
|
+ cPar.hashLog = maxHashLog;
|
|
+ }
|
|
+ }
|
|
+
|
|
return cPar;
|
|
}
|
|
|
|
@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
|
|
{
|
|
cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */
|
|
if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
|
|
- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
|
|
+ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
|
|
}
|
|
|
|
static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
|
|
@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
|
|
ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
|
|
assert(!ZSTD_checkCParams(cParams));
|
|
/* srcSizeHint == 0 means 0 */
|
|
- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
|
|
+ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
|
|
}
|
|
|
|
static size_t
|
|
@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
|
|
+ ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
|
|
+ ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
|
|
size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
|
|
- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
|
|
+ ? ZSTD_cwksp_aligned_alloc_size(hSize)
|
|
: 0;
|
|
size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
|
|
? optPotentialSpace
|
|
@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
|
|
return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
|
|
}
|
|
|
|
+/* Helper function for calculating memory requirements.
|
|
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
|
|
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
|
|
+ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
|
|
+ return blockSize / divider;
|
|
+}
|
|
+
|
|
static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
const ZSTD_compressionParameters* cParams,
|
|
const ldmParams_t* ldmParams,
|
|
@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
const ZSTD_paramSwitch_e useRowMatchFinder,
|
|
const size_t buffInSize,
|
|
const size_t buffOutSize,
|
|
- const U64 pledgedSrcSize)
|
|
+ const U64 pledgedSrcSize,
|
|
+ int useSequenceProducer,
|
|
+ size_t maxBlockSize)
|
|
{
|
|
size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
|
|
- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
|
|
- U32 const divider = (cParams->minMatch==3) ? 3 : 4;
|
|
- size_t const maxNbSeq = blockSize / divider;
|
|
+ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
|
|
+ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
|
|
size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
|
|
+ ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
|
|
+ 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
|
|
@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
|
|
size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
|
|
|
|
+ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
|
|
+ size_t const externalSeqSpace = useSequenceProducer
|
|
+ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
|
|
+ : 0;
|
|
+
|
|
size_t const neededSpace =
|
|
cctxSpace +
|
|
entropySpace +
|
|
@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
ldmSeqSpace +
|
|
matchStateSize +
|
|
tokenSpace +
|
|
- bufferSpace;
|
|
+ bufferSpace +
|
|
+ externalSeqSpace;
|
|
|
|
DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
|
|
return neededSpace;
|
|
@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
|
|
* be needed. However, we still allocate two 0-sized buffers, which can
|
|
* take space under ASAN. */
|
|
return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
|
|
+ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
|
|
}
|
|
|
|
size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
|
|
@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
|
|
RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
|
|
{ ZSTD_compressionParameters const cParams =
|
|
ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
|
|
- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
|
|
+ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
|
|
size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
|
|
? ((size_t)1 << cParams.windowLog) + blockSize
|
|
: 0;
|
|
@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
|
|
|
|
return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
&cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
|
|
- ZSTD_CONTENTSIZE_UNKNOWN);
|
|
+ ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
|
|
}
|
|
}
|
|
|
|
@@ -1637,6 +1833,19 @@ typedef enum {
|
|
ZSTD_resetTarget_CCtx
|
|
} ZSTD_resetTarget_e;
|
|
|
|
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
|
|
+static U64 ZSTD_bitmix(U64 val, U64 len) {
|
|
+ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
|
|
+ val *= 0x9FB21C651E98DF25ULL;
|
|
+ val ^= (val >> 35) + len ;
|
|
+ val *= 0x9FB21C651E98DF25ULL;
|
|
+ return val ^ (val >> 28);
|
|
+}
|
|
+
|
|
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
|
|
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
|
|
+ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
|
|
+}
|
|
|
|
static size_t
|
|
ZSTD_reset_matchState(ZSTD_matchState_t* ms,
|
|
@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
|
|
}
|
|
|
|
ms->hashLog3 = hashLog3;
|
|
+ ms->lazySkipping = 0;
|
|
|
|
ZSTD_invalidateMatchState(ms);
|
|
|
|
@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
|
|
ZSTD_cwksp_clean_tables(ws);
|
|
}
|
|
|
|
+ if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
|
|
+ /* Row match finder needs an additional table of hashes ("tags") */
|
|
+ size_t const tagTableSize = hSize;
|
|
+ /* We want to generate a new salt in case we reset a Cctx, but we always want to use
|
|
+ * 0 when we reset a Cdict */
|
|
+ if(forWho == ZSTD_resetTarget_CCtx) {
|
|
+ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
|
|
+ ZSTD_advanceHashSalt(ms);
|
|
+ } else {
|
|
+ /* When we are not salting we want to always memset the memory */
|
|
+ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
|
|
+ ZSTD_memset(ms->tagTable, 0, tagTableSize);
|
|
+ ms->hashSalt = 0;
|
|
+ }
|
|
+ { /* Switch to 32-entry rows if searchLog is 5 (or more) */
|
|
+ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
|
|
+ assert(cParams->hashLog >= rowLog);
|
|
+ ms->rowHashLog = cParams->hashLog - rowLog;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* opt parser space */
|
|
if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
|
|
DEBUGLOG(4, "reserving optimal parser space");
|
|
@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
|
|
ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
|
|
}
|
|
|
|
- if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
|
|
- { /* Row match finder needs an additional table of hashes ("tags") */
|
|
- size_t const tagTableSize = hSize*sizeof(U16);
|
|
- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
|
|
- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
|
|
- }
|
|
- { /* Switch to 32-entry rows if searchLog is 5 (or more) */
|
|
- U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
|
|
- assert(cParams->hashLog >= rowLog);
|
|
- ms->rowHashLog = cParams->hashLog - rowLog;
|
|
- }
|
|
- }
|
|
-
|
|
ms->cParams = *cParams;
|
|
|
|
RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
|
|
@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
assert(params->useRowMatchFinder != ZSTD_ps_auto);
|
|
assert(params->useBlockSplitter != ZSTD_ps_auto);
|
|
assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
|
|
+ assert(params->maxBlockSize != 0);
|
|
if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
|
|
/* Adjust long distance matching parameters */
|
|
ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams);
|
|
@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
}
|
|
|
|
{ size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
|
|
- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
|
|
- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4;
|
|
- size_t const maxNbSeq = blockSize / divider;
|
|
+ size_t const blockSize = MIN(params->maxBlockSize, windowSize);
|
|
+ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
|
|
size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
|
|
? ZSTD_compressBound(blockSize) + 1
|
|
: 0;
|
|
@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
size_t const neededSpace =
|
|
ZSTD_estimateCCtxSize_usingCCtxParams_internal(
|
|
¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
|
|
- buffInSize, buffOutSize, pledgedSrcSize);
|
|
+ buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
|
|
int resizeWorkspace;
|
|
|
|
FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
|
|
@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
|
|
/* init params */
|
|
zc->blockState.matchState.cParams = params->cParams;
|
|
+ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
|
|
zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
|
|
zc->consumedSrcSize = 0;
|
|
zc->producedCSize = 0;
|
|
@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
|
|
ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
|
|
|
|
+ FORWARD_IF_ERROR(ZSTD_reset_matchState(
|
|
+ &zc->blockState.matchState,
|
|
+ ws,
|
|
+ ¶ms->cParams,
|
|
+ params->useRowMatchFinder,
|
|
+ crp,
|
|
+ needsIndexReset,
|
|
+ ZSTD_resetTarget_CCtx), "");
|
|
+
|
|
+ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
|
|
+
|
|
+ /* ldm hash table */
|
|
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
|
|
+ /* TODO: avoid memset? */
|
|
+ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
|
|
+ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
|
|
+ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
|
|
+ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
|
|
+ zc->maxNbLdmSequences = maxNbLdmSeq;
|
|
+
|
|
+ ZSTD_window_init(&zc->ldmState.window);
|
|
+ zc->ldmState.loadedDictEnd = 0;
|
|
+ }
|
|
+
|
|
+ /* reserve space for block-level external sequences */
|
|
+ if (params->useSequenceProducer) {
|
|
+ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
|
|
+ zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
|
|
+ zc->externalMatchCtx.seqBuffer =
|
|
+ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
|
|
+ }
|
|
+
|
|
+ /* buffers */
|
|
+
|
|
/* ZSTD_wildcopy() is used to copy into the literals buffer,
|
|
* so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
|
|
*/
|
|
zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
|
|
zc->seqStore.maxNbLit = blockSize;
|
|
|
|
- /* buffers */
|
|
zc->bufferedPolicy = zbuff;
|
|
zc->inBuffSize = buffInSize;
|
|
zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
|
|
@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
|
|
zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
|
|
zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
|
|
zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
|
|
- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
|
|
-
|
|
- FORWARD_IF_ERROR(ZSTD_reset_matchState(
|
|
- &zc->blockState.matchState,
|
|
- ws,
|
|
- ¶ms->cParams,
|
|
- params->useRowMatchFinder,
|
|
- crp,
|
|
- needsIndexReset,
|
|
- ZSTD_resetTarget_CCtx), "");
|
|
-
|
|
- /* ldm hash table */
|
|
- if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
|
|
- /* TODO: avoid memset? */
|
|
- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
|
|
- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
|
|
- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
|
|
- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
|
|
- zc->maxNbLdmSequences = maxNbLdmSeq;
|
|
-
|
|
- ZSTD_window_init(&zc->ldmState.window);
|
|
- zc->ldmState.loadedDictEnd = 0;
|
|
- }
|
|
|
|
DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
|
|
- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
|
|
+ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
|
|
|
|
zc->initialized = 1;
|
|
|
|
@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
|
|
}
|
|
|
|
params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
|
|
- cdict->dictContentSize, ZSTD_cpm_attachDict);
|
|
+ cdict->dictContentSize, ZSTD_cpm_attachDict,
|
|
+ params.useRowMatchFinder);
|
|
params.cParams.windowLog = windowLog;
|
|
params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */
|
|
FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize,
|
|
@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
|
|
return 0;
|
|
}
|
|
|
|
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
|
|
+ ZSTD_compressionParameters const* cParams) {
|
|
+ if (ZSTD_CDictIndicesAreTagged(cParams)){
|
|
+ /* Remove tags from the CDict table if they are present.
|
|
+ * See docs on "short cache" in zstd_compress_internal.h for context. */
|
|
+ size_t i;
|
|
+ for (i = 0; i < tableSize; i++) {
|
|
+ U32 const taggedIndex = src[i];
|
|
+ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ dst[i] = index;
|
|
+ }
|
|
+ } else {
|
|
+ ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
|
|
+ }
|
|
+}
|
|
+
|
|
static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
|
|
const ZSTD_CDict* cdict,
|
|
ZSTD_CCtx_params params,
|
|
@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
|
|
: 0;
|
|
size_t const hSize = (size_t)1 << cdict_cParams->hashLog;
|
|
|
|
- ZSTD_memcpy(cctx->blockState.matchState.hashTable,
|
|
- cdict->matchState.hashTable,
|
|
- hSize * sizeof(U32));
|
|
+ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
|
|
+ cdict->matchState.hashTable,
|
|
+ hSize, cdict_cParams);
|
|
+
|
|
/* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
|
|
if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
|
|
- ZSTD_memcpy(cctx->blockState.matchState.chainTable,
|
|
- cdict->matchState.chainTable,
|
|
- chainSize * sizeof(U32));
|
|
+ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
|
|
+ cdict->matchState.chainTable,
|
|
+ chainSize, cdict_cParams);
|
|
}
|
|
/* copy tag table */
|
|
if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
|
|
- size_t const tagTableSize = hSize*sizeof(U16);
|
|
+ size_t const tagTableSize = hSize;
|
|
ZSTD_memcpy(cctx->blockState.matchState.tagTable,
|
|
- cdict->matchState.tagTable,
|
|
- tagTableSize);
|
|
+ cdict->matchState.tagTable,
|
|
+ tagTableSize);
|
|
+ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
|
|
}
|
|
}
|
|
|
|
@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
|
|
params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
|
|
params.ldmParams = srcCCtx->appliedParams.ldmParams;
|
|
params.fParams = fParams;
|
|
+ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
|
|
ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize,
|
|
/* loadedDictSize */ 0,
|
|
ZSTDcrp_leaveDirty, zbuff);
|
|
@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
|
|
|
|
/* See doc/zstd_compression_format.md for detailed format description */
|
|
|
|
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
|
|
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
|
|
{
|
|
const seqDef* const sequences = seqStorePtr->sequencesStart;
|
|
BYTE* const llCodeTable = seqStorePtr->llCode;
|
|
@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
|
|
BYTE* const mlCodeTable = seqStorePtr->mlCode;
|
|
U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
|
U32 u;
|
|
+ int longOffsets = 0;
|
|
assert(nbSeq <= seqStorePtr->maxNbSeq);
|
|
for (u=0; u<nbSeq; u++) {
|
|
U32 const llv = sequences[u].litLength;
|
|
+ U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
|
|
U32 const mlv = sequences[u].mlBase;
|
|
llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
|
|
- ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
|
|
+ ofCodeTable[u] = (BYTE)ofCode;
|
|
mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
|
|
+ assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
|
|
+ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
|
|
+ longOffsets = 1;
|
|
}
|
|
if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
|
|
llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
|
|
if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
|
|
mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
|
|
+ return longOffsets;
|
|
}
|
|
|
|
/* ZSTD_useTargetCBlockSize():
|
|
@@ -2347,6 +2602,7 @@ typedef struct {
|
|
U32 MLtype;
|
|
size_t size;
|
|
size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
|
|
+ int longOffsets;
|
|
} ZSTD_symbolEncodingTypeStats_t;
|
|
|
|
/* ZSTD_buildSequencesStatistics():
|
|
@@ -2357,11 +2613,13 @@ typedef struct {
|
|
* entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
|
|
*/
|
|
static ZSTD_symbolEncodingTypeStats_t
|
|
-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
|
|
- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
|
|
- BYTE* dst, const BYTE* const dstEnd,
|
|
- ZSTD_strategy strategy, unsigned* countWorkspace,
|
|
- void* entropyWorkspace, size_t entropyWkspSize) {
|
|
+ZSTD_buildSequencesStatistics(
|
|
+ const seqStore_t* seqStorePtr, size_t nbSeq,
|
|
+ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
|
|
+ BYTE* dst, const BYTE* const dstEnd,
|
|
+ ZSTD_strategy strategy, unsigned* countWorkspace,
|
|
+ void* entropyWorkspace, size_t entropyWkspSize)
|
|
+{
|
|
BYTE* const ostart = dst;
|
|
const BYTE* const oend = dstEnd;
|
|
BYTE* op = ostart;
|
|
@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
|
|
|
|
stats.lastCountSize = 0;
|
|
/* convert length/distances into codes */
|
|
- ZSTD_seqToCodes(seqStorePtr);
|
|
+ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
|
|
assert(op <= oend);
|
|
assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
|
|
/* build CTable for Literal Lengths */
|
|
@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
|
|
*/
|
|
#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
|
|
MEM_STATIC size_t
|
|
-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
|
|
- const ZSTD_entropyCTables_t* prevEntropy,
|
|
- ZSTD_entropyCTables_t* nextEntropy,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- void* dst, size_t dstCapacity,
|
|
- void* entropyWorkspace, size_t entropyWkspSize,
|
|
- const int bmi2)
|
|
+ZSTD_entropyCompressSeqStore_internal(
|
|
+ const seqStore_t* seqStorePtr,
|
|
+ const ZSTD_entropyCTables_t* prevEntropy,
|
|
+ ZSTD_entropyCTables_t* nextEntropy,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ void* entropyWorkspace, size_t entropyWkspSize,
|
|
+ const int bmi2)
|
|
{
|
|
- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
|
|
ZSTD_strategy const strategy = cctxParams->cParams.strategy;
|
|
unsigned* count = (unsigned*)entropyWorkspace;
|
|
FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
|
|
FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
|
|
FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
|
|
const seqDef* const sequences = seqStorePtr->sequencesStart;
|
|
- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
|
|
+ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
|
const BYTE* const ofCodeTable = seqStorePtr->ofCode;
|
|
const BYTE* const llCodeTable = seqStorePtr->llCode;
|
|
const BYTE* const mlCodeTable = seqStorePtr->mlCode;
|
|
@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
|
|
BYTE* const oend = ostart + dstCapacity;
|
|
BYTE* op = ostart;
|
|
size_t lastCountSize;
|
|
+ int longOffsets = 0;
|
|
|
|
entropyWorkspace = count + (MaxSeq + 1);
|
|
entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
|
|
|
|
- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
|
|
+ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
|
|
ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
|
|
assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
|
|
|
|
/* Compress literals */
|
|
{ const BYTE* const literals = seqStorePtr->litStart;
|
|
- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
|
|
- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
|
|
+ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
|
+ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
|
|
/* Base suspicion of uncompressibility on ratio of literals to sequences */
|
|
unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
|
|
size_t const litSize = (size_t)(seqStorePtr->lit - literals);
|
|
+
|
|
size_t const cSize = ZSTD_compressLiterals(
|
|
- &prevEntropy->huf, &nextEntropy->huf,
|
|
- cctxParams->cParams.strategy,
|
|
- ZSTD_literalsCompressionIsDisabled(cctxParams),
|
|
op, dstCapacity,
|
|
literals, litSize,
|
|
entropyWorkspace, entropyWkspSize,
|
|
- bmi2, suspectUncompressible);
|
|
+ &prevEntropy->huf, &nextEntropy->huf,
|
|
+ cctxParams->cParams.strategy,
|
|
+ ZSTD_literalsCompressionIsDisabled(cctxParams),
|
|
+ suspectUncompressible, bmi2);
|
|
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
|
|
assert(cSize <= dstCapacity);
|
|
op += cSize;
|
|
@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
|
|
ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
|
|
return (size_t)(op - ostart);
|
|
}
|
|
- {
|
|
- ZSTD_symbolEncodingTypeStats_t stats;
|
|
- BYTE* seqHead = op++;
|
|
+ { BYTE* const seqHead = op++;
|
|
/* build stats for sequences */
|
|
- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
|
|
+ const ZSTD_symbolEncodingTypeStats_t stats =
|
|
+ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
|
|
&prevEntropy->fse, &nextEntropy->fse,
|
|
op, oend,
|
|
strategy, count,
|
|
@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
|
|
*seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
|
|
lastCountSize = stats.lastCountSize;
|
|
op += stats.size;
|
|
+ longOffsets = stats.longOffsets;
|
|
}
|
|
|
|
{ size_t const bitstreamSize = ZSTD_encodeSequences(
|
|
@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
|
|
}
|
|
|
|
MEM_STATIC size_t
|
|
-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
|
|
- const ZSTD_entropyCTables_t* prevEntropy,
|
|
- ZSTD_entropyCTables_t* nextEntropy,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- void* dst, size_t dstCapacity,
|
|
- size_t srcSize,
|
|
- void* entropyWorkspace, size_t entropyWkspSize,
|
|
- int bmi2)
|
|
+ZSTD_entropyCompressSeqStore(
|
|
+ const seqStore_t* seqStorePtr,
|
|
+ const ZSTD_entropyCTables_t* prevEntropy,
|
|
+ ZSTD_entropyCTables_t* nextEntropy,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ size_t srcSize,
|
|
+ void* entropyWorkspace, size_t entropyWkspSize,
|
|
+ int bmi2)
|
|
{
|
|
size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
|
|
seqStorePtr, prevEntropy, nextEntropy, cctxParams,
|
|
@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
|
|
/* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
|
|
* Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
|
|
*/
|
|
- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
|
|
+ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
|
|
+ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
|
|
return 0; /* block not compressed */
|
|
+ }
|
|
FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
|
|
|
|
/* Check compressibility */
|
|
{ size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
|
|
if (cSize >= maxCSize) return 0; /* block not compressed */
|
|
}
|
|
- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
|
|
+ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
|
|
+ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
|
|
+ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
|
|
+ */
|
|
+ assert(cSize < ZSTD_BLOCKSIZE_MAX);
|
|
return cSize;
|
|
}
|
|
|
|
@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
|
|
ssPtr->longLengthType = ZSTD_llt_none;
|
|
}
|
|
|
|
+/* ZSTD_postProcessSequenceProducerResult() :
|
|
+ * Validates and post-processes sequences obtained through the external matchfinder API:
|
|
+ * - Checks whether nbExternalSeqs represents an error condition.
|
|
+ * - Appends a block delimiter to outSeqs if one is not already present.
|
|
+ * See zstd.h for context regarding block delimiters.
|
|
+ * Returns the number of sequences after post-processing, or an error code. */
|
|
+static size_t ZSTD_postProcessSequenceProducerResult(
|
|
+ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
|
|
+) {
|
|
+ RETURN_ERROR_IF(
|
|
+ nbExternalSeqs > outSeqsCapacity,
|
|
+ sequenceProducer_failed,
|
|
+ "External sequence producer returned error code %lu",
|
|
+ (unsigned long)nbExternalSeqs
|
|
+ );
|
|
+
|
|
+ RETURN_ERROR_IF(
|
|
+ nbExternalSeqs == 0 && srcSize > 0,
|
|
+ sequenceProducer_failed,
|
|
+ "Got zero sequences from external sequence producer for a non-empty src buffer!"
|
|
+ );
|
|
+
|
|
+ if (srcSize == 0) {
|
|
+ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ {
|
|
+ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
|
|
+
|
|
+ /* We can return early if lastSeq is already a block delimiter. */
|
|
+ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
|
|
+ return nbExternalSeqs;
|
|
+ }
|
|
+
|
|
+ /* This error condition is only possible if the external matchfinder
|
|
+ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
|
|
+ RETURN_ERROR_IF(
|
|
+ nbExternalSeqs == outSeqsCapacity,
|
|
+ sequenceProducer_failed,
|
|
+ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
|
|
+ );
|
|
+
|
|
+ /* lastSeq is not a block delimiter, so we need to append one. */
|
|
+ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
|
|
+ return nbExternalSeqs + 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* ZSTD_fastSequenceLengthSum() :
|
|
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
|
|
+ * Similar to another function in zstd_compress.c (determine_blockSize),
|
|
+ * except it doesn't check for a block delimiter to end summation.
|
|
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
|
|
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
|
|
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
|
|
+ size_t matchLenSum, litLenSum, i;
|
|
+ matchLenSum = 0;
|
|
+ litLenSum = 0;
|
|
+ for (i = 0; i < seqBufSize; i++) {
|
|
+ litLenSum += seqBuf[i].litLength;
|
|
+ matchLenSum += seqBuf[i].matchLength;
|
|
+ }
|
|
+ return litLenSum + matchLenSum;
|
|
+}
|
|
+
|
|
typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
|
|
|
|
static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|
@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|
assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
|
|
/* Assert that we have correctly flushed the ctx params into the ms's copy */
|
|
ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
|
|
- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
|
|
+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
|
|
+ * additional 1. We need to revisit and change this logic to be more consistent */
|
|
+ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
|
|
if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
|
|
ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
|
|
} else {
|
|
@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|
}
|
|
if (zc->externSeqStore.pos < zc->externSeqStore.size) {
|
|
assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
|
|
+
|
|
+ /* External matchfinder + LDM is technically possible, just not implemented yet.
|
|
+ * We need to revisit soon and implement it. */
|
|
+ RETURN_ERROR_IF(
|
|
+ zc->appliedParams.useSequenceProducer,
|
|
+ parameter_combination_unsupported,
|
|
+ "Long-distance matching with external sequence producer enabled is not currently supported."
|
|
+ );
|
|
+
|
|
/* Updates ldmSeqStore.pos */
|
|
lastLLSize =
|
|
ZSTD_ldm_blockCompress(&zc->externSeqStore,
|
|
@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|
} else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
|
|
rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
|
|
|
|
+ /* External matchfinder + LDM is technically possible, just not implemented yet.
|
|
+ * We need to revisit soon and implement it. */
|
|
+ RETURN_ERROR_IF(
|
|
+ zc->appliedParams.useSequenceProducer,
|
|
+ parameter_combination_unsupported,
|
|
+ "Long-distance matching with external sequence producer enabled is not currently supported."
|
|
+ );
|
|
+
|
|
ldmSeqStore.seq = zc->ldmSequences;
|
|
ldmSeqStore.capacity = zc->maxNbLdmSequences;
|
|
/* Updates ldmSeqStore.size */
|
|
@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|
zc->appliedParams.useRowMatchFinder,
|
|
src, srcSize);
|
|
assert(ldmSeqStore.pos == ldmSeqStore.size);
|
|
- } else { /* not long range mode */
|
|
+ } else if (zc->appliedParams.useSequenceProducer) {
|
|
+ assert(
|
|
+ zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
|
|
+ );
|
|
+ assert(zc->externalMatchCtx.mFinder != NULL);
|
|
+
|
|
+ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
|
|
+
|
|
+ size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
|
|
+ zc->externalMatchCtx.mState,
|
|
+ zc->externalMatchCtx.seqBuffer,
|
|
+ zc->externalMatchCtx.seqBufferCapacity,
|
|
+ src, srcSize,
|
|
+ NULL, 0, /* dict and dictSize, currently not supported */
|
|
+ zc->appliedParams.compressionLevel,
|
|
+ windowSize
|
|
+ );
|
|
+
|
|
+ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
|
|
+ zc->externalMatchCtx.seqBuffer,
|
|
+ nbExternalSeqs,
|
|
+ zc->externalMatchCtx.seqBufferCapacity,
|
|
+ srcSize
|
|
+ );
|
|
+
|
|
+ /* Return early if there is no error, since we don't need to worry about last literals */
|
|
+ if (!ZSTD_isError(nbPostProcessedSeqs)) {
|
|
+ ZSTD_sequencePosition seqPos = {0,0,0};
|
|
+ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
|
|
+ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
|
|
+ FORWARD_IF_ERROR(
|
|
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
|
|
+ zc, &seqPos,
|
|
+ zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
|
|
+ src, srcSize,
|
|
+ zc->appliedParams.searchForExternalRepcodes
|
|
+ ),
|
|
+ "Failed to copy external sequences to seqStore!"
|
|
+ );
|
|
+ ms->ldmSeqStore = NULL;
|
|
+ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
|
|
+ return ZSTDbss_compress;
|
|
+ }
|
|
+
|
|
+ /* Propagate the error if fallback is disabled */
|
|
+ if (!zc->appliedParams.enableMatchFinderFallback) {
|
|
+ return nbPostProcessedSeqs;
|
|
+ }
|
|
+
|
|
+ /* Fallback to software matchfinder */
|
|
+ { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
|
|
+ zc->appliedParams.useRowMatchFinder,
|
|
+ dictMode);
|
|
+ ms->ldmSeqStore = NULL;
|
|
+ DEBUGLOG(
|
|
+ 5,
|
|
+ "External sequence producer returned error code %lu. Falling back to internal parser.",
|
|
+ (unsigned long)nbExternalSeqs
|
|
+ );
|
|
+ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
|
|
+ } }
|
|
+ } else { /* not long range mode and no external matchfinder */
|
|
ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
|
|
zc->appliedParams.useRowMatchFinder,
|
|
dictMode);
|
|
@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
|
/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
|
|
so we provide seqStoreSeqs[i].offset - 1 */
|
|
ZSTD_updateRep(updatedRepcodes.rep,
|
|
- seqStoreSeqs[i].offBase - 1,
|
|
+ seqStoreSeqs[i].offBase,
|
|
seqStoreSeqs[i].litLength == 0);
|
|
literalsRead += outSeqs[i].litLength;
|
|
}
|
|
@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
|
zc->seqCollector.seqIndex += seqStoreSeqSize;
|
|
}
|
|
|
|
+size_t ZSTD_sequenceBound(size_t srcSize) {
|
|
+ return (srcSize / ZSTD_MINMATCH_MIN) + 1;
|
|
+}
|
|
+
|
|
size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
|
size_t outSeqsSize, const void* src, size_t srcSize)
|
|
{
|
|
@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
|
|
const size_t unrollMask = unrollSize - 1;
|
|
const size_t prefixLength = length & unrollMask;
|
|
size_t i;
|
|
- size_t u;
|
|
if (length == 1) return 1;
|
|
/* Check if prefix is RLE first before using unrolled loop */
|
|
if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
|
|
return 0;
|
|
}
|
|
for (i = prefixLength; i != length; i += unrollSize) {
|
|
+ size_t u;
|
|
for (u = 0; u < unrollSize; u += sizeof(size_t)) {
|
|
if (MEM_readST(ip + i + u) != valueST) {
|
|
return 0;
|
|
- }
|
|
- }
|
|
- }
|
|
+ } } }
|
|
return 1;
|
|
}
|
|
|
|
@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
|
|
return nbSeqs < 4 && nbLits < 10;
|
|
}
|
|
|
|
-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
|
|
+static void
|
|
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
|
|
{
|
|
ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
|
|
bs->prevCBlock = bs->nextCBlock;
|
|
@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
|
|
}
|
|
|
|
/* Writes the block header */
|
|
-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
|
|
+static void
|
|
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
|
|
+{
|
|
U32 const cBlockHeader = cSize == 1 ?
|
|
lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
|
|
lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
|
|
@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
|
|
* Stores literals block type (raw, rle, compressed, repeat) and
|
|
* huffman description table to hufMetadata.
|
|
* Requires ENTROPY_WORKSPACE_SIZE workspace
|
|
- * @return : size of huffman description table or error code */
|
|
-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
|
|
- const ZSTD_hufCTables_t* prevHuf,
|
|
- ZSTD_hufCTables_t* nextHuf,
|
|
- ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
- const int literalsCompressionIsDisabled,
|
|
- void* workspace, size_t wkspSize)
|
|
+ * @return : size of huffman description table, or an error code
|
|
+ */
|
|
+static size_t
|
|
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
|
|
+ const ZSTD_hufCTables_t* prevHuf,
|
|
+ ZSTD_hufCTables_t* nextHuf,
|
|
+ ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
+ const int literalsCompressionIsDisabled,
|
|
+ void* workspace, size_t wkspSize,
|
|
+ int hufFlags)
|
|
{
|
|
BYTE* const wkspStart = (BYTE*)workspace;
|
|
BYTE* const wkspEnd = wkspStart + wkspSize;
|
|
@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
|
|
unsigned* const countWksp = (unsigned*)workspace;
|
|
const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
|
|
BYTE* const nodeWksp = countWkspStart + countWkspSize;
|
|
- const size_t nodeWkspSize = wkspEnd-nodeWksp;
|
|
+ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
|
|
unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
|
|
- unsigned huffLog = HUF_TABLELOG_DEFAULT;
|
|
+ unsigned huffLog = LitHufLog;
|
|
HUF_repeat repeat = prevHuf->repeatMode;
|
|
DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
|
|
|
|
@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
|
|
|
|
/* small ? don't even attempt compression (speed opt) */
|
|
#ifndef COMPRESS_LITERALS_SIZE_MIN
|
|
-#define COMPRESS_LITERALS_SIZE_MIN 63
|
|
+# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */
|
|
#endif
|
|
{ size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
|
|
if (srcSize <= minLitSize) {
|
|
DEBUGLOG(5, "set_basic - too small");
|
|
hufMetadata->hType = set_basic;
|
|
return 0;
|
|
- }
|
|
- }
|
|
+ } }
|
|
|
|
/* Scan input and build symbol stats */
|
|
- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
|
|
+ { size_t const largest =
|
|
+ HIST_count_wksp (countWksp, &maxSymbolValue,
|
|
+ (const BYTE*)src, srcSize,
|
|
+ workspace, wkspSize);
|
|
FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
|
|
if (largest == srcSize) {
|
|
+ /* only one literal symbol */
|
|
DEBUGLOG(5, "set_rle");
|
|
hufMetadata->hType = set_rle;
|
|
return 0;
|
|
}
|
|
if (largest <= (srcSize >> 7)+4) {
|
|
+ /* heuristic: likely not compressible */
|
|
DEBUGLOG(5, "set_basic - no gain");
|
|
hufMetadata->hType = set_basic;
|
|
return 0;
|
|
- }
|
|
- }
|
|
+ } }
|
|
|
|
/* Validate the previous Huffman table */
|
|
- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
|
|
+ if (repeat == HUF_repeat_check
|
|
+ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
|
|
repeat = HUF_repeat_none;
|
|
}
|
|
|
|
/* Build Huffman Tree */
|
|
ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
|
|
- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
|
|
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
|
|
+ assert(huffLog <= LitHufLog);
|
|
{ size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
|
|
maxSymbolValue, huffLog,
|
|
nodeWksp, nodeWkspSize);
|
|
FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
|
|
huffLog = (U32)maxBits;
|
|
- { /* Build and write the CTable */
|
|
- size_t const newCSize = HUF_estimateCompressedSize(
|
|
- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
|
|
- size_t const hSize = HUF_writeCTable_wksp(
|
|
- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
|
|
- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
|
|
- nodeWksp, nodeWkspSize);
|
|
- /* Check against repeating the previous CTable */
|
|
- if (repeat != HUF_repeat_none) {
|
|
- size_t const oldCSize = HUF_estimateCompressedSize(
|
|
- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
|
|
- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
|
|
- DEBUGLOG(5, "set_repeat - smaller");
|
|
- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
- hufMetadata->hType = set_repeat;
|
|
- return 0;
|
|
- }
|
|
- }
|
|
- if (newCSize + hSize >= srcSize) {
|
|
- DEBUGLOG(5, "set_basic - no gains");
|
|
+ }
|
|
+ { /* Build and write the CTable */
|
|
+ size_t const newCSize = HUF_estimateCompressedSize(
|
|
+ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
|
|
+ size_t const hSize = HUF_writeCTable_wksp(
|
|
+ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
|
|
+ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
|
|
+ nodeWksp, nodeWkspSize);
|
|
+ /* Check against repeating the previous CTable */
|
|
+ if (repeat != HUF_repeat_none) {
|
|
+ size_t const oldCSize = HUF_estimateCompressedSize(
|
|
+ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
|
|
+ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
|
|
+ DEBUGLOG(5, "set_repeat - smaller");
|
|
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
- hufMetadata->hType = set_basic;
|
|
+ hufMetadata->hType = set_repeat;
|
|
return 0;
|
|
- }
|
|
- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
|
|
- hufMetadata->hType = set_compressed;
|
|
- nextHuf->repeatMode = HUF_repeat_check;
|
|
- return hSize;
|
|
+ } }
|
|
+ if (newCSize + hSize >= srcSize) {
|
|
+ DEBUGLOG(5, "set_basic - no gains");
|
|
+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
+ hufMetadata->hType = set_basic;
|
|
+ return 0;
|
|
}
|
|
+ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
|
|
+ hufMetadata->hType = set_compressed;
|
|
+ nextHuf->repeatMode = HUF_repeat_check;
|
|
+ return hSize;
|
|
}
|
|
}
|
|
|
|
@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
|
|
* and updates nextEntropy to the appropriate repeatMode.
|
|
*/
|
|
static ZSTD_symbolEncodingTypeStats_t
|
|
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
|
|
- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
|
|
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
|
|
+{
|
|
+ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
|
|
nextEntropy->litlength_repeatMode = FSE_repeat_none;
|
|
nextEntropy->offcode_repeatMode = FSE_repeat_none;
|
|
nextEntropy->matchlength_repeatMode = FSE_repeat_none;
|
|
@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
|
|
* Builds entropy for the sequences.
|
|
* Stores symbol compression modes and fse table to fseMetadata.
|
|
* Requires ENTROPY_WORKSPACE_SIZE wksp.
|
|
- * @return : size of fse tables or error code */
|
|
-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
|
|
- const ZSTD_fseCTables_t* prevEntropy,
|
|
- ZSTD_fseCTables_t* nextEntropy,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
- void* workspace, size_t wkspSize)
|
|
+ * @return : size of fse tables or error code */
|
|
+static size_t
|
|
+ZSTD_buildBlockEntropyStats_sequences(
|
|
+ const seqStore_t* seqStorePtr,
|
|
+ const ZSTD_fseCTables_t* prevEntropy,
|
|
+ ZSTD_fseCTables_t* nextEntropy,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
+ void* workspace, size_t wkspSize)
|
|
{
|
|
ZSTD_strategy const strategy = cctxParams->cParams.strategy;
|
|
- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
|
|
+ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
|
BYTE* const ostart = fseMetadata->fseTablesBuffer;
|
|
BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
|
|
BYTE* op = ostart;
|
|
@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
|
|
/* ZSTD_buildBlockEntropyStats() :
|
|
* Builds entropy for the block.
|
|
* Requires workspace size ENTROPY_WORKSPACE_SIZE
|
|
- *
|
|
- * @return : 0 on success or error code
|
|
+ * @return : 0 on success, or an error code
|
|
+ * Note : also employed in superblock
|
|
*/
|
|
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
|
|
- const ZSTD_entropyCTables_t* prevEntropy,
|
|
- ZSTD_entropyCTables_t* nextEntropy,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
- void* workspace, size_t wkspSize)
|
|
-{
|
|
- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
|
|
+size_t ZSTD_buildBlockEntropyStats(
|
|
+ const seqStore_t* seqStorePtr,
|
|
+ const ZSTD_entropyCTables_t* prevEntropy,
|
|
+ ZSTD_entropyCTables_t* nextEntropy,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
+ void* workspace, size_t wkspSize)
|
|
+{
|
|
+ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
|
|
+ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
|
|
+ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
|
|
+
|
|
entropyMetadata->hufMetadata.hufDesSize =
|
|
ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
|
|
&prevEntropy->huf, &nextEntropy->huf,
|
|
&entropyMetadata->hufMetadata,
|
|
ZSTD_literalsCompressionIsDisabled(cctxParams),
|
|
- workspace, wkspSize);
|
|
+ workspace, wkspSize, hufFlags);
|
|
+
|
|
FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
|
|
entropyMetadata->fseMetadata.fseTablesSize =
|
|
ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
|
|
@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
|
|
}
|
|
|
|
/* Returns the size estimate for the literals section (header + content) of a block */
|
|
-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
|
|
- const ZSTD_hufCTables_t* huf,
|
|
- const ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
- void* workspace, size_t wkspSize,
|
|
- int writeEntropy)
|
|
+static size_t
|
|
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
|
|
+ const ZSTD_hufCTables_t* huf,
|
|
+ const ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
+ void* workspace, size_t wkspSize,
|
|
+ int writeEntropy)
|
|
{
|
|
unsigned* const countWksp = (unsigned*)workspace;
|
|
unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
|
|
@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
|
|
}
|
|
|
|
/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
|
|
-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
|
|
- const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
|
|
- const FSE_CTable* fseCTable,
|
|
- const U8* additionalBits,
|
|
- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
|
|
- void* workspace, size_t wkspSize)
|
|
+static size_t
|
|
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
|
|
+ const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
|
|
+ const FSE_CTable* fseCTable,
|
|
+ const U8* additionalBits,
|
|
+ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
|
|
+ void* workspace, size_t wkspSize)
|
|
{
|
|
unsigned* const countWksp = (unsigned*)workspace;
|
|
const BYTE* ctp = codeTable;
|
|
@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
|
|
}
|
|
|
|
/* Returns the size estimate for the sequences section (header + content) of a block */
|
|
-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
|
|
- const BYTE* llCodeTable,
|
|
- const BYTE* mlCodeTable,
|
|
- size_t nbSeq,
|
|
- const ZSTD_fseCTables_t* fseTables,
|
|
- const ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
- void* workspace, size_t wkspSize,
|
|
- int writeEntropy)
|
|
+static size_t
|
|
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
|
|
+ const BYTE* llCodeTable,
|
|
+ const BYTE* mlCodeTable,
|
|
+ size_t nbSeq,
|
|
+ const ZSTD_fseCTables_t* fseTables,
|
|
+ const ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
+ void* workspace, size_t wkspSize,
|
|
+ int writeEntropy)
|
|
{
|
|
size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
|
|
size_t cSeqSizeEstimate = 0;
|
|
cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
|
|
- fseTables->offcodeCTable, NULL,
|
|
- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
|
|
- workspace, wkspSize);
|
|
+ fseTables->offcodeCTable, NULL,
|
|
+ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
|
|
+ workspace, wkspSize);
|
|
cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
|
|
- fseTables->litlengthCTable, LL_bits,
|
|
- LL_defaultNorm, LL_defaultNormLog, MaxLL,
|
|
- workspace, wkspSize);
|
|
+ fseTables->litlengthCTable, LL_bits,
|
|
+ LL_defaultNorm, LL_defaultNormLog, MaxLL,
|
|
+ workspace, wkspSize);
|
|
cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
|
|
- fseTables->matchlengthCTable, ML_bits,
|
|
- ML_defaultNorm, ML_defaultNormLog, MaxML,
|
|
- workspace, wkspSize);
|
|
+ fseTables->matchlengthCTable, ML_bits,
|
|
+ ML_defaultNorm, ML_defaultNormLog, MaxML,
|
|
+ workspace, wkspSize);
|
|
if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
|
|
return cSeqSizeEstimate + sequencesSectionHeaderSize;
|
|
}
|
|
|
|
/* Returns the size estimate for a given stream of literals, of, ll, ml */
|
|
-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
|
|
- const BYTE* ofCodeTable,
|
|
- const BYTE* llCodeTable,
|
|
- const BYTE* mlCodeTable,
|
|
- size_t nbSeq,
|
|
- const ZSTD_entropyCTables_t* entropy,
|
|
- const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
- void* workspace, size_t wkspSize,
|
|
- int writeLitEntropy, int writeSeqEntropy) {
|
|
+static size_t
|
|
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
|
|
+ const BYTE* ofCodeTable,
|
|
+ const BYTE* llCodeTable,
|
|
+ const BYTE* mlCodeTable,
|
|
+ size_t nbSeq,
|
|
+ const ZSTD_entropyCTables_t* entropy,
|
|
+ const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
+ void* workspace, size_t wkspSize,
|
|
+ int writeLitEntropy, int writeSeqEntropy)
|
|
+{
|
|
size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
|
|
- &entropy->huf, &entropyMetadata->hufMetadata,
|
|
- workspace, wkspSize, writeLitEntropy);
|
|
+ &entropy->huf, &entropyMetadata->hufMetadata,
|
|
+ workspace, wkspSize, writeLitEntropy);
|
|
size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
|
|
- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
|
|
- workspace, wkspSize, writeSeqEntropy);
|
|
+ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
|
|
+ workspace, wkspSize, writeSeqEntropy);
|
|
return seqSize + literalsSize + ZSTD_blockHeaderSize;
|
|
}
|
|
|
|
/* Builds entropy statistics and uses them for blocksize estimation.
|
|
*
|
|
- * Returns the estimated compressed size of the seqStore, or a zstd error.
|
|
+ * @return: estimated compressed size of the seqStore, or a zstd error.
|
|
*/
|
|
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
|
|
- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
|
|
+static size_t
|
|
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
|
|
+{
|
|
+ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
|
|
DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
|
|
FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
|
|
&zc->blockState.prevCBlock->entropy,
|
|
&zc->blockState.nextCBlock->entropy,
|
|
&zc->appliedParams,
|
|
entropyMetadata,
|
|
- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
|
|
- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
|
|
+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
|
|
+ return ZSTD_estimateBlockSize(
|
|
+ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
|
|
seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
|
|
(size_t)(seqStore->sequences - seqStore->sequencesStart),
|
|
- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
|
|
+ &zc->blockState.nextCBlock->entropy,
|
|
+ entropyMetadata,
|
|
+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
|
|
(int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
|
|
}
|
|
|
|
/* Returns literals bytes represented in a seqStore */
|
|
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
|
|
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
|
|
+{
|
|
size_t literalsBytes = 0;
|
|
- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
|
|
+ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
|
|
size_t i;
|
|
for (i = 0; i < nbSeqs; ++i) {
|
|
- seqDef seq = seqStore->sequencesStart[i];
|
|
+ seqDef const seq = seqStore->sequencesStart[i];
|
|
literalsBytes += seq.litLength;
|
|
if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
|
|
literalsBytes += 0x10000;
|
|
- }
|
|
- }
|
|
+ } }
|
|
return literalsBytes;
|
|
}
|
|
|
|
/* Returns match bytes represented in a seqStore */
|
|
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
|
|
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
|
|
+{
|
|
size_t matchBytes = 0;
|
|
- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
|
|
+ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
|
|
size_t i;
|
|
for (i = 0; i < nbSeqs; ++i) {
|
|
seqDef seq = seqStore->sequencesStart[i];
|
|
matchBytes += seq.mlBase + MINMATCH;
|
|
if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
|
|
matchBytes += 0x10000;
|
|
- }
|
|
- }
|
|
+ } }
|
|
return matchBytes;
|
|
}
|
|
|
|
@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
|
|
*/
|
|
static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
|
|
const seqStore_t* originalSeqStore,
|
|
- size_t startIdx, size_t endIdx) {
|
|
- BYTE* const litEnd = originalSeqStore->lit;
|
|
- size_t literalsBytes;
|
|
- size_t literalsBytesPreceding = 0;
|
|
-
|
|
+ size_t startIdx, size_t endIdx)
|
|
+{
|
|
*resultSeqStore = *originalSeqStore;
|
|
if (startIdx > 0) {
|
|
resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
|
|
- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
|
|
+ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
|
|
}
|
|
|
|
/* Move longLengthPos into the correct position if necessary */
|
|
@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
|
|
}
|
|
resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
|
|
resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
|
|
- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
|
|
- resultSeqStore->litStart += literalsBytesPreceding;
|
|
if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
|
|
/* This accounts for possible last literals if the derived chunk reaches the end of the block */
|
|
- resultSeqStore->lit = litEnd;
|
|
+ assert(resultSeqStore->lit == originalSeqStore->lit);
|
|
} else {
|
|
- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
|
|
+ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
|
|
+ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
|
|
}
|
|
resultSeqStore->llCode += startIdx;
|
|
resultSeqStore->mlCode += startIdx;
|
|
@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
|
|
}
|
|
|
|
/*
|
|
- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
|
|
- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
|
|
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
|
|
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
|
|
*/
|
|
static U32
|
|
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
|
|
-{
|
|
- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */
|
|
- assert(STORED_IS_REPCODE(offCode));
|
|
- if (adjustedOffCode == ZSTD_REP_NUM) {
|
|
- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
|
|
- assert(rep[0] > 0);
|
|
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
|
|
+{
|
|
+ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */
|
|
+ assert(OFFBASE_IS_REPCODE(offBase));
|
|
+ if (adjustedRepCode == ZSTD_REP_NUM) {
|
|
+ assert(ll0);
|
|
+ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
|
|
+ * This is only valid if it results in a valid offset value, aka > 0.
|
|
+ * Note : it may happen that `rep[0]==1` in exceptional circumstances.
|
|
+ * In which case this function will return 0, which is an invalid offset.
|
|
+ * It's not an issue though, since this value will be
|
|
+ * compared and discarded within ZSTD_seqStore_resolveOffCodes().
|
|
+ */
|
|
return rep[0] - 1;
|
|
}
|
|
- return rep[adjustedOffCode];
|
|
+ return rep[adjustedRepCode];
|
|
}
|
|
|
|
/*
|
|
@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
|
|
* 1-3 : repcode 1-3
|
|
* 4+ : real_offset+3
|
|
*/
|
|
-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
|
|
- seqStore_t* const seqStore, U32 const nbSeq) {
|
|
+static void
|
|
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
|
|
+ const seqStore_t* const seqStore, U32 const nbSeq)
|
|
+{
|
|
U32 idx = 0;
|
|
+ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
|
|
for (; idx < nbSeq; ++idx) {
|
|
seqDef* const seq = seqStore->sequencesStart + idx;
|
|
- U32 const ll0 = (seq->litLength == 0);
|
|
- U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
|
|
- assert(seq->offBase > 0);
|
|
- if (STORED_IS_REPCODE(offCode)) {
|
|
- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
|
|
- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
|
|
+ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
|
|
+ U32 const offBase = seq->offBase;
|
|
+ assert(offBase > 0);
|
|
+ if (OFFBASE_IS_REPCODE(offBase)) {
|
|
+ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
|
|
+ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
|
|
/* Adjust simulated decompression repcode history if we come across a mismatch. Replace
|
|
* the repcode with the offset it actually references, determined by the compression
|
|
* repcode history.
|
|
*/
|
|
if (dRawOffset != cRawOffset) {
|
|
- seq->offBase = cRawOffset + ZSTD_REP_NUM;
|
|
+ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
|
|
}
|
|
}
|
|
/* Compression repcode history is always updated with values directly from the unmodified seqStore.
|
|
* Decompression repcode history may use modified seq->offset value taken from compression repcode history.
|
|
*/
|
|
- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
|
|
- ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
|
|
+ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
|
|
+ ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
|
|
}
|
|
}
|
|
|
|
@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
|
|
* Returns the total size of that block (including header) or a ZSTD error code.
|
|
*/
|
|
static size_t
|
|
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
|
|
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
|
|
+ const seqStore_t* const seqStore,
|
|
repcodes_t* const dRep, repcodes_t* const cRep,
|
|
void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize,
|
|
+ const void* src, size_t srcSize,
|
|
U32 lastBlock, U32 isPartition)
|
|
{
|
|
const U32 rleMaxLength = 25;
|
|
@@ -3481,45 +3930,49 @@ typedef struct {
|
|
|
|
/* Helper function to perform the recursive search for block splits.
|
|
* Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
|
|
- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
|
|
- * we do not recurse.
|
|
+ * If advantageous to split, then we recurse down the two sub-blocks.
|
|
+ * If not, or if an error occurred in estimation, then we do not recurse.
|
|
*
|
|
- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
|
|
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
|
|
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
|
|
* In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
|
|
* In practice, recursion depth usually doesn't go beyond 4.
|
|
*
|
|
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
|
|
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
|
|
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
|
|
* maximum of 128 KB, this value is actually impossible to reach.
|
|
*/
|
|
static void
|
|
ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
|
|
ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
|
|
{
|
|
- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
|
|
- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
|
|
- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
|
|
+ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
|
|
+ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
|
|
+ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
|
|
size_t estimatedOriginalSize;
|
|
size_t estimatedFirstHalfSize;
|
|
size_t estimatedSecondHalfSize;
|
|
size_t midIdx = (startIdx + endIdx)/2;
|
|
|
|
+ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
|
|
+ assert(endIdx >= startIdx);
|
|
if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
|
|
- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
|
|
+ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
|
|
return;
|
|
}
|
|
- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
|
|
ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
|
|
ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
|
|
ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
|
|
estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
|
|
estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
|
|
estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
|
|
- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
|
|
+ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
|
|
estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
|
|
if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
|
|
return;
|
|
}
|
|
if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
|
|
+ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
|
|
ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
|
|
splits->splitLocations[splits->idx] = (U32)midIdx;
|
|
splits->idx++;
|
|
@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
|
|
}
|
|
}
|
|
|
|
-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
|
|
+/* Base recursive function.
|
|
+ * Populates a table with intra-block partition indices that can improve compression ratio.
|
|
*
|
|
- * Returns the number of splits made (which equals the size of the partition table - 1).
|
|
+ * @return: number of splits made (which equals the size of the partition table - 1).
|
|
*/
|
|
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
|
|
- seqStoreSplits splits = {partitions, 0};
|
|
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
|
|
+{
|
|
+ seqStoreSplits splits;
|
|
+ splits.splitLocations = partitions;
|
|
+ splits.idx = 0;
|
|
if (nbSeq <= 4) {
|
|
- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
|
|
+ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
|
|
/* Refuse to try and split anything with less than 4 sequences */
|
|
return 0;
|
|
}
|
|
@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
|
|
* Returns combined size of all blocks (which includes headers), or a ZSTD error code.
|
|
*/
|
|
static size_t
|
|
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
|
|
- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
|
|
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t blockSize,
|
|
+ U32 lastBlock, U32 nbSeq)
|
|
{
|
|
size_t cSize = 0;
|
|
const BYTE* ip = (const BYTE*)src;
|
|
BYTE* op = (BYTE*)dst;
|
|
size_t i = 0;
|
|
size_t srcBytesTotal = 0;
|
|
- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
|
|
- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
|
|
- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
|
|
- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
|
|
+ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
|
|
+ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
|
|
+ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
|
|
+ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
|
|
|
|
/* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
|
|
* may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
|
|
@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
|
|
ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
|
|
ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
|
|
|
|
- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
|
|
+ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
|
|
(unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
|
|
(unsigned)zc->blockState.matchState.nextToUpdate);
|
|
|
|
if (numSplits == 0) {
|
|
- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
|
|
- &dRep, &cRep,
|
|
- op, dstCapacity,
|
|
- ip, blockSize,
|
|
- lastBlock, 0 /* isPartition */);
|
|
+ size_t cSizeSingleBlock =
|
|
+ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
|
|
+ &dRep, &cRep,
|
|
+ op, dstCapacity,
|
|
+ ip, blockSize,
|
|
+ lastBlock, 0 /* isPartition */);
|
|
FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
|
|
DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
|
|
- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
|
|
+ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
|
|
+ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
|
|
return cSizeSingleBlock;
|
|
}
|
|
|
|
ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
|
|
for (i = 0; i <= numSplits; ++i) {
|
|
- size_t srcBytes;
|
|
size_t cSizeChunk;
|
|
U32 const lastPartition = (i == numSplits);
|
|
U32 lastBlockEntireSrc = 0;
|
|
|
|
- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
|
|
+ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
|
|
srcBytesTotal += srcBytes;
|
|
if (lastPartition) {
|
|
/* This is the final partition, need to account for possible last literals */
|
|
@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
|
|
op, dstCapacity,
|
|
ip, srcBytes,
|
|
lastBlockEntireSrc, 1 /* isPartition */);
|
|
- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
|
|
+ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
|
|
+ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
|
|
FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
|
|
|
|
ip += srcBytes;
|
|
@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
|
|
dstCapacity -= cSizeChunk;
|
|
cSize += cSizeChunk;
|
|
*currSeqStore = *nextSeqStore;
|
|
- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
|
|
+ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
|
|
}
|
|
- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
|
|
- * for the next block.
|
|
+ /* cRep and dRep may have diverged during the compression.
|
|
+ * If so, we use the dRep repcodes for the next block.
|
|
*/
|
|
ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
|
|
return cSize;
|
|
@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
|
|
void* dst, size_t dstCapacity,
|
|
const void* src, size_t srcSize, U32 lastBlock)
|
|
{
|
|
- const BYTE* ip = (const BYTE*)src;
|
|
- BYTE* op = (BYTE*)dst;
|
|
U32 nbSeq;
|
|
size_t cSize;
|
|
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
|
|
@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
|
|
if (bss == ZSTDbss_noCompress) {
|
|
if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
|
|
zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
|
|
- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
|
|
+ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
|
|
FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
|
|
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
|
|
return cSize;
|
|
@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
|
|
void* dst, size_t dstCapacity,
|
|
const void* src, size_t srcSize, U32 frame)
|
|
{
|
|
- /* This the upper bound for the length of an rle block.
|
|
- * This isn't the actual upper bound. Finding the real threshold
|
|
- * needs further investigation.
|
|
+ /* This is an estimated upper bound for the length of an rle block.
|
|
+ * This isn't the actual upper bound.
|
|
+ * Finding the real threshold needs further investigation.
|
|
*/
|
|
const U32 rleMaxLength = 25;
|
|
size_t cSize;
|
|
@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
|
|
* * cSize >= blockBound(srcSize): We have expanded the block too much so
|
|
* emit an uncompressed block.
|
|
*/
|
|
- {
|
|
- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
|
|
+ { size_t const cSize =
|
|
+ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
|
|
if (cSize != ERROR(dstSize_tooSmall)) {
|
|
- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
|
|
+ size_t const maxCSize =
|
|
+ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
|
|
FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
|
|
if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
|
|
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
|
|
@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
|
|
}
|
|
}
|
|
}
|
|
- }
|
|
+ } /* if (bss == ZSTDbss_compress)*/
|
|
|
|
DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
|
|
/* Superblock compression failed, attempt to emit a single no compress block.
|
|
@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
|
|
* All blocks will be terminated, all input will be consumed.
|
|
* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
|
|
* Frame is supposed already started (header already produced)
|
|
-* @return : compressed size, or an error code
|
|
+* @return : compressed size, or an error code
|
|
*/
|
|
static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
|
|
void* dst, size_t dstCapacity,
|
|
@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
|
|
ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
|
|
U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
|
|
|
|
- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
|
|
+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
|
|
+ * additional 1. We need to revisit and change this logic to be more consistent */
|
|
+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
|
|
dstSize_tooSmall,
|
|
"not enough space to store compressed block");
|
|
if (remaining < blockSize) blockSize = remaining;
|
|
@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
|
|
MEM_writeLE24(op, cBlockHeader);
|
|
cSize += ZSTD_blockHeaderSize;
|
|
}
|
|
- }
|
|
+ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
|
|
|
|
|
|
ip += blockSize;
|
|
@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
|
|
}
|
|
}
|
|
|
|
-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
|
|
- void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize)
|
|
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
{
|
|
DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
|
|
return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
|
|
}
|
|
|
|
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
|
|
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
+{
|
|
+ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
|
|
+}
|
|
|
|
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
|
|
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
|
|
{
|
|
ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
|
|
assert(!ZSTD_checkCParams(cParams));
|
|
- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
|
|
+ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
|
|
}
|
|
|
|
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
|
|
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
|
|
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
|
|
+{
|
|
+ return ZSTD_getBlockSize_deprecated(cctx);
|
|
+}
|
|
+
|
|
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
|
|
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
|
|
{
|
|
DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
|
|
- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
|
|
+ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
|
|
RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
|
|
|
|
return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
|
|
}
|
|
|
|
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
|
|
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
|
|
+{
|
|
+ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
|
|
+}
|
|
+
|
|
/*! ZSTD_loadDictionaryContent() :
|
|
* @return : 0, or an error code
|
|
*/
|
|
@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
|
|
ZSTD_cwksp* ws,
|
|
ZSTD_CCtx_params const* params,
|
|
const void* src, size_t srcSize,
|
|
- ZSTD_dictTableLoadMethod_e dtlm)
|
|
+ ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp)
|
|
{
|
|
const BYTE* ip = (const BYTE*) src;
|
|
const BYTE* const iend = ip + srcSize;
|
|
int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
|
|
|
|
- /* Assert that we the ms params match the params we're being given */
|
|
+ /* Assert that the ms params match the params we're being given */
|
|
ZSTD_assertEqualCParams(params->cParams, ms->cParams);
|
|
|
|
- if (srcSize > ZSTD_CHUNKSIZE_MAX) {
|
|
+ { /* Ensure large dictionaries can't cause index overflow */
|
|
+
|
|
/* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
|
|
* Dictionaries right at the edge will immediately trigger overflow
|
|
* correction, but I don't want to insert extra constraints here.
|
|
*/
|
|
- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
|
|
- /* We must have cleared our windows when our source is this large. */
|
|
- assert(ZSTD_window_isEmpty(ms->window));
|
|
- if (loadLdmDict)
|
|
- assert(ZSTD_window_isEmpty(ls->window));
|
|
+ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
|
|
+
|
|
+ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams);
|
|
+ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
|
|
+ /* Some dictionary matchfinders in zstd use "short cache",
|
|
+ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
|
|
+ * CDict hashtable entry as a tag rather than as part of an index.
|
|
+ * When short cache is used, we need to truncate the dictionary
|
|
+ * so that its indices don't overlap with the tag. */
|
|
+ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
|
|
+ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
|
|
+ assert(!loadLdmDict);
|
|
+ }
|
|
+
|
|
/* If the dictionary is too large, only load the suffix of the dictionary. */
|
|
if (srcSize > maxDictSize) {
|
|
ip = iend - maxDictSize;
|
|
@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
|
|
}
|
|
}
|
|
|
|
- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
|
|
+ if (srcSize > ZSTD_CHUNKSIZE_MAX) {
|
|
+ /* We must have cleared our windows when our source is this large. */
|
|
+ assert(ZSTD_window_isEmpty(ms->window));
|
|
+ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
|
|
+ }
|
|
ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
|
|
- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
|
|
- ms->forceNonContiguous = params->deterministicRefPrefix;
|
|
|
|
- if (loadLdmDict) {
|
|
+ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
|
|
+
|
|
+ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
|
|
ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
|
|
ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
|
|
+ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams);
|
|
+ }
|
|
+
|
|
+ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
|
|
+ if (params->cParams.strategy < ZSTD_btultra) {
|
|
+ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
|
|
+ if (srcSize > maxDictSize) {
|
|
+ ip = iend - maxDictSize;
|
|
+ src = ip;
|
|
+ srcSize = maxDictSize;
|
|
+ }
|
|
}
|
|
|
|
+ ms->nextToUpdate = (U32)(ip - ms->window.base);
|
|
+ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
|
|
+ ms->forceNonContiguous = params->deterministicRefPrefix;
|
|
+
|
|
if (srcSize <= HASH_READ_SIZE) return 0;
|
|
|
|
ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
|
|
|
|
- if (loadLdmDict)
|
|
- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams);
|
|
-
|
|
switch(params->cParams.strategy)
|
|
{
|
|
case ZSTD_fast:
|
|
- ZSTD_fillHashTable(ms, iend, dtlm);
|
|
+ ZSTD_fillHashTable(ms, iend, dtlm, tfp);
|
|
break;
|
|
case ZSTD_dfast:
|
|
- ZSTD_fillDoubleHashTable(ms, iend, dtlm);
|
|
+ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
|
|
break;
|
|
|
|
case ZSTD_greedy:
|
|
@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
|
|
} else {
|
|
assert(params->useRowMatchFinder != ZSTD_ps_auto);
|
|
if (params->useRowMatchFinder == ZSTD_ps_enable) {
|
|
- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
|
|
+ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
|
|
ZSTD_memset(ms->tagTable, 0, tagTableSize);
|
|
ZSTD_row_update(ms, iend-HASH_READ_SIZE);
|
|
DEBUGLOG(4, "Using row-based hash table for lazy dict");
|
|
@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
|
|
ZSTD_CCtx_params const* params,
|
|
const void* dict, size_t dictSize,
|
|
ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp,
|
|
void* workspace)
|
|
{
|
|
const BYTE* dictPtr = (const BYTE*)dict;
|
|
@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
|
|
{
|
|
size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
|
|
FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
|
|
- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
|
|
+ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
|
|
}
|
|
return dictID;
|
|
}
|
|
@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
|
|
const void* dict, size_t dictSize,
|
|
ZSTD_dictContentType_e dictContentType,
|
|
ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp,
|
|
void* workspace)
|
|
{
|
|
DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
|
|
@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
|
|
|
|
/* dict restricted modes */
|
|
if (dictContentType == ZSTD_dct_rawContent)
|
|
- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
|
|
+ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
|
|
|
|
if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
|
|
if (dictContentType == ZSTD_dct_auto) {
|
|
DEBUGLOG(4, "raw content dictionary detected");
|
|
return ZSTD_loadDictionaryContent(
|
|
- ms, ls, ws, params, dict, dictSize, dtlm);
|
|
+ ms, ls, ws, params, dict, dictSize, dtlm, tfp);
|
|
}
|
|
RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
|
|
assert(0); /* impossible */
|
|
@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
|
|
|
|
/* dict as full zstd dictionary */
|
|
return ZSTD_loadZstdDictionary(
|
|
- bs, ms, ws, params, dict, dictSize, dtlm, workspace);
|
|
+ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
|
|
}
|
|
|
|
#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
|
|
#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
|
|
|
|
/*! ZSTD_compressBegin_internal() :
|
|
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
|
|
* @return : 0, or an error code */
|
|
static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
|
|
const void* dict, size_t dictSize,
|
|
@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
|
|
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
|
|
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
|
|
cdict->dictContentSize, cdict->dictContentType, dtlm,
|
|
- cctx->entropyWorkspace)
|
|
+ ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
|
|
: ZSTD_compress_insertDictionary(
|
|
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
|
|
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
|
|
- dictContentType, dtlm, cctx->entropyWorkspace);
|
|
+ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
|
|
FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
|
|
assert(dictID <= UINT_MAX);
|
|
cctx->dictID = (U32)dictID;
|
|
@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
|
|
&cctxParams, pledgedSrcSize);
|
|
}
|
|
|
|
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
|
|
+static size_t
|
|
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
|
|
{
|
|
ZSTD_CCtx_params cctxParams;
|
|
- {
|
|
- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
|
|
+ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
|
|
ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
|
|
}
|
|
DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
|
|
@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
|
|
&cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
|
|
}
|
|
|
|
+size_t
|
|
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
|
|
+{
|
|
+ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
|
|
+}
|
|
+
|
|
size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
|
|
{
|
|
- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
|
|
+ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
|
|
}
|
|
|
|
|
|
@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
|
|
(void)extraCSize;
|
|
}
|
|
|
|
-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
|
|
- void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize)
|
|
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
{
|
|
size_t endResult;
|
|
size_t const cSize = ZSTD_compressContinue_internal(cctx,
|
|
@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
|
|
return cSize + endResult;
|
|
}
|
|
|
|
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
|
|
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
+{
|
|
+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
|
|
+}
|
|
+
|
|
size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
|
|
void* dst, size_t dstCapacity,
|
|
const void* src, size_t srcSize,
|
|
@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
|
|
FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
|
|
dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
|
|
params, srcSize, ZSTDb_not_buffered) , "");
|
|
- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
|
|
+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
|
|
}
|
|
|
|
size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
|
|
@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
|
|
{ size_t const dictID = ZSTD_compress_insertDictionary(
|
|
&cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
|
|
¶ms, cdict->dictContent, cdict->dictContentSize,
|
|
- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
|
|
+ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
|
|
FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
|
|
assert(dictID <= (size_t)(U32)-1);
|
|
cdict->dictID = (U32)dictID;
|
|
@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
|
|
params.cParams = cParams;
|
|
params.useRowMatchFinder = useRowMatchFinder;
|
|
cdict->useRowMatchFinder = useRowMatchFinder;
|
|
+ cdict->compressionLevel = ZSTD_NO_CLEVEL;
|
|
|
|
if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
|
|
dict, dictSize,
|
|
@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
|
|
|
|
/* ZSTD_compressBegin_usingCDict() :
|
|
* cdict must be != NULL */
|
|
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
|
|
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
|
|
{
|
|
ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
|
|
return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
|
|
}
|
|
|
|
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
|
|
+{
|
|
+ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
|
|
+}
|
|
+
|
|
/*! ZSTD_compress_usingCDict_internal():
|
|
* Implementation of various ZSTD_compress_usingCDict* functions.
|
|
*/
|
|
@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
|
|
const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
|
|
{
|
|
FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
|
|
- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
|
|
+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
|
|
}
|
|
|
|
/*! ZSTD_compress_usingCDict_advanced():
|
|
@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
|
|
|
|
static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
|
|
{
|
|
- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
|
|
- if (hintInSize==0) hintInSize = cctx->blockSize;
|
|
- return hintInSize;
|
|
+ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
|
|
+ return cctx->blockSize - cctx->stableIn_notConsumed;
|
|
+ }
|
|
+ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
|
|
+ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
|
|
+ if (hintInSize==0) hintInSize = cctx->blockSize;
|
|
+ return hintInSize;
|
|
+ }
|
|
}
|
|
|
|
/* ZSTD_compressStream_generic():
|
|
* internal function for all *compressStream*() variants
|
|
- * non-static, because can be called from zstdmt_compress.c
|
|
- * @return : hint size for next input */
|
|
+ * @return : hint size for next input to complete ongoing block */
|
|
static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
ZSTD_outBuffer* output,
|
|
ZSTD_inBuffer* input,
|
|
ZSTD_EndDirective const flushMode)
|
|
{
|
|
- const char* const istart = (const char*)input->src;
|
|
- const char* const iend = input->size != 0 ? istart + input->size : istart;
|
|
- const char* ip = input->pos != 0 ? istart + input->pos : istart;
|
|
- char* const ostart = (char*)output->dst;
|
|
- char* const oend = output->size != 0 ? ostart + output->size : ostart;
|
|
- char* op = output->pos != 0 ? ostart + output->pos : ostart;
|
|
+ const char* const istart = (assert(input != NULL), (const char*)input->src);
|
|
+ const char* const iend = (istart != NULL) ? istart + input->size : istart;
|
|
+ const char* ip = (istart != NULL) ? istart + input->pos : istart;
|
|
+ char* const ostart = (assert(output != NULL), (char*)output->dst);
|
|
+ char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
|
|
+ char* op = (ostart != NULL) ? ostart + output->pos : ostart;
|
|
U32 someMoreWork = 1;
|
|
|
|
/* check expectations */
|
|
- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
|
|
+ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
|
|
+ assert(zcs != NULL);
|
|
+ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
|
|
+ assert(input->pos >= zcs->stableIn_notConsumed);
|
|
+ input->pos -= zcs->stableIn_notConsumed;
|
|
+ ip -= zcs->stableIn_notConsumed;
|
|
+ zcs->stableIn_notConsumed = 0;
|
|
+ }
|
|
if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
|
|
assert(zcs->inBuff != NULL);
|
|
assert(zcs->inBuffSize > 0);
|
|
@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
assert(zcs->outBuff != NULL);
|
|
assert(zcs->outBuffSize > 0);
|
|
}
|
|
- assert(output->pos <= output->size);
|
|
+ if (input->src == NULL) assert(input->size == 0);
|
|
assert(input->pos <= input->size);
|
|
+ if (output->dst == NULL) assert(output->size == 0);
|
|
+ assert(output->pos <= output->size);
|
|
assert((U32)flushMode <= (U32)ZSTD_e_end);
|
|
|
|
while (someMoreWork) {
|
|
@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
|| zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */
|
|
&& (zcs->inBuffPos == 0) ) {
|
|
/* shortcut to compression pass directly into output buffer */
|
|
- size_t const cSize = ZSTD_compressEnd(zcs,
|
|
+ size_t const cSize = ZSTD_compressEnd_public(zcs,
|
|
op, oend-op, ip, iend-ip);
|
|
DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
|
|
FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
|
|
@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
zcs->inBuff + zcs->inBuffPos, toLoad,
|
|
ip, iend-ip);
|
|
zcs->inBuffPos += loaded;
|
|
- if (loaded != 0)
|
|
- ip += loaded;
|
|
+ if (ip) ip += loaded;
|
|
if ( (flushMode == ZSTD_e_continue)
|
|
&& (zcs->inBuffPos < zcs->inBuffTarget) ) {
|
|
/* not enough input to fill full block : stop here */
|
|
@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
/* empty */
|
|
someMoreWork = 0; break;
|
|
}
|
|
+ } else {
|
|
+ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
|
|
+ if ( (flushMode == ZSTD_e_continue)
|
|
+ && ( (size_t)(iend - ip) < zcs->blockSize) ) {
|
|
+ /* can't compress a full block : stop here */
|
|
+ zcs->stableIn_notConsumed = (size_t)(iend - ip);
|
|
+ ip = iend; /* pretend to have consumed input */
|
|
+ someMoreWork = 0; break;
|
|
+ }
|
|
+ if ( (flushMode == ZSTD_e_flush)
|
|
+ && (ip == iend) ) {
|
|
+ /* empty */
|
|
+ someMoreWork = 0; break;
|
|
+ }
|
|
}
|
|
/* compress current block (note : this stage cannot be stopped in the middle) */
|
|
DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
|
|
@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
void* cDst;
|
|
size_t cSize;
|
|
size_t oSize = oend-op;
|
|
- size_t const iSize = inputBuffered
|
|
- ? zcs->inBuffPos - zcs->inToCompress
|
|
- : MIN((size_t)(iend - ip), zcs->blockSize);
|
|
+ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
|
|
+ : MIN((size_t)(iend - ip), zcs->blockSize);
|
|
if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
|
|
cDst = op; /* compress into output buffer, to skip flush stage */
|
|
else
|
|
@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
if (inputBuffered) {
|
|
unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
|
|
cSize = lastBlock ?
|
|
- ZSTD_compressEnd(zcs, cDst, oSize,
|
|
+ ZSTD_compressEnd_public(zcs, cDst, oSize,
|
|
zcs->inBuff + zcs->inToCompress, iSize) :
|
|
- ZSTD_compressContinue(zcs, cDst, oSize,
|
|
+ ZSTD_compressContinue_public(zcs, cDst, oSize,
|
|
zcs->inBuff + zcs->inToCompress, iSize);
|
|
FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
|
|
zcs->frameEnded = lastBlock;
|
|
@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
|
|
if (!lastBlock)
|
|
assert(zcs->inBuffTarget <= zcs->inBuffSize);
|
|
zcs->inToCompress = zcs->inBuffPos;
|
|
- } else {
|
|
- unsigned const lastBlock = (ip + iSize == iend);
|
|
- assert(flushMode == ZSTD_e_end /* Already validated */);
|
|
+ } else { /* !inputBuffered, hence ZSTD_bm_stable */
|
|
+ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
|
|
cSize = lastBlock ?
|
|
- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
|
|
- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
|
|
+ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
|
|
+ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
|
|
/* Consume the input prior to error checking to mirror buffered mode. */
|
|
- if (iSize > 0)
|
|
- ip += iSize;
|
|
+ if (ip) ip += iSize;
|
|
FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
|
|
zcs->frameEnded = lastBlock;
|
|
- if (lastBlock)
|
|
- assert(ip == iend);
|
|
+ if (lastBlock) assert(ip == iend);
|
|
}
|
|
if (cDst == op) { /* no need to flush */
|
|
op += cSize;
|
|
@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
|
|
/* After a compression call set the expected input/output buffer.
|
|
* This is validated at the start of the next compression call.
|
|
*/
|
|
-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
|
|
+static void
|
|
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
|
|
{
|
|
+ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
|
|
if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
|
|
cctx->expectedInBuffer = *input;
|
|
}
|
|
@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
|
|
{
|
|
if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
|
|
ZSTD_inBuffer const expect = cctx->expectedInBuffer;
|
|
- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
|
|
- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
|
|
- if (endOp != ZSTD_e_end)
|
|
- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
|
|
+ if (expect.src != input->src || expect.pos != input->pos)
|
|
+ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
|
|
}
|
|
+ (void)endOp;
|
|
if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
|
|
size_t const outBufferSize = output->size - output->pos;
|
|
if (cctx->expectedOutBufferSize != outBufferSize)
|
|
- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
|
|
+ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
|
|
ZSTD_EndDirective endOp,
|
|
- size_t inSize) {
|
|
+ size_t inSize)
|
|
+{
|
|
ZSTD_CCtx_params params = cctx->requestedParams;
|
|
ZSTD_prefixDict const prefixDict = cctx->prefixDict;
|
|
FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
|
|
@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
|
|
params.compressionLevel = cctx->cdict->compressionLevel;
|
|
}
|
|
DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
|
|
- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */
|
|
- {
|
|
- size_t const dictSize = prefixDict.dict
|
|
+ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */
|
|
+
|
|
+ { size_t const dictSize = prefixDict.dict
|
|
? prefixDict.dictSize
|
|
: (cctx->cdict ? cctx->cdict->dictContentSize : 0);
|
|
ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1);
|
|
@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
|
|
params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams);
|
|
params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams);
|
|
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams);
|
|
+ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
|
|
+ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
|
|
+ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
|
|
|
|
{ U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
|
|
assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
|
|
@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
|
|
return 0;
|
|
}
|
|
|
|
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
|
|
+ */
|
|
size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
|
|
ZSTD_outBuffer* output,
|
|
ZSTD_inBuffer* input,
|
|
@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
|
|
|
|
/* transparent initialization stage */
|
|
if (cctx->streamStage == zcss_init) {
|
|
- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
|
|
- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */
|
|
+ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */
|
|
+ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
|
|
+ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
|
|
+ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */
|
|
+ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */
|
|
+ if (cctx->stableIn_notConsumed) { /* not the first time */
|
|
+ /* check stable source guarantees */
|
|
+ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
|
|
+ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
|
|
+ }
|
|
+ /* pretend input was consumed, to give a sense forward progress */
|
|
+ input->pos = input->size;
|
|
+ /* save stable inBuffer, for later control, and flush/end */
|
|
+ cctx->expectedInBuffer = *input;
|
|
+ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
|
|
+ cctx->stableIn_notConsumed += inputSize;
|
|
+ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
|
|
+ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */
|
|
+ }
|
|
+ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
|
|
+ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */
|
|
}
|
|
/* end of transparent initialization stage */
|
|
|
|
@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
|
|
const void* src, size_t srcSize, size_t* srcPos,
|
|
ZSTD_EndDirective endOp)
|
|
{
|
|
- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
|
|
- ZSTD_inBuffer input = { src, srcSize, *srcPos };
|
|
+ ZSTD_outBuffer output;
|
|
+ ZSTD_inBuffer input;
|
|
+ output.dst = dst;
|
|
+ output.size = dstCapacity;
|
|
+ output.pos = *dstPos;
|
|
+ input.src = src;
|
|
+ input.size = srcSize;
|
|
+ input.pos = *srcPos;
|
|
/* ZSTD_compressStream2() will check validity of dstPos and srcPos */
|
|
- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
|
|
- *dstPos = output.pos;
|
|
- *srcPos = input.pos;
|
|
- return cErr;
|
|
+ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
|
|
+ *dstPos = output.pos;
|
|
+ *srcPos = input.pos;
|
|
+ return cErr;
|
|
+ }
|
|
}
|
|
|
|
size_t ZSTD_compress2(ZSTD_CCtx* cctx,
|
|
@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
|
|
/* Reset to the original values. */
|
|
cctx->requestedParams.inBufferMode = originalInBufferMode;
|
|
cctx->requestedParams.outBufferMode = originalOutBufferMode;
|
|
+
|
|
FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
|
|
if (result != 0) { /* compression not completed, due to lack of output space */
|
|
assert(oPos == dstCapacity);
|
|
@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
|
|
}
|
|
}
|
|
|
|
-typedef struct {
|
|
- U32 idx; /* Index in array of ZSTD_Sequence */
|
|
- U32 posInSequence; /* Position within sequence at idx */
|
|
- size_t posInSrc; /* Number of bytes given by sequences provided so far */
|
|
-} ZSTD_sequencePosition;
|
|
-
|
|
/* ZSTD_validateSequence() :
|
|
* @offCode : is presumed to follow format required by ZSTD_storeSeq()
|
|
* @returns a ZSTD error code if sequence is not valid
|
|
*/
|
|
static size_t
|
|
-ZSTD_validateSequence(U32 offCode, U32 matchLength,
|
|
- size_t posInSrc, U32 windowLog, size_t dictSize)
|
|
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
|
|
+ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
|
|
{
|
|
- U32 const windowSize = 1 << windowLog;
|
|
+ U32 const windowSize = 1u << windowLog;
|
|
/* posInSrc represents the amount of data the decoder would decode up to this point.
|
|
* As long as the amount of data decoded is less than or equal to window size, offsets may be
|
|
* larger than the total length of output decoded in order to reference the dict, even larger than
|
|
* window size. After output surpasses windowSize, we're limited to windowSize offsets again.
|
|
*/
|
|
size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
|
|
- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
|
|
- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
|
|
+ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
|
|
+ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
|
|
+ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
|
|
+ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
|
|
return 0;
|
|
}
|
|
|
|
/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
|
|
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
|
|
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
|
|
{
|
|
- U32 offCode = STORE_OFFSET(rawOffset);
|
|
+ U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
|
|
|
|
if (!ll0 && rawOffset == rep[0]) {
|
|
- offCode = STORE_REPCODE_1;
|
|
+ offBase = REPCODE1_TO_OFFBASE;
|
|
} else if (rawOffset == rep[1]) {
|
|
- offCode = STORE_REPCODE(2 - ll0);
|
|
+ offBase = REPCODE_TO_OFFBASE(2 - ll0);
|
|
} else if (rawOffset == rep[2]) {
|
|
- offCode = STORE_REPCODE(3 - ll0);
|
|
+ offBase = REPCODE_TO_OFFBASE(3 - ll0);
|
|
} else if (ll0 && rawOffset == rep[0] - 1) {
|
|
- offCode = STORE_REPCODE_3;
|
|
+ offBase = REPCODE3_TO_OFFBASE;
|
|
}
|
|
- return offCode;
|
|
+ return offBase;
|
|
}
|
|
|
|
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
|
|
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
|
|
- */
|
|
-static size_t
|
|
+size_t
|
|
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
|
|
ZSTD_sequencePosition* seqPos,
|
|
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
|
|
- const void* src, size_t blockSize)
|
|
+ const void* src, size_t blockSize,
|
|
+ ZSTD_paramSwitch_e externalRepSearch)
|
|
{
|
|
U32 idx = seqPos->idx;
|
|
+ U32 const startIdx = idx;
|
|
BYTE const* ip = (BYTE const*)(src);
|
|
const BYTE* const iend = ip + blockSize;
|
|
repcodes_t updatedRepcodes;
|
|
U32 dictSize;
|
|
|
|
+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
|
|
+
|
|
if (cctx->cdict) {
|
|
dictSize = (U32)cctx->cdict->dictContentSize;
|
|
} else if (cctx->prefixDict.dict) {
|
|
@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
|
|
dictSize = 0;
|
|
}
|
|
ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
|
|
- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
|
|
+ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
|
|
U32 const litLength = inSeqs[idx].litLength;
|
|
- U32 const ll0 = (litLength == 0);
|
|
U32 const matchLength = inSeqs[idx].matchLength;
|
|
- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
|
|
- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
|
|
+ U32 offBase;
|
|
|
|
- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
|
|
+ if (externalRepSearch == ZSTD_ps_disable) {
|
|
+ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
|
|
+ } else {
|
|
+ U32 const ll0 = (litLength == 0);
|
|
+ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
|
|
+ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
|
|
+ }
|
|
+
|
|
+ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
|
|
if (cctx->appliedParams.validateSequences) {
|
|
seqPos->posInSrc += litLength + matchLength;
|
|
- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
|
|
- cctx->appliedParams.cParams.windowLog, dictSize),
|
|
+ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
|
|
+ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
|
|
"Sequence validation failed");
|
|
}
|
|
- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
|
|
+ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
|
|
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
|
|
- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
|
|
+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
|
|
ip += matchLength + litLength;
|
|
}
|
|
+
|
|
+ /* If we skipped repcode search while parsing, we need to update repcodes now */
|
|
+ assert(externalRepSearch != ZSTD_ps_auto);
|
|
+ assert(idx >= startIdx);
|
|
+ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
|
|
+ U32* const rep = updatedRepcodes.rep;
|
|
+ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
|
|
+
|
|
+ if (lastSeqIdx >= startIdx + 2) {
|
|
+ rep[2] = inSeqs[lastSeqIdx - 2].offset;
|
|
+ rep[1] = inSeqs[lastSeqIdx - 1].offset;
|
|
+ rep[0] = inSeqs[lastSeqIdx].offset;
|
|
+ } else if (lastSeqIdx == startIdx + 1) {
|
|
+ rep[2] = rep[0];
|
|
+ rep[1] = inSeqs[lastSeqIdx - 1].offset;
|
|
+ rep[0] = inSeqs[lastSeqIdx].offset;
|
|
+ } else {
|
|
+ assert(lastSeqIdx == startIdx);
|
|
+ rep[2] = rep[1];
|
|
+ rep[1] = rep[0];
|
|
+ rep[0] = inSeqs[lastSeqIdx].offset;
|
|
+ }
|
|
+ }
|
|
+
|
|
ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
|
|
|
|
if (inSeqs[idx].litLength) {
|
|
@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
|
|
ip += inSeqs[idx].litLength;
|
|
seqPos->posInSrc += inSeqs[idx].litLength;
|
|
}
|
|
- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
|
|
+ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
|
|
seqPos->idx = idx+1;
|
|
return 0;
|
|
}
|
|
|
|
-/* Returns the number of bytes to move the current read position back by. Only non-zero
|
|
- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
|
|
- * went wrong.
|
|
- *
|
|
- * This function will attempt to scan through blockSize bytes represented by the sequences
|
|
- * in inSeqs, storing any (partial) sequences.
|
|
- *
|
|
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
|
|
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
|
|
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
|
|
- */
|
|
-static size_t
|
|
+size_t
|
|
ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
|
|
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
|
|
- const void* src, size_t blockSize)
|
|
+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
|
|
{
|
|
U32 idx = seqPos->idx;
|
|
U32 startPosInSequence = seqPos->posInSequence;
|
|
@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
U32 bytesAdjustment = 0;
|
|
U32 finalMatchSplit = 0;
|
|
|
|
+ /* TODO(embg) support fast parsing mode in noBlockDelim mode */
|
|
+ (void)externalRepSearch;
|
|
+
|
|
if (cctx->cdict) {
|
|
dictSize = cctx->cdict->dictContentSize;
|
|
} else if (cctx->prefixDict.dict) {
|
|
@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
} else {
|
|
dictSize = 0;
|
|
}
|
|
- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
|
|
+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
|
|
DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
|
|
ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
|
|
while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
|
|
@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
U32 litLength = currSeq.litLength;
|
|
U32 matchLength = currSeq.matchLength;
|
|
U32 const rawOffset = currSeq.offset;
|
|
- U32 offCode;
|
|
+ U32 offBase;
|
|
|
|
/* Modify the sequence depending on where endPosInSequence lies */
|
|
if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
|
|
@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
/* Move to the next sequence */
|
|
endPosInSequence -= currSeq.litLength + currSeq.matchLength;
|
|
startPosInSequence = 0;
|
|
- idx++;
|
|
} else {
|
|
/* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
|
|
does not reach the end of the match. So, we have to split the sequence */
|
|
@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
}
|
|
/* Check if this offset can be represented with a repcode */
|
|
{ U32 const ll0 = (litLength == 0);
|
|
- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
|
|
- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
|
|
+ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
|
|
+ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
|
|
}
|
|
|
|
if (cctx->appliedParams.validateSequences) {
|
|
seqPos->posInSrc += litLength + matchLength;
|
|
- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
|
|
- cctx->appliedParams.cParams.windowLog, dictSize),
|
|
+ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
|
|
+ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
|
|
"Sequence validation failed");
|
|
}
|
|
- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
|
|
- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
|
|
+ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
|
|
+ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
|
|
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
|
|
- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
|
|
+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
|
|
ip += matchLength + litLength;
|
|
+ if (!finalMatchSplit)
|
|
+ idx++; /* Next Sequence */
|
|
}
|
|
DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
|
|
assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
|
|
@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
|
|
|
|
typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
|
|
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
|
|
- const void* src, size_t blockSize);
|
|
+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
|
|
static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
|
|
{
|
|
ZSTD_sequenceCopier sequenceCopier = NULL;
|
|
@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
|
|
return sequenceCopier;
|
|
}
|
|
|
|
+/* Discover the size of next block by searching for the delimiter.
|
|
+ * Note that a block delimiter **must** exist in this mode,
|
|
+ * otherwise it's an input error.
|
|
+ * The block size retrieved will be later compared to ensure it remains within bounds */
|
|
+static size_t
|
|
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
|
|
+{
|
|
+ int end = 0;
|
|
+ size_t blockSize = 0;
|
|
+ size_t spos = seqPos.idx;
|
|
+ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
|
|
+ assert(spos <= inSeqsSize);
|
|
+ while (spos < inSeqsSize) {
|
|
+ end = (inSeqs[spos].offset == 0);
|
|
+ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
|
|
+ if (end) {
|
|
+ if (inSeqs[spos].matchLength != 0)
|
|
+ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
|
|
+ break;
|
|
+ }
|
|
+ spos++;
|
|
+ }
|
|
+ if (!end)
|
|
+ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
|
|
+ return blockSize;
|
|
+}
|
|
+
|
|
+/* More a "target" block size */
|
|
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
|
|
+{
|
|
+ int const lastBlock = (remaining <= blockSize);
|
|
+ return lastBlock ? remaining : blockSize;
|
|
+}
|
|
+
|
|
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
|
|
+ size_t blockSize, size_t remaining,
|
|
+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
|
|
+{
|
|
+ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
|
|
+ if (mode == ZSTD_sf_noBlockDelimiters)
|
|
+ return blockSize_noDelimiter(blockSize, remaining);
|
|
+ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
|
|
+ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
|
|
+ if (explicitBlockSize > blockSize)
|
|
+ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
|
|
+ if (explicitBlockSize > remaining)
|
|
+ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
|
|
+ return explicitBlockSize;
|
|
+ }
|
|
+}
|
|
+
|
|
/* Compress, block-by-block, all of the sequences given.
|
|
*
|
|
* Returns the cumulative size of all compressed blocks (including their headers),
|
|
@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
const void* src, size_t srcSize)
|
|
{
|
|
size_t cSize = 0;
|
|
- U32 lastBlock;
|
|
- size_t blockSize;
|
|
- size_t compressedSeqsSize;
|
|
size_t remaining = srcSize;
|
|
ZSTD_sequencePosition seqPos = {0, 0, 0};
|
|
|
|
@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
}
|
|
|
|
while (remaining) {
|
|
+ size_t compressedSeqsSize;
|
|
size_t cBlockSize;
|
|
size_t additionalByteAdjustment;
|
|
- lastBlock = remaining <= cctx->blockSize;
|
|
- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
|
|
+ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
|
|
+ cctx->blockSize, remaining,
|
|
+ inSeqs, inSeqsSize, seqPos);
|
|
+ U32 const lastBlock = (blockSize == remaining);
|
|
+ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
|
|
+ assert(blockSize <= remaining);
|
|
ZSTD_resetSeqStore(&cctx->seqStore);
|
|
- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
|
|
+ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
|
|
|
|
- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
|
|
+ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
|
|
FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
|
|
blockSize -= additionalByteAdjustment;
|
|
|
|
/* If blocks are too small, emit as a nocompress block */
|
|
- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
|
|
+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
|
|
+ * additional 1. We need to revisit and change this logic to be more consistent */
|
|
+ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
|
|
cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
|
|
FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
|
|
- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
|
|
+ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
|
|
cSize += cBlockSize;
|
|
ip += blockSize;
|
|
op += cBlockSize;
|
|
@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
continue;
|
|
}
|
|
|
|
+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
|
|
compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
|
|
&cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
|
|
&cctx->appliedParams,
|
|
@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
|
|
cctx->bmi2);
|
|
FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
|
|
- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
|
|
+ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
|
|
|
|
if (!cctx->isFirstBlock &&
|
|
ZSTD_maybeRLE(&cctx->seqStore) &&
|
|
- ZSTD_isRLE((BYTE const*)src, srcSize)) {
|
|
+ ZSTD_isRLE(ip, blockSize)) {
|
|
/* We don't want to emit our first block as a RLE even if it qualifies because
|
|
* doing so will cause the decoder (cli only) to throw a "should consume all input error."
|
|
* This is only an issue for zstd <= v1.4.3
|
|
@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
if (compressedSeqsSize == 0) {
|
|
/* ZSTD_noCompressBlock writes the block header as well */
|
|
cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
|
|
- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
|
|
- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
|
|
+ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
|
|
+ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
|
|
} else if (compressedSeqsSize == 1) {
|
|
cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
|
|
- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
|
|
- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
|
|
+ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
|
|
+ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
|
|
} else {
|
|
U32 cBlockHeader;
|
|
/* Error checking and repcodes update */
|
|
@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
|
|
MEM_writeLE24(op, cBlockHeader);
|
|
cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
|
|
- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
|
|
+ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
|
|
}
|
|
|
|
cSize += cBlockSize;
|
|
- DEBUGLOG(4, "cSize running total: %zu", cSize);
|
|
|
|
if (lastBlock) {
|
|
break;
|
|
@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
|
|
dstCapacity -= cBlockSize;
|
|
cctx->isFirstBlock = 0;
|
|
}
|
|
+ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
|
|
}
|
|
|
|
+ DEBUGLOG(4, "cSize final total: %zu", cSize);
|
|
return cSize;
|
|
}
|
|
|
|
-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
|
|
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
|
|
const void* src, size_t srcSize)
|
|
{
|
|
@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
|
|
size_t frameHeaderSize = 0;
|
|
|
|
/* Transparent initialization stage, same as compressStream2() */
|
|
- DEBUGLOG(3, "ZSTD_compressSequences()");
|
|
+ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
|
|
assert(cctx != NULL);
|
|
FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
|
|
/* Begin writing output, starting with frame header */
|
|
@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
|
|
cSize += 4;
|
|
}
|
|
|
|
- DEBUGLOG(3, "Final compressed size: %zu", cSize);
|
|
+ DEBUGLOG(4, "Final compressed size: %zu", cSize);
|
|
return cSize;
|
|
}
|
|
|
|
/*====== Finalize ======*/
|
|
|
|
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
|
|
+{
|
|
+ const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
|
|
+ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
|
|
+ return stableInput ? zcs->expectedInBuffer : nullInput;
|
|
+}
|
|
+
|
|
/*! ZSTD_flushStream() :
|
|
* @return : amount of data remaining to flush */
|
|
size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
|
|
{
|
|
- ZSTD_inBuffer input = { NULL, 0, 0 };
|
|
+ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
|
|
+ input.size = input.pos; /* do not ingest more input during flush */
|
|
return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
|
|
}
|
|
|
|
|
|
size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
|
|
{
|
|
- ZSTD_inBuffer input = { NULL, 0, 0 };
|
|
+ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
|
|
size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
|
|
- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
|
|
+ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
|
|
if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */
|
|
/* single thread mode : attempt to calculate remaining to flush more precisely */
|
|
{ size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
|
|
@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
|
|
cp.targetLength = (unsigned)(-clampedCompressionLevel);
|
|
}
|
|
/* refine parameters based on srcSize & dictSize */
|
|
- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
|
|
+ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
|
|
}
|
|
}
|
|
|
|
@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
|
|
if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
|
|
return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
|
|
}
|
|
+
|
|
+void ZSTD_registerSequenceProducer(
|
|
+ ZSTD_CCtx* zc, void* mState,
|
|
+ ZSTD_sequenceProducer_F* mFinder
|
|
+) {
|
|
+ if (mFinder != NULL) {
|
|
+ ZSTD_externalMatchCtx emctx;
|
|
+ emctx.mState = mState;
|
|
+ emctx.mFinder = mFinder;
|
|
+ emctx.seqBuffer = NULL;
|
|
+ emctx.seqBufferCapacity = 0;
|
|
+ zc->externalMatchCtx = emctx;
|
|
+ zc->requestedParams.useSequenceProducer = 1;
|
|
+ } else {
|
|
+ ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
|
|
+ zc->requestedParams.useSequenceProducer = 0;
|
|
+ }
|
|
+}
|
|
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
|
|
index 71697a11ae30..899f5e2de8e9 100644
|
|
--- a/lib/zstd/compress/zstd_compress_internal.h
|
|
+++ b/lib/zstd/compress/zstd_compress_internal.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -20,6 +21,7 @@
|
|
***************************************/
|
|
#include "../common/zstd_internal.h"
|
|
#include "zstd_cwksp.h"
|
|
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
|
|
|
|
|
|
/*-*************************************
|
|
@@ -111,12 +113,13 @@ typedef struct {
|
|
/* ZSTD_buildBlockEntropyStats() :
|
|
* Builds entropy for the block.
|
|
* @return : 0 on success or error code */
|
|
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
|
|
- const ZSTD_entropyCTables_t* prevEntropy,
|
|
- ZSTD_entropyCTables_t* nextEntropy,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
- void* workspace, size_t wkspSize);
|
|
+size_t ZSTD_buildBlockEntropyStats(
|
|
+ const seqStore_t* seqStorePtr,
|
|
+ const ZSTD_entropyCTables_t* prevEntropy,
|
|
+ ZSTD_entropyCTables_t* nextEntropy,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ ZSTD_entropyCTablesMetadata_t* entropyMetadata,
|
|
+ void* workspace, size_t wkspSize);
|
|
|
|
/* *******************************
|
|
* Compression internals structs *
|
|
@@ -142,6 +145,12 @@ typedef struct {
|
|
size_t capacity; /* The capacity starting from `seq` pointer */
|
|
} rawSeqStore_t;
|
|
|
|
+typedef struct {
|
|
+ U32 idx; /* Index in array of ZSTD_Sequence */
|
|
+ U32 posInSequence; /* Position within sequence at idx */
|
|
+ size_t posInSrc; /* Number of bytes given by sequences provided so far */
|
|
+} ZSTD_sequencePosition;
|
|
+
|
|
UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
|
|
|
|
typedef struct {
|
|
@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
|
|
U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */
|
|
|
|
U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
|
|
- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */
|
|
+ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */
|
|
U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
|
|
+ U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */
|
|
+ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */
|
|
|
|
U32* hashTable;
|
|
U32* hashTable3;
|
|
@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
|
|
const ZSTD_matchState_t* dictMatchState;
|
|
ZSTD_compressionParameters cParams;
|
|
const rawSeqStore_t* ldmSeqStore;
|
|
+
|
|
+ /* Controls prefetching in some dictMatchState matchfinders.
|
|
+ * This behavior is controlled from the cctx ms.
|
|
+ * This parameter has no effect in the cdict ms. */
|
|
+ int prefetchCDictTables;
|
|
+
|
|
+ /* When == 0, lazy match finders insert every position.
|
|
+ * When != 0, lazy match finders only insert positions they search.
|
|
+ * This allows them to skip much faster over incompressible data,
|
|
+ * at a small cost to compression ratio.
|
|
+ */
|
|
+ int lazySkipping;
|
|
};
|
|
|
|
typedef struct {
|
|
@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
|
|
|
|
/* Internal use, for createCCtxParams() and freeCCtxParams() only */
|
|
ZSTD_customMem customMem;
|
|
+
|
|
+ /* Controls prefetching in some dictMatchState matchfinders */
|
|
+ ZSTD_paramSwitch_e prefetchCDictTables;
|
|
+
|
|
+ /* Controls whether zstd will fall back to an internal matchfinder
|
|
+ * if the external matchfinder returns an error code. */
|
|
+ int enableMatchFinderFallback;
|
|
+
|
|
+ /* Indicates whether an external matchfinder has been referenced.
|
|
+ * Users can't set this externally.
|
|
+ * It is set internally in ZSTD_registerSequenceProducer(). */
|
|
+ int useSequenceProducer;
|
|
+
|
|
+ /* Adjust the max block size*/
|
|
+ size_t maxBlockSize;
|
|
+
|
|
+ /* Controls repcode search in external sequence parsing */
|
|
+ ZSTD_paramSwitch_e searchForExternalRepcodes;
|
|
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
|
|
|
|
#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
|
|
@@ -355,6 +396,14 @@ typedef struct {
|
|
ZSTD_entropyCTablesMetadata_t entropyMetadata;
|
|
} ZSTD_blockSplitCtx;
|
|
|
|
+/* Context for block-level external matchfinder API */
|
|
+typedef struct {
|
|
+ void* mState;
|
|
+ ZSTD_sequenceProducer_F* mFinder;
|
|
+ ZSTD_Sequence* seqBuffer;
|
|
+ size_t seqBufferCapacity;
|
|
+} ZSTD_externalMatchCtx;
|
|
+
|
|
struct ZSTD_CCtx_s {
|
|
ZSTD_compressionStage_e stage;
|
|
int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
|
|
@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
|
|
|
|
/* Stable in/out buffer verification */
|
|
ZSTD_inBuffer expectedInBuffer;
|
|
+ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
|
|
size_t expectedOutBufferSize;
|
|
|
|
/* Dictionary */
|
|
@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
|
|
|
|
/* Workspace for block splitter */
|
|
ZSTD_blockSplitCtx blockSplitCtx;
|
|
+
|
|
+ /* Workspace for external matchfinder */
|
|
+ ZSTD_externalMatchCtx externalMatchCtx;
|
|
};
|
|
|
|
typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
|
|
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
|
|
|
|
typedef enum {
|
|
ZSTD_noDict = 0,
|
|
@@ -441,7 +495,7 @@ typedef enum {
|
|
* In this mode we take both the source size and the dictionary size
|
|
* into account when selecting and adjusting the parameters.
|
|
*/
|
|
- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
|
|
+ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
|
|
* We don't know what these parameters are for. We default to the legacy
|
|
* behavior of taking both the source size and the dict size into account
|
|
* when selecting and adjusting parameters.
|
|
@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
|
|
/* ZSTD_noCompressBlock() :
|
|
* Writes uncompressed block to dst buffer from given src.
|
|
* Returns the size of the block */
|
|
-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
|
|
+MEM_STATIC size_t
|
|
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
|
|
{
|
|
U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
|
|
+ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
|
|
RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
|
|
dstSize_tooSmall, "dst buf too small for uncompressed block");
|
|
MEM_writeLE24(dst, cBlockHeader24);
|
|
@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
|
|
return ZSTD_blockHeaderSize + srcSize;
|
|
}
|
|
|
|
-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
|
|
+MEM_STATIC size_t
|
|
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
|
|
{
|
|
BYTE* const op = (BYTE*)dst;
|
|
U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
|
|
@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
|
|
{
|
|
U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
|
|
ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
|
|
- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
|
|
+ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
|
|
return (srcSize >> minlog) + 2;
|
|
}
|
|
|
|
@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
|
|
while (ip < iend) *op++ = *ip++;
|
|
}
|
|
|
|
-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1)
|
|
-#define STORE_REPCODE_1 STORE_REPCODE(1)
|
|
-#define STORE_REPCODE_2 STORE_REPCODE(2)
|
|
-#define STORE_REPCODE_3 STORE_REPCODE(3)
|
|
-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
|
|
-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE)
|
|
-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE)
|
|
-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
|
|
-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
|
|
-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */
|
|
-#define STORED_TO_OFFBASE(o) ((o)+1)
|
|
-#define OFFBASE_TO_STORED(o) ((o)-1)
|
|
+
|
|
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
|
|
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
|
|
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
|
|
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
|
|
+#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM)
|
|
+#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM)
|
|
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
|
|
+#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
|
|
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */
|
|
|
|
/*! ZSTD_storeSeq() :
|
|
- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
|
|
- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
|
|
+ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
|
|
+ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
|
|
* @matchLength : must be >= MINMATCH
|
|
- * Allowed to overread literals up to litLimit.
|
|
+ * Allowed to over-read literals up to litLimit.
|
|
*/
|
|
HINT_INLINE UNUSED_ATTR void
|
|
ZSTD_storeSeq(seqStore_t* seqStorePtr,
|
|
size_t litLength, const BYTE* literals, const BYTE* litLimit,
|
|
- U32 offBase_minus1,
|
|
+ U32 offBase,
|
|
size_t matchLength)
|
|
{
|
|
BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
|
|
@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
|
|
static const BYTE* g_start = NULL;
|
|
if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
|
|
{ U32 const pos = (U32)((const BYTE*)literals - g_start);
|
|
- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
|
|
- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
|
|
+ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
|
|
+ pos, (U32)litLength, (U32)matchLength, (U32)offBase);
|
|
}
|
|
#endif
|
|
assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
|
|
@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
|
|
assert(literals + litLength <= litLimit);
|
|
if (litEnd <= litLimit_w) {
|
|
/* Common case we can use wildcopy.
|
|
- * First copy 16 bytes, because literals are likely short.
|
|
- */
|
|
- assert(WILDCOPY_OVERLENGTH >= 16);
|
|
+ * First copy 16 bytes, because literals are likely short.
|
|
+ */
|
|
+ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
|
|
ZSTD_copy16(seqStorePtr->lit, literals);
|
|
if (litLength > 16) {
|
|
ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
|
|
@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
|
|
seqStorePtr->sequences[0].litLength = (U16)litLength;
|
|
|
|
/* match offset */
|
|
- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
|
|
+ seqStorePtr->sequences[0].offBase = offBase;
|
|
|
|
/* match Length */
|
|
assert(matchLength >= MINMATCH);
|
|
@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
|
|
|
|
/* ZSTD_updateRep() :
|
|
* updates in-place @rep (array of repeat offsets)
|
|
- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
|
|
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
|
|
*/
|
|
MEM_STATIC void
|
|
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
|
|
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
|
|
{
|
|
- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */
|
|
+ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */
|
|
rep[2] = rep[1];
|
|
rep[1] = rep[0];
|
|
- rep[0] = STORED_OFFSET(offBase_minus1);
|
|
+ rep[0] = OFFBASE_TO_OFFSET(offBase);
|
|
} else { /* repcode */
|
|
- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
|
|
+ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
|
|
if (repCode > 0) { /* note : if repCode==0, no change */
|
|
U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
|
|
rep[2] = (repCode >= 2) ? rep[1] : rep[2];
|
|
@@ -673,11 +728,11 @@ typedef struct repcodes_s {
|
|
} repcodes_t;
|
|
|
|
MEM_STATIC repcodes_t
|
|
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
|
|
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
|
|
{
|
|
repcodes_t newReps;
|
|
ZSTD_memcpy(&newReps, rep, sizeof(newReps));
|
|
- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
|
|
+ ZSTD_updateRep(newReps.rep, offBase, ll0);
|
|
return newReps;
|
|
}
|
|
|
|
@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
|
|
/*-*************************************
|
|
* Match length counter
|
|
***************************************/
|
|
-static unsigned ZSTD_NbCommonBytes (size_t val)
|
|
-{
|
|
- if (MEM_isLittleEndian()) {
|
|
- if (MEM_64bits()) {
|
|
-# if (__GNUC__ >= 4)
|
|
- return (__builtin_ctzll((U64)val) >> 3);
|
|
-# else
|
|
- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
|
|
- 0, 3, 1, 3, 1, 4, 2, 7,
|
|
- 0, 2, 3, 6, 1, 5, 3, 5,
|
|
- 1, 3, 4, 4, 2, 5, 6, 7,
|
|
- 7, 0, 1, 2, 3, 3, 4, 6,
|
|
- 2, 6, 5, 5, 3, 4, 5, 6,
|
|
- 7, 1, 2, 4, 6, 4, 4, 5,
|
|
- 7, 2, 6, 5, 7, 6, 7, 7 };
|
|
- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
|
-# endif
|
|
- } else { /* 32 bits */
|
|
-# if (__GNUC__ >= 3)
|
|
- return (__builtin_ctz((U32)val) >> 3);
|
|
-# else
|
|
- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
|
|
- 3, 2, 2, 1, 3, 2, 0, 1,
|
|
- 3, 3, 1, 2, 2, 2, 2, 0,
|
|
- 3, 1, 2, 0, 1, 0, 1, 1 };
|
|
- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
|
-# endif
|
|
- }
|
|
- } else { /* Big Endian CPU */
|
|
- if (MEM_64bits()) {
|
|
-# if (__GNUC__ >= 4)
|
|
- return (__builtin_clzll(val) >> 3);
|
|
-# else
|
|
- unsigned r;
|
|
- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
|
|
- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
|
|
- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
|
|
- r += (!val);
|
|
- return r;
|
|
-# endif
|
|
- } else { /* 32 bits */
|
|
-# if (__GNUC__ >= 3)
|
|
- return (__builtin_clz((U32)val) >> 3);
|
|
-# else
|
|
- unsigned r;
|
|
- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
|
|
- r += (!val);
|
|
- return r;
|
|
-# endif
|
|
- } }
|
|
-}
|
|
-
|
|
-
|
|
MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
|
|
{
|
|
const BYTE* const pStart = pIn;
|
|
@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
|
|
* Hashes
|
|
***************************************/
|
|
static const U32 prime3bytes = 506832829U;
|
|
-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
|
|
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
|
|
+static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; }
|
|
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
|
|
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
|
|
|
|
static const U32 prime4bytes = 2654435761U;
|
|
-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
|
|
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
|
|
+static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
|
|
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
|
|
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
|
|
|
|
static const U64 prime5bytes = 889523592379ULL;
|
|
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; }
|
|
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
|
|
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
|
|
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
|
|
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
|
|
|
|
static const U64 prime6bytes = 227718039650203ULL;
|
|
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
|
|
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
|
|
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
|
|
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
|
|
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
|
|
|
|
static const U64 prime7bytes = 58295818150454627ULL;
|
|
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; }
|
|
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
|
|
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
|
|
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
|
|
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
|
|
|
|
static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
|
|
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
|
|
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
|
|
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; }
|
|
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
|
|
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
|
|
+
|
|
|
|
MEM_STATIC FORCE_INLINE_ATTR
|
|
size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
|
|
{
|
|
+ /* Although some of these hashes do support hBits up to 64, some do not.
|
|
+ * To be on the safe side, always avoid hBits > 32. */
|
|
+ assert(hBits <= 32);
|
|
+
|
|
switch(mls)
|
|
{
|
|
default:
|
|
@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
|
|
}
|
|
}
|
|
|
|
+MEM_STATIC FORCE_INLINE_ATTR
|
|
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
|
|
+ /* Although some of these hashes do support hBits up to 64, some do not.
|
|
+ * To be on the safe side, always avoid hBits > 32. */
|
|
+ assert(hBits <= 32);
|
|
+
|
|
+ switch(mls)
|
|
+ {
|
|
+ default:
|
|
+ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
|
|
+ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
|
|
+ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
|
|
+ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
|
|
+ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
/* ZSTD_ipow() :
|
|
* Return base^exponent.
|
|
*/
|
|
@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
|
|
(unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
|
|
assert(blockEndIdx >= loadedDictEnd);
|
|
|
|
- if (blockEndIdx > loadedDictEnd + maxDist) {
|
|
+ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
|
|
/* On reaching window size, dictionaries are invalidated.
|
|
* For simplification, if window size is reached anywhere within next block,
|
|
* the dictionary is invalidated for the full block.
|
|
+ *
|
|
+ * We also have to invalidate the dictionary if ZSTD_window_update() has detected
|
|
+ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
|
|
+ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
|
|
+ * dictMatchState, so setting it to NULL is not a problem.
|
|
*/
|
|
DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
|
|
*loadedDictEndPtr = 0;
|
|
@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
|
|
|
|
#endif
|
|
|
|
+/* Short Cache */
|
|
+
|
|
+/* Normally, zstd matchfinders follow this flow:
|
|
+ * 1. Compute hash at ip
|
|
+ * 2. Load index from hashTable[hash]
|
|
+ * 3. Check if *ip == *(base + index)
|
|
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
|
|
+ *
|
|
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
|
|
+ * when the data doesn't actually match. With short cache, the flow becomes:
|
|
+ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
|
|
+ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
|
|
+ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
|
|
+ *
|
|
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
|
|
+ * dictMatchState matchfinders.
|
|
+ */
|
|
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
|
|
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
|
|
+
|
|
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
|
|
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
|
|
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
|
|
+ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
|
|
+ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
|
|
+ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
|
|
+}
|
|
+
|
|
+/* Helper function for short cache matchfinders.
|
|
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
|
|
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
|
|
+ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
|
|
+ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
|
|
+ return tag1 == tag2;
|
|
+}
|
|
|
|
|
|
/* ===============================================================
|
|
@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
|
|
*/
|
|
void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
|
|
|
|
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
|
|
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
|
|
+ * Note that the block delimiter must include the last literals of the block.
|
|
+ */
|
|
+size_t
|
|
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
|
|
+ ZSTD_sequencePosition* seqPos,
|
|
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
|
|
+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
|
|
+
|
|
+/* Returns the number of bytes to move the current read position back by.
|
|
+ * Only non-zero if we ended up splitting a sequence.
|
|
+ * Otherwise, it may return a ZSTD error if something went wrong.
|
|
+ *
|
|
+ * This function will attempt to scan through blockSize bytes
|
|
+ * represented by the sequences in @inSeqs,
|
|
+ * storing any (partial) sequences.
|
|
+ *
|
|
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
|
|
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
|
|
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
|
|
+ */
|
|
+size_t
|
|
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
|
|
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
|
|
+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
|
|
+
|
|
+
|
|
+/* ===============================================================
|
|
+ * Deprecated definitions that are still used internally to avoid
|
|
+ * deprecation warnings. These functions are exactly equivalent to
|
|
+ * their public variants, but avoid the deprecation warnings.
|
|
+ * =============================================================== */
|
|
+
|
|
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
|
|
+
|
|
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize);
|
|
+
|
|
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize);
|
|
+
|
|
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
+
|
|
+
|
|
#endif /* ZSTD_COMPRESS_H */
|
|
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
|
|
index 52b0a8059aba..3e9ea46a670a 100644
|
|
--- a/lib/zstd/compress/zstd_compress_literals.c
|
|
+++ b/lib/zstd/compress/zstd_compress_literals.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -13,11 +14,36 @@
|
|
***************************************/
|
|
#include "zstd_compress_literals.h"
|
|
|
|
+
|
|
+/* **************************************************************
|
|
+* Debug Traces
|
|
+****************************************************************/
|
|
+#if DEBUGLEVEL >= 2
|
|
+
|
|
+static size_t showHexa(const void* src, size_t srcSize)
|
|
+{
|
|
+ const BYTE* const ip = (const BYTE*)src;
|
|
+ size_t u;
|
|
+ for (u=0; u<srcSize; u++) {
|
|
+ RAWLOG(5, " %02X", ip[u]); (void)ip;
|
|
+ }
|
|
+ RAWLOG(5, " \n");
|
|
+ return srcSize;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+/* **************************************************************
|
|
+* Literals compression - special cases
|
|
+****************************************************************/
|
|
size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
|
|
{
|
|
BYTE* const ostart = (BYTE*)dst;
|
|
U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
|
|
|
|
+ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
|
|
+
|
|
RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
|
|
|
|
switch(flSize)
|
|
@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
|
|
}
|
|
|
|
ZSTD_memcpy(ostart + flSize, src, srcSize);
|
|
- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
|
|
+ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
|
|
return srcSize + flSize;
|
|
}
|
|
|
|
+static int allBytesIdentical(const void* src, size_t srcSize)
|
|
+{
|
|
+ assert(srcSize >= 1);
|
|
+ assert(src != NULL);
|
|
+ { const BYTE b = ((const BYTE*)src)[0];
|
|
+ size_t p;
|
|
+ for (p=1; p<srcSize; p++) {
|
|
+ if (((const BYTE*)src)[p] != b) return 0;
|
|
+ }
|
|
+ return 1;
|
|
+ }
|
|
+}
|
|
+
|
|
size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
|
|
{
|
|
BYTE* const ostart = (BYTE*)dst;
|
|
U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
|
|
|
|
- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
|
|
+ assert(dstCapacity >= 4); (void)dstCapacity;
|
|
+ assert(allBytesIdentical(src, srcSize));
|
|
|
|
switch(flSize)
|
|
{
|
|
@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
|
|
}
|
|
|
|
ostart[flSize] = *(const BYTE*)src;
|
|
- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
|
|
+ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
|
|
return flSize+1;
|
|
}
|
|
|
|
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
|
|
- ZSTD_hufCTables_t* nextHuf,
|
|
- ZSTD_strategy strategy, int disableLiteralCompression,
|
|
- void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize,
|
|
- void* entropyWorkspace, size_t entropyWorkspaceSize,
|
|
- const int bmi2,
|
|
- unsigned suspectUncompressible)
|
|
+/* ZSTD_minLiteralsToCompress() :
|
|
+ * returns minimal amount of literals
|
|
+ * for literal compression to even be attempted.
|
|
+ * Minimum is made tighter as compression strategy increases.
|
|
+ */
|
|
+static size_t
|
|
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
|
|
+{
|
|
+ assert((int)strategy >= 0);
|
|
+ assert((int)strategy <= 9);
|
|
+ /* btultra2 : min 8 bytes;
|
|
+ * then 2x larger for each successive compression strategy
|
|
+ * max threshold 64 bytes */
|
|
+ { int const shift = MIN(9-(int)strategy, 3);
|
|
+ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
|
|
+ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
|
|
+ return mintc;
|
|
+ }
|
|
+}
|
|
+
|
|
+size_t ZSTD_compressLiterals (
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize,
|
|
+ void* entropyWorkspace, size_t entropyWorkspaceSize,
|
|
+ const ZSTD_hufCTables_t* prevHuf,
|
|
+ ZSTD_hufCTables_t* nextHuf,
|
|
+ ZSTD_strategy strategy,
|
|
+ int disableLiteralCompression,
|
|
+ int suspectUncompressible,
|
|
+ int bmi2)
|
|
{
|
|
- size_t const minGain = ZSTD_minGain(srcSize, strategy);
|
|
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
|
|
BYTE* const ostart = (BYTE*)dst;
|
|
U32 singleStream = srcSize < 256;
|
|
symbolEncodingType_e hType = set_compressed;
|
|
size_t cLitSize;
|
|
|
|
- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
|
|
- disableLiteralCompression, (U32)srcSize);
|
|
+ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
|
|
+ disableLiteralCompression, (U32)srcSize, dstCapacity);
|
|
+
|
|
+ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
|
|
|
|
/* Prepare nextEntropy assuming reusing the existing table */
|
|
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
|
|
if (disableLiteralCompression)
|
|
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
|
|
|
|
- /* small ? don't even attempt compression (speed opt) */
|
|
-# define COMPRESS_LITERALS_SIZE_MIN 63
|
|
- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
|
|
- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
|
|
- }
|
|
+ /* if too small, don't even attempt compression (speed opt) */
|
|
+ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
|
|
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
|
|
|
|
RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
|
|
{ HUF_repeat repeat = prevHuf->repeatMode;
|
|
- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
|
|
+ int const flags = 0
|
|
+ | (bmi2 ? HUF_flags_bmi2 : 0)
|
|
+ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
|
|
+ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
|
|
+ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
|
|
+
|
|
+ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
|
|
+ huf_compress_f huf_compress;
|
|
if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
|
|
- cLitSize = singleStream ?
|
|
- HUF_compress1X_repeat(
|
|
- ostart+lhSize, dstCapacity-lhSize, src, srcSize,
|
|
- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
|
|
- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
|
|
- HUF_compress4X_repeat(
|
|
- ostart+lhSize, dstCapacity-lhSize, src, srcSize,
|
|
- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
|
|
- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
|
|
+ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
|
|
+ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
|
|
+ src, srcSize,
|
|
+ HUF_SYMBOLVALUE_MAX, LitHufLog,
|
|
+ entropyWorkspace, entropyWorkspaceSize,
|
|
+ (HUF_CElt*)nextHuf->CTable,
|
|
+ &repeat, flags);
|
|
+ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
|
|
if (repeat != HUF_repeat_none) {
|
|
/* reused the existing table */
|
|
- DEBUGLOG(5, "Reusing previous huffman table");
|
|
+ DEBUGLOG(5, "reusing statistics from previous huffman block");
|
|
hType = set_repeat;
|
|
}
|
|
}
|
|
|
|
- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
|
|
- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
|
|
- }
|
|
+ { size_t const minGain = ZSTD_minGain(srcSize, strategy);
|
|
+ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
|
|
+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
|
|
+ } }
|
|
if (cLitSize==1) {
|
|
- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
|
|
- }
|
|
+ /* A return value of 1 signals that the alphabet consists of a single symbol.
|
|
+ * However, in some rare circumstances, it could be the compressed size (a single byte).
|
|
+ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
|
|
+ * (it's also necessary to not generate statistics).
|
|
+ * Therefore, in such a case, actively check that all bytes are identical. */
|
|
+ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
|
|
+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
|
|
+ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
|
|
+ } }
|
|
|
|
if (hType == set_compressed) {
|
|
/* using a newly constructed table */
|
|
@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
|
|
switch(lhSize)
|
|
{
|
|
case 3: /* 2 - 2 - 10 - 10 */
|
|
- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
|
|
+ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
|
|
+ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
|
|
MEM_writeLE24(ostart, lhc);
|
|
break;
|
|
}
|
|
case 4: /* 2 - 2 - 14 - 14 */
|
|
+ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
|
|
{ U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
|
|
MEM_writeLE32(ostart, lhc);
|
|
break;
|
|
}
|
|
case 5: /* 2 - 2 - 18 - 18 */
|
|
+ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
|
|
{ U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
|
|
MEM_writeLE32(ostart, lhc);
|
|
ostart[4] = (BYTE)(cLitSize >> 10);
|
|
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
|
|
index 9775fb97cb70..a2a85d6b69e5 100644
|
|
--- a/lib/zstd/compress/zstd_compress_literals.h
|
|
+++ b/lib/zstd/compress/zstd_compress_literals.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -16,16 +17,24 @@
|
|
|
|
size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
|
|
+/* ZSTD_compressRleLiteralsBlock() :
|
|
+ * Conditions :
|
|
+ * - All bytes in @src are identical
|
|
+ * - dstCapacity >= 4 */
|
|
size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
|
|
|
-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
|
|
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
|
|
- ZSTD_hufCTables_t* nextHuf,
|
|
- ZSTD_strategy strategy, int disableLiteralCompression,
|
|
- void* dst, size_t dstCapacity,
|
|
+/* ZSTD_compressLiterals():
|
|
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
|
|
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
|
|
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
|
|
+ */
|
|
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
|
|
const void* src, size_t srcSize,
|
|
void* entropyWorkspace, size_t entropyWorkspaceSize,
|
|
- const int bmi2,
|
|
- unsigned suspectUncompressible);
|
|
+ const ZSTD_hufCTables_t* prevHuf,
|
|
+ ZSTD_hufCTables_t* nextHuf,
|
|
+ ZSTD_strategy strategy, int disableLiteralCompression,
|
|
+ int suspectUncompressible,
|
|
+ int bmi2);
|
|
|
|
#endif /* ZSTD_COMPRESS_LITERALS_H */
|
|
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
|
|
index 21ddc1b37acf..5c028c78d889 100644
|
|
--- a/lib/zstd/compress/zstd_compress_sequences.c
|
|
+++ b/lib/zstd/compress/zstd_compress_sequences.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
|
|
{
|
|
/* Heuristic: This should cover most blocks <= 16K and
|
|
* start to fade out after 16K to about 32K depending on
|
|
- * comprssibility.
|
|
+ * compressibility.
|
|
*/
|
|
return nbSeq >= 2048;
|
|
}
|
|
@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
|
|
if (mostFrequent == nbSeq) {
|
|
*repeatMode = FSE_repeat_none;
|
|
if (isDefaultAllowed && nbSeq <= 2) {
|
|
- /* Prefer set_basic over set_rle when there are 2 or less symbols,
|
|
+ /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
|
|
* since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
|
|
* If basic encoding isn't possible, always choose RLE.
|
|
*/
|
|
diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
|
|
index 7991364c2f71..7fe6f4ff5cf2 100644
|
|
--- a/lib/zstd/compress/zstd_compress_sequences.h
|
|
+++ b/lib/zstd/compress/zstd_compress_sequences.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
|
|
index 17d836cc84e8..dbacbaf72733 100644
|
|
--- a/lib/zstd/compress/zstd_compress_superblock.c
|
|
+++ b/lib/zstd/compress/zstd_compress_superblock.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -36,13 +37,14 @@
|
|
* If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
|
|
* and the following sub-blocks' literals sections will be Treeless_Literals_Block.
|
|
* @return : compressed size of literals section of a sub-block
|
|
- * Or 0 if it unable to compress.
|
|
+ * Or 0 if unable to compress.
|
|
* Or error code */
|
|
-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
|
|
- const ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
- const BYTE* literals, size_t litSize,
|
|
- void* dst, size_t dstSize,
|
|
- const int bmi2, int writeEntropy, int* entropyWritten)
|
|
+static size_t
|
|
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
|
|
+ const ZSTD_hufCTablesMetadata_t* hufMetadata,
|
|
+ const BYTE* literals, size_t litSize,
|
|
+ void* dst, size_t dstSize,
|
|
+ const int bmi2, int writeEntropy, int* entropyWritten)
|
|
{
|
|
size_t const header = writeEntropy ? 200 : 0;
|
|
size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
|
|
@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
|
|
symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
|
|
size_t cLitSize = 0;
|
|
|
|
- (void)bmi2; /* TODO bmi2... */
|
|
-
|
|
DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
|
|
|
|
*entropyWritten = 0;
|
|
@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
|
|
DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
|
|
}
|
|
|
|
- /* TODO bmi2 */
|
|
- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
|
|
- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
|
|
+ { int const flags = bmi2 ? HUF_flags_bmi2 : 0;
|
|
+ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
|
|
+ : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
|
|
op += cSize;
|
|
cLitSize += cSize;
|
|
if (cSize == 0 || ERR_isError(cSize)) {
|
|
@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
|
|
return op-ostart;
|
|
}
|
|
|
|
-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
|
|
+static size_t
|
|
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
|
|
+ const seqDef* sequences, size_t nbSeq,
|
|
+ size_t litSize, int lastSequence)
|
|
+{
|
|
const seqDef* const sstart = sequences;
|
|
const seqDef* const send = sequences + nbSeq;
|
|
const seqDef* sp = sstart;
|
|
@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
|
|
* @return : compressed size of sequences section of a sub-block
|
|
* Or 0 if it is unable to compress
|
|
* Or error code. */
|
|
-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
|
|
- const ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
- const seqDef* sequences, size_t nbSeq,
|
|
- const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
|
|
- const ZSTD_CCtx_params* cctxParams,
|
|
- void* dst, size_t dstCapacity,
|
|
- const int bmi2, int writeEntropy, int* entropyWritten)
|
|
+static size_t
|
|
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
|
|
+ const ZSTD_fseCTablesMetadata_t* fseMetadata,
|
|
+ const seqDef* sequences, size_t nbSeq,
|
|
+ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
|
|
+ const ZSTD_CCtx_params* cctxParams,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const int bmi2, int writeEntropy, int* entropyWritten)
|
|
{
|
|
const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
|
|
BYTE* const ostart = (BYTE*)dst;
|
|
@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
|
|
repcodes_t rep;
|
|
ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
|
|
for (seq = sstart; seq < sp; ++seq) {
|
|
- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
|
|
+ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
|
|
}
|
|
ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
|
|
}
|
|
diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
|
|
index 224ece79546e..826bbc9e029b 100644
|
|
--- a/lib/zstd/compress/zstd_compress_superblock.h
|
|
+++ b/lib/zstd/compress/zstd_compress_superblock.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
|
|
index 349fc923c355..65ea53b62844 100644
|
|
--- a/lib/zstd/compress/zstd_cwksp.h
|
|
+++ b/lib/zstd/compress/zstd_cwksp.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -14,7 +15,9 @@
|
|
/*-*************************************
|
|
* Dependencies
|
|
***************************************/
|
|
+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
|
|
#include "../common/zstd_internal.h"
|
|
+#include "../common/portability_macros.h"
|
|
|
|
|
|
/*-*************************************
|
|
@@ -41,8 +44,9 @@
|
|
***************************************/
|
|
typedef enum {
|
|
ZSTD_cwksp_alloc_objects,
|
|
- ZSTD_cwksp_alloc_buffers,
|
|
- ZSTD_cwksp_alloc_aligned
|
|
+ ZSTD_cwksp_alloc_aligned_init_once,
|
|
+ ZSTD_cwksp_alloc_aligned,
|
|
+ ZSTD_cwksp_alloc_buffers
|
|
} ZSTD_cwksp_alloc_phase_e;
|
|
|
|
/*
|
|
@@ -95,8 +99,8 @@ typedef enum {
|
|
*
|
|
* Workspace Layout:
|
|
*
|
|
- * [ ... workspace ... ]
|
|
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
|
|
+ * [ ... workspace ... ]
|
|
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
|
|
*
|
|
* The various objects that live in the workspace are divided into the
|
|
* following categories, and are allocated separately:
|
|
@@ -120,9 +124,18 @@ typedef enum {
|
|
* uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
|
|
* Their sizes depend on the cparams. These tables are 64-byte aligned.
|
|
*
|
|
- * - Aligned: these buffers are used for various purposes that require 4 byte
|
|
- * alignment, but don't require any initialization before they're used. These
|
|
- * buffers are each aligned to 64 bytes.
|
|
+ * - Init once: these buffers require to be initialized at least once before
|
|
+ * use. They should be used when we want to skip memory initialization
|
|
+ * while not triggering memory checkers (like Valgrind) when reading from
|
|
+ * from this memory without writing to it first.
|
|
+ * These buffers should be used carefully as they might contain data
|
|
+ * from previous compressions.
|
|
+ * Buffers are aligned to 64 bytes.
|
|
+ *
|
|
+ * - Aligned: these buffers don't require any initialization before they're
|
|
+ * used. The user of the buffer should make sure they write into a buffer
|
|
+ * location before reading from it.
|
|
+ * Buffers are aligned to 64 bytes.
|
|
*
|
|
* - Buffers: these buffers are used for various purposes that don't require
|
|
* any alignment or initialization before they're used. This means they can
|
|
@@ -134,8 +147,9 @@ typedef enum {
|
|
* correctly packed into the workspace buffer. That order is:
|
|
*
|
|
* 1. Objects
|
|
- * 2. Buffers
|
|
- * 3. Aligned/Tables
|
|
+ * 2. Init once / Tables
|
|
+ * 3. Aligned / Tables
|
|
+ * 4. Buffers / Tables
|
|
*
|
|
* Attempts to reserve objects of different types out of order will fail.
|
|
*/
|
|
@@ -147,6 +161,7 @@ typedef struct {
|
|
void* tableEnd;
|
|
void* tableValidEnd;
|
|
void* allocStart;
|
|
+ void* initOnceStart;
|
|
|
|
BYTE allocFailed;
|
|
int workspaceOversizedDuration;
|
|
@@ -159,6 +174,7 @@ typedef struct {
|
|
***************************************/
|
|
|
|
MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
|
|
+MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
|
|
|
|
MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
|
|
(void)ws;
|
|
@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
|
|
assert(ws->tableEnd <= ws->allocStart);
|
|
assert(ws->tableValidEnd <= ws->allocStart);
|
|
assert(ws->allocStart <= ws->workspaceEnd);
|
|
+ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
|
|
+ assert(ws->workspace <= ws->initOnceStart);
|
|
}
|
|
|
|
/*
|
|
@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
|
|
* for internal purposes (currently only alignment).
|
|
*/
|
|
MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
|
|
- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
|
|
- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
|
|
- * to align the beginning of the aligned section.
|
|
- *
|
|
- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
|
|
- * aligneds being sized in multiples of 64 bytes.
|
|
+ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
|
|
+ * bytes to align the beginning of tables section and end of buffers;
|
|
*/
|
|
- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
|
|
+ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
|
|
return slackSpace;
|
|
}
|
|
|
|
@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
|
|
size_t const alignBytesMask = alignBytes - 1;
|
|
size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
|
|
assert((alignBytes & alignBytesMask) == 0);
|
|
- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
|
|
+ assert(bytes < alignBytes);
|
|
return bytes;
|
|
}
|
|
|
|
+/*
|
|
+ * Returns the initial value for allocStart which is used to determine the position from
|
|
+ * which we can allocate from the end of the workspace.
|
|
+ */
|
|
+MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
|
|
+ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
|
|
+}
|
|
+
|
|
/*
|
|
* Internal function. Do not use directly.
|
|
* Reserves the given number of bytes within the aligned/buffer segment of the wksp,
|
|
@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
|
|
{
|
|
assert(phase >= ws->phase);
|
|
if (phase > ws->phase) {
|
|
- /* Going from allocating objects to allocating buffers */
|
|
- if (ws->phase < ZSTD_cwksp_alloc_buffers &&
|
|
- phase >= ZSTD_cwksp_alloc_buffers) {
|
|
+ /* Going from allocating objects to allocating initOnce / tables */
|
|
+ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
|
|
+ phase >= ZSTD_cwksp_alloc_aligned_init_once) {
|
|
ws->tableValidEnd = ws->objectEnd;
|
|
- }
|
|
+ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
|
|
|
|
- /* Going from allocating buffers to allocating aligneds/tables */
|
|
- if (ws->phase < ZSTD_cwksp_alloc_aligned &&
|
|
- phase >= ZSTD_cwksp_alloc_aligned) {
|
|
- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
|
|
- size_t const bytesToAlign =
|
|
- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
|
|
- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
|
|
- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
|
|
- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
|
|
- memory_allocation, "aligned phase - alignment initial allocation failed!");
|
|
- }
|
|
{ /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
|
|
- void* const alloc = ws->objectEnd;
|
|
+ void *const alloc = ws->objectEnd;
|
|
size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
|
|
- void* const objectEnd = (BYTE*)alloc + bytesToAlign;
|
|
+ void *const objectEnd = (BYTE *) alloc + bytesToAlign;
|
|
DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
|
|
RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
|
|
"table phase - alignment initial allocation failed!");
|
|
@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
|
|
ws->tableEnd = objectEnd; /* table area starts being empty */
|
|
if (ws->tableValidEnd < ws->tableEnd) {
|
|
ws->tableValidEnd = ws->tableEnd;
|
|
- } } }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
ws->phase = phase;
|
|
ZSTD_cwksp_assert_internal_consistency(ws);
|
|
}
|
|
@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
|
|
*/
|
|
MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
|
|
{
|
|
- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
|
|
+ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
|
|
}
|
|
|
|
/*
|
|
@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
|
|
return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
|
|
}
|
|
|
|
+/*
|
|
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
|
|
+ * This memory has been initialized at least once in the past.
|
|
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
|
|
+ * operations.
|
|
+ * The main usage is for algorithms that might need read access into uninitialized memory.
|
|
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
|
|
+ * leak any of the past data (directly or in side channels).
|
|
+ */
|
|
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
|
|
+{
|
|
+ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
|
|
+ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
|
|
+ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
|
|
+ if(ptr && ptr < ws->initOnceStart) {
|
|
+ /* We assume the memory following the current allocation is either:
|
|
+ * 1. Not usable as initOnce memory (end of workspace)
|
|
+ * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
|
|
+ * 3. An ASAN redzone, in which case we don't want to write on it
|
|
+ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
|
|
+ * Note that we assume here that MSAN and ASAN cannot run in the same time. */
|
|
+ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
|
|
+ ws->initOnceStart = ptr;
|
|
+ }
|
|
+ return ptr;
|
|
+}
|
|
+
|
|
/*
|
|
* Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
|
|
*/
|
|
@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
|
|
*/
|
|
MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
|
|
{
|
|
- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
|
|
+ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
|
|
void* alloc;
|
|
void* end;
|
|
void* top;
|
|
|
|
- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
|
|
- return NULL;
|
|
+ /* We can only start allocating tables after we are done reserving space for objects at the
|
|
+ * start of the workspace */
|
|
+ if(ws->phase < phase) {
|
|
+ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
|
|
+ return NULL;
|
|
+ }
|
|
}
|
|
alloc = ws->tableEnd;
|
|
end = (BYTE *)alloc + bytes;
|
|
@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
|
|
assert(ws->tableValidEnd >= ws->objectEnd);
|
|
assert(ws->tableValidEnd <= ws->allocStart);
|
|
if (ws->tableValidEnd < ws->tableEnd) {
|
|
- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
|
|
+ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
|
|
}
|
|
ZSTD_cwksp_mark_tables_clean(ws);
|
|
}
|
|
@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
|
|
|
|
|
|
ws->tableEnd = ws->objectEnd;
|
|
- ws->allocStart = ws->workspaceEnd;
|
|
+ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
|
|
ws->allocFailed = 0;
|
|
- if (ws->phase > ZSTD_cwksp_alloc_buffers) {
|
|
- ws->phase = ZSTD_cwksp_alloc_buffers;
|
|
+ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
|
|
+ ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
|
|
}
|
|
ZSTD_cwksp_assert_internal_consistency(ws);
|
|
}
|
|
@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
|
|
ws->workspaceEnd = (BYTE*)start + size;
|
|
ws->objectEnd = ws->workspace;
|
|
ws->tableValidEnd = ws->objectEnd;
|
|
+ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
|
|
ws->phase = ZSTD_cwksp_alloc_objects;
|
|
ws->isStatic = isStatic;
|
|
ZSTD_cwksp_clear(ws);
|
|
@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
|
|
* Returns if the estimated space needed for a wksp is within an acceptable limit of the
|
|
* actual amount of space used.
|
|
*/
|
|
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
|
|
- size_t const estimatedSpace, int resizedWorkspace) {
|
|
- if (resizedWorkspace) {
|
|
- /* Resized/newly allocated wksp should have exact bounds */
|
|
- return ZSTD_cwksp_used(ws) == estimatedSpace;
|
|
- } else {
|
|
- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
|
|
- * than estimatedSpace. See the comments in zstd_cwksp.h for details.
|
|
- */
|
|
- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
|
|
- }
|
|
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
|
|
+ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
|
|
+ * the alignment bytes difference between estimation and actual usage */
|
|
+ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
|
|
+ ZSTD_cwksp_used(ws) <= estimatedSpace;
|
|
}
|
|
|
|
|
|
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
|
|
index 76933dea2624..ab9440a99603 100644
|
|
--- a/lib/zstd/compress/zstd_double_fast.c
|
|
+++ b/lib/zstd/compress/zstd_double_fast.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -11,8 +12,43 @@
|
|
#include "zstd_compress_internal.h"
|
|
#include "zstd_double_fast.h"
|
|
|
|
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
|
|
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm)
|
|
+{
|
|
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
+ U32* const hashLarge = ms->hashTable;
|
|
+ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ U32 const mls = cParams->minMatch;
|
|
+ U32* const hashSmall = ms->chainTable;
|
|
+ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ const BYTE* const base = ms->window.base;
|
|
+ const BYTE* ip = base + ms->nextToUpdate;
|
|
+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
|
|
+ const U32 fastHashFillStep = 3;
|
|
|
|
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
|
|
+ /* Always insert every fastHashFillStep position into the hash tables.
|
|
+ * Insert the other positions into the large hash table if their entry
|
|
+ * is empty.
|
|
+ */
|
|
+ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
|
|
+ U32 const curr = (U32)(ip - base);
|
|
+ U32 i;
|
|
+ for (i = 0; i < fastHashFillStep; ++i) {
|
|
+ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
|
|
+ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
|
|
+ if (i == 0) {
|
|
+ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
|
|
+ }
|
|
+ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
|
|
+ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
|
|
+ }
|
|
+ /* Only load extra positions for ZSTD_dtlm_full */
|
|
+ if (dtlm == ZSTD_dtlm_fast)
|
|
+ break;
|
|
+ } }
|
|
+}
|
|
+
|
|
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
|
|
void const* end, ZSTD_dictTableLoadMethod_e dtlm)
|
|
{
|
|
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
|
|
/* Only load extra positions for ZSTD_dtlm_full */
|
|
if (dtlm == ZSTD_dtlm_fast)
|
|
break;
|
|
- } }
|
|
+ } }
|
|
+}
|
|
+
|
|
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
|
|
+ const void* const end,
|
|
+ ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp)
|
|
+{
|
|
+ if (tfp == ZSTD_tfp_forCDict) {
|
|
+ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
|
|
+ } else {
|
|
+ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
|
|
+ }
|
|
}
|
|
|
|
|
|
@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
const BYTE* const iend = istart + srcSize;
|
|
const BYTE* const ilimit = iend - HASH_READ_SIZE;
|
|
U32 offset_1=rep[0], offset_2=rep[1];
|
|
- U32 offsetSaved = 0;
|
|
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
|
|
|
|
size_t mLength;
|
|
U32 offset;
|
|
@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
U32 const current = (U32)(ip - base);
|
|
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
|
|
U32 const maxRep = current - windowLow;
|
|
- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
|
|
- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
|
|
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
|
|
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
|
|
}
|
|
|
|
/* Outer Loop: one iteration per match found and stored */
|
|
@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
|
|
mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
|
|
ip++;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
|
|
goto _match_stored;
|
|
}
|
|
|
|
@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
} while (ip1 <= ilimit);
|
|
|
|
_cleanup:
|
|
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
|
|
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
|
|
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
|
|
+
|
|
/* save reps for next block */
|
|
- rep[0] = offset_1 ? offset_1 : offsetSaved;
|
|
- rep[1] = offset_2 ? offset_2 : offsetSaved;
|
|
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
|
|
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
hashLong[hl1] = (U32)(ip1 - base);
|
|
}
|
|
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
|
|
_match_stored:
|
|
/* match found */
|
|
@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
|
|
U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
|
|
hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
|
|
hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
|
|
ip += rLength;
|
|
anchor = ip;
|
|
continue; /* faster when present ... (?) */
|
|
@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
const BYTE* const iend = istart + srcSize;
|
|
const BYTE* const ilimit = iend - HASH_READ_SIZE;
|
|
U32 offset_1=rep[0], offset_2=rep[1];
|
|
- U32 offsetSaved = 0;
|
|
|
|
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
|
|
@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
const BYTE* const dictStart = dictBase + dictStartIndex;
|
|
const BYTE* const dictEnd = dms->window.nextSrc;
|
|
const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase);
|
|
- const U32 dictHBitsL = dictCParams->hashLog;
|
|
- const U32 dictHBitsS = dictCParams->chainLog;
|
|
+ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
|
|
|
|
DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
|
|
@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
/* if a dictionary is attached, it must be within window range */
|
|
assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
|
|
|
|
+ if (ms->prefetchCDictTables) {
|
|
+ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
|
|
+ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
|
|
+ PREFETCH_AREA(dictHashLong, hashTableBytes)
|
|
+ PREFETCH_AREA(dictHashSmall, chainTableBytes)
|
|
+ }
|
|
+
|
|
/* init */
|
|
ip += (dictAndPrefixLength == 0);
|
|
|
|
@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
U32 offset;
|
|
size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
|
|
size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
|
|
- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
|
|
- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
|
|
+ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
|
|
+ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
|
|
+ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
|
|
+ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
|
|
+ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
|
|
+ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
|
|
U32 const curr = (U32)(ip-base);
|
|
U32 const matchIndexL = hashLong[h2];
|
|
U32 matchIndexS = hashSmall[h];
|
|
@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
|
|
ip++;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
|
|
goto _match_stored;
|
|
}
|
|
|
|
@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
|
|
goto _match_found;
|
|
}
|
|
- } else {
|
|
+ } else if (dictTagsMatchL) {
|
|
/* check dictMatchState long match */
|
|
- U32 const dictMatchIndexL = dictHashLong[dictHL];
|
|
+ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
const BYTE* dictMatchL = dictBase + dictMatchIndexL;
|
|
assert(dictMatchL < dictEnd);
|
|
|
|
@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
if (MEM_read32(match) == MEM_read32(ip)) {
|
|
goto _search_next_long;
|
|
}
|
|
- } else {
|
|
+ } else if (dictTagsMatchS) {
|
|
/* check dictMatchState short match */
|
|
- U32 const dictMatchIndexS = dictHashSmall[dictHS];
|
|
+ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
match = dictBase + dictMatchIndexS;
|
|
matchIndexS = dictMatchIndexS + dictIndexDelta;
|
|
|
|
@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
continue;
|
|
|
|
_search_next_long:
|
|
-
|
|
{ size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
|
|
- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
|
|
+ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
|
|
U32 const matchIndexL3 = hashLong[hl3];
|
|
+ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
|
|
+ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
|
|
const BYTE* matchL3 = base + matchIndexL3;
|
|
hashLong[hl3] = curr + 1;
|
|
|
|
@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
|
|
goto _match_found;
|
|
}
|
|
- } else {
|
|
+ } else if (dictTagsMatchL3) {
|
|
/* check dict long +1 match */
|
|
- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
|
|
+ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
|
|
assert(dictMatchL3 < dictEnd);
|
|
if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
|
|
@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
offset_2 = offset_1;
|
|
offset_1 = offset;
|
|
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
|
|
_match_stored:
|
|
/* match found */
|
|
@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
|
|
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
|
|
U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
|
|
hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
|
|
hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
|
|
ip += repLength2;
|
|
@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
|
|
} /* while (ip < ilimit) */
|
|
|
|
/* save reps for next block */
|
|
- rep[0] = offset_1 ? offset_1 : offsetSaved;
|
|
- rep[1] = offset_2 ? offset_2 : offsetSaved;
|
|
+ rep[0] = offset_1;
|
|
+ rep[1] = offset_2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
|
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
|
|
ip++;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
|
|
} else {
|
|
if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
|
|
const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
|
|
@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
|
|
offset_2 = offset_1;
|
|
offset_1 = offset;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
|
|
} else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
|
|
size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
|
|
@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
}
|
|
offset_2 = offset_1;
|
|
offset_1 = offset;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
|
|
} else {
|
|
ip += ((ip-anchor) >> kSearchStrength) + 1;
|
|
@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
|
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
|
U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
|
|
hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
|
|
hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
|
|
ip += repLength2;
|
|
diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
|
|
index 6822bde65a1d..0204f12e4cf7 100644
|
|
--- a/lib/zstd/compress/zstd_double_fast.h
|
|
+++ b/lib/zstd/compress/zstd_double_fast.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -16,7 +17,8 @@
|
|
#include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */
|
|
|
|
void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
|
|
- void const* end, ZSTD_dictTableLoadMethod_e dtlm);
|
|
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp);
|
|
size_t ZSTD_compressBlock_doubleFast(
|
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
void const* src, size_t srcSize);
|
|
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
|
|
index a752e6beab52..3399b39c5dbc 100644
|
|
--- a/lib/zstd/compress/zstd_fast.c
|
|
+++ b/lib/zstd/compress/zstd_fast.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -11,8 +12,42 @@
|
|
#include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
|
|
#include "zstd_fast.h"
|
|
|
|
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
|
|
+ const void* const end,
|
|
+ ZSTD_dictTableLoadMethod_e dtlm)
|
|
+{
|
|
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
+ U32* const hashTable = ms->hashTable;
|
|
+ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ U32 const mls = cParams->minMatch;
|
|
+ const BYTE* const base = ms->window.base;
|
|
+ const BYTE* ip = base + ms->nextToUpdate;
|
|
+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
|
|
+ const U32 fastHashFillStep = 3;
|
|
|
|
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
+ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
|
|
+ * Feel free to remove this assert if there's a good reason! */
|
|
+ assert(dtlm == ZSTD_dtlm_full);
|
|
+
|
|
+ /* Always insert every fastHashFillStep position into the hash table.
|
|
+ * Insert the other positions if their hash entry is empty.
|
|
+ */
|
|
+ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
|
|
+ U32 const curr = (U32)(ip - base);
|
|
+ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
|
|
+ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
|
|
+
|
|
+ if (dtlm == ZSTD_dtlm_fast) continue;
|
|
+ /* Only load extra positions for ZSTD_dtlm_full */
|
|
+ { U32 p;
|
|
+ for (p = 1; p < fastHashFillStep; ++p) {
|
|
+ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
|
|
+ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
|
|
+ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
|
|
+ } } } }
|
|
+}
|
|
+
|
|
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
|
|
const void* const end,
|
|
ZSTD_dictTableLoadMethod_e dtlm)
|
|
{
|
|
@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
|
|
const U32 fastHashFillStep = 3;
|
|
|
|
+ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
|
|
+ * Feel free to remove this assert if there's a good reason! */
|
|
+ assert(dtlm == ZSTD_dtlm_fast);
|
|
+
|
|
/* Always insert every fastHashFillStep position into the hash table.
|
|
* Insert the other positions if their hash entry is empty.
|
|
*/
|
|
@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
} } } }
|
|
}
|
|
|
|
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
+ const void* const end,
|
|
+ ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp)
|
|
+{
|
|
+ if (tfp == ZSTD_tfp_forCDict) {
|
|
+ ZSTD_fillHashTableForCDict(ms, end, dtlm);
|
|
+ } else {
|
|
+ ZSTD_fillHashTableForCCtx(ms, end, dtlm);
|
|
+ }
|
|
+}
|
|
+
|
|
|
|
/*
|
|
* If you squint hard enough (and ignore repcodes), the search operation at any
|
|
@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
|
|
U32 rep_offset1 = rep[0];
|
|
U32 rep_offset2 = rep[1];
|
|
- U32 offsetSaved = 0;
|
|
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
|
|
|
|
size_t hash0; /* hash for ip0 */
|
|
size_t hash1; /* hash for ip1 */
|
|
@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
{ U32 const curr = (U32)(ip0 - base);
|
|
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
|
|
U32 const maxRep = curr - windowLow;
|
|
- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
|
|
- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
|
|
+ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
|
|
+ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
|
|
}
|
|
|
|
/* start each op */
|
|
@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
mLength = ip0[-1] == match0[-1];
|
|
ip0 -= mLength;
|
|
match0 -= mLength;
|
|
- offcode = STORE_REPCODE_1;
|
|
+ offcode = REPCODE1_TO_OFFBASE;
|
|
mLength += 4;
|
|
+
|
|
+ /* First write next hash table entry; we've already calculated it.
|
|
+ * This write is known to be safe because the ip1 is before the
|
|
+ * repcode (ip2). */
|
|
+ hashTable[hash1] = (U32)(ip1 - base);
|
|
+
|
|
goto _match;
|
|
}
|
|
|
|
@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
/* check match at ip[0] */
|
|
if (MEM_read32(ip0) == mval) {
|
|
/* found a match! */
|
|
+
|
|
+ /* First write next hash table entry; we've already calculated it.
|
|
+ * This write is known to be safe because the ip1 == ip0 + 1, so
|
|
+ * we know we will resume searching after ip1 */
|
|
+ hashTable[hash1] = (U32)(ip1 - base);
|
|
+
|
|
goto _offset;
|
|
}
|
|
|
|
@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
/* check match at ip[0] */
|
|
if (MEM_read32(ip0) == mval) {
|
|
/* found a match! */
|
|
+
|
|
+ /* first write next hash table entry; we've already calculated it */
|
|
+ if (step <= 4) {
|
|
+ /* We need to avoid writing an index into the hash table >= the
|
|
+ * position at which we will pick up our searching after we've
|
|
+ * taken this match.
|
|
+ *
|
|
+ * The minimum possible match has length 4, so the earliest ip0
|
|
+ * can be after we take this match will be the current ip0 + 4.
|
|
+ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
|
|
+ * write this position.
|
|
+ */
|
|
+ hashTable[hash1] = (U32)(ip1 - base);
|
|
+ }
|
|
+
|
|
goto _offset;
|
|
}
|
|
|
|
@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
* However, it seems to be a meaningful performance hit to try to search
|
|
* them. So let's not. */
|
|
|
|
+ /* When the repcodes are outside of the prefix, we set them to zero before the loop.
|
|
+ * When the offsets are still zero, we need to restore them after the block to have a correct
|
|
+ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
|
|
+ * offsets were invalid. We need to figure out which offset to refill with.
|
|
+ * - If both offsets are zero they are in the same order.
|
|
+ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
|
|
+ * - If only one is zero, we need to decide which offset to restore.
|
|
+ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
|
|
+ * - It is impossible for rep_offset2 to be non-zero.
|
|
+ *
|
|
+ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
|
|
+ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
|
|
+ */
|
|
+ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
|
|
+
|
|
/* save reps for next block */
|
|
- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
|
|
- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
|
|
+ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
|
|
+ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
match0 = base + idx;
|
|
rep_offset2 = rep_offset1;
|
|
rep_offset1 = (U32)(ip0-match0);
|
|
- offcode = STORE_OFFSET(rep_offset1);
|
|
+ offcode = OFFSET_TO_OFFBASE(rep_offset1);
|
|
mLength = 4;
|
|
|
|
/* Count the backwards match length. */
|
|
@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
ip0 += mLength;
|
|
anchor = ip0;
|
|
|
|
- /* write next hash table entry */
|
|
- if (ip1 < ip0) {
|
|
- hashTable[hash1] = (U32)(ip1 - base);
|
|
- }
|
|
-
|
|
/* Fill table and check for immediate repcode. */
|
|
if (ip0 <= ilimit) {
|
|
/* Fill Table */
|
|
@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
|
|
{ U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
|
|
hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
|
|
ip0 += rLength;
|
|
- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
|
|
+ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
|
|
anchor = ip0;
|
|
continue; /* faster when present (confirmed on gcc-8) ... (?) */
|
|
} } }
|
|
@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
|
|
U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
|
|
const BYTE* const base = ms->window.base;
|
|
const BYTE* const istart = (const BYTE*)src;
|
|
- const BYTE* ip = istart;
|
|
+ const BYTE* ip0 = istart;
|
|
+ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
|
|
const BYTE* anchor = istart;
|
|
const U32 prefixStartIndex = ms->window.dictLimit;
|
|
const BYTE* const prefixStart = base + prefixStartIndex;
|
|
const BYTE* const iend = istart + srcSize;
|
|
const BYTE* const ilimit = iend - HASH_READ_SIZE;
|
|
U32 offset_1=rep[0], offset_2=rep[1];
|
|
- U32 offsetSaved = 0;
|
|
|
|
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
|
|
@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
|
|
const BYTE* const dictStart = dictBase + dictStartIndex;
|
|
const BYTE* const dictEnd = dms->window.nextSrc;
|
|
const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
|
|
- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart);
|
|
- const U32 dictHLog = dictCParams->hashLog;
|
|
+ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
|
|
+ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
|
|
|
|
/* if a dictionary is still attached, it necessarily means that
|
|
* it is within window size. So we just check it. */
|
|
const U32 maxDistance = 1U << cParams->windowLog;
|
|
- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
|
|
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
|
|
assert(endIndex - prefixStartIndex <= maxDistance);
|
|
(void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
|
|
|
|
@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
|
|
* when translating a dict index into a local index */
|
|
assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
|
|
|
|
+ if (ms->prefetchCDictTables) {
|
|
+ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
|
|
+ PREFETCH_AREA(dictHashTable, hashTableBytes)
|
|
+ }
|
|
+
|
|
/* init */
|
|
DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
|
|
- ip += (dictAndPrefixLength == 0);
|
|
+ ip0 += (dictAndPrefixLength == 0);
|
|
/* dictMatchState repCode checks don't currently handle repCode == 0
|
|
* disabling. */
|
|
assert(offset_1 <= dictAndPrefixLength);
|
|
assert(offset_2 <= dictAndPrefixLength);
|
|
|
|
- /* Main Search Loop */
|
|
- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
|
|
+ /* Outer search loop */
|
|
+ assert(stepSize >= 1);
|
|
+ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
|
|
size_t mLength;
|
|
- size_t const h = ZSTD_hashPtr(ip, hlog, mls);
|
|
- U32 const curr = (U32)(ip-base);
|
|
- U32 const matchIndex = hashTable[h];
|
|
- const BYTE* match = base + matchIndex;
|
|
- const U32 repIndex = curr + 1 - offset_1;
|
|
- const BYTE* repMatch = (repIndex < prefixStartIndex) ?
|
|
- dictBase + (repIndex - dictIndexDelta) :
|
|
- base + repIndex;
|
|
- hashTable[h] = curr; /* update hash table */
|
|
-
|
|
- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
|
|
- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
|
|
- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
|
- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
|
|
- ip++;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
|
|
- } else if ( (matchIndex <= prefixStartIndex) ) {
|
|
- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
|
|
- U32 const dictMatchIndex = dictHashTable[dictHash];
|
|
- const BYTE* dictMatch = dictBase + dictMatchIndex;
|
|
- if (dictMatchIndex <= dictStartIndex ||
|
|
- MEM_read32(dictMatch) != MEM_read32(ip)) {
|
|
- assert(stepSize >= 1);
|
|
- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
|
|
- continue;
|
|
- } else {
|
|
- /* found a dict match */
|
|
- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
|
|
- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
|
|
- while (((ip>anchor) & (dictMatch>dictStart))
|
|
- && (ip[-1] == dictMatch[-1])) {
|
|
- ip--; dictMatch--; mLength++;
|
|
+ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
|
|
+
|
|
+ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
|
|
+ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
|
|
+ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
|
|
+
|
|
+ U32 matchIndex = hashTable[hash0];
|
|
+ U32 curr = (U32)(ip0 - base);
|
|
+ size_t step = stepSize;
|
|
+ const size_t kStepIncr = 1 << kSearchStrength;
|
|
+ const BYTE* nextStep = ip0 + kStepIncr;
|
|
+
|
|
+ /* Inner search loop */
|
|
+ while (1) {
|
|
+ const BYTE* match = base + matchIndex;
|
|
+ const U32 repIndex = curr + 1 - offset_1;
|
|
+ const BYTE* repMatch = (repIndex < prefixStartIndex) ?
|
|
+ dictBase + (repIndex - dictIndexDelta) :
|
|
+ base + repIndex;
|
|
+ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
|
|
+ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
|
|
+ hashTable[hash0] = curr; /* update hash table */
|
|
+
|
|
+ if (((U32) ((prefixStartIndex - 1) - repIndex) >=
|
|
+ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
|
|
+ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
|
|
+ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
|
+ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
|
|
+ ip0++;
|
|
+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (dictTagsMatch) {
|
|
+ /* Found a possible dict match */
|
|
+ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
|
|
+ const BYTE* dictMatch = dictBase + dictMatchIndex;
|
|
+ if (dictMatchIndex > dictStartIndex &&
|
|
+ MEM_read32(dictMatch) == MEM_read32(ip0)) {
|
|
+ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
|
|
+ if (matchIndex <= prefixStartIndex) {
|
|
+ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
|
|
+ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
|
|
+ while (((ip0 > anchor) & (dictMatch > dictStart))
|
|
+ && (ip0[-1] == dictMatch[-1])) {
|
|
+ ip0--;
|
|
+ dictMatch--;
|
|
+ mLength++;
|
|
+ } /* catch up */
|
|
+ offset_2 = offset_1;
|
|
+ offset_1 = offset;
|
|
+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
|
|
+ /* found a regular match */
|
|
+ U32 const offset = (U32) (ip0 - match);
|
|
+ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
|
|
+ while (((ip0 > anchor) & (match > prefixStart))
|
|
+ && (ip0[-1] == match[-1])) {
|
|
+ ip0--;
|
|
+ match--;
|
|
+ mLength++;
|
|
} /* catch up */
|
|
offset_2 = offset_1;
|
|
offset_1 = offset;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
|
|
+ break;
|
|
}
|
|
- } else if (MEM_read32(match) != MEM_read32(ip)) {
|
|
- /* it's not a match, and we're not going to check the dictionary */
|
|
- assert(stepSize >= 1);
|
|
- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
|
|
- continue;
|
|
- } else {
|
|
- /* found a regular match */
|
|
- U32 const offset = (U32)(ip-match);
|
|
- mLength = ZSTD_count(ip+4, match+4, iend) + 4;
|
|
- while (((ip>anchor) & (match>prefixStart))
|
|
- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
|
|
- offset_2 = offset_1;
|
|
- offset_1 = offset;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
- }
|
|
+
|
|
+ /* Prepare for next iteration */
|
|
+ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
|
|
+ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
|
|
+ matchIndex = hashTable[hash1];
|
|
+
|
|
+ if (ip1 >= nextStep) {
|
|
+ step++;
|
|
+ nextStep += kStepIncr;
|
|
+ }
|
|
+ ip0 = ip1;
|
|
+ ip1 = ip1 + step;
|
|
+ if (ip1 > ilimit) goto _cleanup;
|
|
+
|
|
+ curr = (U32)(ip0 - base);
|
|
+ hash0 = hash1;
|
|
+ } /* end inner search loop */
|
|
|
|
/* match found */
|
|
- ip += mLength;
|
|
- anchor = ip;
|
|
+ assert(mLength);
|
|
+ ip0 += mLength;
|
|
+ anchor = ip0;
|
|
|
|
- if (ip <= ilimit) {
|
|
+ if (ip0 <= ilimit) {
|
|
/* Fill Table */
|
|
assert(base+curr+2 > istart); /* check base overflow */
|
|
hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
|
|
- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
|
|
+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
|
|
|
|
/* check immediate repcode */
|
|
- while (ip <= ilimit) {
|
|
- U32 const current2 = (U32)(ip-base);
|
|
+ while (ip0 <= ilimit) {
|
|
+ U32 const current2 = (U32)(ip0-base);
|
|
U32 const repIndex2 = current2 - offset_2;
|
|
const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
|
|
dictBase - dictIndexDelta + repIndex2 :
|
|
base + repIndex2;
|
|
if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
|
|
- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
|
|
+ && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
|
|
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
|
- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
|
+ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
|
U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
|
|
- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
|
|
- ip += repLength2;
|
|
- anchor = ip;
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
|
|
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
|
|
+ ip0 += repLength2;
|
|
+ anchor = ip0;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
+
|
|
+ /* Prepare for next iteration */
|
|
+ assert(ip0 == anchor);
|
|
+ ip1 = ip0 + stepSize;
|
|
}
|
|
|
|
+_cleanup:
|
|
/* save reps for next block */
|
|
- rep[0] = offset_1 ? offset_1 : offsetSaved;
|
|
- rep[1] = offset_2 ? offset_2 : offsetSaved;
|
|
+ rep[0] = offset_1;
|
|
+ rep[1] = offset_2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
|
|
U32* const hashTable = ms->hashTable;
|
|
U32 const hlog = cParams->hashLog;
|
|
/* support stepSize of 0 */
|
|
- U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
|
|
+ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
|
|
const BYTE* const base = ms->window.base;
|
|
const BYTE* const dictBase = ms->window.dictBase;
|
|
const BYTE* const istart = (const BYTE*)src;
|
|
- const BYTE* ip = istart;
|
|
const BYTE* anchor = istart;
|
|
const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
|
|
const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
|
|
@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
|
|
const BYTE* const iend = istart + srcSize;
|
|
const BYTE* const ilimit = iend - 8;
|
|
U32 offset_1=rep[0], offset_2=rep[1];
|
|
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
|
|
+
|
|
+ const BYTE* ip0 = istart;
|
|
+ const BYTE* ip1;
|
|
+ const BYTE* ip2;
|
|
+ const BYTE* ip3;
|
|
+ U32 current0;
|
|
+
|
|
+
|
|
+ size_t hash0; /* hash for ip0 */
|
|
+ size_t hash1; /* hash for ip1 */
|
|
+ U32 idx; /* match idx for ip0 */
|
|
+ const BYTE* idxBase; /* base pointer for idx */
|
|
+
|
|
+ U32 offcode;
|
|
+ const BYTE* match0;
|
|
+ size_t mLength;
|
|
+ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
|
|
+
|
|
+ size_t step;
|
|
+ const BYTE* nextStep;
|
|
+ const size_t kStepIncr = (1 << (kSearchStrength - 1));
|
|
|
|
(void)hasStep; /* not currently specialized on whether it's accelerated */
|
|
|
|
@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
|
|
if (prefixStartIndex == dictStartIndex)
|
|
return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
|
|
|
|
- /* Search Loop */
|
|
- while (ip < ilimit) { /* < instead of <=, because (ip+1) */
|
|
- const size_t h = ZSTD_hashPtr(ip, hlog, mls);
|
|
- const U32 matchIndex = hashTable[h];
|
|
- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
|
|
- const BYTE* match = matchBase + matchIndex;
|
|
- const U32 curr = (U32)(ip-base);
|
|
- const U32 repIndex = curr + 1 - offset_1;
|
|
- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
|
|
- const BYTE* const repMatch = repBase + repIndex;
|
|
- hashTable[h] = curr; /* update hash table */
|
|
- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
|
|
-
|
|
- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
|
|
- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
|
|
- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
|
|
- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
|
- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
|
|
- ip++;
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
|
|
- ip += rLength;
|
|
- anchor = ip;
|
|
- } else {
|
|
- if ( (matchIndex < dictStartIndex) ||
|
|
- (MEM_read32(match) != MEM_read32(ip)) ) {
|
|
- assert(stepSize >= 1);
|
|
- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
|
|
- continue;
|
|
+ { U32 const curr = (U32)(ip0 - base);
|
|
+ U32 const maxRep = curr - dictStartIndex;
|
|
+ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
|
|
+ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
|
|
+ }
|
|
+
|
|
+ /* start each op */
|
|
+_start: /* Requires: ip0 */
|
|
+
|
|
+ step = stepSize;
|
|
+ nextStep = ip0 + kStepIncr;
|
|
+
|
|
+ /* calculate positions, ip0 - anchor == 0, so we skip step calc */
|
|
+ ip1 = ip0 + 1;
|
|
+ ip2 = ip0 + step;
|
|
+ ip3 = ip2 + 1;
|
|
+
|
|
+ if (ip3 >= ilimit) {
|
|
+ goto _cleanup;
|
|
+ }
|
|
+
|
|
+ hash0 = ZSTD_hashPtr(ip0, hlog, mls);
|
|
+ hash1 = ZSTD_hashPtr(ip1, hlog, mls);
|
|
+
|
|
+ idx = hashTable[hash0];
|
|
+ idxBase = idx < prefixStartIndex ? dictBase : base;
|
|
+
|
|
+ do {
|
|
+ { /* load repcode match for ip[2] */
|
|
+ U32 const current2 = (U32)(ip2 - base);
|
|
+ U32 const repIndex = current2 - offset_1;
|
|
+ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
|
|
+ U32 rval;
|
|
+ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
|
|
+ & (offset_1 > 0) ) {
|
|
+ rval = MEM_read32(repBase + repIndex);
|
|
+ } else {
|
|
+ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
|
|
}
|
|
- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
|
|
- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
|
|
- U32 const offset = curr - matchIndex;
|
|
- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
|
|
- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
|
|
- offset_2 = offset_1; offset_1 = offset; /* update offset history */
|
|
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
|
|
- ip += mLength;
|
|
- anchor = ip;
|
|
+
|
|
+ /* write back hash table entry */
|
|
+ current0 = (U32)(ip0 - base);
|
|
+ hashTable[hash0] = current0;
|
|
+
|
|
+ /* check repcode at ip[2] */
|
|
+ if (MEM_read32(ip2) == rval) {
|
|
+ ip0 = ip2;
|
|
+ match0 = repBase + repIndex;
|
|
+ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
|
+ assert((match0 != prefixStart) & (match0 != dictStart));
|
|
+ mLength = ip0[-1] == match0[-1];
|
|
+ ip0 -= mLength;
|
|
+ match0 -= mLength;
|
|
+ offcode = REPCODE1_TO_OFFBASE;
|
|
+ mLength += 4;
|
|
+ goto _match;
|
|
} }
|
|
|
|
- if (ip <= ilimit) {
|
|
- /* Fill Table */
|
|
- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
|
|
- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
|
|
- /* check immediate repcode */
|
|
- while (ip <= ilimit) {
|
|
- U32 const current2 = (U32)(ip-base);
|
|
- U32 const repIndex2 = current2 - offset_2;
|
|
- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
|
|
- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */
|
|
- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
|
|
- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
|
- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
|
- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
|
|
- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
|
|
- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
|
|
- ip += repLength2;
|
|
- anchor = ip;
|
|
- continue;
|
|
- }
|
|
- break;
|
|
- } } }
|
|
+ { /* load match for ip[0] */
|
|
+ U32 const mval = idx >= dictStartIndex ?
|
|
+ MEM_read32(idxBase + idx) :
|
|
+ MEM_read32(ip0) ^ 1; /* guaranteed not to match */
|
|
+
|
|
+ /* check match at ip[0] */
|
|
+ if (MEM_read32(ip0) == mval) {
|
|
+ /* found a match! */
|
|
+ goto _offset;
|
|
+ } }
|
|
+
|
|
+ /* lookup ip[1] */
|
|
+ idx = hashTable[hash1];
|
|
+ idxBase = idx < prefixStartIndex ? dictBase : base;
|
|
+
|
|
+ /* hash ip[2] */
|
|
+ hash0 = hash1;
|
|
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
|
|
+
|
|
+ /* advance to next positions */
|
|
+ ip0 = ip1;
|
|
+ ip1 = ip2;
|
|
+ ip2 = ip3;
|
|
+
|
|
+ /* write back hash table entry */
|
|
+ current0 = (U32)(ip0 - base);
|
|
+ hashTable[hash0] = current0;
|
|
+
|
|
+ { /* load match for ip[0] */
|
|
+ U32 const mval = idx >= dictStartIndex ?
|
|
+ MEM_read32(idxBase + idx) :
|
|
+ MEM_read32(ip0) ^ 1; /* guaranteed not to match */
|
|
+
|
|
+ /* check match at ip[0] */
|
|
+ if (MEM_read32(ip0) == mval) {
|
|
+ /* found a match! */
|
|
+ goto _offset;
|
|
+ } }
|
|
+
|
|
+ /* lookup ip[1] */
|
|
+ idx = hashTable[hash1];
|
|
+ idxBase = idx < prefixStartIndex ? dictBase : base;
|
|
+
|
|
+ /* hash ip[2] */
|
|
+ hash0 = hash1;
|
|
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
|
|
+
|
|
+ /* advance to next positions */
|
|
+ ip0 = ip1;
|
|
+ ip1 = ip2;
|
|
+ ip2 = ip0 + step;
|
|
+ ip3 = ip1 + step;
|
|
+
|
|
+ /* calculate step */
|
|
+ if (ip2 >= nextStep) {
|
|
+ step++;
|
|
+ PREFETCH_L1(ip1 + 64);
|
|
+ PREFETCH_L1(ip1 + 128);
|
|
+ nextStep += kStepIncr;
|
|
+ }
|
|
+ } while (ip3 < ilimit);
|
|
+
|
|
+_cleanup:
|
|
+ /* Note that there are probably still a couple positions we could search.
|
|
+ * However, it seems to be a meaningful performance hit to try to search
|
|
+ * them. So let's not. */
|
|
+
|
|
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
|
|
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
|
|
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
|
|
|
|
/* save reps for next block */
|
|
- rep[0] = offset_1;
|
|
- rep[1] = offset_2;
|
|
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
|
|
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
+
|
|
+_offset: /* Requires: ip0, idx, idxBase */
|
|
+
|
|
+ /* Compute the offset code. */
|
|
+ { U32 const offset = current0 - idx;
|
|
+ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
|
|
+ matchEnd = idx < prefixStartIndex ? dictEnd : iend;
|
|
+ match0 = idxBase + idx;
|
|
+ offset_2 = offset_1;
|
|
+ offset_1 = offset;
|
|
+ offcode = OFFSET_TO_OFFBASE(offset);
|
|
+ mLength = 4;
|
|
+
|
|
+ /* Count the backwards match length. */
|
|
+ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
|
|
+ ip0--;
|
|
+ match0--;
|
|
+ mLength++;
|
|
+ } }
|
|
+
|
|
+_match: /* Requires: ip0, match0, offcode, matchEnd */
|
|
+
|
|
+ /* Count the forward length. */
|
|
+ assert(matchEnd != 0);
|
|
+ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
|
|
+
|
|
+ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
|
|
+
|
|
+ ip0 += mLength;
|
|
+ anchor = ip0;
|
|
+
|
|
+ /* write next hash table entry */
|
|
+ if (ip1 < ip0) {
|
|
+ hashTable[hash1] = (U32)(ip1 - base);
|
|
+ }
|
|
+
|
|
+ /* Fill table and check for immediate repcode. */
|
|
+ if (ip0 <= ilimit) {
|
|
+ /* Fill Table */
|
|
+ assert(base+current0+2 > istart); /* check base overflow */
|
|
+ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
|
|
+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
|
|
+
|
|
+ while (ip0 <= ilimit) {
|
|
+ U32 const repIndex2 = (U32)(ip0-base) - offset_2;
|
|
+ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
|
|
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */
|
|
+ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
|
|
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
|
+ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
|
+ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
|
|
+ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
|
|
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
|
|
+ ip0 += repLength2;
|
|
+ anchor = ip0;
|
|
+ continue;
|
|
+ }
|
|
+ break;
|
|
+ } }
|
|
+
|
|
+ goto _start;
|
|
}
|
|
|
|
ZSTD_GEN_FAST_FN(extDict, 4, 0)
|
|
@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
|
|
void const* src, size_t srcSize)
|
|
{
|
|
U32 const mls = ms->cParams.minMatch;
|
|
+ assert(ms->dictMatchState == NULL);
|
|
switch(mls)
|
|
{
|
|
default: /* includes case 3 */
|
|
diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
|
|
index fddc2f532d21..e64d9e1b2d39 100644
|
|
--- a/lib/zstd/compress/zstd_fast.h
|
|
+++ b/lib/zstd/compress/zstd_fast.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -16,7 +17,8 @@
|
|
#include "zstd_compress_internal.h"
|
|
|
|
void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
- void const* end, ZSTD_dictTableLoadMethod_e dtlm);
|
|
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm,
|
|
+ ZSTD_tableFillPurpose_e tfp);
|
|
size_t ZSTD_compressBlock_fast(
|
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
void const* src, size_t srcSize);
|
|
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
|
|
index 0298a01a7504..f6b4978ceba7 100644
|
|
--- a/lib/zstd/compress/zstd_lazy.c
|
|
+++ b/lib/zstd/compress/zstd_lazy.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -10,6 +11,9 @@
|
|
|
|
#include "zstd_compress_internal.h"
|
|
#include "zstd_lazy.h"
|
|
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
|
|
+
|
|
+#define kLazySkippingStep 8
|
|
|
|
|
|
/*-*************************************
|
|
@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
U32 matchIndex = dictMatchIndex + dictIndexDelta;
|
|
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
|
|
DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
|
|
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
|
|
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
|
|
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
|
|
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
}
|
|
if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
|
|
break; /* drop, to guarantee consistency (miss a little bit of compression) */
|
|
@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
}
|
|
|
|
if (bestLength >= MINMATCH) {
|
|
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
|
|
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
|
|
DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
|
|
}
|
|
@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
static size_t
|
|
ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
const BYTE* const ip, const BYTE* const iend,
|
|
- size_t* offsetPtr,
|
|
+ size_t* offBasePtr,
|
|
U32 const mls,
|
|
const ZSTD_dictMode_e dictMode)
|
|
{
|
|
@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
if (matchLength > bestLength) {
|
|
if (matchLength > matchEndIdx - matchIndex)
|
|
matchEndIdx = matchIndex + (U32)matchLength;
|
|
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
|
|
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
|
|
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
|
|
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
|
|
if (dictMode == ZSTD_dictMatchState) {
|
|
nbCompares = 0; /* in addition to avoiding checking any
|
|
@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
if (dictMode == ZSTD_dictMatchState && nbCompares) {
|
|
bestLength = ZSTD_DUBT_findBetterDictMatch(
|
|
ms, ip, iend,
|
|
- offsetPtr, bestLength, nbCompares,
|
|
+ offBasePtr, bestLength, nbCompares,
|
|
mls, dictMode);
|
|
}
|
|
|
|
assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
|
|
ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
|
|
if (bestLength >= MINMATCH) {
|
|
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
|
|
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
|
|
DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
|
|
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
|
|
}
|
|
return bestLength;
|
|
}
|
|
@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
FORCE_INLINE_TEMPLATE size_t
|
|
ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
|
|
const BYTE* const ip, const BYTE* const iLimit,
|
|
- size_t* offsetPtr,
|
|
+ size_t* offBasePtr,
|
|
const U32 mls /* template */,
|
|
const ZSTD_dictMode_e dictMode)
|
|
{
|
|
DEBUGLOG(7, "ZSTD_BtFindBestMatch");
|
|
if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
|
|
ZSTD_updateDUBT(ms, ip, iLimit, mls);
|
|
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
|
|
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
|
|
}
|
|
|
|
/* *********************************
|
|
@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
|
|
/* save best solution */
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
|
|
if (ip+currentMl == iLimit) {
|
|
/* best possible, avoids read overflow on next attempt */
|
|
return ml;
|
|
@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
|
|
/* save best solution */
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
|
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
}
|
|
}
|
|
@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
|
|
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
|
ZSTD_matchState_t* ms,
|
|
const ZSTD_compressionParameters* const cParams,
|
|
- const BYTE* ip, U32 const mls)
|
|
+ const BYTE* ip, U32 const mls, U32 const lazySkipping)
|
|
{
|
|
U32* const hashTable = ms->hashTable;
|
|
const U32 hashLog = cParams->hashLog;
|
|
@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
|
NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
|
|
hashTable[h] = idx;
|
|
idx++;
|
|
+ /* Stop inserting every position when in the lazy skipping mode. */
|
|
+ if (lazySkipping)
|
|
+ break;
|
|
}
|
|
|
|
ms->nextToUpdate = target;
|
|
@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
|
|
|
U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
|
|
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
|
|
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
|
|
}
|
|
|
|
/* inlining is important to hardwire a hot branch (template emulation) */
|
|
@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
|
|
}
|
|
|
|
/* HC4 match finder */
|
|
- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
|
|
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
|
|
|
|
for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
|
|
size_t currentMl=0;
|
|
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
const BYTE* const match = base + matchIndex;
|
|
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
|
- if (match[ml] == ip[ml]) /* potentially better */
|
|
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
|
|
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
|
|
currentMl = ZSTD_count(ip, match, iLimit);
|
|
} else {
|
|
const BYTE* const match = dictBase + matchIndex;
|
|
@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
|
|
/* save best solution */
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
}
|
|
|
|
@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
assert(curr > matchIndex + dmsIndexDelta);
|
|
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
|
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
}
|
|
|
|
@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
|
|
* (SIMD) Row-based matchfinder
|
|
***********************************/
|
|
/* Constants for row-based hash */
|
|
-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
|
|
-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
|
|
#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
|
|
#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
|
|
|
|
@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
|
|
* Starting from the LSB, returns the idx of the next non-zero bit.
|
|
* Basically counting the nb of trailing zeroes.
|
|
*/
|
|
-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
|
|
- assert(val != 0);
|
|
-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
|
|
- if (sizeof(size_t) == 4) {
|
|
- U32 mostSignificantWord = (U32)(val >> 32);
|
|
- U32 leastSignificantWord = (U32)val;
|
|
- if (leastSignificantWord == 0) {
|
|
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
|
|
- } else {
|
|
- return (U32)__builtin_ctz(leastSignificantWord);
|
|
- }
|
|
- } else {
|
|
- return (U32)__builtin_ctzll(val);
|
|
- }
|
|
-# else
|
|
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
|
|
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
|
|
- */
|
|
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
|
|
- val = val - ((val >> 1) & 0x5555555555555555);
|
|
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
|
|
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
|
|
-# endif
|
|
-}
|
|
-
|
|
-/* ZSTD_rotateRight_*():
|
|
- * Rotates a bitfield to the right by "count" bits.
|
|
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
|
|
- */
|
|
-FORCE_INLINE_TEMPLATE
|
|
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
|
|
- assert(count < 64);
|
|
- count &= 0x3F; /* for fickle pattern recognition */
|
|
- return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
|
|
-}
|
|
-
|
|
-FORCE_INLINE_TEMPLATE
|
|
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
|
|
- assert(count < 32);
|
|
- count &= 0x1F; /* for fickle pattern recognition */
|
|
- return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
|
|
-}
|
|
-
|
|
-FORCE_INLINE_TEMPLATE
|
|
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
|
|
- assert(count < 16);
|
|
- count &= 0x0F; /* for fickle pattern recognition */
|
|
- return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
|
|
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
|
|
+ return ZSTD_countTrailingZeros64(val);
|
|
}
|
|
|
|
/* ZSTD_row_nextIndex():
|
|
* Returns the next index to insert at within a tagTable row, and updates the "head"
|
|
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
|
|
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
|
|
*/
|
|
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
|
|
- U32 const next = (*tagRow - 1) & rowMask;
|
|
- *tagRow = (BYTE)next;
|
|
- return next;
|
|
+ U32 next = (*tagRow-1) & rowMask;
|
|
+ next += (next == 0) ? rowMask : 0; /* skip first position */
|
|
+ *tagRow = (BYTE)next;
|
|
+ return next;
|
|
}
|
|
|
|
/* ZSTD_isAligned():
|
|
@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
|
|
/* ZSTD_row_prefetch():
|
|
* Performs prefetching for the hashTable and tagTable at a given row.
|
|
*/
|
|
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
|
|
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
|
|
PREFETCH_L1(hashTable + relRow);
|
|
if (rowLog >= 5) {
|
|
PREFETCH_L1(hashTable + relRow + 16);
|
|
@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
|
|
U32 idx, const BYTE* const iLimit)
|
|
{
|
|
U32 const* const hashTable = ms->hashTable;
|
|
- U16 const* const tagTable = ms->tagTable;
|
|
+ BYTE const* const tagTable = ms->tagTable;
|
|
U32 const hashLog = ms->rowHashLog;
|
|
U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
|
|
U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
|
|
|
|
for (; idx < lim; ++idx) {
|
|
- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
|
|
U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
|
ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
|
|
@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
|
|
* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
|
|
*/
|
|
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
|
|
- U16 const* tagTable, BYTE const* base,
|
|
+ BYTE const* tagTable, BYTE const* base,
|
|
U32 idx, U32 const hashLog,
|
|
- U32 const rowLog, U32 const mls)
|
|
+ U32 const rowLog, U32 const mls,
|
|
+ U64 const hashSalt)
|
|
{
|
|
- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
|
|
U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
|
{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
|
|
@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
|
|
U32 const rowMask, U32 const useCache)
|
|
{
|
|
U32* const hashTable = ms->hashTable;
|
|
- U16* const tagTable = ms->tagTable;
|
|
+ BYTE* const tagTable = ms->tagTable;
|
|
U32 const hashLog = ms->rowHashLog;
|
|
const BYTE* const base = ms->window.base;
|
|
|
|
DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
|
|
for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
|
|
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
|
|
- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
|
|
+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
|
|
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
U32* const row = hashTable + relRow;
|
|
- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
|
|
- Explicit cast allows us to get exact desired position within each row */
|
|
+ BYTE* tagRow = tagTable + relRow;
|
|
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
|
|
|
- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
|
|
- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
|
|
+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
row[pos] = updateStartIdx;
|
|
}
|
|
}
|
|
@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
|
|
const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
|
|
|
|
DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
|
|
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
|
|
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
|
|
+}
|
|
+
|
|
+/* Returns the mask width of bits group of which will be set to 1. Given not all
|
|
+ * architectures have easy movemask instruction, this helps to iterate over
|
|
+ * groups of bits easier and faster.
|
|
+ */
|
|
+FORCE_INLINE_TEMPLATE U32
|
|
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
|
|
+{
|
|
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
|
|
+ (void)rowEntries;
|
|
+#if defined(ZSTD_ARCH_ARM_NEON)
|
|
+ /* NEON path only works for little endian */
|
|
+ if (!MEM_isLittleEndian()) {
|
|
+ return 1;
|
|
+ }
|
|
+ if (rowEntries == 16) {
|
|
+ return 4;
|
|
+ }
|
|
+ if (rowEntries == 32) {
|
|
+ return 2;
|
|
+ }
|
|
+ if (rowEntries == 64) {
|
|
+ return 1;
|
|
+ }
|
|
+#endif
|
|
+ return 1;
|
|
}
|
|
|
|
#if defined(ZSTD_ARCH_X86_SSE2)
|
|
@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
|
|
}
|
|
#endif
|
|
|
|
-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
|
|
- * the hash at the nth position in a row of the tagTable.
|
|
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
|
|
- * to match up with the actual layout of the entries within the hashTable */
|
|
+#if defined(ZSTD_ARCH_ARM_NEON)
|
|
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
|
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
|
|
+{
|
|
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
+ if (rowEntries == 16) {
|
|
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
|
|
+ * After that groups of 4 bits represent the equalMask. We lower
|
|
+ * all bits except the highest in these groups by doing AND with
|
|
+ * 0x88 = 0b10001000.
|
|
+ */
|
|
+ const uint8x16_t chunk = vld1q_u8(src);
|
|
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
|
|
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
|
|
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
|
|
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
|
|
+ } else if (rowEntries == 32) {
|
|
+ /* Same idea as with rowEntries == 16 but doing AND with
|
|
+ * 0x55 = 0b01010101.
|
|
+ */
|
|
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
|
|
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
|
|
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
|
|
+ const uint8x16_t dup = vdupq_n_u8(tag);
|
|
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
|
|
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
|
|
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
|
|
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
|
|
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
|
|
+ } else { /* rowEntries == 64 */
|
|
+ const uint8x16x4_t chunk = vld4q_u8(src);
|
|
+ const uint8x16_t dup = vdupq_n_u8(tag);
|
|
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
|
|
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
|
|
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
|
|
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
|
|
+
|
|
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
|
|
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
|
|
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
|
|
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
|
|
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
|
|
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
|
|
+ return ZSTD_rotateRight_U64(matches, headGrouped);
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
|
|
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
|
|
+ * matches the hash at the nth position in a row of the tagTable.
|
|
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
|
|
+ * must rotate the "matches" bitfield to match up with the actual layout of the
|
|
+ * entries within the hashTable */
|
|
FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
|
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
|
|
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
|
|
{
|
|
- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
|
|
+ const BYTE* const src = tagRow;
|
|
assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
|
|
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
|
|
|
|
#if defined(ZSTD_ARCH_X86_SSE2)
|
|
|
|
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
|
|
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
|
|
|
|
#else /* SW or NEON-LE */
|
|
|
|
# if defined(ZSTD_ARCH_ARM_NEON)
|
|
/* This NEON path only works for little endian - otherwise use SWAR below */
|
|
if (MEM_isLittleEndian()) {
|
|
- if (rowEntries == 16) {
|
|
- const uint8x16_t chunk = vld1q_u8(src);
|
|
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
|
|
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
|
|
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
|
|
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
|
|
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
|
|
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
|
|
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
|
|
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
|
|
- } else if (rowEntries == 32) {
|
|
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
|
|
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
|
|
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
|
|
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
|
|
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
|
|
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
|
|
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
|
|
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
|
|
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
|
|
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
|
|
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
|
|
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
|
|
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
|
|
- return ZSTD_rotateRight_U32(matches, head);
|
|
- } else { /* rowEntries == 64 */
|
|
- const uint8x16x4_t chunk = vld4q_u8(src);
|
|
- const uint8x16_t dup = vdupq_n_u8(tag);
|
|
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
|
|
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
|
|
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
|
|
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
|
|
-
|
|
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
|
|
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
|
|
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
|
|
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
|
|
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
|
|
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
|
|
- return ZSTD_rotateRight_U64(matches, head);
|
|
- }
|
|
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
|
|
}
|
|
# endif /* ZSTD_ARCH_ARM_NEON */
|
|
/* SWAR */
|
|
- { const size_t chunkSize = sizeof(size_t);
|
|
+ { const int chunkSize = sizeof(size_t);
|
|
const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
|
|
const size_t xFF = ~((size_t)0);
|
|
const size_t x01 = xFF / 0xFF;
|
|
@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
|
|
}
|
|
matches = ~matches;
|
|
if (rowEntries == 16) {
|
|
- return ZSTD_rotateRight_U16((U16)matches, head);
|
|
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
|
|
} else if (rowEntries == 32) {
|
|
- return ZSTD_rotateRight_U32((U32)matches, head);
|
|
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
|
|
} else {
|
|
- return ZSTD_rotateRight_U64((U64)matches, head);
|
|
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
|
|
}
|
|
}
|
|
#endif
|
|
@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
const U32 rowLog)
|
|
{
|
|
U32* const hashTable = ms->hashTable;
|
|
- U16* const tagTable = ms->tagTable;
|
|
+ BYTE* const tagTable = ms->tagTable;
|
|
U32* const hashCache = ms->hashCache;
|
|
const U32 hashLog = ms->rowHashLog;
|
|
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
|
|
const U32 rowEntries = (1U << rowLog);
|
|
const U32 rowMask = rowEntries - 1;
|
|
const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
|
|
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
|
|
+ const U64 hashSalt = ms->hashSalt;
|
|
U32 nbAttempts = 1U << cappedSearchLog;
|
|
size_t ml=4-1;
|
|
+ U32 hash;
|
|
|
|
/* DMS/DDS variables that may be referenced laster */
|
|
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
if (dictMode == ZSTD_dictMatchState) {
|
|
/* Prefetch DMS rows */
|
|
U32* const dmsHashTable = dms->hashTable;
|
|
- U16* const dmsTagTable = dms->tagTable;
|
|
+ BYTE* const dmsTagTable = dms->tagTable;
|
|
U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
|
|
@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
|
|
}
|
|
|
|
/* Update the hashTable and tagTable up to (but not including) ip */
|
|
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
|
|
+ if (!ms->lazySkipping) {
|
|
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
|
|
+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
|
|
+ } else {
|
|
+ /* Stop inserting every position when in the lazy skipping mode.
|
|
+ * The hash cache is also not kept up to date in this mode.
|
|
+ */
|
|
+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
|
|
+ ms->nextToUpdate = curr;
|
|
+ }
|
|
+ ms->hashSaltEntropy += hash; /* collect salt entropy */
|
|
+
|
|
{ /* Get the hash for ip, compute the appropriate row */
|
|
- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
|
|
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
U32* const row = hashTable + relRow;
|
|
BYTE* tagRow = (BYTE*)(tagTable + relRow);
|
|
- U32 const head = *tagRow & rowMask;
|
|
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
|
|
U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
|
|
size_t numMatches = 0;
|
|
size_t currMatch = 0;
|
|
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
|
|
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
|
|
|
|
/* Cycle through the matches and prefetch */
|
|
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
|
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
|
|
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
|
|
U32 const matchIndex = row[matchPos];
|
|
+ if(matchPos == 0) continue;
|
|
assert(numMatches < rowEntries);
|
|
if (matchIndex < lowLimit)
|
|
break;
|
|
@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
|
|
PREFETCH_L1(dictBase + matchIndex);
|
|
}
|
|
matchBuffer[numMatches++] = matchIndex;
|
|
+ --nbAttempts;
|
|
}
|
|
|
|
/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
|
|
in ZSTD_row_update_internal() at the next search. */
|
|
{
|
|
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
|
- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
|
|
+ tagRow[pos] = (BYTE)tag;
|
|
row[pos] = ms->nextToUpdate++;
|
|
}
|
|
|
|
@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
|
|
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
const BYTE* const match = base + matchIndex;
|
|
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
|
- if (match[ml] == ip[ml]) /* potentially better */
|
|
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
|
|
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
|
|
currentMl = ZSTD_count(ip, match, iLimit);
|
|
} else {
|
|
const BYTE* const match = dictBase + matchIndex;
|
|
@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
/* Save best solution */
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
}
|
|
}
|
|
@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
|
|
const U32 dmsSize = (U32)(dmsEnd - dmsBase);
|
|
const U32 dmsIndexDelta = dictLimit - dmsSize;
|
|
|
|
- { U32 const head = *dmsTagRow & rowMask;
|
|
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
|
|
U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
|
|
size_t numMatches = 0;
|
|
size_t currMatch = 0;
|
|
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
|
|
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
|
|
|
|
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
|
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
|
|
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
|
|
U32 const matchIndex = dmsRow[matchPos];
|
|
+ if(matchPos == 0) continue;
|
|
if (matchIndex < dmsLowestIndex)
|
|
break;
|
|
PREFETCH_L1(dmsBase + matchIndex);
|
|
matchBuffer[numMatches++] = matchIndex;
|
|
+ --nbAttempts;
|
|
}
|
|
|
|
/* Return the longest match */
|
|
@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
if (currentMl > ml) {
|
|
ml = currentMl;
|
|
assert(curr > matchIndex + dmsIndexDelta);
|
|
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
|
|
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
|
|
if (ip+currentMl == iLimit) break;
|
|
}
|
|
}
|
|
@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
|
|
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
|
|
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
|
|
|
|
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
|
|
+ U32 offset_1 = rep[0], offset_2 = rep[1];
|
|
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
|
|
|
|
const int isDMS = dictMode == ZSTD_dictMatchState;
|
|
const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
|
|
@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
|
|
U32 const curr = (U32)(ip - base);
|
|
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
|
|
U32 const maxRep = curr - windowLow;
|
|
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
|
|
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
|
|
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
|
|
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
|
|
}
|
|
if (isDxS) {
|
|
/* dictMatchState repCode checks don't currently handle repCode == 0
|
|
@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
|
|
assert(offset_2 <= dictAndPrefixLength);
|
|
}
|
|
|
|
+ /* Reset the lazy skipping state */
|
|
+ ms->lazySkipping = 0;
|
|
+
|
|
if (searchMethod == search_rowHash) {
|
|
- ZSTD_row_fillHashCache(ms, base, rowLog,
|
|
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
|
- ms->nextToUpdate, ilimit);
|
|
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
|
|
}
|
|
|
|
/* Match Loop */
|
|
@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
|
|
#endif
|
|
while (ip < ilimit) {
|
|
size_t matchLength=0;
|
|
- size_t offcode=STORE_REPCODE_1;
|
|
+ size_t offBase = REPCODE1_TO_OFFBASE;
|
|
const BYTE* start=ip+1;
|
|
DEBUGLOG(7, "search baseline (depth 0)");
|
|
|
|
@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
|
|
}
|
|
|
|
/* first search (depth 0) */
|
|
- { size_t offsetFound = 999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
|
|
+ { size_t offbaseFound = 999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
|
|
if (ml2 > matchLength)
|
|
- matchLength = ml2, start = ip, offcode=offsetFound;
|
|
+ matchLength = ml2, start = ip, offBase = offbaseFound;
|
|
}
|
|
|
|
if (matchLength < 4) {
|
|
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
|
|
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
|
|
+ ip += step;
|
|
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
|
|
+ * In this mode we stop inserting every position into our tables, and only insert
|
|
+ * positions that we search, which is one in step positions.
|
|
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
|
|
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
|
|
+ * triggered once we've gone 2KB without finding any matches.
|
|
+ */
|
|
+ ms->lazySkipping = step > kLazySkippingStep;
|
|
continue;
|
|
}
|
|
|
|
@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
|
|
DEBUGLOG(7, "search depth 1");
|
|
ip ++;
|
|
if ( (dictMode == ZSTD_noDict)
|
|
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
|
|
int const gain2 = (int)(mlRep * 3);
|
|
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
}
|
|
if (isDxS) {
|
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
|
|
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
|
|
int const gain2 = (int)(mlRep * 3);
|
|
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
}
|
|
}
|
|
- { size_t offset2=999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
|
|
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
|
|
+ { size_t ofbCandidate=999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
|
|
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
|
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
- matchLength = ml2, offcode = offset2, start = ip;
|
|
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
continue; /* search a better one */
|
|
} }
|
|
|
|
@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
|
|
DEBUGLOG(7, "search depth 2");
|
|
ip ++;
|
|
if ( (dictMode == ZSTD_noDict)
|
|
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
|
|
int const gain2 = (int)(mlRep * 4);
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
}
|
|
if (isDxS) {
|
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
|
|
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
|
|
int const gain2 = (int)(mlRep * 4);
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
}
|
|
}
|
|
- { size_t offset2=999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
|
|
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
|
|
+ { size_t ofbCandidate=999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
|
|
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
|
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
- matchLength = ml2, offcode = offset2, start = ip;
|
|
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
continue;
|
|
} } }
|
|
break; /* nothing found : store previous solution */
|
|
@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
|
|
* notably if `value` is unsigned, resulting in a large positive `-value`.
|
|
*/
|
|
/* catch up */
|
|
- if (STORED_IS_OFFSET(offcode)) {
|
|
+ if (OFFBASE_IS_OFFSET(offBase)) {
|
|
if (dictMode == ZSTD_noDict) {
|
|
- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
|
|
- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
|
|
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
|
|
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
|
|
{ start--; matchLength++; }
|
|
}
|
|
if (isDxS) {
|
|
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
|
|
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
|
|
const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
|
|
const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
|
|
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
|
|
}
|
|
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
|
|
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
|
|
}
|
|
/* store sequence */
|
|
_storeSequence:
|
|
{ size_t const litLength = (size_t)(start - anchor);
|
|
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
|
|
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
|
|
anchor = ip = start + matchLength;
|
|
}
|
|
+ if (ms->lazySkipping) {
|
|
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
|
|
+ if (searchMethod == search_rowHash) {
|
|
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
|
|
+ }
|
|
+ ms->lazySkipping = 0;
|
|
+ }
|
|
|
|
/* check immediate repcode */
|
|
if (isDxS) {
|
|
@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
|
|
&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
|
|
const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
|
|
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
|
|
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
ip += matchLength;
|
|
anchor = ip;
|
|
continue;
|
|
@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
|
|
&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
|
|
/* store sequence */
|
|
matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
|
|
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
|
|
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
ip += matchLength;
|
|
anchor = ip;
|
|
continue; /* faster when present ... (?) */
|
|
} } }
|
|
|
|
- /* Save reps for next block */
|
|
- rep[0] = offset_1 ? offset_1 : savedOffset;
|
|
- rep[1] = offset_2 ? offset_2 : savedOffset;
|
|
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
|
|
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
|
|
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
|
|
+
|
|
+ /* save reps for next block */
|
|
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
|
|
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
|
|
|
|
/* Return the last literals size */
|
|
return (size_t)(iend - anchor);
|
|
@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
|
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
|
|
|
|
+ /* Reset the lazy skipping state */
|
|
+ ms->lazySkipping = 0;
|
|
+
|
|
/* init */
|
|
ip += (ip == prefixStart);
|
|
if (searchMethod == search_rowHash) {
|
|
- ZSTD_row_fillHashCache(ms, base, rowLog,
|
|
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
|
- ms->nextToUpdate, ilimit);
|
|
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
|
|
}
|
|
|
|
/* Match Loop */
|
|
@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
#endif
|
|
while (ip < ilimit) {
|
|
size_t matchLength=0;
|
|
- size_t offcode=STORE_REPCODE_1;
|
|
+ size_t offBase = REPCODE1_TO_OFFBASE;
|
|
const BYTE* start=ip+1;
|
|
U32 curr = (U32)(ip-base);
|
|
|
|
@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
} }
|
|
|
|
/* first search (depth 0) */
|
|
- { size_t offsetFound = 999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
+ { size_t ofbCandidate = 999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
if (ml2 > matchLength)
|
|
- matchLength = ml2, start = ip, offcode=offsetFound;
|
|
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
|
|
}
|
|
|
|
if (matchLength < 4) {
|
|
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
|
|
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
|
|
+ ip += step + 1; /* jump faster over incompressible sections */
|
|
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
|
|
+ * In this mode we stop inserting every position into our tables, and only insert
|
|
+ * positions that we search, which is one in step positions.
|
|
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
|
|
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
|
|
+ * triggered once we've gone 2KB without finding any matches.
|
|
+ */
|
|
+ ms->lazySkipping = step > kLazySkippingStep;
|
|
continue;
|
|
}
|
|
|
|
@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
ip ++;
|
|
curr++;
|
|
/* check repCode */
|
|
- if (offcode) {
|
|
+ if (offBase) {
|
|
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
const U32 repIndex = (U32)(curr - offset_1);
|
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
int const gain2 = (int)(repLength * 3);
|
|
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((repLength >= 4) && (gain2 > gain1))
|
|
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
} }
|
|
|
|
/* search match, depth 1 */
|
|
- { size_t offset2=999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
|
|
+ { size_t ofbCandidate = 999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
|
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
- matchLength = ml2, offcode = offset2, start = ip;
|
|
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
continue; /* search a better one */
|
|
} }
|
|
|
|
@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
ip ++;
|
|
curr++;
|
|
/* check repCode */
|
|
- if (offcode) {
|
|
+ if (offBase) {
|
|
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
const U32 repIndex = (U32)(curr - offset_1);
|
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
int const gain2 = (int)(repLength * 4);
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
if ((repLength >= 4) && (gain2 > gain1))
|
|
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
|
|
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
} }
|
|
|
|
/* search match, depth 2 */
|
|
- { size_t offset2=999999999;
|
|
- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
|
|
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
|
|
+ { size_t ofbCandidate = 999999999;
|
|
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
|
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
- matchLength = ml2, offcode = offset2, start = ip;
|
|
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
continue;
|
|
} } }
|
|
break; /* nothing found : store previous solution */
|
|
}
|
|
|
|
/* catch up */
|
|
- if (STORED_IS_OFFSET(offcode)) {
|
|
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
|
|
+ if (OFFBASE_IS_OFFSET(offBase)) {
|
|
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
|
|
const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
|
|
const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
|
|
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
|
|
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
|
|
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
|
|
}
|
|
|
|
/* store sequence */
|
|
_storeSequence:
|
|
{ size_t const litLength = (size_t)(start - anchor);
|
|
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
|
|
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
|
|
anchor = ip = start + matchLength;
|
|
}
|
|
+ if (ms->lazySkipping) {
|
|
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
|
|
+ if (searchMethod == search_rowHash) {
|
|
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
|
|
+ }
|
|
+ ms->lazySkipping = 0;
|
|
+ }
|
|
|
|
/* check immediate repcode */
|
|
while (ip <= ilimit) {
|
|
@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
/* repcode detected we should take it */
|
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
|
|
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
|
|
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
|
|
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
ip += matchLength;
|
|
anchor = ip;
|
|
continue; /* faster when present ... (?) */
|
|
@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
|
|
size_t ZSTD_compressBlock_lazy2_extDict_row(
|
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
void const* src, size_t srcSize)
|
|
-
|
|
{
|
|
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
|
|
}
|
|
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
|
|
index e5bdf4df8dde..9505bed93c03 100644
|
|
--- a/lib/zstd/compress/zstd_lazy.h
|
|
+++ b/lib/zstd/compress/zstd_lazy.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -22,6 +23,8 @@
|
|
*/
|
|
#define ZSTD_LAZY_DDSS_BUCKET_LOG 2
|
|
|
|
+#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
|
|
+
|
|
U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
|
|
void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
|
|
|
|
@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
|
|
size_t ZSTD_compressBlock_btlazy2_extDict(
|
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
void const* src, size_t srcSize);
|
|
-
|
|
+
|
|
|
|
|
|
#endif /* ZSTD_LAZY_H */
|
|
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
|
|
index dd86fc83e7dd..b7da76b0db7c 100644
|
|
--- a/lib/zstd/compress/zstd_ldm.c
|
|
+++ b/lib/zstd/compress/zstd_ldm.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
|
|
switch(ms->cParams.strategy)
|
|
{
|
|
case ZSTD_fast:
|
|
- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
|
|
+ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
|
|
break;
|
|
|
|
case ZSTD_dfast:
|
|
- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
|
|
+ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
|
|
break;
|
|
|
|
case ZSTD_greedy:
|
|
@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
|
|
* the window through early invalidation.
|
|
* TODO: * Test the chunk size.
|
|
* * Try invalidation after the sequence generation and test the
|
|
- * the offset against maxDist directly.
|
|
+ * offset against maxDist directly.
|
|
*
|
|
* NOTE: Because of dictionaries + sequence splitting we MUST make sure
|
|
* that any offset used is valid at the END of the sequence, since it may
|
|
@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
|
|
rep[0] = sequence.offset;
|
|
/* Store the sequence */
|
|
ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
|
|
- STORE_OFFSET(sequence.offset),
|
|
+ OFFSET_TO_OFFBASE(sequence.offset),
|
|
sequence.matchLength);
|
|
ip += sequence.matchLength;
|
|
}
|
|
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
|
|
index fbc6a5e88fd7..c540731abde7 100644
|
|
--- a/lib/zstd/compress/zstd_ldm.h
|
|
+++ b/lib/zstd/compress/zstd_ldm.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
|
|
index 647f865be290..cfccfc46f6f7 100644
|
|
--- a/lib/zstd/compress/zstd_ldm_geartab.h
|
|
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
|
|
index fd82acfda62f..1e41cb04f482 100644
|
|
--- a/lib/zstd/compress/zstd_opt.c
|
|
+++ b/lib/zstd/compress/zstd_opt.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -16,7 +17,7 @@
|
|
#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
|
|
#define ZSTD_MAX_PRICE (1<<30)
|
|
|
|
-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
|
|
+#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
|
|
|
|
|
|
/*-*************************************
|
|
@@ -26,27 +27,35 @@
|
|
#if 0 /* approximation at bit level (for tests) */
|
|
# define BITCOST_ACCURACY 0
|
|
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
|
|
-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
|
|
+# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
|
|
#elif 0 /* fractional bit accuracy (for tests) */
|
|
# define BITCOST_ACCURACY 8
|
|
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
|
|
-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
|
|
+# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
|
|
#else /* opt==approx, ultra==accurate */
|
|
# define BITCOST_ACCURACY 8
|
|
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
|
|
-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
|
|
+# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
|
|
#endif
|
|
|
|
+/* ZSTD_bitWeight() :
|
|
+ * provide estimated "cost" of a stat in full bits only */
|
|
MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
|
|
{
|
|
return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
|
|
}
|
|
|
|
+/* ZSTD_fracWeight() :
|
|
+ * provide fractional-bit "cost" of a stat,
|
|
+ * using linear interpolation approximation */
|
|
MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
|
|
{
|
|
U32 const stat = rawStat + 1;
|
|
U32 const hb = ZSTD_highbit32(stat);
|
|
U32 const BWeight = hb * BITCOST_MULTIPLIER;
|
|
+ /* Fweight was meant for "Fractional weight"
|
|
+ * but it's effectively a value between 1 and 2
|
|
+ * using fixed point arithmetic */
|
|
U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
|
|
U32 const weight = BWeight + FWeight;
|
|
assert(hb + BITCOST_ACCURACY < 31);
|
|
@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
|
|
/* debugging function,
|
|
* @return price in bytes as fractional value
|
|
* for debug messages only */
|
|
-MEM_STATIC double ZSTD_fCost(U32 price)
|
|
+MEM_STATIC double ZSTD_fCost(int price)
|
|
{
|
|
return (double)price / (BITCOST_MULTIPLIER*8);
|
|
}
|
|
@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
|
|
return total;
|
|
}
|
|
|
|
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
|
|
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
|
|
+
|
|
+static U32
|
|
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
|
|
{
|
|
U32 s, sum=0;
|
|
- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
|
|
+ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
|
|
+ (unsigned)lastEltIndex+1, (unsigned)shift );
|
|
assert(shift < 30);
|
|
for (s=0; s<lastEltIndex+1; s++) {
|
|
- table[s] = 1 + (table[s] >> shift);
|
|
- sum += table[s];
|
|
+ unsigned const base = base1 ? 1 : (table[s]>0);
|
|
+ unsigned const newStat = base + (table[s] >> shift);
|
|
+ sum += newStat;
|
|
+ table[s] = newStat;
|
|
}
|
|
return sum;
|
|
}
|
|
|
|
/* ZSTD_scaleStats() :
|
|
- * reduce all elements in table is sum too large
|
|
+ * reduce all elt frequencies in table if sum too large
|
|
* return the resulting sum of elements */
|
|
static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
|
|
{
|
|
@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
|
|
DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
|
|
assert(logTarget < 30);
|
|
if (factor <= 1) return prevsum;
|
|
- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
|
|
+ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
|
|
}
|
|
|
|
/* ZSTD_rescaleFreqs() :
|
|
@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
|
|
DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
|
|
optPtr->priceType = zop_dynamic;
|
|
|
|
- if (optPtr->litLengthSum == 0) { /* first block : init */
|
|
- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */
|
|
- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
|
|
+ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */
|
|
+
|
|
+ /* heuristic: use pre-defined stats for too small inputs */
|
|
+ if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
|
|
+ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
|
|
optPtr->priceType = zop_predef;
|
|
}
|
|
|
|
assert(optPtr->symbolCosts != NULL);
|
|
if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
|
|
- /* huffman table presumed generated by dictionary */
|
|
+
|
|
+ /* huffman stats covering the full value set : table presumed generated by dictionary */
|
|
optPtr->priceType = zop_dynamic;
|
|
|
|
if (compressedLiterals) {
|
|
+ /* generate literals statistics from huffman table */
|
|
unsigned lit;
|
|
assert(optPtr->litFreq != NULL);
|
|
optPtr->litSum = 0;
|
|
@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
|
|
optPtr->offCodeSum += optPtr->offCodeFreq[of];
|
|
} }
|
|
|
|
- } else { /* not a dictionary */
|
|
+ } else { /* first block, no dictionary */
|
|
|
|
assert(optPtr->litFreq != NULL);
|
|
if (compressedLiterals) {
|
|
+ /* base initial cost of literals on direct frequency within src */
|
|
unsigned lit = MaxLit;
|
|
HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
|
|
- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
|
|
+ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
|
|
}
|
|
|
|
{ unsigned const baseLLfreqs[MaxLL+1] = {
|
|
@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
|
|
optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
|
|
}
|
|
|
|
-
|
|
}
|
|
|
|
- } else { /* new block : re-use previous statistics, scaled down */
|
|
+ } else { /* new block : scale down accumulated statistics */
|
|
|
|
if (compressedLiterals)
|
|
optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
|
|
@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
|
|
return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */
|
|
|
|
/* dynamic statistics */
|
|
- { U32 price = litLength * optPtr->litSumBasePrice;
|
|
+ { U32 price = optPtr->litSumBasePrice * litLength;
|
|
+ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
|
|
U32 u;
|
|
+ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
|
|
for (u=0; u < litLength; u++) {
|
|
- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */
|
|
- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
|
|
+ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
|
|
+ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
|
|
+ price -= litPrice;
|
|
}
|
|
return price;
|
|
}
|
|
@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
|
|
assert(litLength <= ZSTD_BLOCKSIZE_MAX);
|
|
if (optPtr->priceType == zop_predef)
|
|
return WEIGHT(litLength, optLevel);
|
|
- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
|
|
- * because it isn't representable in the zstd format. So instead just
|
|
- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
|
|
- * would be all literals.
|
|
+
|
|
+ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
|
|
+ * because it isn't representable in the zstd format.
|
|
+ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
|
|
+ * In such a case, the block would be all literals.
|
|
*/
|
|
if (litLength == ZSTD_BLOCKSIZE_MAX)
|
|
return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
|
|
@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
|
|
}
|
|
|
|
/* ZSTD_getMatchPrice() :
|
|
- * Provides the cost of the match part (offset + matchLength) of a sequence
|
|
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
|
|
* Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
|
|
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
|
|
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
|
|
* @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
|
|
*/
|
|
FORCE_INLINE_TEMPLATE U32
|
|
-ZSTD_getMatchPrice(U32 const offcode,
|
|
+ZSTD_getMatchPrice(U32 const offBase,
|
|
U32 const matchLength,
|
|
const optState_t* const optPtr,
|
|
int const optLevel)
|
|
{
|
|
U32 price;
|
|
- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
|
|
+ U32 const offCode = ZSTD_highbit32(offBase);
|
|
U32 const mlBase = matchLength - MINMATCH;
|
|
assert(matchLength >= MINMATCH);
|
|
|
|
- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */
|
|
- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
|
|
+ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */
|
|
+ return WEIGHT(mlBase, optLevel)
|
|
+ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
|
|
|
|
/* dynamic statistics */
|
|
price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
|
|
@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
|
|
}
|
|
|
|
/* ZSTD_updateStats() :
|
|
- * assumption : literals + litLengtn <= iend */
|
|
+ * assumption : literals + litLength <= iend */
|
|
static void ZSTD_updateStats(optState_t* const optPtr,
|
|
U32 litLength, const BYTE* literals,
|
|
- U32 offsetCode, U32 matchLength)
|
|
+ U32 offBase, U32 matchLength)
|
|
{
|
|
/* literals */
|
|
if (ZSTD_compressedLiterals(optPtr)) {
|
|
@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
|
|
optPtr->litLengthSum++;
|
|
}
|
|
|
|
- /* offset code : expected to follow storeSeq() numeric representation */
|
|
- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
|
|
+ /* offset code : follows storeSeq() numeric representation */
|
|
+ { U32 const offCode = ZSTD_highbit32(offBase);
|
|
assert(offCode <= MaxOff);
|
|
optPtr->offCodeFreq[offCode]++;
|
|
optPtr->offCodeSum++;
|
|
@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
|
|
ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
|
|
}
|
|
|
|
-FORCE_INLINE_TEMPLATE
|
|
-U32 ZSTD_insertBtAndGetAllMatches (
|
|
- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
|
|
- ZSTD_matchState_t* ms,
|
|
- U32* nextToUpdate3,
|
|
- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
|
|
- const U32 rep[ZSTD_REP_NUM],
|
|
- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
|
|
- const U32 lengthToBeat,
|
|
- U32 const mls /* template */)
|
|
+FORCE_INLINE_TEMPLATE U32
|
|
+ZSTD_insertBtAndGetAllMatches (
|
|
+ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
|
|
+ ZSTD_matchState_t* ms,
|
|
+ U32* nextToUpdate3,
|
|
+ const BYTE* const ip, const BYTE* const iLimit,
|
|
+ const ZSTD_dictMode_e dictMode,
|
|
+ const U32 rep[ZSTD_REP_NUM],
|
|
+ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
|
|
+ const U32 lengthToBeat,
|
|
+ const U32 mls /* template */)
|
|
{
|
|
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
|
|
@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
|
|
DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
|
|
repCode, ll0, repOffset, repLen);
|
|
bestLength = repLen;
|
|
- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */
|
|
+ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */
|
|
matches[mnum].len = (U32)repLen;
|
|
mnum++;
|
|
if ( (repLen > sufficient_len)
|
|
@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
|
|
bestLength = mlen;
|
|
assert(curr > matchIndex3);
|
|
assert(mnum==0); /* no prior solution */
|
|
- matches[0].off = STORE_OFFSET(curr - matchIndex3);
|
|
+ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
|
|
matches[0].len = (U32)mlen;
|
|
mnum = 1;
|
|
if ( (mlen > sufficient_len) |
|
|
@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
|
|
}
|
|
|
|
if (matchLength > bestLength) {
|
|
- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
|
|
- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
|
|
+ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
|
|
+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
|
|
assert(matchEndIdx > matchIndex);
|
|
if (matchLength > matchEndIdx - matchIndex)
|
|
matchEndIdx = matchIndex + (U32)matchLength;
|
|
bestLength = matchLength;
|
|
- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
|
|
+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
matches[mnum].len = (U32)matchLength;
|
|
mnum++;
|
|
if ( (matchLength > ZSTD_OPT_NUM)
|
|
@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
|
|
|
|
if (matchLength > bestLength) {
|
|
matchIndex = dictMatchIndex + dmsIndexDelta;
|
|
- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
|
|
- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
|
|
+ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
|
|
+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
|
|
if (matchLength > matchEndIdx - matchIndex)
|
|
matchEndIdx = matchIndex + (U32)matchLength;
|
|
bestLength = matchLength;
|
|
- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
|
|
+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
matches[mnum].len = (U32)matchLength;
|
|
mnum++;
|
|
if ( (matchLength > ZSTD_OPT_NUM)
|
|
@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
|
|
const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
|
|
{
|
|
U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
|
|
- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
|
|
+ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
|
|
U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
|
|
|
|
/* Ensure that current block position is not outside of the match */
|
|
@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
|
|
}
|
|
|
|
if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
|
|
- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
|
|
- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
|
|
- candidateOffCode, candidateMatchLength, currPosInBlock);
|
|
+ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
|
|
+ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
|
|
+ candidateOffBase, candidateMatchLength, currPosInBlock);
|
|
matches[*nbMatches].len = candidateMatchLength;
|
|
- matches[*nbMatches].off = candidateOffCode;
|
|
+ matches[*nbMatches].off = candidateOffBase;
|
|
(*nbMatches)++;
|
|
}
|
|
}
|
|
@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
ZSTD_optimal_t lastSequence;
|
|
ZSTD_optLdm_t optLdm;
|
|
|
|
+ ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
|
|
+
|
|
optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
|
|
optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
|
|
ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
|
|
@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
|
|
/* large match -> immediate encoding */
|
|
{ U32 const maxML = matches[nbMatches-1].len;
|
|
- U32 const maxOffcode = matches[nbMatches-1].off;
|
|
- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
|
|
- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
|
|
+ U32 const maxOffBase = matches[nbMatches-1].off;
|
|
+ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
|
|
+ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
|
|
|
|
if (maxML > sufficient_len) {
|
|
lastSequence.litlen = litlen;
|
|
lastSequence.mlen = maxML;
|
|
- lastSequence.off = maxOffcode;
|
|
+ lastSequence.off = maxOffBase;
|
|
DEBUGLOG(6, "large match (%u>%u), immediate encoding",
|
|
maxML, sufficient_len);
|
|
cur = 0;
|
|
@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
|
|
}
|
|
for (matchNb = 0; matchNb < nbMatches; matchNb++) {
|
|
- U32 const offcode = matches[matchNb].off;
|
|
+ U32 const offBase = matches[matchNb].off;
|
|
U32 const end = matches[matchNb].len;
|
|
for ( ; pos <= end ; pos++ ) {
|
|
- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
|
|
+ U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
|
|
U32 const sequencePrice = literalsPrice + matchPrice;
|
|
DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
|
|
- pos, ZSTD_fCost(sequencePrice));
|
|
+ pos, ZSTD_fCost((int)sequencePrice));
|
|
opt[pos].mlen = pos;
|
|
- opt[pos].off = offcode;
|
|
+ opt[pos].off = offBase;
|
|
opt[pos].litlen = litlen;
|
|
opt[pos].price = (int)sequencePrice;
|
|
} }
|
|
@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
|
|
U32 mlen;
|
|
|
|
- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
|
|
+ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
|
|
matchNb, matches[matchNb].off, lastML, litlen);
|
|
|
|
for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
|
|
@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
for (storePos=storeStart; storePos <= storeEnd; storePos++) {
|
|
U32 const llen = opt[storePos].litlen;
|
|
U32 const mlen = opt[storePos].mlen;
|
|
- U32 const offCode = opt[storePos].off;
|
|
+ U32 const offBase = opt[storePos].off;
|
|
U32 const advance = llen + mlen;
|
|
DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
|
|
anchor - istart, (unsigned)llen, (unsigned)mlen);
|
|
@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|
}
|
|
|
|
assert(anchor + llen <= iend);
|
|
- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
|
|
- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
|
|
+ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
|
|
+ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
|
|
anchor += advance;
|
|
ip = anchor;
|
|
} }
|
|
@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
|
|
/* ZSTD_initStats_ultra():
|
|
* make a first compression pass, just to seed stats with more accurate starting values.
|
|
* only works on first block, with no dictionary and no ldm.
|
|
- * this function cannot error, hence its contract must be respected.
|
|
+ * this function cannot error out, its narrow contract must be respected.
|
|
*/
|
|
static void
|
|
ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
|
|
@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
|
|
|
|
ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
|
|
|
|
- /* invalidate first scan from history */
|
|
+ /* invalidate first scan from history, only keep entropy stats */
|
|
ZSTD_resetSeqStore(seqStore);
|
|
ms->window.base -= srcSize;
|
|
ms->window.dictLimit += (U32)srcSize;
|
|
@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
|
|
U32 const curr = (U32)((const BYTE*)src - ms->window.base);
|
|
DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
|
|
|
|
- /* 2-pass strategy:
|
|
+ /* 2-passes strategy:
|
|
* this strategy makes a first pass over first block to collect statistics
|
|
- * and seed next round's statistics with it.
|
|
- * After 1st pass, function forgets everything, and starts a new block.
|
|
+ * in order to seed next round's statistics with it.
|
|
+ * After 1st pass, function forgets history, and starts a new block.
|
|
* Consequently, this can only work if no data has been previously loaded in tables,
|
|
* aka, no dictionary, no prefix, no ldm preprocessing.
|
|
* The compression ratio gain is generally small (~0.5% on first block),
|
|
- * the cost is 2x cpu time on first block. */
|
|
+ ** the cost is 2x cpu time on first block. */
|
|
assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
|
|
if ( (ms->opt.litLengthSum==0) /* first block */
|
|
&& (seqStore->sequences == seqStore->sequencesStart) /* no ldm */
|
|
&& (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */
|
|
- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
|
|
- && (srcSize > ZSTD_PREDEF_THRESHOLD)
|
|
+ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
|
|
+ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
|
|
) {
|
|
ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
|
|
}
|
|
diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
|
|
index 22b862858ba7..faa73ff4b03d 100644
|
|
--- a/lib/zstd/compress/zstd_opt.h
|
|
+++ b/lib/zstd/compress/zstd_opt.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
|
|
index 60958afebc41..d172e35fbd9a 100644
|
|
--- a/lib/zstd/decompress/huf_decompress.c
|
|
+++ b/lib/zstd/decompress/huf_decompress.c
|
|
@@ -1,7 +1,8 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/* ******************************************************************
|
|
* huff0 huffman decoder,
|
|
* part of Finite State Entropy library
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* You can contact the author at :
|
|
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
|
@@ -19,10 +20,10 @@
|
|
#include "../common/compiler.h"
|
|
#include "../common/bitstream.h" /* BIT_* */
|
|
#include "../common/fse.h" /* to compress headers */
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include "../common/error_private.h"
|
|
#include "../common/zstd_internal.h"
|
|
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
|
|
|
|
/* **************************************************************
|
|
* Constants
|
|
@@ -43,27 +44,25 @@
|
|
#error "Cannot force the use of the X1 and X2 decoders at the same time!"
|
|
#endif
|
|
|
|
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
|
|
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
|
|
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
|
|
+ * supported at runtime, so we can add the BMI2 target attribute.
|
|
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
|
|
+ */
|
|
+#if DYNAMIC_BMI2
|
|
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
|
|
#else
|
|
-# define HUF_ASM_X86_64_BMI2_ATTRS
|
|
+# define HUF_FAST_BMI2_ATTRS
|
|
#endif
|
|
|
|
#define HUF_EXTERN_C
|
|
#define HUF_ASM_DECL HUF_EXTERN_C
|
|
|
|
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
|
|
+#if DYNAMIC_BMI2
|
|
# define HUF_NEED_BMI2_FUNCTION 1
|
|
#else
|
|
# define HUF_NEED_BMI2_FUNCTION 0
|
|
#endif
|
|
|
|
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
|
|
-# define HUF_NEED_DEFAULT_FUNCTION 1
|
|
-#else
|
|
-# define HUF_NEED_DEFAULT_FUNCTION 0
|
|
-#endif
|
|
-
|
|
/* **************************************************************
|
|
* Error Management
|
|
****************************************************************/
|
|
@@ -80,6 +79,11 @@
|
|
/* **************************************************************
|
|
* BMI2 Variant Wrappers
|
|
****************************************************************/
|
|
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
|
|
+ const void *cSrc,
|
|
+ size_t cSrcSize,
|
|
+ const HUF_DTable *DTable);
|
|
+
|
|
#if DYNAMIC_BMI2
|
|
|
|
#define HUF_DGEN(fn) \
|
|
@@ -101,9 +105,9 @@
|
|
} \
|
|
\
|
|
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
|
|
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
|
|
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
|
|
{ \
|
|
- if (bmi2) { \
|
|
+ if (flags & HUF_flags_bmi2) { \
|
|
return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
|
|
} \
|
|
return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
|
|
@@ -113,9 +117,9 @@
|
|
|
|
#define HUF_DGEN(fn) \
|
|
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
|
|
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
|
|
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
|
|
{ \
|
|
- (void)bmi2; \
|
|
+ (void)flags; \
|
|
return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
|
|
}
|
|
|
|
@@ -134,15 +138,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
|
|
return dtd;
|
|
}
|
|
|
|
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
-
|
|
-static size_t HUF_initDStream(BYTE const* ip) {
|
|
+static size_t HUF_initFastDStream(BYTE const* ip) {
|
|
BYTE const lastByte = ip[7];
|
|
- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
|
|
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
|
|
size_t const value = MEM_readLEST(ip) | 1;
|
|
assert(bitsConsumed <= 8);
|
|
+ assert(sizeof(size_t) == 8);
|
|
return value << bitsConsumed;
|
|
}
|
|
+
|
|
+
|
|
+/*
|
|
+ * The input/output arguments to the Huffman fast decoding loop:
|
|
+ *
|
|
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
|
|
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
|
|
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
|
|
+ * dt [in] - The decoding table.
|
|
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
|
|
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
|
|
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
|
|
+ * as long as it is above ilimit, but that indicates corruption.
|
|
+ */
|
|
typedef struct {
|
|
BYTE const* ip[4];
|
|
BYTE* op[4];
|
|
@@ -151,15 +168,17 @@ typedef struct {
|
|
BYTE const* ilimit;
|
|
BYTE* oend;
|
|
BYTE const* iend[4];
|
|
-} HUF_DecompressAsmArgs;
|
|
+} HUF_DecompressFastArgs;
|
|
+
|
|
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
|
|
|
|
/*
|
|
- * Initializes args for the asm decoding loop.
|
|
- * @returns 0 on success
|
|
- * 1 if the fallback implementation should be used.
|
|
+ * Initializes args for the fast decoding loop.
|
|
+ * @returns 1 on success
|
|
+ * 0 if the fallback implementation should be used.
|
|
* Or an error code on failure.
|
|
*/
|
|
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
|
|
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
|
|
{
|
|
void const* dt = DTable + 1;
|
|
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
|
|
@@ -168,9 +187,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
|
|
BYTE* const oend = (BYTE*)dst + dstSize;
|
|
|
|
- /* The following condition is false on x32 platform,
|
|
- * but HUF_asm is not compatible with this ABI */
|
|
- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
|
|
+ /* The fast decoding loop assumes 64-bit little-endian.
|
|
+ * This condition is false on x32.
|
|
+ */
|
|
+ if (!MEM_isLittleEndian() || MEM_32bits())
|
|
+ return 0;
|
|
|
|
/* strict minimum : jump table + 1 byte per stream */
|
|
if (srcSize < 10)
|
|
@@ -181,7 +202,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
|
|
*/
|
|
if (dtLog != HUF_DECODER_FAST_TABLELOG)
|
|
- return 1;
|
|
+ return 0;
|
|
|
|
/* Read the jump table. */
|
|
{
|
|
@@ -195,13 +216,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
args->iend[2] = args->iend[1] + length2;
|
|
args->iend[3] = args->iend[2] + length3;
|
|
|
|
- /* HUF_initDStream() requires this, and this small of an input
|
|
+ /* HUF_initFastDStream() requires this, and this small of an input
|
|
* won't benefit from the ASM loop anyways.
|
|
* length1 must be >= 16 so that ip[0] >= ilimit before the loop
|
|
* starts.
|
|
*/
|
|
if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
|
|
- return 1;
|
|
+ return 0;
|
|
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
|
|
}
|
|
/* ip[] contains the position that is currently loaded into bits[]. */
|
|
@@ -218,7 +239,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
|
|
/* No point to call the ASM loop for tiny outputs. */
|
|
if (args->op[3] >= oend)
|
|
- return 1;
|
|
+ return 0;
|
|
|
|
/* bits[] is the bit container.
|
|
* It is read from the MSB down to the LSB.
|
|
@@ -227,10 +248,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
* set, so that CountTrailingZeros(bits[]) can be used
|
|
* to count how many bits we've consumed.
|
|
*/
|
|
- args->bits[0] = HUF_initDStream(args->ip[0]);
|
|
- args->bits[1] = HUF_initDStream(args->ip[1]);
|
|
- args->bits[2] = HUF_initDStream(args->ip[2]);
|
|
- args->bits[3] = HUF_initDStream(args->ip[3]);
|
|
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
|
|
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
|
|
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
|
|
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
|
|
|
|
/* If ip[] >= ilimit, it is guaranteed to be safe to
|
|
* reload bits[]. It may be beyond its section, but is
|
|
@@ -241,10 +262,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
|
|
args->oend = oend;
|
|
args->dt = dt;
|
|
|
|
- return 0;
|
|
+ return 1;
|
|
}
|
|
|
|
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
|
|
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
|
|
{
|
|
/* Validate that we haven't overwritten. */
|
|
if (args->op[stream] > segmentEnd)
|
|
@@ -258,15 +279,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
|
|
return ERROR(corruption_detected);
|
|
|
|
/* Construct the BIT_DStream_t. */
|
|
- bit->bitContainer = MEM_readLE64(args->ip[stream]);
|
|
- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
|
|
+ assert(sizeof(size_t) == 8);
|
|
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
|
|
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
|
|
bit->start = (const char*)args->iend[0];
|
|
bit->limitPtr = bit->start + sizeof(size_t);
|
|
bit->ptr = (const char*)args->ip[stream];
|
|
|
|
return 0;
|
|
}
|
|
-#endif
|
|
|
|
|
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
@@ -283,10 +304,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
|
|
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
|
|
U64 D4;
|
|
if (MEM_isLittleEndian()) {
|
|
- D4 = (symbol << 8) + nbBits;
|
|
+ D4 = (U64)((symbol << 8) + nbBits);
|
|
} else {
|
|
- D4 = symbol + (nbBits << 8);
|
|
+ D4 = (U64)(symbol + (nbBits << 8));
|
|
}
|
|
+ assert(D4 < (1U << 16));
|
|
D4 *= 0x0001000100010001ULL;
|
|
return D4;
|
|
}
|
|
@@ -329,13 +351,7 @@ typedef struct {
|
|
BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
|
|
} HUF_ReadDTableX1_Workspace;
|
|
|
|
-
|
|
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
|
|
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
U32 tableLog = 0;
|
|
U32 nbSymbols = 0;
|
|
@@ -350,7 +366,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
|
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
|
|
/* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
|
|
|
|
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
|
|
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
|
|
if (HUF_isError(iSize)) return iSize;
|
|
|
|
|
|
@@ -377,9 +393,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
|
* rankStart[0] is not filled because there are no entries in the table for
|
|
* weight 0.
|
|
*/
|
|
- {
|
|
- int n;
|
|
- int nextRankStart = 0;
|
|
+ { int n;
|
|
+ U32 nextRankStart = 0;
|
|
int const unroll = 4;
|
|
int const nLimit = (int)nbSymbols - unroll + 1;
|
|
for (n=0; n<(int)tableLog+1; n++) {
|
|
@@ -406,10 +421,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
|
* We can switch based on the length to a different inner loop which is
|
|
* optimized for that particular case.
|
|
*/
|
|
- {
|
|
- U32 w;
|
|
- int symbol=wksp->rankVal[0];
|
|
- int rankStart=0;
|
|
+ { U32 w;
|
|
+ int symbol = wksp->rankVal[0];
|
|
+ int rankStart = 0;
|
|
for (w=1; w<tableLog+1; ++w) {
|
|
int const symbolCount = wksp->rankVal[w];
|
|
int const length = (1 << w) >> 1;
|
|
@@ -519,7 +533,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
|
|
while (p < pEnd)
|
|
HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
|
|
|
|
- return pEnd-pStart;
|
|
+ return (size_t)(pEnd-pStart);
|
|
}
|
|
|
|
FORCE_INLINE_TEMPLATE size_t
|
|
@@ -545,6 +559,10 @@ HUF_decompress1X1_usingDTable_internal_body(
|
|
return dstSize;
|
|
}
|
|
|
|
+/* HUF_decompress4X1_usingDTable_internal_body():
|
|
+ * Conditions :
|
|
+ * @dstSize >= 6
|
|
+ */
|
|
FORCE_INLINE_TEMPLATE size_t
|
|
HUF_decompress4X1_usingDTable_internal_body(
|
|
void* dst, size_t dstSize,
|
|
@@ -588,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
|
|
|
|
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
|
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
|
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
|
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
|
@@ -650,38 +669,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
|
|
}
|
|
#endif
|
|
|
|
-#if HUF_NEED_DEFAULT_FUNCTION
|
|
static
|
|
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
|
size_t cSrcSize, HUF_DTable const* DTable) {
|
|
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
-#endif
|
|
|
|
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
|
|
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
|
|
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
|
|
+
|
|
+#endif
|
|
+
|
|
+static HUF_FAST_BMI2_ATTRS
|
|
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
|
|
+{
|
|
+ U64 bits[4];
|
|
+ BYTE const* ip[4];
|
|
+ BYTE* op[4];
|
|
+ U16 const* const dtable = (U16 const*)args->dt;
|
|
+ BYTE* const oend = args->oend;
|
|
+ BYTE const* const ilimit = args->ilimit;
|
|
+
|
|
+ /* Copy the arguments to local variables */
|
|
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
|
|
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
|
|
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
|
|
+
|
|
+ assert(MEM_isLittleEndian());
|
|
+ assert(!MEM_32bits());
|
|
+
|
|
+ for (;;) {
|
|
+ BYTE* olimit;
|
|
+ int stream;
|
|
+ int symbol;
|
|
+
|
|
+ /* Assert loop preconditions */
|
|
+#ifndef NDEBUG
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
|
|
+ assert(ip[stream] >= ilimit);
|
|
+ }
|
|
+#endif
|
|
+ /* Compute olimit */
|
|
+ {
|
|
+ /* Each iteration produces 5 output symbols per stream */
|
|
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
|
|
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
|
|
+ * per stream.
|
|
+ */
|
|
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
|
|
+ /* We can safely run iters iterations before running bounds checks */
|
|
+ size_t const iters = MIN(oiters, iiters);
|
|
+ size_t const symbols = iters * 5;
|
|
+
|
|
+ /* We can simply check that op[3] < olimit, instead of checking all
|
|
+ * of our bounds, since we can't hit the other bounds until we've run
|
|
+ * iters iterations, which only happens when op[3] == olimit.
|
|
+ */
|
|
+ olimit = op[3] + symbols;
|
|
+
|
|
+ /* Exit fast decoding loop once we get close to the end. */
|
|
+ if (op[3] + 20 > olimit)
|
|
+ break;
|
|
+
|
|
+ /* Exit the decoding loop if any input pointer has crossed the
|
|
+ * previous one. This indicates corruption, and a precondition
|
|
+ * to our loop is that ip[i] >= ip[0].
|
|
+ */
|
|
+ for (stream = 1; stream < 4; ++stream) {
|
|
+ if (ip[stream] < ip[stream - 1])
|
|
+ goto _out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#ifndef NDEBUG
|
|
+ for (stream = 1; stream < 4; ++stream) {
|
|
+ assert(ip[stream] >= ip[stream - 1]);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ do {
|
|
+ /* Decode 5 symbols in each of the 4 streams */
|
|
+ for (symbol = 0; symbol < 5; ++symbol) {
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ int const index = (int)(bits[stream] >> 53);
|
|
+ int const entry = (int)dtable[index];
|
|
+ bits[stream] <<= (entry & 63);
|
|
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
|
|
+ }
|
|
+ }
|
|
+ /* Reload the bitstreams */
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
|
|
+ int const nbBits = ctz & 7;
|
|
+ int const nbBytes = ctz >> 3;
|
|
+ op[stream] += 5;
|
|
+ ip[stream] -= nbBytes;
|
|
+ bits[stream] = MEM_read64(ip[stream]) | 1;
|
|
+ bits[stream] <<= nbBits;
|
|
+ }
|
|
+ } while (op[3] < olimit);
|
|
+ }
|
|
+
|
|
+_out:
|
|
|
|
-static HUF_ASM_X86_64_BMI2_ATTRS
|
|
+ /* Save the final values of each of the state variables back to args. */
|
|
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
|
|
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
|
|
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * @returns @p dstSize on success (>= 6)
|
|
+ * 0 if the fallback implementation should be used
|
|
+ * An error if an error occurred
|
|
+ */
|
|
+static HUF_FAST_BMI2_ATTRS
|
|
size_t
|
|
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
|
|
+HUF_decompress4X1_usingDTable_internal_fast(
|
|
void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
+ const HUF_DTable* DTable,
|
|
+ HUF_DecompressFastLoopFn loopFn)
|
|
{
|
|
void const* dt = DTable + 1;
|
|
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
|
BYTE* const oend = (BYTE*)dst + dstSize;
|
|
- HUF_DecompressAsmArgs args;
|
|
- {
|
|
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
|
- FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
|
- if (ret != 0)
|
|
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ HUF_DecompressFastArgs args;
|
|
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
|
|
+ if (ret == 0)
|
|
+ return 0;
|
|
}
|
|
|
|
assert(args.ip[0] >= args.ilimit);
|
|
- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
|
|
+ loopFn(&args);
|
|
|
|
/* Our loop guarantees that ip[] >= ilimit and that we haven't
|
|
* overwritten any op[].
|
|
@@ -694,8 +817,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
|
|
(void)iend;
|
|
|
|
/* finish bit streams one by one. */
|
|
- {
|
|
- size_t const segmentSize = (dstSize+3) / 4;
|
|
+ { size_t const segmentSize = (dstSize+3) / 4;
|
|
BYTE* segmentEnd = (BYTE*)dst;
|
|
int i;
|
|
for (i = 0; i < 4; ++i) {
|
|
@@ -712,97 +834,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
|
|
}
|
|
|
|
/* decoded size */
|
|
+ assert(dstSize != 0);
|
|
return dstSize;
|
|
}
|
|
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
|
|
-
|
|
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
|
|
- const void *cSrc,
|
|
- size_t cSrcSize,
|
|
- const HUF_DTable *DTable);
|
|
|
|
HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
|
|
|
|
static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
|
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
|
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
|
|
{
|
|
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
|
|
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
|
|
+
|
|
#if DYNAMIC_BMI2
|
|
- if (bmi2) {
|
|
+ if (flags & HUF_flags_bmi2) {
|
|
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
|
|
# if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
-# else
|
|
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ if (!(flags & HUF_flags_disableAsm)) {
|
|
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
|
|
+ }
|
|
# endif
|
|
+ } else {
|
|
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
-#else
|
|
- (void)bmi2;
|
|
#endif
|
|
|
|
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
|
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
-#else
|
|
- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ if (!(flags & HUF_flags_disableAsm)) {
|
|
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
|
|
+ }
|
|
#endif
|
|
-}
|
|
-
|
|
-
|
|
-size_t HUF_decompress1X1_usingDTable(
|
|
- void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc dtd = HUF_getDTableDesc(DTable);
|
|
- if (dtd.tableType != 0) return ERROR(GENERIC);
|
|
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-}
|
|
|
|
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- const BYTE* ip = (const BYTE*) cSrc;
|
|
-
|
|
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
|
|
- if (HUF_isError(hSize)) return hSize;
|
|
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
|
|
- ip += hSize; cSrcSize -= hSize;
|
|
-
|
|
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-
|
|
-size_t HUF_decompress4X1_usingDTable(
|
|
- void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc dtd = HUF_getDTableDesc(DTable);
|
|
- if (dtd.tableType != 0) return ERROR(GENERIC);
|
|
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
+ if (!(flags & HUF_flags_disableFast)) {
|
|
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
|
|
+ if (ret != 0)
|
|
+ return ret;
|
|
+ }
|
|
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
|
|
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize, int bmi2)
|
|
+ void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
const BYTE* ip = (const BYTE*) cSrc;
|
|
|
|
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
|
|
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
|
|
if (HUF_isError(hSize)) return hSize;
|
|
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
|
|
ip += hSize; cSrcSize -= hSize;
|
|
|
|
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
|
|
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
|
|
}
|
|
|
|
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
|
|
-}
|
|
-
|
|
-
|
|
#endif /* HUF_FORCE_DECOMPRESS_X2 */
|
|
|
|
|
|
@@ -985,7 +1069,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
|
|
|
|
static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
|
|
const sortedSymbol_t* sortedList,
|
|
- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
|
|
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
|
|
const U32 nbBitsBaseline)
|
|
{
|
|
U32* const rankVal = rankValOrigin[0];
|
|
@@ -1040,14 +1124,7 @@ typedef struct {
|
|
|
|
size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
const void* src, size_t srcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
|
|
- const void* src, size_t srcSize,
|
|
- void* workSpace, size_t wkspSize, int bmi2)
|
|
+ void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
U32 tableLog, maxW, nbSymbols;
|
|
DTableDesc dtd = HUF_getDTableDesc(DTable);
|
|
@@ -1069,7 +1146,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
|
|
if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
|
|
/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
|
|
|
|
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
|
|
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
|
|
if (HUF_isError(iSize)) return iSize;
|
|
|
|
/* check result */
|
|
@@ -1240,6 +1317,11 @@ HUF_decompress1X2_usingDTable_internal_body(
|
|
/* decoded size */
|
|
return dstSize;
|
|
}
|
|
+
|
|
+/* HUF_decompress4X2_usingDTable_internal_body():
|
|
+ * Conditions:
|
|
+ * @dstSize >= 6
|
|
+ */
|
|
FORCE_INLINE_TEMPLATE size_t
|
|
HUF_decompress4X2_usingDTable_internal_body(
|
|
void* dst, size_t dstSize,
|
|
@@ -1280,8 +1362,9 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|
DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
|
U32 const dtLog = dtd.tableLog;
|
|
|
|
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
|
- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
|
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
|
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
|
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
|
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
|
@@ -1366,36 +1449,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
|
|
}
|
|
#endif
|
|
|
|
-#if HUF_NEED_DEFAULT_FUNCTION
|
|
static
|
|
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
|
size_t cSrcSize, HUF_DTable const* DTable) {
|
|
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
-#endif
|
|
|
|
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
|
|
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
|
|
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
|
|
|
|
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
|
|
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
|
|
+#endif
|
|
+
|
|
+static HUF_FAST_BMI2_ATTRS
|
|
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
|
|
+{
|
|
+ U64 bits[4];
|
|
+ BYTE const* ip[4];
|
|
+ BYTE* op[4];
|
|
+ BYTE* oend[4];
|
|
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
|
|
+ BYTE const* const ilimit = args->ilimit;
|
|
+
|
|
+ /* Copy the arguments to local registers. */
|
|
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
|
|
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
|
|
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
|
|
+
|
|
+ oend[0] = op[1];
|
|
+ oend[1] = op[2];
|
|
+ oend[2] = op[3];
|
|
+ oend[3] = args->oend;
|
|
+
|
|
+ assert(MEM_isLittleEndian());
|
|
+ assert(!MEM_32bits());
|
|
+
|
|
+ for (;;) {
|
|
+ BYTE* olimit;
|
|
+ int stream;
|
|
+ int symbol;
|
|
+
|
|
+ /* Assert loop preconditions */
|
|
+#ifndef NDEBUG
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ assert(op[stream] <= oend[stream]);
|
|
+ assert(ip[stream] >= ilimit);
|
|
+ }
|
|
+#endif
|
|
+ /* Compute olimit */
|
|
+ {
|
|
+ /* Each loop does 5 table lookups for each of the 4 streams.
|
|
+ * Each table lookup consumes up to 11 bits of input, and produces
|
|
+ * up to 2 bytes of output.
|
|
+ */
|
|
+ /* We can consume up to 7 bytes of input per iteration per stream.
|
|
+ * We also know that each input pointer is >= ip[0]. So we can run
|
|
+ * iters loops before running out of input.
|
|
+ */
|
|
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
|
|
+ /* Each iteration can produce up to 10 bytes of output per stream.
|
|
+ * Each output stream my advance at different rates. So take the
|
|
+ * minimum number of safe iterations among all the output streams.
|
|
+ */
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
|
|
+ iters = MIN(iters, oiters);
|
|
+ }
|
|
+
|
|
+ /* Each iteration produces at least 5 output symbols. So until
|
|
+ * op[3] crosses olimit, we know we haven't executed iters
|
|
+ * iterations yet. This saves us maintaining an iters counter,
|
|
+ * at the expense of computing the remaining # of iterations
|
|
+ * more frequently.
|
|
+ */
|
|
+ olimit = op[3] + (iters * 5);
|
|
+
|
|
+ /* Exit the fast decoding loop if we are too close to the end. */
|
|
+ if (op[3] + 10 > olimit)
|
|
+ break;
|
|
+
|
|
+ /* Exit the decoding loop if any input pointer has crossed the
|
|
+ * previous one. This indicates corruption, and a precondition
|
|
+ * to our loop is that ip[i] >= ip[0].
|
|
+ */
|
|
+ for (stream = 1; stream < 4; ++stream) {
|
|
+ if (ip[stream] < ip[stream - 1])
|
|
+ goto _out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#ifndef NDEBUG
|
|
+ for (stream = 1; stream < 4; ++stream) {
|
|
+ assert(ip[stream] >= ip[stream - 1]);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ do {
|
|
+ /* Do 5 table lookups for each of the first 3 streams */
|
|
+ for (symbol = 0; symbol < 5; ++symbol) {
|
|
+ for (stream = 0; stream < 3; ++stream) {
|
|
+ int const index = (int)(bits[stream] >> 53);
|
|
+ HUF_DEltX2 const entry = dtable[index];
|
|
+ MEM_write16(op[stream], entry.sequence);
|
|
+ bits[stream] <<= (entry.nbBits);
|
|
+ op[stream] += (entry.length);
|
|
+ }
|
|
+ }
|
|
+ /* Do 1 table lookup from the final stream */
|
|
+ {
|
|
+ int const index = (int)(bits[3] >> 53);
|
|
+ HUF_DEltX2 const entry = dtable[index];
|
|
+ MEM_write16(op[3], entry.sequence);
|
|
+ bits[3] <<= (entry.nbBits);
|
|
+ op[3] += (entry.length);
|
|
+ }
|
|
+ /* Do 4 table lookups from the final stream & reload bitstreams */
|
|
+ for (stream = 0; stream < 4; ++stream) {
|
|
+ /* Do a table lookup from the final stream.
|
|
+ * This is interleaved with the reloading to reduce register
|
|
+ * pressure. This shouldn't be necessary, but compilers can
|
|
+ * struggle with codegen with high register pressure.
|
|
+ */
|
|
+ {
|
|
+ int const index = (int)(bits[3] >> 53);
|
|
+ HUF_DEltX2 const entry = dtable[index];
|
|
+ MEM_write16(op[3], entry.sequence);
|
|
+ bits[3] <<= (entry.nbBits);
|
|
+ op[3] += (entry.length);
|
|
+ }
|
|
+ /* Reload the bistreams. The final bitstream must be reloaded
|
|
+ * after the 5th symbol was decoded.
|
|
+ */
|
|
+ {
|
|
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
|
|
+ int const nbBits = ctz & 7;
|
|
+ int const nbBytes = ctz >> 3;
|
|
+ ip[stream] -= nbBytes;
|
|
+ bits[stream] = MEM_read64(ip[stream]) | 1;
|
|
+ bits[stream] <<= nbBits;
|
|
+ }
|
|
+ }
|
|
+ } while (op[3] < olimit);
|
|
+ }
|
|
+
|
|
+_out:
|
|
+
|
|
+ /* Save the final values of each of the state variables back to args. */
|
|
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
|
|
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
|
|
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
|
|
+}
|
|
+
|
|
+
|
|
+static HUF_FAST_BMI2_ATTRS size_t
|
|
+HUF_decompress4X2_usingDTable_internal_fast(
|
|
void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable) {
|
|
+ const HUF_DTable* DTable,
|
|
+ HUF_DecompressFastLoopFn loopFn) {
|
|
void const* dt = DTable + 1;
|
|
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
|
BYTE* const oend = (BYTE*)dst + dstSize;
|
|
- HUF_DecompressAsmArgs args;
|
|
+ HUF_DecompressFastArgs args;
|
|
{
|
|
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
|
FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
|
- if (ret != 0)
|
|
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ if (ret == 0)
|
|
+ return 0;
|
|
}
|
|
|
|
assert(args.ip[0] >= args.ilimit);
|
|
- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
|
|
+ loopFn(&args);
|
|
|
|
/* note : op4 already verified within main loop */
|
|
assert(args.ip[0] >= iend);
|
|
@@ -1426,91 +1650,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
|
|
/* decoded size */
|
|
return dstSize;
|
|
}
|
|
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
|
|
|
|
static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
|
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
|
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
|
|
{
|
|
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
|
|
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
|
|
+
|
|
#if DYNAMIC_BMI2
|
|
- if (bmi2) {
|
|
+ if (flags & HUF_flags_bmi2) {
|
|
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
|
|
# if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
-# else
|
|
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ if (!(flags & HUF_flags_disableAsm)) {
|
|
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
|
|
+ }
|
|
# endif
|
|
+ } else {
|
|
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
-#else
|
|
- (void)bmi2;
|
|
#endif
|
|
|
|
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
|
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
-#else
|
|
- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
+ if (!(flags & HUF_flags_disableAsm)) {
|
|
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
|
|
+ }
|
|
#endif
|
|
+
|
|
+ if (!(flags & HUF_flags_disableFast)) {
|
|
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
|
|
+ if (ret != 0)
|
|
+ return ret;
|
|
+ }
|
|
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
|
|
}
|
|
|
|
HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
|
|
|
|
-size_t HUF_decompress1X2_usingDTable(
|
|
- void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc dtd = HUF_getDTableDesc(DTable);
|
|
- if (dtd.tableType != 1) return ERROR(GENERIC);
|
|
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
+ void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
const BYTE* ip = (const BYTE*) cSrc;
|
|
|
|
size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
|
|
- workSpace, wkspSize);
|
|
+ workSpace, wkspSize, flags);
|
|
if (HUF_isError(hSize)) return hSize;
|
|
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
|
|
ip += hSize; cSrcSize -= hSize;
|
|
|
|
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-
|
|
-size_t HUF_decompress4X2_usingDTable(
|
|
- void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc dtd = HUF_getDTableDesc(DTable);
|
|
- if (dtd.tableType != 1) return ERROR(GENERIC);
|
|
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
|
|
}
|
|
|
|
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize, int bmi2)
|
|
+ void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
const BYTE* ip = (const BYTE*) cSrc;
|
|
|
|
size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
|
|
- workSpace, wkspSize);
|
|
+ workSpace, wkspSize, flags);
|
|
if (HUF_isError(hSize)) return hSize;
|
|
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
|
|
ip += hSize; cSrcSize -= hSize;
|
|
|
|
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
|
|
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
|
|
}
|
|
|
|
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
-{
|
|
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
|
|
-}
|
|
-
|
|
-
|
|
#endif /* HUF_FORCE_DECOMPRESS_X1 */
|
|
|
|
|
|
@@ -1518,44 +1723,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
/* Universal decompression selectors */
|
|
/* ***********************************/
|
|
|
|
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
|
-#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
- (void)dtd;
|
|
- assert(dtd.tableType == 0);
|
|
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
- (void)dtd;
|
|
- assert(dtd.tableType == 1);
|
|
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#else
|
|
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
|
|
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#endif
|
|
-}
|
|
-
|
|
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
|
|
- const void* cSrc, size_t cSrcSize,
|
|
- const HUF_DTable* DTable)
|
|
-{
|
|
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
|
-#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
- (void)dtd;
|
|
- assert(dtd.tableType == 0);
|
|
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
- (void)dtd;
|
|
- assert(dtd.tableType == 1);
|
|
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#else
|
|
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
|
|
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
|
|
-#endif
|
|
-}
|
|
-
|
|
|
|
#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
|
|
typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
|
|
@@ -1610,36 +1777,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
|
|
#endif
|
|
}
|
|
|
|
-
|
|
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
|
|
- size_t dstSize, const void* cSrc,
|
|
- size_t cSrcSize, void* workSpace,
|
|
- size_t wkspSize)
|
|
-{
|
|
- /* validation checks */
|
|
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
|
|
- if (cSrcSize == 0) return ERROR(corruption_detected);
|
|
-
|
|
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
|
|
-#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
- (void)algoNb;
|
|
- assert(algoNb == 0);
|
|
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
|
|
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
- (void)algoNb;
|
|
- assert(algoNb == 1);
|
|
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
|
|
-#else
|
|
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
|
|
- cSrcSize, workSpace, wkspSize):
|
|
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
|
|
-#endif
|
|
- }
|
|
-}
|
|
-
|
|
size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
const void* cSrc, size_t cSrcSize,
|
|
- void* workSpace, size_t wkspSize)
|
|
+ void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
/* validation checks */
|
|
if (dstSize == 0) return ERROR(dstSize_tooSmall);
|
|
@@ -1652,71 +1792,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
(void)algoNb;
|
|
assert(algoNb == 0);
|
|
return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
|
|
- cSrcSize, workSpace, wkspSize);
|
|
+ cSrcSize, workSpace, wkspSize, flags);
|
|
#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
(void)algoNb;
|
|
assert(algoNb == 1);
|
|
return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
|
|
- cSrcSize, workSpace, wkspSize);
|
|
+ cSrcSize, workSpace, wkspSize, flags);
|
|
#else
|
|
return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
|
|
- cSrcSize, workSpace, wkspSize):
|
|
+ cSrcSize, workSpace, wkspSize, flags):
|
|
HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
|
|
- cSrcSize, workSpace, wkspSize);
|
|
+ cSrcSize, workSpace, wkspSize, flags);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
|
|
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
|
|
{
|
|
DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
|
#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
(void)dtd;
|
|
assert(dtd.tableType == 0);
|
|
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
(void)dtd;
|
|
assert(dtd.tableType == 1);
|
|
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#else
|
|
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
|
|
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
|
|
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#endif
|
|
}
|
|
|
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
|
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
|
|
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
const BYTE* ip = (const BYTE*) cSrc;
|
|
|
|
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
|
|
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
|
|
if (HUF_isError(hSize)) return hSize;
|
|
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
|
|
ip += hSize; cSrcSize -= hSize;
|
|
|
|
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
|
|
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
|
|
}
|
|
#endif
|
|
|
|
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
|
|
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
|
|
{
|
|
DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
|
#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
(void)dtd;
|
|
assert(dtd.tableType == 0);
|
|
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
(void)dtd;
|
|
assert(dtd.tableType == 1);
|
|
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#else
|
|
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
|
|
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
|
|
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
|
|
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
|
|
#endif
|
|
}
|
|
|
|
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
|
|
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
|
|
{
|
|
/* validation checks */
|
|
if (dstSize == 0) return ERROR(dstSize_tooSmall);
|
|
@@ -1726,15 +1866,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
|
|
#if defined(HUF_FORCE_DECOMPRESS_X1)
|
|
(void)algoNb;
|
|
assert(algoNb == 0);
|
|
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
|
|
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
|
|
#elif defined(HUF_FORCE_DECOMPRESS_X2)
|
|
(void)algoNb;
|
|
assert(algoNb == 1);
|
|
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
|
|
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
|
|
#else
|
|
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
|
|
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
|
|
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
|
|
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
|
|
#endif
|
|
}
|
|
}
|
|
-
|
|
diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
|
|
index dbbc7919de53..30ef65e1ab5c 100644
|
|
--- a/lib/zstd/decompress/zstd_ddict.c
|
|
+++ b/lib/zstd/decompress/zstd_ddict.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -14,12 +15,12 @@
|
|
/*-*******************************************************
|
|
* Dependencies
|
|
*********************************************************/
|
|
+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
|
|
#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
|
|
#include "../common/cpu.h" /* bmi2 */
|
|
#include "../common/mem.h" /* low level memory routines */
|
|
#define FSE_STATIC_LINKING_ONLY
|
|
#include "../common/fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include "zstd_decompress_internal.h"
|
|
#include "zstd_ddict.h"
|
|
@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
|
|
ZSTD_memcpy(internalBuffer, dict, dictSize);
|
|
}
|
|
ddict->dictSize = dictSize;
|
|
- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
|
|
+ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
|
|
|
|
/* parse dictionary content */
|
|
FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
|
|
@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
|
|
unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
|
|
{
|
|
if (ddict==NULL) return 0;
|
|
- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
|
|
+ return ddict->dictID;
|
|
}
|
|
diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
|
|
index 8c1a79d666f8..de459a0dacd1 100644
|
|
--- a/lib/zstd/decompress/zstd_ddict.h
|
|
+++ b/lib/zstd/decompress/zstd_ddict.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
|
|
index 6b3177c94711..5e2a3ef03732 100644
|
|
--- a/lib/zstd/decompress/zstd_decompress.c
|
|
+++ b/lib/zstd/decompress/zstd_decompress.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -52,17 +53,18 @@
|
|
/*-*******************************************************
|
|
* Dependencies
|
|
*********************************************************/
|
|
+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
|
|
#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
|
|
#include "../common/mem.h" /* low level memory routines */
|
|
#define FSE_STATIC_LINKING_ONLY
|
|
#include "../common/fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
|
|
#include "../common/zstd_internal.h" /* blockProperties_t */
|
|
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
|
|
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
|
|
#include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */
|
|
+#include "../common/bits.h" /* ZSTD_highbit32 */
|
|
|
|
|
|
|
|
@@ -72,11 +74,11 @@
|
|
*************************************/
|
|
|
|
#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
|
|
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
|
|
- * Currently, that means a 0.75 load factor.
|
|
- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
|
|
- * the load factor of the ddict hash set.
|
|
- */
|
|
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
|
|
+ * Currently, that means a 0.75 load factor.
|
|
+ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
|
|
+ * the load factor of the ddict hash set.
|
|
+ */
|
|
|
|
#define DDICT_HASHSET_TABLE_BASE_SIZE 64
|
|
#define DDICT_HASHSET_RESIZE_FACTOR 2
|
|
@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
|
|
dctx->outBufferMode = ZSTD_bm_buffered;
|
|
dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
|
|
dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
|
|
+ dctx->disableHufAsm = 0;
|
|
}
|
|
|
|
static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
|
|
@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
|
|
* note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
|
|
* @return : 0, `zfhPtr` is correctly filled,
|
|
* >0, `srcSize` is too small, value is wanted `srcSize` amount,
|
|
- * or an error code, which can be tested using ZSTD_isError() */
|
|
+** or an error code, which can be tested using ZSTD_isError() */
|
|
size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
|
|
{
|
|
const BYTE* ip = (const BYTE*)src;
|
|
size_t const minInputSize = ZSTD_startingInputLength(format);
|
|
|
|
- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
|
|
- if (srcSize < minInputSize) return minInputSize;
|
|
- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
|
|
+ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
|
|
+
|
|
+ if (srcSize > 0) {
|
|
+ /* note : technically could be considered an assert(), since it's an invalid entry */
|
|
+ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
|
|
+ }
|
|
+ if (srcSize < minInputSize) {
|
|
+ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
|
|
+ /* when receiving less than @minInputSize bytes,
|
|
+ * control these bytes at least correspond to a supported magic number
|
|
+ * in order to error out early if they don't.
|
|
+ **/
|
|
+ size_t const toCopy = MIN(4, srcSize);
|
|
+ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
|
|
+ assert(src != NULL);
|
|
+ ZSTD_memcpy(hbuf, src, toCopy);
|
|
+ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
|
|
+ /* not a zstd frame : let's check if it's a skippable frame */
|
|
+ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
|
|
+ ZSTD_memcpy(hbuf, src, toCopy);
|
|
+ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
|
|
+ RETURN_ERROR(prefix_unknown,
|
|
+ "first bytes don't correspond to any supported magic number");
|
|
+ } } }
|
|
+ return minInputSize;
|
|
+ }
|
|
|
|
+ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
|
|
if ( (format != ZSTD_f_zstd1_magicless)
|
|
&& (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
|
|
if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
|
|
@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
|
|
sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
|
|
RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
|
|
frameParameter_unsupported, "");
|
|
- {
|
|
- size_t const skippableSize = skippableHeaderSize + sizeU32;
|
|
+ { size_t const skippableSize = skippableHeaderSize + sizeU32;
|
|
RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
|
|
return skippableSize;
|
|
}
|
|
}
|
|
|
|
/*! ZSTD_readSkippableFrame() :
|
|
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
|
|
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
|
|
*
|
|
* The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
|
|
* i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested
|
|
* in the magicVariant.
|
|
*
|
|
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
|
|
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
|
|
*
|
|
* @return : number of bytes written or a ZSTD error.
|
|
*/
|
|
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
|
|
- const void* src, size_t srcSize)
|
|
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
|
|
+ unsigned* magicVariant, /* optional, can be NULL */
|
|
+ const void* src, size_t srcSize)
|
|
{
|
|
- U32 const magicNumber = MEM_readLE32(src);
|
|
- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
|
|
- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
|
|
-
|
|
- /* check input validity */
|
|
- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
|
|
- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
|
|
- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
|
|
+ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
|
|
|
|
- /* deliver payload */
|
|
- if (skippableContentSize > 0 && dst != NULL)
|
|
- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
|
|
- if (magicVariant != NULL)
|
|
- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
|
|
- return skippableContentSize;
|
|
+ { U32 const magicNumber = MEM_readLE32(src);
|
|
+ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
|
|
+ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
|
|
+
|
|
+ /* check input validity */
|
|
+ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
|
|
+ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
|
|
+ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
|
|
+
|
|
+ /* deliver payload */
|
|
+ if (skippableContentSize > 0 && dst != NULL)
|
|
+ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
|
|
+ if (magicVariant != NULL)
|
|
+ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
|
|
+ return skippableContentSize;
|
|
+ }
|
|
}
|
|
|
|
/* ZSTD_findDecompressedSize() :
|
|
- * compatible with legacy mode
|
|
* `srcSize` must be the exact length of some number of ZSTD compressed and/or
|
|
* skippable frames
|
|
- * @return : decompressed size of the frames contained */
|
|
+ * note: compatible with legacy mode
|
|
+ * @return : decompressed size of the frames contained */
|
|
unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
|
|
{
|
|
unsigned long long totalDstSize = 0;
|
|
@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
|
|
|
|
if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
|
|
size_t const skippableSize = readSkippableFrameSize(src, srcSize);
|
|
- if (ZSTD_isError(skippableSize)) {
|
|
- return ZSTD_CONTENTSIZE_ERROR;
|
|
- }
|
|
+ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
|
|
assert(skippableSize <= srcSize);
|
|
|
|
src = (const BYTE *)src + skippableSize;
|
|
@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
|
|
continue;
|
|
}
|
|
|
|
- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
|
|
- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
|
|
+ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
|
|
+ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
|
|
|
|
- /* check for overflow */
|
|
- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
|
|
- totalDstSize += ret;
|
|
+ if (totalDstSize + fcs < totalDstSize)
|
|
+ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
|
|
+ totalDstSize += fcs;
|
|
}
|
|
+ /* skip to next frame */
|
|
{ size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
|
|
- if (ZSTD_isError(frameSrcSize)) {
|
|
- return ZSTD_CONTENTSIZE_ERROR;
|
|
- }
|
|
+ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
|
|
+ assert(frameSrcSize <= srcSize);
|
|
|
|
src = (const BYTE *)src + frameSrcSize;
|
|
srcSize -= frameSrcSize;
|
|
@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
|
|
ip += 4;
|
|
}
|
|
|
|
+ frameSizeInfo.nbBlocks = nbBlocks;
|
|
frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
|
|
frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
|
|
? zfh.frameContentSize
|
|
- : nbBlocks * zfh.blockSizeMax;
|
|
+ : (unsigned long long)nbBlocks * zfh.blockSizeMax;
|
|
return frameSizeInfo;
|
|
}
|
|
}
|
|
@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
|
|
return bound;
|
|
}
|
|
|
|
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
|
|
+{
|
|
+ size_t margin = 0;
|
|
+ unsigned maxBlockSize = 0;
|
|
+
|
|
+ /* Iterate over each frame */
|
|
+ while (srcSize > 0) {
|
|
+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
|
|
+ size_t const compressedSize = frameSizeInfo.compressedSize;
|
|
+ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
|
|
+ ZSTD_frameHeader zfh;
|
|
+
|
|
+ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
|
|
+ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
|
|
+ return ERROR(corruption_detected);
|
|
+
|
|
+ if (zfh.frameType == ZSTD_frame) {
|
|
+ /* Add the frame header to our margin */
|
|
+ margin += zfh.headerSize;
|
|
+ /* Add the checksum to our margin */
|
|
+ margin += zfh.checksumFlag ? 4 : 0;
|
|
+ /* Add 3 bytes per block */
|
|
+ margin += 3 * frameSizeInfo.nbBlocks;
|
|
+
|
|
+ /* Compute the max block size */
|
|
+ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
|
|
+ } else {
|
|
+ assert(zfh.frameType == ZSTD_skippableFrame);
|
|
+ /* Add the entire skippable frame size to our margin. */
|
|
+ margin += compressedSize;
|
|
+ }
|
|
+
|
|
+ assert(srcSize >= compressedSize);
|
|
+ src = (const BYTE*)src + compressedSize;
|
|
+ srcSize -= compressedSize;
|
|
+ }
|
|
+
|
|
+ /* Add the max block size back to the margin. */
|
|
+ margin += maxBlockSize;
|
|
+
|
|
+ return margin;
|
|
+}
|
|
|
|
/*-*************************************************************
|
|
* Frame decoding
|
|
@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
|
|
}
|
|
ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
|
|
/* Allow caller to get size read */
|
|
+ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
|
|
*srcPtr = ip;
|
|
*srcSizePtr = remainingSrcSize;
|
|
return (size_t)(op-ostart);
|
|
@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
|
|
while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
|
|
|
|
|
|
- { U32 const magicNumber = MEM_readLE32(src);
|
|
- DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
|
|
- (unsigned)magicNumber, ZSTD_MAGICNUMBER);
|
|
+ if (srcSize >= 4) {
|
|
+ U32 const magicNumber = MEM_readLE32(src);
|
|
+ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
|
|
if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
|
|
+ /* skippable frame detected : skip it */
|
|
size_t const skippableSize = readSkippableFrameSize(src, srcSize);
|
|
- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
|
|
+ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
|
|
assert(skippableSize <= srcSize);
|
|
|
|
src = (const BYTE *)src + skippableSize;
|
|
srcSize -= skippableSize;
|
|
- continue;
|
|
+ continue; /* check next frame */
|
|
} }
|
|
|
|
if (ddict) {
|
|
@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
|
|
size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
|
|
|
|
/*
|
|
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
|
|
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
|
|
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
|
|
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
|
|
* be streamed.
|
|
*
|
|
* For blocks that can be streamed, this allows us to reduce the latency until we produce
|
|
@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
|
|
|
|
default:
|
|
assert(0); /* impossible */
|
|
- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
|
|
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
|
|
}
|
|
}
|
|
|
|
@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
|
|
/* in minimal huffman, we always use X1 variants */
|
|
size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
|
|
dictPtr, dictEnd - dictPtr,
|
|
- workspace, workspaceSize);
|
|
+ workspace, workspaceSize, /* flags */ 0);
|
|
#else
|
|
size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
|
|
dictPtr, (size_t)(dictEnd - dictPtr),
|
|
- workspace, workspaceSize);
|
|
+ workspace, workspaceSize, /* flags */ 0);
|
|
#endif
|
|
RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
|
|
dictPtr += hSize;
|
|
@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
|
|
dctx->prefixStart = NULL;
|
|
dctx->virtualStart = NULL;
|
|
dctx->dictEnd = NULL;
|
|
- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
|
|
+ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
|
|
dctx->litEntropy = dctx->fseEntropy = 0;
|
|
dctx->dictID = 0;
|
|
dctx->bType = bt_reserved;
|
|
@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
|
|
* This could for one of the following reasons :
|
|
* - The frame does not require a dictionary (most common case).
|
|
* - The frame was built with dictID intentionally removed.
|
|
- * Needed dictionary is a hidden information.
|
|
+ * Needed dictionary is a hidden piece of information.
|
|
* Note : this use case also happens when using a non-conformant dictionary.
|
|
* - `srcSize` is too small, and as a result, frame header could not be decoded.
|
|
* Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
|
|
@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
|
|
* ZSTD_getFrameHeader(), which will provide a more precise error code. */
|
|
unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
|
|
{
|
|
- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
|
|
+ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
|
|
size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
|
|
if (ZSTD_isError(hError)) return 0;
|
|
return zfp.dictID;
|
|
@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
|
|
size_t ZSTD_initDStream(ZSTD_DStream* zds)
|
|
{
|
|
DEBUGLOG(4, "ZSTD_initDStream");
|
|
- return ZSTD_initDStream_usingDDict(zds, NULL);
|
|
+ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
|
|
+ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
|
|
+ return ZSTD_startingInputLength(zds->format);
|
|
}
|
|
|
|
/* ZSTD_initDStream_usingDDict() :
|
|
@@ -1589,20 +1664,12 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
|
|
* this function cannot fail */
|
|
size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
|
|
{
|
|
+ DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
|
|
FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
|
|
FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
|
|
return ZSTD_startingInputLength(dctx->format);
|
|
}
|
|
|
|
-/* ZSTD_resetDStream() :
|
|
- * return : expected size, aka ZSTD_startingInputLength().
|
|
- * this function cannot fail */
|
|
-size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
|
|
-{
|
|
- FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
|
|
- return ZSTD_startingInputLength(dctx->format);
|
|
-}
|
|
-
|
|
|
|
size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
|
|
{
|
|
@@ -1670,6 +1737,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
|
|
bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
|
|
bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
|
|
return bounds;
|
|
+ case ZSTD_d_disableHuffmanAssembly:
|
|
+ bounds.lowerBound = 0;
|
|
+ bounds.upperBound = 1;
|
|
+ return bounds;
|
|
+
|
|
default:;
|
|
}
|
|
bounds.error = ERROR(parameter_unsupported);
|
|
@@ -1710,6 +1782,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
|
|
case ZSTD_d_refMultipleDDicts:
|
|
*value = (int)dctx->refMultipleDDicts;
|
|
return 0;
|
|
+ case ZSTD_d_disableHuffmanAssembly:
|
|
+ *value = (int)dctx->disableHufAsm;
|
|
+ return 0;
|
|
default:;
|
|
}
|
|
RETURN_ERROR(parameter_unsupported, "");
|
|
@@ -1743,6 +1818,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
|
|
}
|
|
dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
|
|
return 0;
|
|
+ case ZSTD_d_disableHuffmanAssembly:
|
|
+ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
|
|
+ dctx->disableHufAsm = value != 0;
|
|
+ return 0;
|
|
default:;
|
|
}
|
|
RETURN_ERROR(parameter_unsupported, "");
|
|
@@ -1918,7 +1997,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
if (zds->refMultipleDDicts && zds->ddictSet) {
|
|
ZSTD_DCtx_selectFrameDDict(zds);
|
|
}
|
|
- DEBUGLOG(5, "header size : %u", (U32)hSize);
|
|
if (ZSTD_isError(hSize)) {
|
|
return hSize; /* error */
|
|
}
|
|
@@ -1932,6 +2010,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
zds->lhSize += remainingInput;
|
|
}
|
|
input->pos = input->size;
|
|
+ /* check first few bytes */
|
|
+ FORWARD_IF_ERROR(
|
|
+ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
|
|
+ "First few bytes detected incorrect" );
|
|
+ /* return hint input size */
|
|
return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
|
|
}
|
|
assert(ip != NULL);
|
|
@@ -1949,8 +2032,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
|
|
if (ZSTD_isError(decompressedSize)) return decompressedSize;
|
|
DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
|
|
+ assert(istart != NULL);
|
|
ip = istart + cSize;
|
|
- op += decompressedSize;
|
|
+ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
|
|
zds->expected = 0;
|
|
zds->streamStage = zdss_init;
|
|
someMoreWork = 0;
|
|
@@ -2034,6 +2118,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
}
|
|
if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */
|
|
FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
|
|
+ assert(ip != NULL);
|
|
ip += neededInSize;
|
|
/* Function modifies the stage so we must break */
|
|
break;
|
|
@@ -2048,7 +2133,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
int const isSkipFrame = ZSTD_isSkipFrame(zds);
|
|
size_t loadedSize;
|
|
/* At this point we shouldn't be decompressing a block that we can stream. */
|
|
- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
|
|
+ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
|
|
if (isSkipFrame) {
|
|
loadedSize = MIN(toLoad, (size_t)(iend-ip));
|
|
} else {
|
|
@@ -2057,8 +2142,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
"should never happen");
|
|
loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
|
|
}
|
|
- ip += loadedSize;
|
|
- zds->inPos += loadedSize;
|
|
+ if (loadedSize != 0) {
|
|
+ /* ip may be NULL */
|
|
+ ip += loadedSize;
|
|
+ zds->inPos += loadedSize;
|
|
+ }
|
|
if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */
|
|
|
|
/* decode loaded input */
|
|
@@ -2068,14 +2156,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
break;
|
|
}
|
|
case zdss_flush:
|
|
- { size_t const toFlushSize = zds->outEnd - zds->outStart;
|
|
+ {
|
|
+ size_t const toFlushSize = zds->outEnd - zds->outStart;
|
|
size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
|
|
- op += flushedSize;
|
|
+
|
|
+ op = op ? op + flushedSize : op;
|
|
+
|
|
zds->outStart += flushedSize;
|
|
if (flushedSize == toFlushSize) { /* flush completed */
|
|
zds->streamStage = zdss_read;
|
|
if ( (zds->outBuffSize < zds->fParams.frameContentSize)
|
|
- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
|
|
+ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
|
|
DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
|
|
(int)(zds->outBuffSize - zds->outStart),
|
|
(U32)zds->fParams.blockSizeMax);
|
|
@@ -2089,7 +2180,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
|
|
default:
|
|
assert(0); /* impossible */
|
|
- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
|
|
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
|
|
} }
|
|
|
|
/* result */
|
|
@@ -2102,8 +2193,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
|
|
if ((ip==istart) && (op==ostart)) { /* no forward progress */
|
|
zds->noForwardProgress ++;
|
|
if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
|
|
- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
|
|
- RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
|
|
+ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
|
|
+ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
|
|
assert(0);
|
|
}
|
|
} else {
|
|
@@ -2140,11 +2231,17 @@ size_t ZSTD_decompressStream_simpleArgs (
|
|
void* dst, size_t dstCapacity, size_t* dstPos,
|
|
const void* src, size_t srcSize, size_t* srcPos)
|
|
{
|
|
- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
|
|
- ZSTD_inBuffer input = { src, srcSize, *srcPos };
|
|
- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
|
|
- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
|
|
- *dstPos = output.pos;
|
|
- *srcPos = input.pos;
|
|
- return cErr;
|
|
+ ZSTD_outBuffer output;
|
|
+ ZSTD_inBuffer input;
|
|
+ output.dst = dst;
|
|
+ output.size = dstCapacity;
|
|
+ output.pos = *dstPos;
|
|
+ input.src = src;
|
|
+ input.size = srcSize;
|
|
+ input.pos = *srcPos;
|
|
+ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
|
|
+ *dstPos = output.pos;
|
|
+ *srcPos = input.pos;
|
|
+ return cErr;
|
|
+ }
|
|
}
|
|
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
|
|
index c1913b8e7c89..9f5577e5bc19 100644
|
|
--- a/lib/zstd/decompress/zstd_decompress_block.c
|
|
+++ b/lib/zstd/decompress/zstd_decompress_block.c
|
|
@@ -1,5 +1,6 @@
|
|
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -20,12 +21,12 @@
|
|
#include "../common/mem.h" /* low level memory routines */
|
|
#define FSE_STATIC_LINKING_ONLY
|
|
#include "../common/fse.h"
|
|
-#define HUF_STATIC_LINKING_ONLY
|
|
#include "../common/huf.h"
|
|
#include "../common/zstd_internal.h"
|
|
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
|
|
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
|
|
#include "zstd_decompress_block.h"
|
|
+#include "../common/bits.h" /* ZSTD_highbit32 */
|
|
|
|
/*_*******************************************************
|
|
* Macros
|
|
@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
|
|
dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
|
|
}
|
|
else {
|
|
- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
|
|
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
|
|
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
|
|
dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
|
|
}
|
|
@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
ZSTD_FALLTHROUGH;
|
|
|
|
case set_compressed:
|
|
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
|
|
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
|
|
{ size_t lhSize, litSize, litCSize;
|
|
U32 singleStream=0;
|
|
U32 const lhlCode = (istart[0] >> 2) & 3;
|
|
U32 const lhc = MEM_readLE32(istart);
|
|
size_t hufSuccess;
|
|
size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
|
|
+ int const flags = 0
|
|
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
|
|
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
|
|
switch(lhlCode)
|
|
{
|
|
case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
|
|
@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
}
|
|
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
|
|
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
|
|
+ if (!singleStream)
|
|
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
|
|
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
|
|
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
|
|
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
|
|
RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
|
|
ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
|
|
@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
|
|
if (litEncType==set_repeat) {
|
|
if (singleStream) {
|
|
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
|
|
+ hufSuccess = HUF_decompress1X_usingDTable(
|
|
dctx->litBuffer, litSize, istart+lhSize, litCSize,
|
|
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
|
|
+ dctx->HUFptr, flags);
|
|
} else {
|
|
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
|
|
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
|
|
+ hufSuccess = HUF_decompress4X_usingDTable(
|
|
dctx->litBuffer, litSize, istart+lhSize, litCSize,
|
|
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
|
|
+ dctx->HUFptr, flags);
|
|
}
|
|
} else {
|
|
if (singleStream) {
|
|
@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
hufSuccess = HUF_decompress1X_DCtx_wksp(
|
|
dctx->entropy.hufTable, dctx->litBuffer, litSize,
|
|
istart+lhSize, litCSize, dctx->workspace,
|
|
- sizeof(dctx->workspace));
|
|
+ sizeof(dctx->workspace), flags);
|
|
#else
|
|
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
|
|
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
|
|
dctx->entropy.hufTable, dctx->litBuffer, litSize,
|
|
istart+lhSize, litCSize, dctx->workspace,
|
|
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
|
|
+ sizeof(dctx->workspace), flags);
|
|
#endif
|
|
} else {
|
|
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
|
|
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
|
|
dctx->entropy.hufTable, dctx->litBuffer, litSize,
|
|
istart+lhSize, litCSize, dctx->workspace,
|
|
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
|
|
+ sizeof(dctx->workspace), flags);
|
|
}
|
|
}
|
|
if (dctx->litBufferLocation == ZSTD_split)
|
|
@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
break;
|
|
case 3:
|
|
lhSize = 3;
|
|
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
|
|
litSize = MEM_readLE24(istart) >> 4;
|
|
break;
|
|
}
|
|
@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
break;
|
|
case 1:
|
|
lhSize = 2;
|
|
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
|
|
litSize = MEM_readLE16(istart) >> 4;
|
|
break;
|
|
case 3:
|
|
lhSize = 3;
|
|
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
|
|
litSize = MEM_readLE24(istart) >> 4;
|
|
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
|
|
break;
|
|
}
|
|
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
|
|
@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
|
|
for (i = 8; i < n; i += 8) {
|
|
MEM_write64(spread + pos + i, sv);
|
|
}
|
|
- pos += n;
|
|
+ assert(n>=0);
|
|
+ pos += (size_t)n;
|
|
}
|
|
}
|
|
/* Now we spread those positions across the table.
|
|
- * The benefit of doing it in two stages is that we avoid the the
|
|
+ * The benefit of doing it in two stages is that we avoid the
|
|
* variable size inner loop, which caused lots of branch misses.
|
|
* Now we can run through all the positions without any branch misses.
|
|
- * We unroll the loop twice, since that is what emperically worked best.
|
|
+ * We unroll the loop twice, since that is what empirically worked best.
|
|
*/
|
|
{
|
|
size_t position = 0;
|
|
@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
|
|
for (i=0; i<n; i++) {
|
|
tableDecode[position].baseValue = s;
|
|
position = (position + step) & tableMask;
|
|
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
|
|
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
|
|
} }
|
|
assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
|
|
}
|
|
@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
|
|
for (u=0; u<tableSize; u++) {
|
|
U32 const symbol = tableDecode[u].baseValue;
|
|
U32 const nextState = symbolNext[symbol]++;
|
|
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
|
|
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
|
|
tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
|
|
assert(nbAdditionalBits[symbol] < 255);
|
|
tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
|
|
@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
|
|
|
|
assert(op != NULL /* Precondition */);
|
|
assert(oend_w < oend /* No underflow */);
|
|
+
|
|
+#if defined(__aarch64__)
|
|
+ /* prefetch sequence starting from match that will be used for copy later */
|
|
+ PREFETCH_L1(match);
|
|
+#endif
|
|
/* Handle edge cases in a slow path:
|
|
* - Read beyond end of literals
|
|
* - Match end is within WILDCOPY_OVERLIMIT of oend
|
|
@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
|
|
}
|
|
|
|
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
|
|
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
|
|
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
|
|
* bits before reloading. This value is the maximum number of bytes we read
|
|
* after reloading when we are decoding long offsets.
|
|
*/
|
|
@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
|
|
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
|
|
{
|
|
seq_t seq;
|
|
+ /*
|
|
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
|
|
+ * loaded in one operation and extracted its fields by simply shifting or
|
|
+ * bit-extracting on aarch64.
|
|
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
|
|
+ * operations that cause performance drop. This can be avoided by using this
|
|
+ * ZSTD_memcpy hack.
|
|
+ */
|
|
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
|
|
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
|
|
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
|
|
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
|
|
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
|
|
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
|
|
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
|
|
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
|
|
+#else
|
|
const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
|
|
const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
|
|
const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
|
|
+#endif
|
|
seq.matchLength = mlDInfo->baseValue;
|
|
seq.litLength = llDInfo->baseValue;
|
|
{ U32 const ofBase = ofDInfo->baseValue;
|
|
@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
|
|
U32 const llnbBits = llDInfo->nbBits;
|
|
U32 const mlnbBits = mlDInfo->nbBits;
|
|
U32 const ofnbBits = ofDInfo->nbBits;
|
|
+
|
|
+ assert(llBits <= MaxLLBits);
|
|
+ assert(mlBits <= MaxMLBits);
|
|
+ assert(ofBits <= MaxOff);
|
|
/*
|
|
* As gcc has better branch and block analyzers, sometimes it is only
|
|
- * valuable to mark likelyness for clang, it gives around 3-4% of
|
|
+ * valuable to mark likeliness for clang, it gives around 3-4% of
|
|
* performance.
|
|
*/
|
|
|
|
/* sequence */
|
|
{ size_t offset;
|
|
- #if defined(__clang__)
|
|
- if (LIKELY(ofBits > 1)) {
|
|
- #else
|
|
if (ofBits > 1) {
|
|
- #endif
|
|
ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
|
|
ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
|
|
- assert(ofBits <= MaxOff);
|
|
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
|
|
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
|
|
if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
|
|
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
|
|
+ /* Always read extra bits, this keeps the logic simple,
|
|
+ * avoids branches, and avoids accidentally reading 0 bits.
|
|
+ */
|
|
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
|
|
offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
|
|
BIT_reloadDStream(&seqState->DStream);
|
|
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
|
|
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
|
|
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
|
|
} else {
|
|
offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
|
|
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
|
|
@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
|
|
seq.offset = offset;
|
|
}
|
|
|
|
- #if defined(__clang__)
|
|
- if (UNLIKELY(mlBits > 0))
|
|
- #else
|
|
if (mlBits > 0)
|
|
- #endif
|
|
seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
|
|
|
|
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
|
|
@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
|
|
/* Ensure there are enough bits to read the rest of data in 64-bit mode. */
|
|
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
|
|
|
|
- #if defined(__clang__)
|
|
- if (UNLIKELY(llBits > 0))
|
|
- #else
|
|
if (llBits > 0)
|
|
- #endif
|
|
seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
|
|
|
|
if (MEM_32bits())
|
|
@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
|
|
const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
|
|
const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
|
|
const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
|
|
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
|
|
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
|
|
(void)frame;
|
|
|
|
/* Regen sequences */
|
|
@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
|
|
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
|
|
|
|
|
|
+/*
|
|
+ * @returns The total size of the history referenceable by zstd, including
|
|
+ * both the prefix and the extDict. At @p op any offset larger than this
|
|
+ * is invalid.
|
|
+ */
|
|
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
|
|
+{
|
|
+ return (size_t)(op - virtualStart);
|
|
+}
|
|
+
|
|
+typedef struct {
|
|
+ unsigned longOffsetShare;
|
|
+ unsigned maxNbAdditionalBits;
|
|
+} ZSTD_OffsetInfo;
|
|
|
|
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
|
|
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
|
|
-/* ZSTD_getLongOffsetsShare() :
|
|
+/* ZSTD_getOffsetInfo() :
|
|
* condition : offTable must be valid
|
|
* @return : "share" of long offsets (arbitrarily defined as > (1<<23))
|
|
- * compared to maximum possible of (1<<OffFSELog) */
|
|
-static unsigned
|
|
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
|
|
+ * compared to maximum possible of (1<<OffFSELog),
|
|
+ * as well as the maximum number additional bits required.
|
|
+ */
|
|
+static ZSTD_OffsetInfo
|
|
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
|
|
{
|
|
- const void* ptr = offTable;
|
|
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
|
|
- const ZSTD_seqSymbol* table = offTable + 1;
|
|
- U32 const max = 1 << tableLog;
|
|
- U32 u, total = 0;
|
|
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
|
|
-
|
|
- assert(max <= (1 << OffFSELog)); /* max not too large */
|
|
- for (u=0; u<max; u++) {
|
|
- if (table[u].nbAdditionalBits > 22) total += 1;
|
|
+ ZSTD_OffsetInfo info = {0, 0};
|
|
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
|
|
+ * no sequences, so both values should be 0.
|
|
+ */
|
|
+ if (nbSeq != 0) {
|
|
+ const void* ptr = offTable;
|
|
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
|
|
+ const ZSTD_seqSymbol* table = offTable + 1;
|
|
+ U32 const max = 1 << tableLog;
|
|
+ U32 u;
|
|
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
|
|
+
|
|
+ assert(max <= (1 << OffFSELog)); /* max not too large */
|
|
+ for (u=0; u<max; u++) {
|
|
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
|
|
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
|
|
+ }
|
|
+
|
|
+ assert(tableLog <= OffFSELog);
|
|
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
|
|
}
|
|
|
|
- assert(tableLog <= OffFSELog);
|
|
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
|
|
+ return info;
|
|
+}
|
|
|
|
- return total;
|
|
+/*
|
|
+ * @returns The maximum offset we can decode in one read of our bitstream, without
|
|
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
|
|
+ * than this must use the long offset decoder.
|
|
+ */
|
|
+static size_t ZSTD_maxShortOffset(void)
|
|
+{
|
|
+ if (MEM_64bits()) {
|
|
+ /* We can decode any offset without reloading bits.
|
|
+ * This might change if the max window size grows.
|
|
+ */
|
|
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
|
|
+ return (size_t)-1;
|
|
+ } else {
|
|
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
|
|
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
|
|
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
|
|
+ */
|
|
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
|
|
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
|
|
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
|
|
+ return maxOffset;
|
|
+ }
|
|
}
|
|
-#endif
|
|
|
|
size_t
|
|
ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
|
|
{ /* blockType == blockCompressed */
|
|
const BYTE* ip = (const BYTE*)src;
|
|
- /* isLongOffset must be true if there are long offsets.
|
|
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
|
|
- * We don't expect that to be the case in 64-bit mode.
|
|
- * In block mode, window size is not known, so we have to be conservative.
|
|
- * (note: but it could be evaluated from current-lowLimit)
|
|
- */
|
|
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
|
|
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
|
|
|
|
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
|
|
+ /* Note : the wording of the specification
|
|
+ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
|
|
+ * This generally does not happen, as it makes little sense,
|
|
+ * since an uncompressed block would feature same size and have no decompression cost.
|
|
+ * Also, note that decoder from reference libzstd before < v1.5.4
|
|
+ * would consider this edge case as an error.
|
|
+ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
|
|
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
|
|
+ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
|
|
|
|
/* Decode literals section */
|
|
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
|
|
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
|
|
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
|
|
if (ZSTD_isError(litCSize)) return litCSize;
|
|
ip += litCSize;
|
|
srcSize -= litCSize;
|
|
@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
|
|
/* Build Decoding Tables */
|
|
{
|
|
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
|
|
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
|
|
+ */
|
|
+ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
|
|
+ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
|
|
+ /* isLongOffset must be true if there are long offsets.
|
|
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
|
|
+ * We don't expect that to be the case in 64-bit mode.
|
|
+ *
|
|
+ * We check here to see if our history is large enough to allow long offsets.
|
|
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
|
|
+ * is invalid, then it is okay to read it incorrectly.
|
|
+ *
|
|
+ * If isLongOffsets is true, then we will later check our decoding table to see
|
|
+ * if it is even possible to generate long offsets.
|
|
+ */
|
|
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
|
|
/* These macros control at build-time which decompressor implementation
|
|
* we use. If neither is defined, we do some inspection and dispatch at
|
|
* runtime.
|
|
@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
|
|
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
|
|
int usePrefetchDecoder = dctx->ddictIsCold;
|
|
+#else
|
|
+ /* Set to 1 to avoid computing offset info if we don't need to.
|
|
+ * Otherwise this value is ignored.
|
|
+ */
|
|
+ int usePrefetchDecoder = 1;
|
|
#endif
|
|
int nbSeq;
|
|
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
|
|
@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
ip += seqHSize;
|
|
srcSize -= seqHSize;
|
|
|
|
- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
|
|
+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
|
|
+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
|
|
+ "invalid dst");
|
|
|
|
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
|
|
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
|
|
- if ( !usePrefetchDecoder
|
|
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
|
|
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
|
|
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
|
|
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
|
|
- usePrefetchDecoder = (shareLongOffsets >= minShare);
|
|
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
|
|
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
|
|
+ * NOTE: could probably use a larger nbSeq limit
|
|
+ */
|
|
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
|
|
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
|
|
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
|
|
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
|
|
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
|
|
+ * use the regular offset decoder.
|
|
+ */
|
|
+ isLongOffset = ZSTD_lo_isRegularOffset;
|
|
+ }
|
|
+ if (!usePrefetchDecoder) {
|
|
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
|
|
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
|
|
+ }
|
|
}
|
|
-#endif
|
|
|
|
dctx->ddictIsCold = 0;
|
|
|
|
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
|
|
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
|
|
- if (usePrefetchDecoder)
|
|
+ if (usePrefetchDecoder) {
|
|
+#else
|
|
+ (void)usePrefetchDecoder;
|
|
+ {
|
|
#endif
|
|
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
|
|
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
|
|
#endif
|
|
+ }
|
|
|
|
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
|
|
/* else */
|
|
@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
|
|
}
|
|
|
|
|
|
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
|
|
- void* dst, size_t dstCapacity,
|
|
- const void* src, size_t srcSize)
|
|
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
{
|
|
size_t dSize;
|
|
ZSTD_checkContinuity(dctx, dst, dstCapacity);
|
|
@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
|
|
dctx->previousDstEnd = (char*)dst + dSize;
|
|
return dSize;
|
|
}
|
|
+
|
|
+
|
|
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
|
|
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize)
|
|
+{
|
|
+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
|
|
+}
|
|
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
|
|
index 3d2d57a5d25a..5888e6cc788b 100644
|
|
--- a/lib/zstd/decompress/zstd_decompress_block.h
|
|
+++ b/lib/zstd/decompress/zstd_decompress_block.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
|
unsigned tableLog, void* wksp, size_t wkspSize,
|
|
int bmi2);
|
|
|
|
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
|
|
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
|
|
+ void* dst, size_t dstCapacity,
|
|
+ const void* src, size_t srcSize);
|
|
+
|
|
|
|
#endif /* ZSTD_DEC_BLOCK_H */
|
|
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
|
|
index 98102edb6a83..32f79fb2873d 100644
|
|
--- a/lib/zstd/decompress/zstd_decompress_internal.h
|
|
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
|
|
@@ -1,5 +1,6 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Yann Collet, Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
|
|
|
|
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
|
|
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
|
|
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
|
|
|
|
typedef struct {
|
|
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
|
|
ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
|
|
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
|
|
- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
|
|
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */
|
|
U32 rep[ZSTD_REP_NUM];
|
|
U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
|
|
} ZSTD_entropyDTables_t;
|
|
@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
|
|
ZSTD_dictUses_e dictUses;
|
|
ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
|
|
ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
|
|
+ int disableHufAsm;
|
|
|
|
/* streaming */
|
|
ZSTD_dStreamStage streamStage;
|
|
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
|
|
index a06ca187aab5..8a47eb2a4514 100644
|
|
--- a/lib/zstd/decompress_sources.h
|
|
+++ b/lib/zstd/decompress_sources.h
|
|
@@ -1,6 +1,6 @@
|
|
/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
|
|
index 22686e367e6f..466828e35752 100644
|
|
--- a/lib/zstd/zstd_common_module.c
|
|
+++ b/lib/zstd/zstd_common_module.c
|
|
@@ -1,6 +1,6 @@
|
|
// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
|
|
EXPORT_SYMBOL_GPL(ZSTD_isError);
|
|
EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
|
|
EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
|
|
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
|
|
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
|
|
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
|
|
|
|
MODULE_LICENSE("Dual BSD/GPL");
|
|
MODULE_DESCRIPTION("Zstd Common");
|
|
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
|
|
index 04e1b5c01d9b..8ecf43226af2 100644
|
|
--- a/lib/zstd/zstd_compress_module.c
|
|
+++ b/lib/zstd/zstd_compress_module.c
|
|
@@ -1,6 +1,6 @@
|
|
// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
|
|
index f4ed952ed485..7d31518e9d5a 100644
|
|
--- a/lib/zstd/zstd_decompress_module.c
|
|
+++ b/lib/zstd/zstd_decompress_module.c
|
|
@@ -1,6 +1,6 @@
|
|
// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
|
|
/*
|
|
- * Copyright (c) Facebook, Inc.
|
|
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
|
|
|
|
size_t zstd_reset_dstream(zstd_dstream *dstream)
|
|
{
|
|
- return ZSTD_resetDStream(dstream);
|
|
+ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
|
|
}
|
|
EXPORT_SYMBOL(zstd_reset_dstream);
|
|
|
|
--
|
|
2.40.1
|