From 60dc452fb5231595aa986bd6915e044445c7ccd3 Mon Sep 17 00:00:00 2001 From: ferrreo Date: Thu, 10 Aug 2023 18:30:38 +0100 Subject: [PATCH] 6.5RC5 --- config | 11 +- patches/0001-cachy-all.patch | 7344 ++++++++++------------------------ patches/0002-eevdf.patch | 316 +- patches/0002-eevdfbore.patch | 249 +- patches/0006-AMD-cppc.patch | 573 +++ scripts/patch.sh | 4 +- scripts/source.sh | 6 +- 7 files changed, 2915 insertions(+), 5588 deletions(-) create mode 100644 patches/0006-AMD-cppc.patch diff --git a/config b/config index 178bb6b..949e0aa 100644 --- a/config +++ b/config @@ -594,7 +594,9 @@ CONFIG_CALL_DEPTH_TRACKING=y # CONFIG_CALL_THUNKS_DEBUG is not set CONFIG_CPU_IBPB_ENTRY=y CONFIG_CPU_IBRS_ENTRY=y +CONFIG_CPU_SRSO=y CONFIG_SLS=y +# CONFIG_GDS_FORCE_MITIGATION is not set CONFIG_ARCH_HAS_ADD_PAGES=y CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y @@ -1333,11 +1335,10 @@ CONFIG_TCP_CONG_YEAH=m CONFIG_TCP_CONG_ILLINOIS=m CONFIG_TCP_CONG_DCTCP=m CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_TCP_CONG_BBR2=y -CONFIG_DEFAULT_BBR2=y +CONFIG_TCP_CONG_BBR=y +CONFIG_DEFAULT_BBR=y # CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="bbr2" +CONFIG_DEFAULT_TCP_CONG="bbr" CONFIG_TCP_MD5SIG=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y @@ -2613,7 +2614,7 @@ CONFIG_ZRAM_DEF_COMP_ZSTD=y # CONFIG_ZRAM_DEF_COMP_842 is not set CONFIG_ZRAM_DEF_COMP="zstd" CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_ZRAM_MEMORY_TRACKING=y CONFIG_ZRAM_MULTI_COMP=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_LOOP_MIN_COUNT=0 diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index 1c37f59..480912c 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,68 +1,87 @@ -From a34c2671419dc12fbea9f81528eda4dd6158d320 Mon Sep 17 00:00:00 2001 +From 907edd508b99c761190492fb3f2211443b4e9bb3 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:08:52 +0200 -Subject: [PATCH 1/7] bbr2 +Date: Mon, 31 Jul 2023 12:19:09 +0200 +Subject: [PATCH 1/5] bbr3 Signed-off-by: Peter Jung --- - include/linux/tcp.h | 3 +- - include/net/inet_connection_sock.h | 3 +- - include/net/tcp.h | 41 +- - include/uapi/linux/inet_diag.h | 33 + - net/ipv4/Kconfig | 22 + - net/ipv4/Makefile | 1 + - net/ipv4/tcp.c | 1 + - net/ipv4/tcp_bbr.c | 38 +- - net/ipv4/tcp_bbr2.c | 2674 ++++++++++++++++++++++++++++ + include/linux/tcp.h | 4 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2231 +++++++++++++++++++++------- net/ipv4/tcp_cong.c | 1 + - net/ipv4/tcp_input.c | 27 +- - net/ipv4/tcp_output.c | 26 +- + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- net/ipv4/tcp_rate.c | 30 +- net/ipv4/tcp_timer.c | 1 + - 14 files changed, 2867 insertions(+), 34 deletions(-) - create mode 100644 net/ipv4/tcp_bbr2.c + 15 files changed, 1934 insertions(+), 551 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index b4c08ac86983..4297c9176435 100644 +index 91a37c99ba66..ae0ee688c3f7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h -@@ -255,7 +255,8 @@ struct tcp_sock { +@@ -255,7 +255,9 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + fast_ack_mode:2, /* which fast ack mode ? */ -+ unused:3; ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index c2b15f7e5516..d85858efa571 100644 +index c2b15f7e5516..a400a84088d3 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h -@@ -135,7 +135,8 @@ struct inet_connection_sock { +@@ -135,8 +135,8 @@ struct inet_connection_sock { u32 icsk_probes_tstamp; u32 icsk_user_timeout; - u64 icsk_ca_priv[104 / sizeof(u64)]; -+/* XXX inflated by temporary internal debugging info */ -+ u64 icsk_ca_priv[216 / sizeof(u64)]; - #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; }; + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index 226bce6d1e8c..64f1ec99c8f0 100644 +index 0ca972ebd3dd..8eb194559b70 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h -@@ -370,6 +370,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, +@@ -370,6 +370,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 -+#define TCP_ECN_ECT_PERMANENT 16 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -819,6 +820,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +@@ -723,6 +725,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + tcp_fast_path_on(tp); + } + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(struct sock *sk) + { +@@ -819,6 +830,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } @@ -74,7 +93,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); -@@ -894,9 +900,14 @@ struct tcp_skb_cb { +@@ -894,9 +910,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -91,7 +110,15 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1022,7 +1033,11 @@ enum tcp_ca_ack_event_flags { +@@ -1000,6 +1021,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1022,7 +1044,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -104,7 +131,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 union tcp_cc_info; -@@ -1042,8 +1057,11 @@ struct ack_sample { +@@ -1042,10 +1068,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -114,17 +141,22 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 + u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 lost; /* number of packets lost over interval */ s32 delivered; /* number of packets delivered over interval */ - s32 delivered_ce; /* number of packets delivered w/ CE marks*/ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ -@@ -1057,6 +1075,7 @@ struct rate_sample { + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1056,7 +1085,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ }; struct tcp_congestion_ops { -@@ -1080,8 +1099,11 @@ struct tcp_congestion_ops { +@@ -1080,8 +1111,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -138,7 +170,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1147,6 +1169,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1147,6 +1181,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -153,7 +185,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1166,6 +1196,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1166,6 +1208,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -161,33 +193,45 @@ index 226bce6d1e8c..64f1ec99c8f0 100644 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); +@@ -1178,6 +1221,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2177,7 +2235,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 50655de04c9b..0e24f11627d5 100644 +index 50655de04c9b..82f8bd8f0d16 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h -@@ -231,9 +231,42 @@ struct tcp_bbr_info { +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ - }; - -+/* Phase as reported in netlink/ss stats. */ -+enum tcp_bbr2_phase { -+ BBR2_PHASE_INVALID = 0, -+ BBR2_PHASE_STARTUP = 1, -+ BBR2_PHASE_DRAIN = 2, -+ BBR2_PHASE_PROBE_RTT = 3, -+ BBR2_PHASE_PROBE_BW_UP = 4, -+ BBR2_PHASE_PROBE_BW_DOWN = 5, -+ BBR2_PHASE_PROBE_BW_CRUISE = 6, -+ BBR2_PHASE_PROBE_BW_REFILL = 7 -+}; -+ -+struct tcp_bbr2_info { -+ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ -+ __u32 bbr_bw_lsb; /* lower 32 bits of bw */ -+ __u32 bbr_bw_msb; /* upper 32 bits of bw */ -+ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ -+ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ -+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ + __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ + __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ @@ -195,80 +239,88 @@ index 50655de04c9b..0e24f11627d5 100644 + __u8 bbr_mode; /* current bbr_mode in state machine */ + __u8 bbr_phase; /* current state machine phase */ + __u8 unused1; /* alignment padding; not used yet */ -+ __u8 bbr_version; /* MUST be at this offset in struct */ -+ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ -+ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ + __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ +}; + - union tcp_cc_info { - struct tcpvegas_info vegas; - struct tcp_dctcp_info dctcp; - struct tcp_bbr_info bbr; -+ struct tcp_bbr2_info bbr2; ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, }; - #endif /* _UAPI_INET_DIAG_H_ */ + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index 51c13cf9c5ae..de8dcba26bec 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -506,9 +506,11 @@ enum { + #define RTAX_FEATURE_SACK (1 << 1) + #define RTAX_FEATURE_TIMESTAMP (1 << 2) + #define RTAX_FEATURE_ALLFRAG (1 << 3) ++#define RTAX_FEATURE_ECN_LOW (1 << 4) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ +- RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) ++ RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \ ++ | RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index 879eeb0a084b..77270053a5e3 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ ++#define TCPI_OPT_ECN_LOW 64 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 2dfb12230f08..b6bec331a82e 100644 +index 2dfb12230f08..2e14db3bee70 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig -@@ -678,6 +678,24 @@ config TCP_CONG_BBR - AQM schemes that do not provide a delay signal. It requires the fq - ("Fair Queue") pacing packet scheduler. +@@ -668,15 +668,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. -+config TCP_CONG_BBR2 -+ tristate "BBR2 TCP" -+ default n -+ help -+ -+ BBR2 TCP congestion control is a model-based congestion control -+ algorithm that aims to maximize network utilization, keep queues and -+ retransmit rates low, and to be able to coexist with Reno/CUBIC in -+ common scenarios. It builds an explicit model of the network path. It -+ tolerates a targeted degree of random packet loss and delay that are -+ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, -+ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can -+ coexist with flows that use loss-based congestion control, and can -+ operate with shallow buffers, deep buffers, bufferbloat, policers, or -+ AQM schemes that do not provide a delay signal. It requires pacing, -+ using either TCP internal pacing or the fq ("Fair Queue") pacing packet -+ scheduler. -+ choice prompt "Default TCP congestion control" - default DEFAULT_CUBIC -@@ -715,6 +733,9 @@ choice - config DEFAULT_BBR - bool "BBR" if TCP_CONG_BBR=y - -+ config DEFAULT_BBR2 -+ bool "BBR2" if TCP_CONG_BBR2=y -+ - config DEFAULT_RENO - bool "Reno" - endchoice -@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG - default "dctcp" if DEFAULT_DCTCP - default "cdg" if DEFAULT_CDG - default "bbr" if DEFAULT_BBR -+ default "bbr2" if DEFAULT_BBR2 - default "cubic" - - config TCP_MD5SIG -diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile -index b18ba8ef93ad..b4e3dcb27a20 100644 ---- a/net/ipv4/Makefile -+++ b/net/ipv4/Makefile -@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o - obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o - obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o - obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o -+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o - obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o - obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o - obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index e03e08745308..326b2c4bacf6 100644 +index 8ed52e1e3c99..0198ac17f3a8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3083,6 +3083,7 @@ int tcp_disconnect(struct sock *sk, int flags) @@ -279,43 +331,505 @@ index e03e08745308..326b2c4bacf6 100644 /* Clean up fastopen related fields */ +@@ -3778,6 +3779,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 146792cd26fe..16038f6ee52a 100644 +index 146792cd26fe..f4f477a69917 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c -@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + } + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); +- sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); ++ sk->sk_pacing_rate = ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -294,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* override sysctl_tcp_min_tso_segs */ - __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) - { - return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; - } - -+/* Return the number of segments BBR would like in a TSO/GSO skb, given -+ * a particular max gso size as a constraint. +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ -+ u32 segs; ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); -+ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) -+{ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); -+} -+ + } + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ static u32 bbr_tso_segs_goal(struct sock *sk) { @@ -335,943 +849,176 @@ index 146792cd26fe..16038f6ee52a 100644 } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { - .undo_cwnd = bbr_undo_cwnd, - .cwnd_event = bbr_cwnd_event, - .ssthresh = bbr_ssthresh, -- .min_tso_segs = bbr_min_tso_segs, -+ .tso_segs = bbr_tso_segs, - .get_info = bbr_get_info, - .set_state = bbr_set_state, - }; -diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c -new file mode 100644 -index 000000000000..85f8052144d1 ---- /dev/null -+++ b/net/ipv4/tcp_bbr2.c -@@ -0,0 +1,2674 @@ -+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 -+ * -+ * BBRv2 is a model-based congestion control algorithm that aims for low -+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model -+ * of the network path, it uses measurements of bandwidth and RTT, as well as -+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that -+ * although it can use ECN or loss signals explicitly, it does not require -+ * either; it can bound its in-flight data based on its estimate of the BDP. -+ * -+ * The model has both higher and lower bounds for the operating range: -+ * lo: bw_lo, inflight_lo: conservative short-term lower bound -+ * hi: bw_hi, inflight_hi: robust long-term upper bound -+ * The bandwidth-probing time scale is (a) extended dynamically based on -+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by -+ * an interactive wall-clock time-scale to be more scalable and responsive -+ * than Reno and CUBIC. -+ * -+ * Here is a state transition diagram for BBR: -+ * -+ * | -+ * V -+ * +---> STARTUP ----+ -+ * | | | -+ * | V | -+ * | DRAIN ----+ -+ * | | | -+ * | V | -+ * +---> PROBE_BW ----+ -+ * | ^ | | -+ * | | | | -+ * | +----+ | -+ * | | -+ * +---- PROBE_RTT <--+ -+ * -+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. -+ * When it estimates the pipe is full, it enters DRAIN to drain the queue. -+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. -+ * A long-lived BBR flow spends the vast majority of its time remaining -+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth -+ * in a fair manner, with a small, bounded queue. *If* a flow has been -+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT -+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then -+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe -+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if -+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; -+ * otherwise we enter STARTUP to try to fill the pipe. -+ * -+ * BBR is described in detail in: -+ * "BBR: Congestion-Based Congestion Control", -+ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, -+ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. -+ * -+ * There is a public e-mail list for discussing BBR development and testing: -+ * https://groups.google.com/forum/#!forum/bbr-dev -+ * -+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, -+ * otherwise TCP stack falls back to an internal pacing using one high -+ * resolution timer per TCP socket and may use more resources. -+ */ -+#include -+#include -+#include -+#include -+#include -+ -+#include "tcp_dctcp.h" -+ -+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth -+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. -+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. -+ * Since the minimum window is >=4 packets, the lower bound isn't -+ * an issue. The upper bound isn't an issue with existing technologies. -+ */ -+#define BW_SCALE 24 -+#define BW_UNIT (1 << BW_SCALE) -+ -+#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ -+#define BBR_UNIT (1 << BBR_SCALE) -+ -+#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ -+#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ -+ -+#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ -+ -+/* BBR has the following modes for deciding how fast to send: */ -+enum bbr_mode { -+ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ -+ BBR_DRAIN, /* drain any queue created during startup */ -+ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ -+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ -+}; -+ -+/* How does the incoming ACK stream relate to our bandwidth probing? */ -+enum bbr_ack_phase { -+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ -+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ -+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ -+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ -+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ -+}; -+ -+/* BBR congestion control block */ -+struct bbr { -+ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ -+ u32 min_rtt_stamp; /* timestamp of min_rtt_us */ -+ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ -+ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ -+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ -+ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ -+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ -+ u64 cycle_mstamp; /* time of this cycle phase start */ -+ u32 mode:3, /* current bbr_mode in state machine */ -+ prev_ca_state:3, /* CA state on previous ACK */ -+ packet_conservation:1, /* use packet conservation? */ -+ round_start:1, /* start of packet-timed tx->ack round? */ -+ ce_state:1, /* If most recent data has CE bit set */ -+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ -+ try_fast_path:1, /* can we take fast path? */ -+ unused2:11, -+ idle_restart:1, /* restarting after idle? */ -+ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ -+ cycle_idx:3, /* current index in pacing_gain cycle array */ -+ has_seen_rtt:1; /* have we seen an RTT sample yet? */ -+ u32 pacing_gain:11, /* current gain for setting pacing rate */ -+ cwnd_gain:11, /* current gain for setting cwnd */ -+ full_bw_reached:1, /* reached full bw in Startup? */ -+ full_bw_cnt:2, /* number of rounds without large bw gains */ -+ init_cwnd:7; /* initial cwnd */ -+ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ -+ u32 full_bw; /* recent bw, to estimate if pipe is full */ -+ -+ /* For tracking ACK aggregation: */ -+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ -+ u16 extra_acked[2]; /* max excess data ACKed in epoch */ -+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ -+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ -+ extra_acked_win_idx:1, /* current index in extra_acked array */ -+ /* BBR v2 state: */ -+ unused1:2, -+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ -+ loss_in_cycle:1, /* packet loss in this cycle? */ -+ ecn_in_cycle:1; /* ECN in this cycle? */ -+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ -+ u32 undo_bw_lo; /* bw_lo before latest losses */ -+ u32 undo_inflight_lo; /* inflight_lo before latest losses */ -+ u32 undo_inflight_hi; /* inflight_hi before latest losses */ -+ u32 bw_latest; /* max delivered bw in last round trip */ -+ u32 bw_lo; /* lower bound on sending bandwidth */ -+ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ -+ u32 inflight_latest; /* max delivered data in last round trip */ -+ u32 inflight_lo; /* lower bound of inflight data range */ -+ u32 inflight_hi; /* upper bound of inflight data range */ -+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ -+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ -+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ -+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ -+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ -+ bw_probe_samples:1, /* rate samples reflect bw probing? */ -+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ -+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ -+ rounds_since_probe:8, /* packet-timed rounds since probed bw */ -+ loss_round_start:1, /* loss_round_delivered round trip? */ -+ loss_in_round:1, /* loss marked in this round trip? */ -+ ecn_in_round:1, /* ECN marked in this round trip? */ -+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ -+ loss_events_in_round:4,/* losses in STARTUP round */ -+ initialized:1; /* has bbr_init() been called? */ -+ u32 alpha_last_delivered; /* tp->delivered at alpha update */ -+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ -+ -+ /* Params configurable using setsockopt. Refer to correspoding -+ * module param for detailed description of params. -+ */ -+ struct bbr_params { -+ u32 high_gain:11, /* max allowed value: 2047 */ -+ drain_gain:10, /* max allowed value: 1023 */ -+ cwnd_gain:11; /* max allowed value: 2047 */ -+ u32 cwnd_min_target:4, /* max allowed value: 15 */ -+ min_rtt_win_sec:5, /* max allowed value: 31 */ -+ probe_rtt_mode_ms:9, /* max allowed value: 511 */ -+ full_bw_cnt:3, /* max allowed value: 7 */ -+ cwnd_tso_budget:1, /* allowed values: {0, 1} */ -+ unused3:6, -+ drain_to_target:1, /* boolean */ -+ precise_ece_ack:1, /* boolean */ -+ extra_acked_in_startup:1, /* allowed values: {0, 1} */ -+ fast_path:1; /* boolean */ -+ u32 full_bw_thresh:10, /* max allowed value: 1023 */ -+ startup_cwnd_gain:11, /* max allowed value: 2047 */ -+ bw_probe_pif_gain:9, /* max allowed value: 511 */ -+ usage_based_cwnd:1, /* boolean */ -+ unused2:1; -+ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ -+ refill_add_inc:2; /* max allowed value: 3 */ -+ u16 extra_acked_gain:11, /* max allowed value: 2047 */ -+ extra_acked_win_rtts:5; /* max allowed value: 31*/ -+ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ -+ /* Mostly BBR v2 parameters below here: */ -+ u32 ecn_alpha_gain:8, /* max allowed value: 255 */ -+ ecn_factor:8, /* max allowed value: 255 */ -+ ecn_thresh:8, /* max allowed value: 255 */ -+ beta:8; /* max allowed value: 255 */ -+ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ -+ bw_probe_reno_gain:9, /* max allowed value: 511 */ -+ full_loss_cnt:4; /* max allowed value: 15 */ -+ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ -+ inflight_headroom:8, /* max allowed value: 255 */ -+ loss_thresh:8, /* max allowed value: 255 */ -+ bw_probe_max_rounds:8; /* max allowed value: 255 */ -+ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ -+ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ -+ full_ecn_cnt:2; /* max allowed value: 3 */ -+ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ -+ undo:1, /* boolean */ -+ tso_rtt_shift:4, /* max allowed value: 15 */ -+ unused5:1; -+ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ -+ unused1:14, -+ ecn_alpha_init:9; /* max allowed value: 256 */ -+ } params; -+ -+ struct { -+ u32 snd_isn; /* Initial sequence number */ -+ u32 rs_bw; /* last valid rate sample bw */ -+ u32 target_cwnd; /* target cwnd, based on BDP */ -+ u8 undo:1, /* Undo even happened but not yet logged */ -+ unused:7; -+ char event; /* single-letter event debug codes */ -+ u16 unused2; -+ } debug; -+}; -+ -+struct bbr_context { -+ u32 sample_bw; -+ u32 target_cwnd; -+ u32 log:1; -+}; -+ -+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ -+static u32 bbr_min_rtt_win_sec = 10; -+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. -+ * Max allowed value is 511 (0x1FF). -+ */ -+static u32 bbr_probe_rtt_mode_ms = 200; -+/* Window length of probe_rtt_min_us filter (in ms), and consequently the -+ * typical interval between PROBE_RTT mode entries. -+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC -+ */ -+static u32 bbr_probe_rtt_win_ms = 5000; -+/* Skip TSO below the following bandwidth (bits/sec): */ -+static int bbr_min_tso_rate = 1200000; -+ -+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting -+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half -+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance -+ * is below 1500 bytes after 6 * ~500 usec = 3ms. -+ */ -+static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ -+ -+/* Select cwnd TSO budget approach: -+ * 0: padding -+ * 1: flooring -+ */ -+static uint bbr_cwnd_tso_budget = 1; -+ -+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. -+ * In order to help drive the network toward lower queues and low latency while -+ * maintaining high utilization, the average pacing rate aims to be slightly -+ * lower than the estimated bandwidth. This is an important aspect of the -+ * design. -+ */ -+static const int bbr_pacing_margin_percent = 1; -+ -+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain -+ * that will allow a smoothly increasing pacing rate that will double each RTT -+ * and send the same number of packets per RTT that an un-paced, slow-starting -+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). -+ */ -+static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; -+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ -+static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; -+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain -+ * the queue created in BBR_STARTUP in a single round. Max allowed value -+ * is 1023 (0x3FF). -+ */ -+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; -+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. -+ * Max allowed value is 2047 (0x7FF). -+ */ -+static int bbr_cwnd_gain = BBR_UNIT * 2; -+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. -+ * Max allowed value for each element is 1023 (0x3FF). -+ */ -+enum bbr_pacing_gain_phase { -+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ -+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ -+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ -+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ -+}; -+static int bbr_pacing_gain[] = { -+ BBR_UNIT * 5 / 4, /* probe for more available bw */ -+ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ -+ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ -+ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ -+}; -+ -+/* Try to keep at least this many packets in flight, if things go smoothly. For -+ * smooth functioning, a sliding window protocol ACKing every other packet -+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF). -+ */ -+static u32 bbr_cwnd_min_target = 4; -+ -+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. -+ * Use 0 to disable. Max allowed value is 255. -+ */ -+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; -+ -+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ -+/* If bw has increased significantly (1.25x), there may be more bw available. -+ * Max allowed value is 1023 (0x3FF). -+ */ -+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; -+/* But after 3 rounds w/o significant bw growth, estimate pipe is full. -+ * Max allowed value is 7 (0x7). -+ */ -+static u32 bbr_full_bw_cnt = 3; -+ -+static u32 bbr_flags; /* Debugging related stuff */ -+ -+/* Whether to debug using printk. -+ */ -+static bool bbr_debug_with_printk; -+ -+/* Whether to debug using ftrace event tcp:tcp_bbr_event. -+ * Ignored when bbr_debug_with_printk is set. -+ */ -+static bool bbr_debug_ftrace; -+ -+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ -+static bool bbr_drain_to_target = true; /* default: enabled */ -+ -+/* Experiment: Flags to control BBR with ECN behavior. -+ */ -+static bool bbr_precise_ece_ack = true; /* default: enabled */ -+ -+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is -+ * (2^(16+14) B)/(1024 B/packet) = 1M packets. -+ */ -+static u32 bbr_cwnd_warn_val = 1U << 20; -+ -+static u16 bbr_debug_port_mask; -+ -+/* BBR module parameters. These are module parameters only in Google prod. -+ * Upstream these are intentionally not module parameters. -+ */ -+static int bbr_pacing_gain_size = CYCLE_LEN; -+ -+/* Gain factor for adding extra_acked to target cwnd: */ -+static int bbr_extra_acked_gain = 256; -+ -+/* Window length of extra_acked window. Max allowed val is 31. */ -+static u32 bbr_extra_acked_win_rtts = 5; -+ -+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ -+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; -+ -+/* Time period for clamping cwnd increment due to ack aggregation */ -+static u32 bbr_extra_acked_max_us = 100 * 1000; -+ -+/* Use extra acked in startup ? -+ * 0: disabled -+ * 1: use latest extra_acked value from 1-2 rtt in startup -+ */ -+static int bbr_extra_acked_in_startup = 1; /* default: enabled */ -+ -+/* Experiment: don't grow cwnd beyond twice of what we just probed. */ -+static bool bbr_usage_based_cwnd; /* default: disabled */ -+ -+/* For lab testing, researchers can enable BBRv2 ECN support with this flag, -+ * when they know that any ECN marks that the connections experience will be -+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. -+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on -+ * negotiation or configuration that is outside the scope of the BBRv2 -+ * alpha release. -+ */ -+static bool bbr_ecn_enable = false; -+ -+module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); -+module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); -+module_param_named(high_gain, bbr_high_gain, int, 0644); -+module_param_named(drain_gain, bbr_drain_gain, int, 0644); -+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); -+module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); -+module_param_array_named(pacing_gain, bbr_pacing_gain, int, -+ &bbr_pacing_gain_size, 0644); -+module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); -+module_param_named(probe_rtt_cwnd_gain, -+ bbr_probe_rtt_cwnd_gain, uint, 0664); -+module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); -+module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); -+module_param_named(flags, bbr_flags, uint, 0644); -+module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); -+module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); -+module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); -+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); -+module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); -+module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); -+module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); -+module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); -+module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); -+module_param_named(extra_acked_win_rtts, -+ bbr_extra_acked_win_rtts, uint, 0664); -+module_param_named(extra_acked_max_us, -+ bbr_extra_acked_max_us, uint, 0664); -+module_param_named(ack_epoch_acked_reset_thresh, -+ bbr_ack_epoch_acked_reset_thresh, uint, 0664); -+module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); -+module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); -+module_param_named(extra_acked_in_startup, -+ bbr_extra_acked_in_startup, int, 0664); -+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); -+module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); -+ -+static void bbr2_exit_probe_rtt(struct sock *sk); -+static void bbr2_reset_congestion_signals(struct sock *sk); -+ -+static void bbr_check_probe_rtt_done(struct sock *sk); -+ -+/* Do we estimate that STARTUP filled the pipe? */ -+static bool bbr_full_bw_reached(const struct sock *sk) -+{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ -+ return bbr->full_bw_reached; -+} -+ -+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ -+static u32 bbr_max_bw(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return max(bbr->bw_hi[0], bbr->bw_hi[1]); -+} -+ -+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ -+static u32 bbr_bw(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return min(bbr_max_bw(sk), bbr->bw_lo); -+} -+ -+/* Return maximum extra acked in past k-2k round trips, -+ * where k = bbr_extra_acked_win_rtts. -+ */ -+static u16 bbr_extra_acked(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return max(bbr->extra_acked[0], bbr->extra_acked[1]); -+} -+ -+/* Return rate in bytes per second, optionally with a gain. -+ * The order here is chosen carefully to avoid overflow of u64. This should -+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. -+ */ -+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, -+ int margin) -+{ -+ unsigned int mss = tcp_sk(sk)->mss_cache; -+ -+ rate *= mss; -+ rate *= gain; -+ rate >>= BBR_SCALE; -+ rate *= USEC_PER_SEC / 100 * (100 - margin); -+ rate >>= BW_SCALE; -+ rate = max(rate, 1ULL); -+ return rate; -+} -+ -+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) -+{ -+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); -+} -+ -+static u64 bbr_rate_kbps(struct sock *sk, u64 rate) -+{ -+ rate = bbr_bw_bytes_per_sec(sk, rate); -+ rate *= 8; -+ do_div(rate, 1000); -+ return rate; -+} -+ -+static u32 bbr_tso_segs_goal(struct sock *sk); -+static void bbr_debug(struct sock *sk, u32 acked, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ static const char ca_states[] = { -+ [TCP_CA_Open] = 'O', -+ [TCP_CA_Disorder] = 'D', -+ [TCP_CA_CWR] = 'C', -+ [TCP_CA_Recovery] = 'R', -+ [TCP_CA_Loss] = 'L', -+ }; -+ static const char mode[] = { -+ 'G', /* Growing - BBR_STARTUP */ -+ 'D', /* Drain - BBR_DRAIN */ -+ 'W', /* Window - BBR_PROBE_BW */ -+ 'M', /* Min RTT - BBR_PROBE_RTT */ -+ }; -+ static const char ack_phase[] = { /* bbr_ack_phase strings */ -+ 'I', /* BBR_ACKS_INIT - 'Init' */ -+ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ -+ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ -+ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ -+ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ -+ }; -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ const u32 una = tp->snd_una - bbr->debug.snd_isn; -+ const u32 fack = tcp_highest_sack_seq(tp); -+ const u16 dport = ntohs(inet_sk(sk)->inet_dport); -+ bool is_port_match = (bbr_debug_port_mask && -+ ((dport & bbr_debug_port_mask) == 0)); -+ char debugmsg[320]; -+ -+ if (sk->sk_state == TCP_SYN_SENT) -+ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ -+ -+ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { -+ char addr[INET6_ADDRSTRLEN + 10] = { 0 }; -+ -+ if (sk->sk_family == AF_INET) -+ snprintf(addr, sizeof(addr), "%pI4:%u", -+ &inet_sk(sk)->inet_daddr, dport); -+ else if (sk->sk_family == AF_INET6) -+ snprintf(addr, sizeof(addr), "%pI6:%u", -+ &sk->sk_v6_daddr, dport); -+ -+ WARN_ONCE(1, -+ "BBR %s cwnd alert: %u " -+ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " -+ "bw: %u rtt: %u min_rtt: %u " -+ "acked: %u tso_segs: %u " -+ "bw: %d %ld %d pif: %u\n", -+ addr, tp->snd_cwnd, -+ una, inet_csk(sk)->icsk_ca_state, -+ bbr->pacing_gain, bbr->cwnd_gain, -+ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, -+ acked, bbr_tso_segs_goal(sk), -+ rs->delivered, rs->interval_us, rs->is_retrans, -+ tcp_packets_in_flight(tp)); -+ } -+ -+ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) -+ return; -+ -+ if (!sock_flag(sk, SOCK_DBG) && !is_port_match) -+ return; -+ -+ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) -+ return; -+ -+ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && -+ !(bbr_flags & FLAG_DEBUG_LOOPBACK)) -+ return; -+ -+ snprintf(debugmsg, sizeof(debugmsg) - 1, -+ "BBR %pI4:%-5u %5u,%03u:%-7u %c " -+ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " -+ "bw %llu lb %llu ib %llu qb %llu " -+ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " -+ "lr %d er %d ea %d bwl %lld il %d ih %d c %d " -+ "v %d %c %u %c %s\n", -+ &inet_sk(sk)->inet_daddr, dport, -+ una / 1000, una % 1000, fack - tp->snd_una, -+ ca_states[inet_csk(sk)->icsk_ca_state], -+ bbr->debug.undo ? '@' : mode[bbr->mode], -+ tp->snd_cwnd, -+ bbr_extra_acked(sk), /* br (legacy): extra_acked */ -+ rs->tx_in_flight, /* cr (legacy): tx_inflight */ -+ rs->rtt_us, -+ rs->delivered, -+ rs->interval_us, -+ bbr->min_rtt_us, -+ rs->is_app_limited ? '_' : 'l', -+ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ -+ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ -+ 0ULL, /* lb: [obsolete] */ -+ 0ULL, /* ib: [obsolete] */ -+ div_u64((u64)sk->sk_pacing_rate * 8, 1000), -+ acked, -+ tcp_packets_in_flight(tp), -+ rs->is_ack_delayed ? 'd' : '.', -+ bbr->round_start ? '*' : '.', -+ tp->delivered, tp->lost, -+ tp->app_limited, -+ 0, /* #: [obsolete] */ -+ ctx->target_cwnd, -+ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ -+ ca_states[bbr->prev_ca_state], -+ (rs->lost + rs->delivered) > 0 ? -+ (1000 * rs->lost / -+ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ -+ (rs->delivered) > 0 ? -+ (1000 * rs->delivered_ce / -+ (rs->delivered)) : 0, /* er: ECN rate x1000 */ -+ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ -+ bbr->bw_lo == ~0U ? -+ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ -+ bbr->inflight_lo, /* il */ -+ bbr->inflight_hi, /* ih */ -+ bbr->bw_probe_up_cnt, /* c */ -+ 2, /* v: version */ -+ bbr->debug.event, -+ bbr->cycle_idx, -+ ack_phase[bbr->ack_phase], -+ bbr->bw_probe_samples ? "Y" : "N"); -+ debugmsg[sizeof(debugmsg) - 1] = 0; -+ -+ /* printk takes a higher precedence. */ -+ if (bbr_debug_with_printk) -+ printk(KERN_DEBUG "%s", debugmsg); -+ -+ if (unlikely(bbr->debug.undo)) -+ bbr->debug.undo = 0; -+} -+ -+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) -+{ -+ u64 rate = bw; -+ -+ rate = bbr_rate_bytes_per_sec(sk, rate, gain, -+ bbr_pacing_margin_percent); -+ rate = min_t(u64, rate, sk->sk_max_pacing_rate); -+ return rate; -+} -+ -+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ -+static void bbr_init_pacing_rate_from_rtt(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw; -+ u32 rtt_us; -+ -+ if (tp->srtt_us) { /* any RTT sample yet? */ -+ rtt_us = max(tp->srtt_us >> 3, 1U); -+ bbr->has_seen_rtt = 1; -+ } else { /* no RTT sample yet */ -+ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ -+ } -+ bw = (u64)tp->snd_cwnd * BW_UNIT; -+ do_div(bw, rtt_us); -+ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); -+} -+ -+/* Pace using current bw estimate and a gain factor. */ -+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); -+ -+ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) -+ bbr_init_pacing_rate_from_rtt(sk); -+ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) -+ sk->sk_pacing_rate = rate; -+} -+ -+static u32 bbr_min_tso_segs(struct sock *sk) -+{ -+ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; -+} -+ -+/* Return the number of segments BBR would like in a TSO/GSO skb, given -+ * a particular max gso size as a constraint. -+ */ -+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, -+ u32 gso_max_size) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 segs, r; -+ u64 bytes; -+ -+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ -+ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; -+ -+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every -+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. -+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) -+ */ -+ if (bbr->params.tso_rtt_shift) { -+ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; -+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ -+ bytes += GSO_MAX_SIZE >> r; -+ } -+ -+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); -+ segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk)); -+ return segs; -+} -+ -+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) -+{ -+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); -+} -+ -+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ -+static u32 bbr_tso_segs_goal(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); -+} -+ -+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -+static void bbr_save_cwnd(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) -+ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ -+ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ -+ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); -+} -+ -+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (event == CA_EVENT_TX_START && tp->app_limited) { -+ bbr->idle_restart = 1; -+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; -+ bbr->ack_epoch_acked = 0; -+ /* Avoid pointless buffer overflows: pace at est. bw if we don't -+ * need more speed (we're restarting from idle and app-limited). -+ */ -+ if (bbr->mode == BBR_PROBE_BW) -+ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); -+ else if (bbr->mode == BBR_PROBE_RTT) -+ bbr_check_probe_rtt_done(sk); +@@ -333,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -344,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); + } else if ((event == CA_EVENT_ECN_IS_CE || + event == CA_EVENT_ECN_NO_CE) && -+ bbr_ecn_enable && -+ bbr->params.precise_ece_ack) { ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { + u32 state = bbr->ce_state; + dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); + bbr->ce_state = state; -+ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) -+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); -+ } -+} -+ -+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: -+ * -+ * bdp = ceil(bw * min_rtt * gain) -+ * -+ * The key factor, gain, controls the amount of queue. While a small gain -+ * builds a smaller queue, it becomes more vulnerable to noise in RTT -+ * measurements (e.g., delayed ACKs or other ACK compression effects). This -+ * noise may cause BBR to under-estimate the rate. -+ */ -+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bdp; -+ u64 w; -+ -+ /* If we've never had a valid RTT sample, cap cwnd at the initial -+ * default. This should only happen when the connection is not using TCP -+ * timestamps and has retransmitted all of the SYN/SYNACK/data packets -+ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -366,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. + * case we need to slow-start up toward something safe: initial cwnd. -+ */ -+ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ + return bbr->init_cwnd; /* be safe: cap at initial cwnd */ -+ -+ w = (u64)bw * bbr->min_rtt_us; -+ -+ /* Apply a gain to the given value, remove the BW_SCALE shift, and -+ * round the value up to avoid a negative feedback loop. -+ */ -+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; -+ -+ return bdp; -+} -+ -+/* To achieve full performance in high-speed paths, we budget enough cwnd to -+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path: -+ * - one skb in sending host Qdisc, -+ * - one skb in sending host TSO/GSO engine -+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine -+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because -+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, -+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe -+ * full even with ACK-every-other-packet delayed ACKs. -+ */ -+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); + + w = (u64)bw * bbr->min_rtt_us; + +@@ -386,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); + u32 tso_segs_goal; -+ + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; + tso_segs_goal = 3 * bbr_tso_segs_goal(sk); -+ + + /* Allow enough full-sized skbs in flight to utilize end systems. */ -+ if (bbr->params.cwnd_tso_budget == 1) { -+ cwnd = max_t(u32, cwnd, tso_segs_goal); -+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); -+ } else { -+ cwnd += tso_segs_goal; -+ cwnd = (cwnd + 1) & ~1U; -+ } -+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) -+ cwnd += 2; -+ -+ return cwnd; -+} -+ -+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ -+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) -+{ -+ u32 inflight; -+ -+ inflight = bbr_bdp(sk, bw, gain); -+ inflight = bbr_quantization_budget(sk, inflight); -+ -+ return inflight; -+} -+ -+/* With pacing at lower layers, there's often less data "in the network" than -+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), -+ * we often have several skbs queued in the pacing layer with a pre-scheduled -+ * earliest departure time (EDT). BBR adapts its pacing rate based on the -+ * inflight level that it estimates has already been "baked in" by previous -+ * departure time decisions. We calculate a rough estimate of the number of our -+ * packets that might be in the network at the earliest departure time for the -+ * next skb scheduled: -+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw -+ * If we're increasing inflight, then we want to know if the transmit of the -+ * EDT skb will push inflight above the target, so inflight_at_edt includes -+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, -+ * then estimate if inflight will sink too low just before the EDT transmit. -+ */ -+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u64 now_ns, edt_ns, interval_us; -+ u32 interval_delivered, inflight_at_edt; -+ -+ now_ns = tp->tcp_clock_cache; -+ edt_ns = max(tp->tcp_wstamp_ns, now_ns); -+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); -+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; -+ inflight_at_edt = inflight_now; -+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ -+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ -+ if (interval_delivered >= inflight_at_edt) -+ return 0; -+ return inflight_at_edt - interval_delivered; -+} -+ -+/* Find the cwnd increment based on estimate of ack aggregation */ -+static u32 bbr_ack_aggregation_cwnd(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 max_aggr_cwnd, aggr_cwnd = 0; -+ -+ if (bbr->params.extra_acked_gain && -+ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { -+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) -+ / BW_UNIT; -+ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) -+ >> BBR_SCALE; -+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); -+ } -+ -+ return aggr_cwnd; -+} -+ + cwnd += 2; + + return cwnd; +@@ -457,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -468,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) +/* Returns the cwnd for PROBE_RTT mode. */ +static u32 bbr_probe_rtt_cwnd(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->params.probe_rtt_cwnd_gain == 0) -+ return bbr->params.cwnd_min_target; -+ return max_t(u32, bbr->params.cwnd_min_target, -+ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); -+} -+ -+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss -+ * has drawn us down below target), or snap down to target if we're above it. -+ */ -+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) + u32 acked, u32 bw, int gain, u32 cwnd, + struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; -+ -+ if (!acked) -+ goto done; /* no packet fully ACKed; just apply caps */ -+ -+ target_cwnd = bbr_bdp(sk, bw, gain); -+ -+ /* Increment the cwnd to account for excess ACKed data that seems -+ * due to aggregation (of data and/or ACKs) visible in the ACK stream. -+ */ -+ target_cwnd += bbr_ack_aggregation_cwnd(sk); -+ target_cwnd = bbr_quantization_budget(sk, target_cwnd); -+ -+ /* If we're below target cwnd, slow start cwnd toward target cwnd. */ -+ bbr->debug.target_cwnd = target_cwnd; -+ + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -536,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); + /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ + bbr->try_fast_path = 0; + if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ @@ -1285,52 +1032,279 @@ index 000000000000..85f8052144d1 + } else { + bbr->try_fast_path = 1; + } -+ -+ /* When growing cwnd, don't grow beyond twice what we just probed. */ -+ if (bbr->params.usage_based_cwnd) { -+ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); -+ cwnd = min(cwnd, max_probe); -+ } -+ -+ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); -+done: -+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ -+ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ -+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); -+ -+ ctx->target_cwnd = target_cwnd; -+ ctx->log = (tp->snd_cwnd != prev_cwnd); -+} -+ -+/* See if we have reached next round trip */ -+static void bbr_update_round_start(struct sock *sk, + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -613,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->round_start = 0; -+ -+ /* See if we've reached the next RTT */ + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { + if (rs->interval_us > 0 && + !before(rs->prior_delivered, bbr->next_rtt_delivered)) { -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr->round_start = 1; -+ } ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; +} -+ + +- bbr_lt_bw_sampling(sk, rs); +/* Calculate the bandwidth based on how fast packets are delivered */ +static void bbr_calculate_bw_sample(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ -+ struct bbr *bbr = inet_csk_ca(sk); + u64 bw = 0; -+ -+ /* Divide delivered by the interval to find a (lower bound) bottleneck -+ * bandwidth sample. Delivered is in packets and interval_us in uS and -+ * ratio will be <<1 for most connections. So delivered is first scaled. + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + * Round up to allow growth at low rates, even with integer division. -+ */ + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); + if (rs->interval_us > 0) { + if (WARN_ONCE(rs->delivered < 0, + "negative delivered: %d interval_us: %ld\n", @@ -1338,362 +1312,289 @@ index 000000000000..85f8052144d1 + return; + + bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); -+ } + } + + ctx->sample_bw = bw; -+ bbr->debug.rs_bw = bw; -+} -+ -+/* Estimates the windowed max degree of ack aggregation. -+ * This is used to provision extra in-flight data to keep sending during -+ * inter-ACK silences. -+ * -+ * Degree of ack aggregation is estimated as extra data acked beyond expected. -+ * -+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" -+ * cwnd += max_extra_acked -+ * -+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). -+ * Max filter is an approximate sliding window of 5-10 (packet timed) round + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -811,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. + * trips for non-startup phase, and 1-2 round trips for startup. -+ */ -+static void bbr_update_ack_aggregation(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ u32 epoch_us, expected_acked, extra_acked; -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct tcp_sock *tp = tcp_sk(sk); -+ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; -+ -+ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || -+ rs->delivered < 0 || rs->interval_us <= 0) -+ return; -+ -+ if (bbr->round_start) { -+ bbr->extra_acked_win_rtts = min(0x1F, -+ bbr->extra_acked_win_rtts + 1); -+ if (bbr->params.extra_acked_in_startup && -+ !bbr_full_bw_reached(sk)) + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -819,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) + extra_acked_win_rtts_thresh = 1; + if (bbr->extra_acked_win_rtts >= + extra_acked_win_rtts_thresh) { -+ bbr->extra_acked_win_rtts = 0; -+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? -+ 0 : 1; -+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0; -+ } -+ } -+ -+ /* Compute how many packets we expected to be delivered over epoch. */ -+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, -+ bbr->ack_epoch_mstamp); -+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; -+ -+ /* Reset the aggregation epoch if ACK rate is below expected rate or -+ * significantly large no. of ack received since epoch (potentially -+ * quite old epoch). -+ */ -+ if (bbr->ack_epoch_acked <= expected_acked || -+ (bbr->ack_epoch_acked + rs->acked_sacked >= -+ bbr_ack_epoch_acked_reset_thresh)) { -+ bbr->ack_epoch_acked = 0; -+ bbr->ack_epoch_mstamp = tp->delivered_mstamp; -+ expected_acked = 0; -+ } -+ -+ /* Compute excess data delivered, beyond what was expected. */ -+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, -+ bbr->ack_epoch_acked + rs->acked_sacked); -+ extra_acked = bbr->ack_epoch_acked - expected_acked; -+ extra_acked = min(extra_acked, tp->snd_cwnd); -+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) -+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; -+} -+ -+/* Estimate when the pipe is full, using the change in delivery rate: BBR -+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by -+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited -+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the -+ * higher rwin, 3: we get higher delivery rate samples. Or transient -+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar -+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. -+ */ -+static void bbr_check_full_bw_reached(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bw_thresh; -+ -+ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) -+ return; -+ -+ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; -+ if (bbr_max_bw(sk) >= bw_thresh) { -+ bbr->full_bw = bbr_max_bw(sk); -+ bbr->full_bw_cnt = 0; -+ return; -+ } -+ ++bbr->full_bw_cnt; -+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; -+} -+ -+/* If pipe is probably full, drain the queue and then enter steady-state. */ -+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { -+ bbr->mode = BBR_DRAIN; /* drain queue we created */ -+ tcp_sk(sk)->snd_ssthresh = -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -+ bbr2_reset_congestion_signals(sk); -+ } /* fall through to check if in-flight is already small: */ -+ if (bbr->mode == BBR_DRAIN && -+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) -+ return true; /* exiting DRAIN now */ -+ return false; -+} -+ -+static void bbr_check_probe_rtt_done(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (!(bbr->probe_rtt_done_stamp && -+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) -+ return; -+ + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -861,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -913,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ + bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ -+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); -+ bbr2_exit_probe_rtt(sk); -+} -+ -+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and -+ * periodically drain the bottleneck queue, to converge to measure the true -+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues -+ * small (reducing queuing delay and packet loss) and achieve fairness among -+ * BBR flows. -+ * -+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, -+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. -+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed -+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and -+ * re-enter the previous mode. BBR uses 200ms to approximately bound the -+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). -+ * -+ * Note that flows need only pay 2% if they are busy sending over the last 10 -+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have -+ * natural silences or low-rate periods within 10 seconds where the rate is low -+ * enough for long enough to drain its queue in the bottleneck. We pick up -+ * these min RTT measurements opportunistically with our min_rtt filter. :-) -+ */ -+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -941,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; + bool probe_rtt_expired, min_rtt_expired; + u32 expire; -+ + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ + expire = bbr->probe_rtt_min_stamp + -+ msecs_to_jiffies(bbr->params.probe_rtt_win_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); + probe_rtt_expired = after(tcp_jiffies32, expire); -+ if (rs->rtt_us >= 0 && -+ (rs->rtt_us <= bbr->probe_rtt_min_us || + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || + (probe_rtt_expired && !rs->is_ack_delayed))) { + bbr->probe_rtt_min_us = rs->rtt_us; + bbr->probe_rtt_min_stamp = tcp_jiffies32; + } + /* Track min RTT seen in the min_rtt_win_sec filter window: */ -+ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; + min_rtt_expired = after(tcp_jiffies32, expire); + if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || + min_rtt_expired) { + bbr->min_rtt_us = bbr->probe_rtt_min_us; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; -+ } -+ -+ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && -+ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { -+ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ -+ bbr_save_cwnd(sk); /* note cwnd so we can restore it */ -+ bbr->probe_rtt_done_stamp = 0; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; -+ } -+ -+ if (bbr->mode == BBR_PROBE_RTT) { -+ /* Ignore low rate samples during this mode. */ -+ tp->app_limited = -+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -+ /* Maintain min packets in flight for max(200 ms, 1 round). */ -+ if (!bbr->probe_rtt_done_stamp && + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -966,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { + tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { -+ bbr->probe_rtt_done_stamp = tcp_jiffies32 + -+ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); -+ bbr->probe_rtt_round_done = 0; -+ bbr->next_rtt_delivered = tp->delivered; -+ } else if (bbr->probe_rtt_done_stamp) { -+ if (bbr->round_start) -+ bbr->probe_rtt_round_done = 1; -+ if (bbr->probe_rtt_round_done) -+ bbr_check_probe_rtt_done(sk); -+ } -+ } -+ /* Restart after idle ends only once we process a new S/ACK for data */ -+ if (rs->delivered > 0) -+ bbr->idle_restart = 0; -+} -+ -+static void bbr_update_gains(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ switch (bbr->mode) { -+ case BBR_STARTUP: -+ bbr->pacing_gain = bbr->params.high_gain; -+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; -+ break; -+ case BBR_DRAIN: -+ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ -+ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ -+ break; -+ case BBR_PROBE_BW: -+ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; -+ bbr->cwnd_gain = bbr->params.cwnd_gain; -+ break; -+ case BBR_PROBE_RTT: -+ bbr->pacing_gain = BBR_UNIT; -+ bbr->cwnd_gain = BBR_UNIT; -+ break; -+ default: -+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); -+ break; -+ } -+} -+ -+static void bbr_init(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ int i; -+ -+ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); -+ -+ bbr->initialized = 1; -+ bbr->params.high_gain = min(0x7FF, bbr_high_gain); -+ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); -+ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); -+ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); -+ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); -+ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); -+ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); -+ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); -+ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); -+ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); -+ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); -+ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); -+ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; -+ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; -+ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; -+ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); -+ bbr->params.probe_rtt_win_ms = -+ min(0x3FFFU, -+ min_t(u32, bbr_probe_rtt_win_ms, -+ bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); -+ for (i = 0; i < CYCLE_LEN; i++) -+ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); -+ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; -+ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); -+ -+ bbr->debug.snd_isn = tp->snd_una; -+ bbr->debug.target_cwnd = 0; -+ bbr->debug.undo = 0; -+ -+ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); -+ bbr->prior_cwnd = tp->prior_cwnd; -+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -+ bbr->next_rtt_delivered = 0; -+ bbr->prev_ca_state = TCP_CA_Open; -+ bbr->packet_conservation = 0; -+ -+ bbr->probe_rtt_done_stamp = 0; -+ bbr->probe_rtt_round_done = 0; -+ bbr->probe_rtt_min_us = tcp_min_rtt(tp); -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; -+ bbr->min_rtt_us = tcp_min_rtt(tp); -+ bbr->min_rtt_stamp = tcp_jiffies32; -+ -+ bbr->has_seen_rtt = 0; -+ bbr_init_pacing_rate_from_rtt(sk); -+ -+ bbr->round_start = 0; -+ bbr->idle_restart = 0; -+ bbr->full_bw_reached = 0; -+ bbr->full_bw = 0; -+ bbr->full_bw_cnt = 0; -+ bbr->cycle_mstamp = 0; -+ bbr->cycle_idx = 0; -+ bbr->mode = BBR_STARTUP; -+ bbr->debug.rs_bw = 0; -+ -+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; -+ bbr->ack_epoch_acked = 0; -+ bbr->extra_acked_win_rtts = 0; -+ bbr->extra_acked_win_idx = 0; -+ bbr->extra_acked[0] = 0; -+ bbr->extra_acked[1] = 0; -+ -+ bbr->ce_state = 0; -+ bbr->prior_rcv_nxt = tp->rcv_nxt; -+ bbr->try_fast_path = 0; -+ -+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); -+} -+ -+static u32 bbr_sndbuf_expand(struct sock *sk) -+{ + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -989,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1012,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; -+} -+ -+/* __________________________________________________________________________ -+ * -+ * Functions new to BBR v2 ("bbr") congestion control are below here. -+ * __________________________________________________________________________ -+ */ -+ + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +/* Incorporate a new bw sample into the current window of our max filter. */ -+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; + +- bbr_update_model(sk, rs); +- +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); + bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); -+} -+ + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ -+static void bbr2_advance_bw_hi_filter(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + if (!bbr->bw_hi[1]) + return; /* no samples in this window; remember old window */ + bbr->bw_hi[0] = bbr->bw_hi[1]; + bbr->bw_hi[1] = 0; +} -+ + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ -+static u32 bbr2_target_inflight(struct sock *sk) ++static u32 bbr_target_inflight(struct sock *sk) +{ + u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); -+ + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); + return min(bdp, tcp_sk(sk)->snd_cwnd); -+} -+ -+static bool bbr2_is_probing_bandwidth(struct sock *sk) -+{ + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; + struct bbr *bbr = inet_csk_ca(sk); + + return (bbr->mode == BBR_STARTUP) || @@ -1703,7 +1604,7 @@ index 000000000000..85f8052144d1 +} + +/* Has the given amount of time elapsed since we marked the phase start? */ -+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bbr *bbr = inet_csk_ca(sk); @@ -1712,62 +1613,74 @@ index 000000000000..85f8052144d1 + bbr->cycle_mstamp + interval_us) > 0; +} + -+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ + + bbr->full_bw_reached = 1; -+ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); +} + +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ -+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || -+ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) + return; + -+ if (ce_ratio >= bbr->params.ecn_thresh) ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) + bbr->startup_ecn_rounds++; + else + bbr->startup_ecn_rounds = 0; + -+ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { -+ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ -+ bbr2_handle_queue_too_high_in_startup(sk); ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); + return; + } +} + -+static void bbr2_update_ecn_alpha(struct sock *sk) ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); + struct bbr *bbr = inet_csk_ca(sk); + s32 delivered, delivered_ce; + u64 alpha, ce_ratio; + u32 gain; ++ bool want_ecn_alpha; + -+ if (bbr->params.ecn_factor == 0) -+ return; ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; + + delivered = tp->delivered - bbr->alpha_last_delivered; + delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; + + if (delivered == 0 || /* avoid divide by zero */ + WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ -+ return; -+ -+ /* See if we should use ECN sender logic for this connection. */ -+ if (!bbr->ecn_eligible && bbr_ecn_enable && -+ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || -+ !bbr->params.ecn_max_rtt_us)) -+ bbr->ecn_eligible = 1; ++ return -1; + ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); + ce_ratio = (u64)delivered_ce << BBR_SCALE; + do_div(ce_ratio, delivered); -+ gain = bbr->params.ecn_alpha_gain; ++ ++ gain = bbr_param(sk, ecn_alpha_gain); + alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; + alpha += (gain * ce_ratio) >> BBR_SCALE; + bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); @@ -1775,37 +1688,51 @@ index 000000000000..85f8052144d1 + bbr->alpha_last_delivered = tp->delivered; + bbr->alpha_last_delivered_ce = tp->delivered_ce; + -+ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); +} + +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ -+static void bbr2_raise_inflight_hi_slope(struct sock *sk) -+{ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 growth_this_round, cnt; + + /* Calculate "slope": packets S/Acked per inflight_hi increment. */ + growth_this_round = 1 << bbr->bw_probe_up_rounds; + bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); -+ cnt = tp->snd_cwnd / growth_this_round; ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; + cnt = max(cnt, 1U); + bbr->bw_probe_up_cnt = cnt; -+ bbr->debug.event = 'G'; /* Grow inflight_hi slope */ +} + +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ -+static void bbr2_probe_inflight_hi_upward(struct sock *sk, ++static void bbr_probe_inflight_hi_upward(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 delta; + -+ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { -+ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) + return; /* not fully using inflight_hi, so don't grow it */ -+ } + + /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ + bbr->bw_probe_up_acks += rs->acked_sacked; @@ -1813,11 +1740,11 @@ index 000000000000..85f8052144d1 + delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; + bbr->inflight_hi += delta; -+ bbr->debug.event = 'I'; /* Increment inflight_hi */ ++ bbr->try_fast_path = 0; /* Need to update cwnd */ + } + + if (bbr->round_start) -+ bbr2_raise_inflight_hi_slope(sk); ++ bbr_raise_inflight_hi_slope(sk); +} + +/* Does loss/ECN rate for this sample say inflight is "too high"? @@ -1825,25 +1752,28 @@ index 000000000000..85f8052144d1 + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which + * uses it to notice when loss/ECN rates suggest inflight is too high. + */ -+static bool bbr2_is_inflight_too_high(const struct sock *sk, -+ const struct rate_sample *rs) ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh, ecn_thresh; -+ + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + if (rs->lost > 0 && rs->tx_in_flight) { -+ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> + BBR_SCALE; -+ if (rs->lost > loss_thresh) ++ if (rs->lost > loss_thresh) { + return true; ++ } + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && -+ bbr->ecn_eligible && bbr->params.ecn_thresh) { -+ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> + BBR_SCALE; -+ if (rs->delivered_ce >= ecn_thresh) ++ if (rs->delivered_ce > ecn_thresh) { + return true; ++ } + } + + return false; @@ -1857,12 +1787,12 @@ index 000000000000..85f8052144d1 + * Then we take that equation, convert it to fixed point, and + * round up to the nearest packet. + */ -+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, + const struct rate_sample *rs, + const struct sk_buff *skb) +{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ u32 loss_thresh = bbr->params.loss_thresh; ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); + u32 pcount, divisor, inflight_hi; + s32 inflight_prev, lost_prev; + u64 loss_budget, lost_prefix; @@ -1871,14 +1801,28 @@ index 000000000000..85f8052144d1 + + /* How much data was in flight before this skb? */ + inflight_prev = rs->tx_in_flight - pcount; -+ if (WARN_ONCE(inflight_prev < 0, -+ "tx_in_flight: %u pcount: %u reneg: %u", -+ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); + return ~0U; ++ } + + /* How much inflight data was marked lost before this skb? */ + lost_prev = rs->lost - pcount; -+ if (WARN_ON_ONCE(lost_prev < 0)) ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) + return ~0U; + + /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ @@ -1903,7 +1847,7 @@ index 000000000000..85f8052144d1 + * buffer, return an operating point that tries to leave unutilized headroom in + * the path for other flows, for fairness convergence and lower RTTs and loss. + */ -+static u32 bbr2_inflight_with_headroom(const struct sock *sk) ++static u32 bbr_inflight_with_headroom(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 headroom, headroom_fraction; @@ -1911,17 +1855,17 @@ index 000000000000..85f8052144d1 + if (bbr->inflight_hi == ~0U) + return ~0U; + -+ headroom_fraction = bbr->params.inflight_headroom; ++ headroom_fraction = bbr_param(sk, inflight_headroom); + headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; + headroom = max(headroom, 1U); + return max_t(s32, bbr->inflight_hi - headroom, -+ bbr->params.cwnd_min_target); ++ bbr_param(sk, cwnd_min_target)); +} + +/* Bound cwnd to a sensible level, based on our current probing state + * machine phase and model of a good inflight level (inflight_lo, inflight_hi). + */ -+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); @@ -1942,13 +1886,55 @@ index 000000000000..85f8052144d1 + if (bbr->mode == BBR_PROBE_RTT || + (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) -+ cap = bbr2_inflight_with_headroom(sk); ++ cap = bbr_inflight_with_headroom(sk); + } + /* Adapt to any loss/ECN since our last bw probe. */ + cap = min(cap, bbr->inflight_lo); + -+ cap = max_t(u32, cap, bbr->params.cwnd_min_target); -+ tp->snd_cwnd = min(cap, tp->snd_cwnd); ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; +} + +/* Estimate a short-term lower bound on the capacity available now, based @@ -1967,57 +1953,39 @@ index 000000000000..85f8052144d1 + * cause low bw for Reno/CUBIC and high loss recovery latency for + * request/response flows using any congestion control. + */ -+static void bbr2_adapt_lower_bounds(struct sock *sk) ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) +{ -+ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); -+ u32 ecn_cut, ecn_inflight_lo, beta; ++ u32 ecn_inflight_lo = ~0U; + + /* We only use lower-bound estimates when not probing bw. + * When probing we need to push inflight higher to probe bw. + */ -+ if (bbr2_is_probing_bandwidth(sk)) ++ if (bbr_is_probing_bandwidth(sk)) + return; + + /* ECN response. */ -+ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { -+ /* Reduce inflight to (1 - alpha*ecn_factor). */ -+ ecn_cut = (BBR_UNIT - -+ ((bbr->ecn_alpha * bbr->params.ecn_factor) >> -+ BBR_SCALE)); -+ if (bbr->inflight_lo == ~0U) -+ bbr->inflight_lo = tp->snd_cwnd; -+ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; -+ } else { -+ ecn_inflight_lo = ~0U; ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); + } + + /* Loss response. */ + if (bbr->loss_in_round) { -+ /* Reduce bw and inflight to (1 - beta). */ -+ if (bbr->bw_lo == ~0U) -+ bbr->bw_lo = bbr_max_bw(sk); -+ if (bbr->inflight_lo == ~0U) -+ bbr->inflight_lo = tp->snd_cwnd; -+ beta = bbr->params.beta; -+ bbr->bw_lo = -+ max_t(u32, bbr->bw_latest, -+ (u64)bbr->bw_lo * -+ (BBR_UNIT - beta) >> BBR_SCALE); -+ bbr->inflight_lo = -+ max_t(u32, bbr->inflight_latest, -+ (u64)bbr->inflight_lo * -+ (BBR_UNIT - beta) >> BBR_SCALE); ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); + } + -+ /* Adjust to the lower of the levels implied by loss or ECN. */ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ + bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); +} + +/* Reset any short-term lower-bound adaptation to congestion, so that we can + * push our inflight up. + */ -+static void bbr2_reset_lower_bounds(struct sock *sk) ++static void bbr_reset_lower_bounds(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + @@ -2028,7 +1996,7 @@ index 000000000000..85f8052144d1 +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state + * machine phase where we adapt our lower bound based on congestion signals. + */ -+static void bbr2_reset_congestion_signals(struct sock *sk) ++static void bbr_reset_congestion_signals(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + @@ -2040,42 +2008,76 @@ index 000000000000..85f8052144d1 + bbr->inflight_latest = 0; +} + -+/* Update (most of) our congestion signals: track the recent rate and volume of -+ * delivered data, presence of loss, and EWMA degree of ECN marking. -+ */ -+static void bbr2_update_congestion_signals( ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw; + + bbr->loss_round_start = 0; + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ + bw = ctx->sample_bw; + + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) -+ bbr2_take_bw_hi_sample(sk, bw); ++ bbr_take_max_bw_sample(sk, bw); + + bbr->loss_in_round |= (rs->losses > 0); + -+ /* Update rate and volume of delivered data from latest round trip: */ -+ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); -+ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); -+ -+ if (before(rs->prior_delivered, bbr->loss_round_delivered)) ++ if (!bbr->loss_round_start) + return; /* skip the per-round-trip updates */ + /* Now do per-round-trip updates. */ -+ bbr->loss_round_delivered = tp->delivered; /* mark round trip */ -+ bbr->loss_round_start = 1; -+ bbr2_adapt_lower_bounds(sk); ++ bbr_adapt_lower_bounds(sk, rs); + -+ /* Update windowed "latest" (single-round-trip) filters. */ + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; -+ bbr->bw_latest = ctx->sample_bw; -+ bbr->inflight_latest = rs->delivered; +} + +/* Bandwidth probing can cause loss. To help coexistence with loss-based @@ -2085,22 +2087,15 @@ index 000000000000..85f8052144d1 + * flow. We count packet-timed round trips directly, since measured RTT can + * vary widely, and Reno is driven by packet-timed round trips. + */ -+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); -+ u32 inflight, rounds, reno_gain, reno_rounds; ++ u32 rounds; + + /* Random loss can shave some small percentage off of our inflight + * in each round. To survive this, flows need robust periodic probes. + */ -+ rounds = bbr->params.bw_probe_max_rounds; -+ -+ reno_gain = bbr->params.bw_probe_reno_gain; -+ if (reno_gain) { -+ inflight = bbr2_target_inflight(sk); -+ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; -+ rounds = min(rounds, reno_rounds); -+ } ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); + return bbr->rounds_since_probe >= rounds; +} + @@ -2121,19 +2116,19 @@ index 000000000000..85f8052144d1 + * time-scales (e.g. perhaps traffic from a web page download that we + * were competing with is now complete). + */ -+static void bbr2_pick_probe_wait(struct sock *sk) ++static void bbr_pick_probe_wait(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Decide the random round-trip bound for wait until probe: */ + bbr->rounds_since_probe = -+ get_random_u32_below(bbr->params.bw_probe_rand_rounds); ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); + /* Decide the random wall clock bound for wait until probe: */ -+ bbr->probe_wait_us = bbr->params.bw_probe_base_us + -+ get_random_u32_below(bbr->params.bw_probe_rand_us); ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); +} + -+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) +{ + struct bbr *bbr = inet_csk_ca(sk); + @@ -2148,24 +2143,22 @@ index 000000000000..85f8052144d1 + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and + * inflight_hi estimates will underestimate. + */ -+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + -+ bbr2_reset_lower_bounds(sk); -+ if (bbr->inflight_hi != ~0U) -+ bbr->inflight_hi += bbr->params.refill_add_inc; ++ bbr_reset_lower_bounds(sk); + bbr->bw_probe_up_rounds = bw_probe_up_rounds; + bbr->bw_probe_up_acks = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_REFILLING; + bbr->next_rtt_delivered = tp->delivered; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); +} + +/* Now probe max deliverable data rate and volume. */ -+static void bbr2_start_bw_probe_up(struct sock *sk) ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); @@ -2173,8 +2166,10 @@ index 000000000000..85f8052144d1 + bbr->ack_phase = BBR_ACKS_PROBE_STARTING; + bbr->next_rtt_delivered = tp->delivered; + bbr->cycle_mstamp = tp->tcp_mstamp; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); -+ bbr2_raise_inflight_hi_slope(sk); ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); +} + +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall @@ -2183,57 +2178,57 @@ index 000000000000..85f8052144d1 + * keep packet loss rates low. Also start a round-trip counter, to probe faster + * if we estimate a Reno flow at our BDP would probe faster. + */ -+static void bbr2_start_bw_probe_down(struct sock *sk) ++static void bbr_start_bw_probe_down(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + -+ bbr2_reset_congestion_signals(sk); ++ bbr_reset_congestion_signals(sk); + bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ -+ bbr2_pick_probe_wait(sk); ++ bbr_pick_probe_wait(sk); + bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); +} + +/* Cruise: maintain what we estimate to be a neutral, conservative + * operating point, without attempting to probe up for bandwidth or down for + * RTT, and only reducing inflight in response to loss/ECN signals. + */ -+static void bbr2_start_bw_probe_cruise(struct sock *sk) ++static void bbr_start_bw_probe_cruise(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->inflight_lo != ~0U) + bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); + -+ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); +} + +/* Loss and/or ECN rate is too high while probing. + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. + */ -+static void bbr2_handle_inflight_too_high(struct sock *sk, ++static void bbr_handle_inflight_too_high(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); -+ const u32 beta = bbr->params.beta; ++ const u32 beta = bbr_param(sk, beta); + + bbr->prev_probe_too_high = 1; + bbr->bw_probe_samples = 0; /* only react once per probe */ -+ bbr->debug.event = 'L'; /* Loss/ECN too high */ + /* If we are app-limited then we are not robustly + * probing the max volume of inflight data we think + * might be safe (analogous to how app-limited bw + * samples are not known to be robustly probing bw). + */ -+ if (!rs->is_app_limited) ++ if (!rs->is_app_limited) { + bbr->inflight_hi = max_t(u32, rs->tx_in_flight, -+ (u64)bbr2_target_inflight(sk) * ++ (u64)bbr_target_inflight(sk) * + (BBR_UNIT - beta) >> BBR_SCALE); ++ } + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr2_start_bw_probe_down(sk); ++ bbr_start_bw_probe_down(sk); +} + +/* If we're seeing bw and loss samples reflecting our bw probing, adapt @@ -2241,8 +2236,9 @@ index 000000000000..85f8052144d1 + * inflight_hi downward. If we're able to push inflight higher without such + * signals, push higher: adapt inflight_hi upward. + */ -+static bool bbr2_adapt_upper_bounds(struct sock *sk, -+ const struct rate_sample *rs) ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + @@ -2259,7 +2255,7 @@ index 000000000000..85f8052144d1 + * samples from the previous cycle, by advancing the window. + */ + if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) -+ bbr2_advance_bw_hi_filter(sk); ++ bbr_advance_max_bw_filter(sk); + /* If we had an inflight_hi, then probed and pushed inflight all + * the way up to hit that inflight_hi without seeing any + * high loss/ECN in all the resulting ACKs from that probing, @@ -2268,100 +2264,91 @@ index 000000000000..85f8052144d1 + */ + if (bbr->mode == BBR_PROBE_BW && + bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { -+ bbr->debug.event = 'R'; /* reprobe */ -+ bbr2_start_bw_probe_refill(sk, 0); ++ bbr_start_bw_probe_refill(sk, 0); + return true; /* yes, decided state transition */ + } + } -+ -+ if (bbr2_is_inflight_too_high(sk, rs)) { ++ if (bbr_is_inflight_too_high(sk, rs)) { + if (bbr->bw_probe_samples) /* sample is from bw probing? */ -+ bbr2_handle_inflight_too_high(sk, rs); ++ bbr_handle_inflight_too_high(sk, rs); + } else { + /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ -+ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ -+ return false; + -+ /* To be resilient to random loss, we must raise inflight_hi ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi + * if we observe in any phase that a higher level is safe. + */ + if (rs->tx_in_flight > bbr->inflight_hi) { + bbr->inflight_hi = rs->tx_in_flight; -+ bbr->debug.event = 'U'; /* raise up inflight_hi */ + } + + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr2_probe_inflight_hi_upward(sk, rs); ++ bbr_probe_inflight_hi_upward(sk, rs); + } + + return false; +} + +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ -+static bool bbr2_check_time_to_probe_bw(struct sock *sk) ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 n; + + /* If we seem to be at an operating point where we are not seeing loss + * but we are seeing ECN marks, then when the ECN marks cease we reprobe -+ * quickly (in case a burst of cross-traffic has ceased and freed up bw, -+ * or in case we are sharing with multiplicatively probing traffic). ++ * quickly (in case cross-traffic has ceased and freed up bw). + */ -+ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && + bbr->ecn_in_cycle && !bbr->loss_in_cycle && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { -+ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ -+ /* Calculate n so that when bbr2_raise_inflight_hi_slope() ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() + * computes growth_this_round as 2^n it will be roughly the + * desired volume of data (inflight_hi*ecn_reprobe_gain). + */ + n = ilog2((((u64)bbr->inflight_hi * -+ bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); -+ bbr2_start_bw_probe_refill(sk, n); ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); + return true; + } + -+ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || -+ bbr2_is_reno_coexistence_probe_time(sk)) { -+ bbr2_start_bw_probe_refill(sk, 0); ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); + return true; + } + return false; +} + +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ -+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) +{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ bool is_under_bdp, is_long_enough; -+ + /* Always need to pull inflight down to leave headroom in queue. */ -+ if (inflight > bbr2_inflight_with_headroom(sk)) ++ if (inflight > bbr_inflight_with_headroom(sk)) + return false; + -+ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); -+ if (bbr->params.drain_to_target) -+ return is_under_bdp; -+ -+ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); -+ return is_under_bdp || is_long_enough; ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); +} + +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ -+static void bbr2_update_cycle_phase(struct sock *sk, -+ const struct rate_sample *rs) ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) +{ ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); -+ bool is_risky = false, is_queuing = false; ++ bool is_bw_probe_done = false; + u32 inflight, bw; + + if (!bbr_full_bw_reached(sk)) + return; + + /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ -+ if (bbr2_adapt_upper_bounds(sk, rs)) ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) + return; /* already decided state transition */ + + if (bbr->mode != BBR_PROBE_BW) @@ -2377,7 +2364,7 @@ index 000000000000..85f8052144d1 + * by slowing down. + */ + case BBR_BW_PROBE_CRUISE: -+ if (bbr2_check_time_to_probe_bw(sk)) ++ if (bbr_check_time_to_probe_bw(sk, rs)) + return; /* already decided state transition */ + break; + @@ -2392,7 +2379,7 @@ index 000000000000..85f8052144d1 + * may be putting too much data in flight. + */ + bbr->bw_probe_samples = 1; -+ bbr2_start_bw_probe_up(sk); ++ bbr_start_bw_probe_up(sk, ctx); + } + break; + @@ -2407,31 +2394,33 @@ index 000000000000..85f8052144d1 + * most recent previous bw probe phase. Thus we want to start + * draining the queue immediately because it's very likely the most + * recently sent packets will fill the queue and cause drops. -+ * (checked here) -+ * (2) We have probed for at least 1*min_rtt_us, and the -+ * estimated queue is high enough (inflight > 1.25 * estimated_bdp). -+ * (checked here) ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). + * (3) Loss filter says loss rate is "too high". -+ * (checked in bbr_is_inflight_too_high()) + * (4) ECN filter says ECN mark rate is "too high". -+ * (checked in bbr_is_inflight_too_high()) ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() + */ + case BBR_BW_PROBE_UP: + if (bbr->prev_probe_too_high && + inflight >= bbr->inflight_hi) { + bbr->stopped_risky_probe = 1; -+ is_risky = true; -+ bbr->debug.event = 'D'; /* D for danger */ -+ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && -+ inflight >= -+ bbr_inflight(sk, bw, -+ bbr->params.bw_probe_pif_gain)) { -+ is_queuing = true; -+ bbr->debug.event = 'Q'; /* building Queue */ ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } + } -+ if (is_risky || is_queuing) { ++ if (is_bw_probe_done) { + bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ -+ bbr2_start_bw_probe_down(sk); /* restart w/ down */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ + } + break; + @@ -2445,10 +2434,10 @@ index 000000000000..85f8052144d1 + * the queue is drained; persisting would underutilize the pipe. + */ + case BBR_BW_PROBE_DOWN: -+ if (bbr2_check_time_to_probe_bw(sk)) ++ if (bbr_check_time_to_probe_bw(sk, rs)) + return; /* already decided state transition */ -+ if (bbr2_check_time_to_cruise(sk, inflight, bw)) -+ bbr2_start_bw_probe_cruise(sk); ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); + break; + + default: @@ -2457,22 +2446,22 @@ index 000000000000..85f8052144d1 +} + +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ -+static void bbr2_exit_probe_rtt(struct sock *sk) ++static void bbr_exit_probe_rtt(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + -+ bbr2_reset_lower_bounds(sk); ++ bbr_reset_lower_bounds(sk); + if (bbr_full_bw_reached(sk)) { + bbr->mode = BBR_PROBE_BW; + /* Raising inflight after PROBE_RTT may cause loss, so reset + * the PROBE_BW clock and schedule the next bandwidth probe for + * a friendly and randomized future point in time. + */ -+ bbr2_start_bw_probe_down(sk); ++ bbr_start_bw_probe_down(sk); + /* Since we are exiting PROBE_RTT, we know inflight is + * below our estimated BDP, so it is reasonable to cruise. + */ -+ bbr2_start_bw_probe_cruise(sk); ++ bbr_start_bw_probe_cruise(sk); + } else { + bbr->mode = BBR_STARTUP; + } @@ -2482,8 +2471,8 @@ index 000000000000..85f8052144d1 + * the end of the round in recovery to get a good estimate of how many packets + * have been lost, and how many we need to drain with a low pacing rate. + */ -+static void bbr2_check_loss_too_high_in_startup(struct sock *sk, -+ const struct rate_sample *rs) ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + @@ -2497,39 +2486,83 @@ index 000000000000..85f8052144d1 + */ + if (rs->losses && bbr->loss_events_in_round < 0xf) + bbr->loss_events_in_round++; /* update saturating counter */ -+ if (bbr->params.full_loss_cnt && bbr->loss_round_start && ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && + inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && -+ bbr->loss_events_in_round >= bbr->params.full_loss_cnt && -+ bbr2_is_inflight_too_high(sk, rs)) { -+ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ -+ bbr2_handle_queue_too_high_in_startup(sk); ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); + return; + } + if (bbr->loss_round_start) + bbr->loss_events_in_round = 0; +} + -+/* If we are done draining, advance into steady state operation in PROBE_BW. */ -+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + -+ if (bbr_check_drain(sk, rs, ctx)) { ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { + bbr->mode = BBR_PROBE_BW; -+ bbr2_start_bw_probe_down(sk); ++ bbr_start_bw_probe_down(sk); + } +} + -+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ -+ bbr2_update_congestion_signals(sk, rs, ctx); ++ bbr_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); -+ bbr2_check_loss_too_high_in_startup(sk, rs); -+ bbr_check_full_bw_reached(sk, rs); -+ bbr2_check_drain(sk, rs, ctx); -+ bbr2_update_cycle_phase(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); + bbr_update_min_rtt(sk, rs); +} + @@ -2557,25 +2590,26 @@ index 000000000000..85f8052144d1 + * + * Returns whether we can take fast path or not. + */ -+static bool bbr2_fast_path(struct sock *sk, bool *update_model, ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 prev_min_rtt_us, prev_mode; + -+ if (bbr->params.fast_path && bbr->try_fast_path && ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && + rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && -+ !bbr->loss_in_round && !bbr->ecn_in_round) { ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { + prev_mode = bbr->mode; + prev_min_rtt_us = bbr->min_rtt_us; -+ bbr2_check_drain(sk, rs, ctx); -+ bbr2_update_cycle_phase(sk, rs); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); + bbr_update_min_rtt(sk, rs); + + if (bbr->mode == prev_mode && + bbr->min_rtt_us == prev_min_rtt_us && -+ bbr->try_fast_path) ++ bbr->try_fast_path) { + return true; ++ } + + /* Skip model update, but control still needs to be updated */ + *update_model = false; @@ -2583,217 +2617,95 @@ index 000000000000..85f8052144d1 + return false; +} + -+static void bbr2_main(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc void bbr_main(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct bbr_context ctx = { 0 }; + bool update_model = true; -+ u32 bw; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; + -+ bbr->debug.event = '.'; /* init to default NOP (no event yet) */ -+ -+ bbr_update_round_start(sk, rs, &ctx); ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); + if (bbr->round_start) { + bbr->rounds_since_probe = + min_t(s32, bbr->rounds_since_probe + 1, 0xFF); -+ bbr2_update_ecn_alpha(sk); ++ ce_ratio = bbr_update_ecn_alpha(sk); + } ++ bbr_plb(sk, rs, ce_ratio); + -+ bbr->ecn_in_round |= rs->is_ece; ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); + bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); + -+ if (bbr2_fast_path(sk, &update_model, rs, &ctx)) ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) + goto out; + + if (update_model) -+ bbr2_update_model(sk, rs, &ctx); ++ bbr_update_model(sk, rs, &ctx); + + bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, -+ tp->snd_cwnd, &ctx); -+ bbr2_bound_cwnd_for_inflight_model(sk); ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); + +out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); + bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; + bbr->loss_in_cycle |= rs->lost > 0; + bbr->ecn_in_cycle |= rs->delivered_ce > 0; -+ -+ bbr_debug(sk, rs->acked_sacked, rs, &ctx); +} + -+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared -+ * down here, so that the algorithm functions that use the parameters must use -+ * the per-socket parameters; if they accidentally use the global version -+ * then there will be a compile error. -+ * TODO(ncardwell): move all per-socket parameters down to this section. -+ */ -+ -+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. -+ * No loss response when 0. Max allwed value is 255. -+ */ -+static u32 bbr_beta = BBR_UNIT * 30 / 100; -+ -+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. -+ * Max allowed value is 255. -+ */ -+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ -+ -+/* The initial value for the ecn_alpha state variable. Default and max -+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly -+ * to congestion if the bottleneck is congested when the flow starts up. -+ */ -+static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ -+ -+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. -+ * No ECN based bounding when 0. Max allwed value is 255. -+ */ -+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ -+ -+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. -+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. -+ */ -+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ -+ -+/* Max RTT (in usec) at which to use sender-side ECN logic. -+ * Disabled when 0 (ECN allowed at any RTT). -+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. -+ */ -+static u32 bbr_ecn_max_rtt_us = 5000; -+ -+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN -+ * clears then use a multiplicative increase to quickly reprobe bw by -+ * starting inflight probing at the given multiple of inflight_hi. -+ * Default for this experimental knob is 0 (disabled). -+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. -+ */ -+static u32 bbr_ecn_reprobe_gain; -+ -+/* Estimate bw probing has gone too far if loss rate exceeds this level. */ -+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ -+ -+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, -+ * and loss rate is higher than bbr_loss_thresh. -+ * Disabled if 0. Max allowed value is 15 (0xF). -+ */ -+static u32 bbr_full_loss_cnt = 8; -+ -+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh -+ * meets this count. Max allowed value is 3. -+ */ -+static u32 bbr_full_ecn_cnt = 2; -+ -+/* Fraction of unutilized headroom to try to leave in path upon high loss. */ -+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; -+ -+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. -+ * Default is 1.25x, as in BBR v1. Max allowed is 511. -+ */ -+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; -+ -+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. -+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. -+ * Max allowed is 511. -+ */ -+static u32 bbr_bw_probe_reno_gain = BBR_UNIT; -+ -+/* Max number of packet-timed rounds to wait before probing for bandwidth. If -+ * we want to tolerate 1% random loss per round, and not have this cut our -+ * inflight too much, we must probe for bw periodically on roughly this scale. -+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. -+ * We aim to be fair with Reno/CUBIC up to a BDP of at least: -+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets -+ */ -+static u32 bbr_bw_probe_max_rounds = 63; -+ -+/* Max amount of randomness to inject in round counting for Reno-coexistence. -+ * Max value is 15. -+ */ -+static u32 bbr_bw_probe_rand_rounds = 2; -+ -+/* Use BBR-native probe time scale starting at this many usec. -+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: -+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs -+ */ -+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ -+ -+/* Use BBR-native probes spread over this many usec: */ -+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ -+ -+/* Undo the model changes made in loss recovery if recovery was spurious? */ -+static bool bbr_undo = true; -+ -+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ -+static bool bbr_fast_path = true; /* default: enabled */ -+ -+/* Use fast ack mode ? */ -+static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ -+ -+/* How much to additively increase inflight_hi when entering REFILL? */ -+static u32 bbr_refill_add_inc; /* default: disabled */ -+ -+module_param_named(beta, bbr_beta, uint, 0644); -+module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); -+module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); -+module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); -+module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); -+module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); -+module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); -+module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); -+module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); -+module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); -+module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); -+module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); -+module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); -+module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); -+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); -+module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); -+module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); -+module_param_named(undo, bbr_undo, bool, 0664); -+module_param_named(fast_path, bbr_fast_path, bool, 0664); -+module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); -+module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); -+ -+static void bbr2_init(struct sock *sk) ++__bpf_kfunc static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + -+ bbr_init(sk); /* run shared init code for v1 and v2 */ -+ -+ /* BBR v2 parameters: */ -+ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); -+ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); -+ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); -+ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); -+ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); -+ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); -+ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); -+ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); -+ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); -+ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); -+ bbr->params.inflight_headroom = -+ min_t(u32, 0xFFU, bbr_inflight_headroom); -+ bbr->params.bw_probe_pif_gain = -+ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); -+ bbr->params.bw_probe_reno_gain = -+ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); -+ bbr->params.bw_probe_max_rounds = -+ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); -+ bbr->params.bw_probe_rand_rounds = -+ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); -+ bbr->params.bw_probe_base_us = -+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); -+ bbr->params.bw_probe_rand_us = -+ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); -+ bbr->params.undo = bbr_undo; -+ bbr->params.fast_path = bbr_fast_path ? 1 : 0; -+ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); -+ -+ /* BBR v2 state: */ + bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ + /* Start sampling ECN mark rate after first full flight is ACKed: */ + bbr->loss_round_delivered = tp->delivered + 1; + bbr->loss_round_start = 0; @@ -2802,12 +2714,13 @@ index 000000000000..85f8052144d1 + bbr->undo_inflight_hi = 0; + bbr->loss_events_in_round = 0; + bbr->startup_ecn_rounds = 0; -+ bbr2_reset_congestion_signals(sk); ++ bbr_reset_congestion_signals(sk); + bbr->bw_lo = ~0U; + bbr->bw_hi[0] = 0; + bbr->bw_hi[1] = 0; + bbr->inflight_lo = ~0U; + bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); + bbr->bw_probe_up_cnt = ~0U; + bbr->bw_probe_up_acks = 0; + bbr->bw_probe_up_rounds = 0; @@ -2818,31 +2731,43 @@ index 000000000000..85f8052144d1 + bbr->bw_probe_samples = 0; + bbr->prev_probe_too_high = 0; + bbr->ecn_eligible = 0; -+ bbr->ecn_alpha = bbr->params.ecn_alpha_init; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); + bbr->alpha_last_delivered = 0; + bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; + -+ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; + -+ if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) ++ if (bbr_can_use_ecn(sk)) + tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; +} + -+/* Core TCP stack informs us that the given skb was just marked lost. */ -+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); -+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -+ struct rate_sample rs; + -+ /* Capture "current" data over the full round trip of loss, -+ * to have a better chance to see the full capacity of the path. -+ */ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = tp->delivered; /* set round trip */ + bbr->loss_in_round = 1; + bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); + + if (!bbr->bw_probe_samples) + return; /* not an skb sent while probing for bandwidth */ @@ -2852,178 +2777,214 @@ index 000000000000..85f8052144d1 + * estimates what happened in the flight leading up to this lost skb, + * then see if the loss rate went too high, and if so at which packet. + */ -+ memset(&rs, 0, sizeof(rs)); + rs.tx_in_flight = scb->tx.in_flight; + rs.lost = tp->lost - scb->tx.lost; + rs.is_app_limited = scb->tx.is_app_limited; -+ if (bbr2_is_inflight_too_high(sk, &rs)) { -+ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); -+ bbr2_handle_inflight_too_high(sk, &rs); ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); + } +} + -+/* Revert short-term model if current loss recovery event was spurious. */ -+static u32 bbr2_undo_cwnd(struct sock *sk) ++static void bbr_run_loss_probe_recovery(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; + -+ bbr->debug.undo = 1; -+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ -+ bbr->full_bw_cnt = 0; ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ + bbr->loss_in_round = 0; + -+ if (!bbr->params.undo) -+ return tp->snd_cwnd; -+ + /* Revert to cwnd and other state saved before loss episode. */ + bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); + bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); + bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ + return bbr->prior_cwnd; +} + +/* Entering loss recovery, so save state for when we undo recovery. */ -+static u32 bbr2_ssthresh(struct sock *sk) -+{ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { + struct bbr *bbr = inet_csk_ca(sk); + -+ bbr_save_cwnd(sk); + bbr_save_cwnd(sk); + /* For undo, save state that adapts based on loss signal. */ + bbr->undo_bw_lo = bbr->bw_lo; + bbr->undo_inflight_lo = bbr->inflight_lo; + bbr->undo_inflight_hi = bbr->inflight_hi; -+ return tcp_sk(sk)->snd_ssthresh; -+} -+ -+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) +{ + switch (bbr->mode) { + case BBR_STARTUP: -+ return BBR2_PHASE_STARTUP; ++ return BBR_PHASE_STARTUP; + case BBR_DRAIN: -+ return BBR2_PHASE_DRAIN; ++ return BBR_PHASE_DRAIN; + case BBR_PROBE_BW: + break; + case BBR_PROBE_RTT: -+ return BBR2_PHASE_PROBE_RTT; ++ return BBR_PHASE_PROBE_RTT; + default: -+ return BBR2_PHASE_INVALID; ++ return BBR_PHASE_INVALID; + } + switch (bbr->cycle_idx) { + case BBR_BW_PROBE_UP: -+ return BBR2_PHASE_PROBE_BW_UP; ++ return BBR_PHASE_PROBE_BW_UP; + case BBR_BW_PROBE_DOWN: -+ return BBR2_PHASE_PROBE_BW_DOWN; ++ return BBR_PHASE_PROBE_BW_DOWN; + case BBR_BW_PROBE_CRUISE: -+ return BBR2_PHASE_PROBE_BW_CRUISE; ++ return BBR_PHASE_PROBE_BW_CRUISE; + case BBR_BW_PROBE_REFILL: -+ return BBR2_PHASE_PROBE_BW_REFILL; ++ return BBR_PHASE_PROBE_BW_REFILL; + default: -+ return BBR2_PHASE_INVALID; ++ return BBR_PHASE_INVALID; + } +} + -+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) + union tcp_cc_info *info) -+{ -+ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || -+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) { -+ struct bbr *bbr = inet_csk_ca(sk); + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; + u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); + u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); + u64 bw_lo = bbr->bw_lo == ~0U ? + ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; + -+ memset(&info->bbr2, 0, sizeof(info->bbr2)); -+ info->bbr2.bbr_bw_lsb = (u32)bw; -+ info->bbr2.bbr_bw_msb = (u32)(bw >> 32); -+ info->bbr2.bbr_min_rtt = bbr->min_rtt_us; -+ info->bbr2.bbr_pacing_gain = bbr->pacing_gain; -+ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; -+ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; -+ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); -+ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; -+ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); -+ info->bbr2.bbr_mode = bbr->mode; -+ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); -+ info->bbr2.bbr_version = (__u8)2; -+ info->bbr2.bbr_inflight_lo = bbr->inflight_lo; -+ info->bbr2.bbr_inflight_hi = bbr->inflight_hi; -+ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); -+ *attr = INET_DIAG_BBRINFO; -+ return sizeof(info->bbr2); -+ } -+ return 0; -+} -+ -+static void bbr2_set_state(struct sock *sk, u8 new_state) -+{ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { + struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (new_state == TCP_CA_Loss) { -+ struct rate_sample rs = { .losses = 1 }; -+ struct bbr_context ctx = { 0 }; -+ -+ bbr->prev_ca_state = TCP_CA_Loss; -+ bbr->full_bw = 0; -+ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + /* bbr_adapt_lower_bounds() needs cwnd before + * we suffered an RTO, to update inflight_lo: + */ + bbr->inflight_lo = -+ max(tp->snd_cwnd, bbr->prior_cwnd); ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); + } -+ bbr_debug(sk, 0, &rs, &ctx); + } else if (bbr->prev_ca_state == TCP_CA_Loss && + new_state != TCP_CA_Loss) { -+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); -+ bbr->try_fast_path = 0; /* bound cwnd using latest model */ -+ } -+} ++ bbr_exit_loss_recovery(sk); + } + } + + -+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, + .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, -+ .name = "bbr2", -+ .owner = THIS_MODULE, -+ .init = bbr2_init, -+ .cong_control = bbr2_main, -+ .sndbuf_expand = bbr_sndbuf_expand, -+ .skb_marked_lost = bbr2_skb_marked_lost, -+ .undo_cwnd = bbr2_undo_cwnd, -+ .cwnd_event = bbr_cwnd_event, -+ .ssthresh = bbr2_ssthresh, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, + .tso_segs = bbr_tso_segs, -+ .get_info = bbr2_get_info, -+ .set_state = bbr2_set_state, -+}; -+ -+static int __init bbr_register(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); -+ return tcp_register_congestion_control(&tcp_bbr2_cong_ops); -+} -+ -+static void __exit bbr_unregister(void) -+{ -+ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); -+} -+ -+module_init(bbr_register); -+module_exit(bbr_unregister); -+ -+MODULE_AUTHOR("Van Jacobson "); -+MODULE_AUTHOR("Neal Cardwell "); -+MODULE_AUTHOR("Yuchung Cheng "); -+MODULE_AUTHOR("Soheil Hassas Yeganeh "); + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1160,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + #endif + #endif +@@ -1198,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_AUTHOR("Priyaranjan Jha "); +MODULE_AUTHOR("Yousuk Seung "); +MODULE_AUTHOR("Kevin Yang "); +MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); + -+MODULE_LICENSE("Dual BSD/GPL"); -+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1b34050a7538..66d40449b3f4 100644 --- a/net/ipv4/tcp_cong.c @@ -3037,7 +2998,7 @@ index 1b34050a7538..66d40449b3f4 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 57c8af1859c1..3193ef5aac61 100644 +index 57c8af1859c1..2195ba488142 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -3089,7 +3050,37 @@ index 57c8af1859c1..3193ef5aac61 100644 /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep -@@ -3819,6 +3835,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3688,7 +3704,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3705,6 +3722,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3715,6 +3733,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3819,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); @@ -3097,7 +3088,16 @@ index 57c8af1859c1..3193ef5aac61 100644 /* ts_recent update must be made after we are sure that the packet * is in window. -@@ -3917,6 +3934,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3893,7 +3917,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -3917,6 +3941,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); @@ -3105,7 +3105,16 @@ index 57c8af1859c1..3193ef5aac61 100644 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); -@@ -5527,13 +5545,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -3936,7 +3961,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5527,13 +5552,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -3122,11 +3131,46 @@ index 57c8af1859c1..3193ef5aac61 100644 /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index c8f2aa003387..fdf51e436899 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 2cb39b6dad02..703d166c1778 100644 +index 51d8638d4b4c..2fb064057868 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c -@@ -377,7 +377,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, +@@ -325,10 +325,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -340,6 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -377,7 +379,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } @@ -3136,7 +3180,7 @@ index 2cb39b6dad02..703d166c1778 100644 /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } -@@ -1532,7 +1533,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1532,7 +1535,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -3145,23 +3189,38 @@ index 2cb39b6dad02..703d166c1778 100644 long limit; int nlen; u8 flags; -@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1607,6 +1610,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); + -+ /* Set buff tx.in_flight as if buff were sent by itself. */ + inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; -+ if (WARN_ONCE(inflight_prev < 0, -+ "inconsistent: tx.in_flight: %u old_factor: %d", -+ TCP_SKB_CB(skb)->tx.in_flight, old_factor)) ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); + inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ + TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + + tcp_skb_pcount(buff); } /* Link BUFF into the send queue. */ -@@ -1982,13 +1992,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, +@@ -1982,13 +2009,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -3176,11 +3235,11 @@ index 2cb39b6dad02..703d166c1778 100644 + tso_segs = ca_ops->tso_segs ? + ca_ops->tso_segs(sk, mss_now) : + tcp_tso_autosize(sk, mss_now, -+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs)); ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } -@@ -2674,6 +2683,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2674,6 +2700,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); @@ -3188,8 +3247,16 @@ index 2cb39b6dad02..703d166c1778 100644 goto repair; /* Skip network transmission */ } +@@ -2886,6 +2913,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c -index a8f6d9d06f2e..a8b4c9504570 100644 +index a8f6d9d06f2e..8737f2134648 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ @@ -3237,8 +3304,8 @@ index a8f6d9d06f2e..a8b4c9504570 100644 rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; - rs->last_end_seq = scb->end_seq; + rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; @@ -3283,10 +3350,10 @@ index 470f581eedd4..2b8d7e94a369 100644 -- 2.41.0 -From 15fb201317f2aaf349c0929478acd92a068be6d1 Mon Sep 17 00:00:00 2001 +From 883b0afdb45d6c4944bf6b917196870726ce0caa Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:09:03 +0200 -Subject: [PATCH 2/7] cachy +Date: Mon, 31 Jul 2023 12:19:39 +0200 +Subject: [PATCH 2/5] cachy Signed-off-by: Peter Jung --- @@ -3367,10 +3434,10 @@ index a1457995fd41..0b33c7960259 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 47690c28456a..79abb476e260 100644 +index 653238528aac..32ab6e225c91 100644 --- a/Makefile +++ b/Makefile -@@ -819,6 +819,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +@@ -831,6 +831,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -3380,7 +3447,7 @@ index 47690c28456a..79abb476e260 100644 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s -@@ -1064,11 +1067,6 @@ KBUILD_CFLAGS += -fno-strict-overflow +@@ -1076,11 +1079,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check @@ -9110,7 +9177,7 @@ index 000000000000..77a6677ec19e +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mm.h b/include/linux/mm.h -index 2dd73e4f3d8e..e0706755c7c3 100644 +index 406ab9ea818f..17794c213055 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) @@ -9290,7 +9357,7 @@ index d2e12b6d2b18..95ca80492a37 100644 if (err) goto bad_unshare_out; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index a80a73909dc2..b097a9f4d817 100644 +index b3e25be58e2b..2c335df30171 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ @@ -9506,2599 +9573,10 @@ index 1080209a568b..f76aa8268215 100644 -- 2.41.0 -From 924ab3ea3113d6e31ad314896faee2c528d917ac Mon Sep 17 00:00:00 2001 +From 0a48385ee928e0a277eb626a86efe9d4aec339f3 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:09:16 +0200 -Subject: [PATCH 3/7] ddcci - -Signed-off-by: Peter Jung ---- - drivers/char/Kconfig | 11 + - drivers/char/Makefile | 1 + - drivers/char/ddcci.c | 1909 +++++++++++++++++++++ - drivers/video/backlight/Kconfig | 11 + - drivers/video/backlight/Makefile | 1 + - drivers/video/backlight/ddcci-backlight.c | 413 +++++ - include/linux/ddcci.h | 164 ++ - 7 files changed, 2510 insertions(+) - create mode 100644 drivers/char/ddcci.c - create mode 100644 drivers/video/backlight/ddcci-backlight.c - create mode 100644 include/linux/ddcci.h - -diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig -index 625af75833fc..3930aeb8e17b 100644 ---- a/drivers/char/Kconfig -+++ b/drivers/char/Kconfig -@@ -422,4 +422,15 @@ config ADI - and SSM (Silicon Secured Memory). Intended consumers of this - driver include crash and makedumpfile. - -+config DDCCI -+ tristate "DDCCI display protocol support" -+ depends on I2C -+ help -+ Display Data Channel Command Interface is an -+ interface that allows the kernel to "talk" -+ to most displays made after 2005. Check your -+ display's specification to see if it has -+ support for this. This depends on I2C to -+ compile. -+ - endmenu -diff --git a/drivers/char/Makefile b/drivers/char/Makefile -index c5f532e412f1..b12476014311 100644 ---- a/drivers/char/Makefile -+++ b/drivers/char/Makefile -@@ -3,6 +3,7 @@ - # Makefile for the kernel character device drivers. - # - -+obj-$(CONFIG_DDCCI) += ddcci.o - obj-y += mem.o random.o - obj-$(CONFIG_TTY_PRINTK) += ttyprintk.o - obj-y += misc.o -diff --git a/drivers/char/ddcci.c b/drivers/char/ddcci.c -new file mode 100644 -index 000000000000..129aede43651 ---- /dev/null -+++ b/drivers/char/ddcci.c -@@ -0,0 +1,1909 @@ -+/* -+ * DDC/CI sub-bus driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define DDCCI_RECV_BUFFER_SIZE 130 -+#define DEVICE_NAME "ddcci" -+#define DDCCI_MAX_CAP_CHUNKS 200 -+ -+static unsigned int delay = 60; -+static unsigned short autoprobe_addrs[127] = {0xF0, 0xF2, 0xF4, 0xF6, 0xF8}; -+static int autoprobe_addr_count = 5; -+ -+static dev_t ddcci_cdev_first; -+static dev_t ddcci_cdev_next; -+static dev_t ddcci_cdev_end; -+static DEFINE_SEMAPHORE(core_lock, 1); -+ -+struct bus_type ddcci_bus_type; -+EXPORT_SYMBOL_GPL(ddcci_bus_type); -+static bool ddcci_bus_registered; -+ -+/* Assert neccessary string array sizes */ -+#ifndef sizeof_field -+# define sizeof_field(t,m) FIELD_SIZEOF(t,m) -+#endif -+static_assert(sizeof_field(struct ddcci_device, prot) > 8); -+static_assert(sizeof_field(struct ddcci_device, type) > 8); -+static_assert(sizeof_field(struct ddcci_device, model) > 8); -+static_assert(sizeof_field(struct ddcci_device, vendor) > 8); -+static_assert(sizeof_field(struct ddcci_device, module) > 8); -+ -+/* Internal per-i2c-client driver data */ -+struct ddcci_bus_drv_data { -+ unsigned long quirks; -+ struct i2c_client *i2c_dev; -+ struct semaphore sem; -+ unsigned char recv_buffer[DDCCI_RECV_BUFFER_SIZE]; -+}; -+ -+/* Replace non-alphanumeric characters in a string (used for modalias) */ -+static void ddcci_modalias_clean(char *string, size_t n, char replacement) -+{ -+ int i; -+ for (i = 0; i < n; ++i) { -+ char c = string[i]; -+ if (c == 0) { -+ return; -+ } else if (c < '0' || (c > '9' && c < 'A') || (c > 'Z' && c < 'a') || c > 'z') { -+ string[i] = replacement; -+ } -+ } -+} -+ -+/* Write a message to the DDC/CI bus using i2c_smbus_write_byte() */ -+static int __ddcci_write_bytewise(struct i2c_client *client, unsigned char addr, -+ bool p_flag, const unsigned char * __restrict buf, -+ unsigned char len) -+{ -+ int ret = 0; -+ unsigned char outer_addr = (unsigned char)(client->addr << 1); -+ unsigned xor = outer_addr; /* initial xor value */ -+ -+ /* Consistency checks */ -+ if (len > 127) -+ return -EINVAL; -+ -+ /* Special case: sender to 0x6E is always 0x51 */ -+ if (addr == DDCCI_DEFAULT_DEVICE_ADDR) { -+ addr = DDCCI_HOST_ADDR_ODD; -+ } else { -+ /* When sending the odd address is used */ -+ addr = addr | 1; -+ } -+ -+ /* first byte: sender address */ -+ xor ^= addr; -+ ret = i2c_smbus_write_byte(client, addr); -+ if (ret < 0) -+ return ret; -+ -+ /* second byte: protocol flag and message size */ -+ xor ^= ((p_flag << 7) | len); -+ ret = i2c_smbus_write_byte(client, (p_flag << 7)|len); -+ if (ret < 0) -+ return ret; -+ -+ /* send payload */ -+ while (len--) { -+ xor ^= (*buf); -+ ret = i2c_smbus_write_byte(client, (*buf)); -+ if (ret < 0) -+ return ret; -+ buf++; -+ } -+ -+ /* send checksum */ -+ ret = i2c_smbus_write_byte(client, xor); -+ return ret; -+} -+ -+/* Write a message to the DDC/CI bus using i2c_master_send() */ -+static int __ddcci_write_block(struct i2c_client *client, unsigned char addr, -+ unsigned char *sendbuf, bool p_flag, -+ const unsigned char *data, unsigned char len) -+{ -+ unsigned char outer_addr = (unsigned char)(client->addr << 1); -+ unsigned xor = outer_addr; /* initial xor value */ -+ unsigned char *ptr = sendbuf; -+ -+ /* Consistency checks */ -+ if (len > 127) -+ return -EINVAL; -+ -+ /* Special case: sender to 0x6E is always 0x51 */ -+ if (addr == DDCCI_DEFAULT_DEVICE_ADDR) { -+ addr = DDCCI_HOST_ADDR_ODD; -+ } else { -+ /* When sending the odd address is used */ -+ addr = addr | 1; -+ } -+ -+ /* first byte: sender address */ -+ xor ^= addr; -+ *(ptr++) = addr; -+ /* second byte: protocol flag and message size */ -+ xor ^= ((p_flag << 7) | len); -+ *(ptr++) = (p_flag << 7)|len; -+ /* payload */ -+ while (len--) { -+ xor ^= (*data); -+ *(ptr++) = (*data); -+ data++; -+ } -+ /* checksum */ -+ (*ptr) = xor; -+ -+ /* Send it */ -+ return i2c_master_send(client, sendbuf, ptr - sendbuf + 1); -+} -+ -+/* -+ * Write a message to the DDC/CI bus. -+ * -+ * You must hold the bus semaphore when calling this function. -+ */ -+static int ddcci_write(struct i2c_client *client, unsigned char addr, -+ bool p_flag, const unsigned char *data, -+ unsigned char len) -+{ -+ struct ddcci_bus_drv_data *drv_data; -+ unsigned char *sendbuf; -+ int ret; -+ -+ drv_data = i2c_get_clientdata(client); -+ -+ -+ pr_debug("sending to %d:%02x:%02x: %*ph\n", client->adapter->nr, -+ client->addr << 1, addr, len, data); -+ if (drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE) { -+ ret = __ddcci_write_bytewise(client, addr, p_flag, data, len); -+ } else { -+ sendbuf = drv_data->recv_buffer; -+ ret = __ddcci_write_block(client, addr, sendbuf, p_flag, data, len); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Read a response from the DDC/CI bus with headers directly into a buffer. -+ * Always check for DDCCI_QUIRK_SKIP_FIRST_BYTE when using this function. -+ * The returned length contains the whole unmodified response. -+ * If -EMSGSIZE is returned, the buffer contains the response up to `len`. -+ * If any other negative error code is returned, the buffer content is -+ * unspecified. -+ */ -+static int __ddcci_read(struct i2c_client *client, unsigned char addr, -+ bool p_flag, unsigned long quirks, unsigned char *buf, -+ unsigned char len) -+{ -+ int i, payload_len, packet_length, ret; -+ unsigned char xor = DDCCI_HOST_ADDR_EVEN; -+ -+ /* Consistency checks */ -+ if (len < 3) -+ return -EINVAL; -+ -+ /* Read frame */ -+ ret = i2c_master_recv(client, buf, len); -+ if (ret < 0) -+ goto out_err; -+ packet_length = ret; -+ -+ /* Skip first byte if quirk active */ -+ if ((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE) && ret > 0 && len > 0) { -+ ret--; -+ len--; -+ buf++; -+ } -+ -+ /* If answer too short (= incomplete) break out */ -+ if (ret < 3) { -+ ret = -EIO; -+ goto out_err; -+ } -+ -+ /* validate first byte */ -+ if (unlikely(buf[0] != addr)) { -+ ret = (buf[0] == '\0') ? -EAGAIN : -EIO; -+ goto out_err; -+ } -+ -+ /* validate second byte (protocol flag) */ -+ if (unlikely((buf[1] & 0x80) != (p_flag << 7))) { -+ if (!p_flag || !(quirks & DDCCI_QUIRK_NO_PFLAG)) { -+ ret = -EIO; -+ goto out_err; -+ } -+ } -+ -+ /* get and check payload length */ -+ payload_len = buf[1] & 0x7F; -+ if (3+payload_len > packet_length) -+ return -EBADMSG; -+ if (3+payload_len > len) -+ return -EMSGSIZE; -+ -+ /* calculate checksum */ -+ for (i = 0; i < 3+payload_len; i++) -+ xor ^= buf[i]; -+ -+ /* verify checksum */ -+ if (xor != 0) { -+ dev_err(&client->dev, "invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n", -+ xor, payload_len); -+ ret = -EBADMSG; -+ goto out_err; -+ } -+ -+ /* return result */ -+ ret = payload_len+3+((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)?1:0); -+ -+out_err: -+ return ret; -+} -+ -+/* -+ * Read a response from the DDC/CI bus -+ * -+ * You must hold the bus semaphore when calling this function. -+ */ -+static int ddcci_read(struct i2c_client *client, unsigned char addr, -+ bool p_flag, unsigned char *buf, unsigned char len) -+{ -+ struct ddcci_bus_drv_data *drv_data; -+ unsigned char *recvbuf; -+ int ret; -+ -+ drv_data = i2c_get_clientdata(client); -+ recvbuf = drv_data->recv_buffer; -+ -+ /* Read frame */ -+ ret = __ddcci_read(client, addr, p_flag, -+ drv_data->quirks, recvbuf, DDCCI_RECV_BUFFER_SIZE); -+ if (ret < 0) -+ return ret; -+ -+ if (drv_data->quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE) -+ recvbuf++; -+ -+ /* return result */ -+ if (buf) { -+ if (ret > 3) { -+ ret = ret-3; -+ /* copy to caller buffer */ -+ memcpy(buf, &recvbuf[2], (ret < len) ? ret : len); -+ -+ if (ret > len) { -+ /* if message was truncated, return -EMSGSIZE */ -+ pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph ...\n", -+ client->adapter->nr, client->addr << 1, -+ addr, ret, len, len, buf); -+ ret = -EMSGSIZE; -+ } else { -+ pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph\n", -+ client->adapter->nr, client->addr << 1, -+ addr, ret, len, ret, buf); -+ } -+ } -+ } -+ if (!(drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ /* second read to clear buffers, needed on some devices */ -+ __ddcci_read(client, addr, true, drv_data->quirks, recvbuf, 1); -+ } -+ return ret; -+} -+ -+/* Request the capability string for a device and put it into buf */ -+static int ddcci_get_caps(struct i2c_client *client, unsigned char addr, -+ unsigned char *buf, unsigned int len) -+{ -+ int result = 0, counter = 0, offset = 0; -+ unsigned char cmd[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00 }; -+ unsigned char *chunkbuf = kzalloc(35, GFP_KERNEL); -+ -+ if (!chunkbuf) -+ return -ENOMEM; -+ -+ do { -+ /* Send command */ -+ result = ddcci_write(client, addr, true, cmd, sizeof(cmd)); -+ if (result < 0) -+ goto err_free; -+ msleep(delay); -+ /* read result chunk */ -+ result = ddcci_read(client, addr, true, chunkbuf, -+ (len > 32) ? 35 : len+3); -+ if (result < 0) -+ goto err_free; -+ -+ if (result > 0) { -+ /* check chunk header */ -+ if (chunkbuf[0] != DDCCI_REPLY_CAPS) { -+ result = -EIO; -+ goto err_free; -+ } -+ if (chunkbuf[1] != cmd[1] || chunkbuf[2] != cmd[2]) { -+ result = -EIO; -+ goto err_free; -+ } -+ if (result < 3) { -+ result = -EIO; -+ goto err_free; -+ } -+ memcpy(buf, chunkbuf+3, min((unsigned int)result-3, len)); -+ -+ counter++; -+ /* adjust offset, etc. */ -+ offset += result-3; -+ len -= result-3; -+ buf += result-3; -+ cmd[1] = offset >> 8; -+ cmd[2] = offset & 0xFF; -+ /* Another superfluous read to make some devices happy... */ -+ ddcci_read(client, addr, true, NULL, 2); -+ } -+ } while (result > 3 && counter < DDCCI_MAX_CAP_CHUNKS); -+ -+ kfree(chunkbuf); -+ return offset+result-3; -+err_free: -+ kfree(chunkbuf); -+ return result; -+} -+ -+/* -+ * Request the device identification and put it into buf. -+ * -+ * Also detects all communication quirks and sets the corresponding flags -+ * in the ddcci_bus_drv_data structure associated with client. -+ * -+ * The identification command will fail on most DDC devices, as it is optional -+ * to support, but even the "failed" response suffices to detect quirks. -+ */ -+static int ddcci_identify_device(struct i2c_client *client, unsigned char addr, -+ unsigned char *buf, unsigned char len) -+{ -+ int i, payload_len, ret = -ENODEV; -+ unsigned long quirks; -+ unsigned char cmd[1] = { DDCCI_COMMAND_ID }; -+ unsigned char *buffer; -+ unsigned char xor = DDCCI_HOST_ADDR_EVEN; -+ struct ddcci_bus_drv_data *bus_drv_data; -+ -+ bus_drv_data = i2c_get_clientdata(client); -+ quirks = bus_drv_data->quirks; -+ buffer = bus_drv_data->recv_buffer; -+ -+ /* Send Identification command */ -+ if (!(quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ ret = __ddcci_write_block(client, addr, buffer, true, cmd, sizeof(cmd)); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] writing identification command in block mode: %d\n", -+ client->addr << 1, addr, ret); -+ if ((ret == -ENXIO) -+ && i2c_check_functionality(client->adapter, -+ I2C_FUNC_SMBUS_WRITE_BYTE)) { -+ quirks |= DDCCI_QUIRK_WRITE_BYTEWISE; -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: writes must be done bytewise\n"); -+ /* Some devices need writing twice after a failed blockwise write */ -+ __ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd)); -+ msleep(delay); -+ } -+ } -+ if (ret < 0 && (quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ ret = __ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd)); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] writing identification command in bytewise mode: %d\n", -+ client->addr << 1, addr, ret); -+ } -+ if (ret < 0) -+ return -ENODEV; -+ -+ /* Wait */ -+ msleep(delay); -+ -+ /* Receive response */ -+ ret = i2c_master_recv(client, buffer, DDCCI_RECV_BUFFER_SIZE); -+ if (ret < 0) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] receiving identification response resulted in errno %d\n", -+ client->addr << 1, addr, ret); -+ return ret; -+ } -+ -+ if (ret == 0) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] no identification response received\n", -+ client->addr << 1, addr); -+ return ret; -+ } -+ -+ /* Skip first byte if quirk already active */ -+ if (quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE && ret > 1) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] doubled first byte quirk in effect\n", -+ client->addr << 1, addr); -+ ret--; -+ buffer++; -+ } -+ -+ /* If answer too short (= incomplete) break out */ -+ if (ret < 3) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response is too short (%d bytes)\n", -+ client->addr << 1, addr, ret); -+ return -EIO; -+ } -+ -+ /* validate first byte */ -+ if (buffer[0] != addr) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph\n", -+ client->addr << 1, addr, (ret > 32 ? 32 : ret), buffer); -+ -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response invalid (expected first byte %02x, got %02x)\n", -+ client->addr << 1, addr, addr, buffer[0]); -+ return -ENODEV; -+ } -+ -+ /* Check if first byte is doubled (QUIRK_SKIP_FIRST_BYTE) */ -+ if (!(quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)) { -+ if (buffer[0] == buffer[1]) { -+ quirks |= DDCCI_QUIRK_SKIP_FIRST_BYTE; -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: doubled first byte on read\n"); -+ ret--; -+ buffer++; -+ if (ret < 3) -+ return -EIO; -+ } -+ } -+ -+ /* validate second byte (protocol flag) */ -+ if ((buffer[1] & 0x80) != 0x80 && !(quirks & DDCCI_QUIRK_NO_PFLAG)) { -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: device omits protocol flag on responses\n"); -+ quirks |= DDCCI_QUIRK_NO_PFLAG; -+ } -+ -+ /* get and check payload length */ -+ payload_len = buffer[1] & 0x7F; -+ if (3+payload_len > ret) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph ...\n", -+ client->addr << 1, addr, ret, buffer); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response was truncated (expected %d bytes, got %d)\n", -+ client->addr << 1, addr, 3+payload_len, ret); -+ return -EBADMSG; -+ } -+ -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph\n", -+ client->addr << 1, addr, 3+payload_len, buffer); -+ -+ /* calculate checksum */ -+ for (i = 0; i < 3+payload_len; i++) -+ xor ^= buffer[i]; -+ -+ /* verify checksum */ -+ if (xor != 0) { -+ dev_err(&client->dev, -+ "[%02x:%02x] invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n", -+ client->addr << 1, addr, xor, payload_len); -+ return -EBADMSG; -+ } -+ -+ /* save quirks */ -+ bus_drv_data->quirks = quirks; -+ -+ /* return result */ -+ if (payload_len <= len) { -+ ret = payload_len; -+ memcpy(buf, &buffer[2], payload_len); -+ } else { -+ ret = -EMSGSIZE; -+ memcpy(buf, &buffer[2], len); -+ } -+ return ret; -+} -+ -+/* Character device */ -+ -+/* Data structure for an open file handle */ -+struct ddcci_fp_data { -+ struct ddcci_device *dev; -+ bool exclusive; -+ unsigned char buffer[129]; -+}; -+ -+/* Called when the character device is opened */ -+static int ddcci_cdev_open(struct inode *inode, struct file *filp) -+{ -+ struct ddcci_device *dev = container_of(inode->i_cdev, -+ struct ddcci_device, cdev); -+ struct ddcci_fp_data *fp_data = NULL; -+ -+ fp_data = kzalloc(sizeof(struct ddcci_fp_data), GFP_KERNEL); -+ -+ if (!fp_data) -+ return -ENOMEM; -+ -+ fp_data->exclusive = filp->f_flags & O_EXCL; -+ -+ if (fp_data->exclusive) { -+ if (down_write_trylock(&dev->cdev_sem) == 0) { -+ kfree(fp_data); -+ return -EBUSY; -+ } -+ } else { -+ if (down_read_trylock(&dev->cdev_sem) == 0) { -+ kfree(fp_data); -+ return -EBUSY; -+ } -+ } -+ -+ fp_data->dev = dev; -+ filp->private_data = fp_data; -+ -+ return 0; -+} -+ -+/* Called when the character device is closed */ -+static int ddcci_cdev_close(struct inode *inode, struct file *filp) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ -+ if (fp_data->exclusive) -+ up_write(&dev->cdev_sem); -+ else -+ up_read(&dev->cdev_sem); -+ -+ filp->private_data = NULL; -+ kfree(fp_data); -+ return 0; -+} -+ -+/* Called when reading from the character device */ -+static ssize_t ddcci_cdev_read(struct file *filp, char __user *buffer, -+ size_t length, loff_t *offset) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ unsigned char *buf = fp_data->buffer; -+ const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0; -+ int ret = 0; -+ -+ if ((filp->f_mode & FMODE_READ) == 0) -+ return -EBADF; -+ -+ /* Lock mutex */ -+ if (nonblocking) { -+ if (down_trylock(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ } else { -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -ERESTARTSYS; -+ } -+ -+ /* Execute read */ -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, true, buf, -+ length); -+ -+ if (ret > 0) { -+ /* Copy data from user space */ -+ if (copy_to_user(buffer, buf, ret)) { -+ ret = -EFAULT; -+ goto out; -+ } -+ } -+ -+out: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+ -+/* Called when writing to the character device */ -+static ssize_t ddcci_cdev_write(struct file *filp, const char __user *buffer, -+ size_t count, loff_t *offset) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ unsigned char *buf = fp_data->buffer; -+ const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0; -+ int ret = 0; -+ -+ if ((filp->f_mode & FMODE_WRITE) == 0) -+ return -EBADF; -+ -+ if (count > 127) -+ return -EINVAL; -+ -+ /* Lock mutex */ -+ if (nonblocking) { -+ if (down_trylock(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ } else { -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -ERESTARTSYS; -+ } -+ -+ if (count > 0) { -+ /* Copy data from user space */ -+ if (copy_from_user(buf, buffer, count)) { -+ ret = -EFAULT; -+ goto err_out; -+ } -+ -+ /* Execute write */ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, -+ true, buf, count); -+ } -+ -+ if (ret >= 0) { -+ msleep(delay); -+ up(&dev->bus_drv_data->sem); -+ return count; -+ } -+ -+err_out: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+ -+/* Called when seeking the character device */ -+static loff_t ddcci_cdev_seek(struct file *filp, loff_t offset, int anchor) -+{ -+ return -EINVAL; -+} -+ -+static const struct file_operations ddcci_fops = { -+ .owner = THIS_MODULE, -+ .read = ddcci_cdev_read, -+ .write = ddcci_cdev_write, -+ .open = ddcci_cdev_open, -+ .release = ddcci_cdev_close, -+ .llseek = ddcci_cdev_seek -+}; -+ -+/* Set up the character device for a DDC/CI device */ -+static int ddcci_setup_char_device(struct ddcci_device *device) -+{ -+ int ret = -EINVAL; -+ -+ /* Check if free minor exists */ -+ if (ddcci_cdev_next == ddcci_cdev_end) { -+ dev_err(&device->dev, "no free major/minor\n"); -+ ret = -ENFILE; -+ goto out; -+ } -+ -+ /* Initialize rwsem */ -+ init_rwsem(&device->cdev_sem); -+ -+ /* Initialize character device node */ -+ cdev_init(&device->cdev, &ddcci_fops); -+ device->cdev.owner = THIS_MODULE; -+ -+ /* Publish char device */ -+ device->dev.devt = ddcci_cdev_next; -+ ret = cdev_add(&device->cdev, ddcci_cdev_next, 1); -+ if (ret) { -+ device->dev.devt = 0; -+ goto out; -+ } -+ -+ ddcci_cdev_next++; -+out: -+ return ret; -+} -+ -+/* sysfs attributes */ -+ -+static ssize_t ddcci_attr_capabilities_show(struct device *dev, -+ struct device_attribute *attr, -+ char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = device->capabilities_len; -+ if (unlikely(len > PAGE_SIZE)) -+ len = PAGE_SIZE; -+ if (len == 0) { -+ ret = len; -+ } else { -+ memcpy(buf, device->capabilities, len); -+ if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } -+ } -+ } -+ -+ return ret; -+} -+ -+static ssize_t ddcci_attr_prot_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->prot, sizeof(device->prot)); -+ strncpy(buf, device->prot, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_type_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->type, sizeof(device->type)); -+ strncpy(buf, device->type, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_model_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->model, sizeof(device->model)); -+ strncpy(buf, device->model, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_vendor_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->vendor, sizeof(device->vendor)); -+ strncpy(buf, device->vendor, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_module_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->module, sizeof(device->module)); -+ strncpy(buf, device->module, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_serial_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ -+ if (likely(device != NULL)) -+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", device->device_number); -+ -+ return ret; -+} -+ -+static ssize_t ddcci_attr_modalias_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ char model[ARRAY_SIZE(device->model)]; -+ char vendor[ARRAY_SIZE(device->model)]; -+ char module[ARRAY_SIZE(device->model)]; -+ -+ if (likely(device != NULL)) { -+ memcpy(model, device->model, sizeof(model)); -+ memcpy(vendor, device->vendor, sizeof(vendor)); -+ memcpy(module, device->module, sizeof(module)); -+ ddcci_modalias_clean(model, sizeof(model), '_'); -+ ddcci_modalias_clean(vendor, sizeof(vendor), '_'); -+ ddcci_modalias_clean(module, sizeof(module), '_'); -+ -+ ret = scnprintf(buf, PAGE_SIZE, "%s%s-%s-%s-%s-%s\n", -+ DDCCI_MODULE_PREFIX, -+ device->prot, -+ device->type, -+ model, -+ vendor, -+ module -+ ); -+ } -+ return ret; -+} -+ -+static DEVICE_ATTR(capabilities, S_IRUGO, ddcci_attr_capabilities_show, NULL); -+static DEVICE_ATTR(idProt, S_IRUGO, ddcci_attr_prot_show, NULL); -+static DEVICE_ATTR(idType, S_IRUGO, ddcci_attr_type_show, NULL); -+static DEVICE_ATTR(idModel, S_IRUGO, ddcci_attr_model_show, NULL); -+static DEVICE_ATTR(idVendor, S_IRUGO, ddcci_attr_vendor_show, NULL); -+static DEVICE_ATTR(idModule, S_IRUGO, ddcci_attr_module_show, NULL); -+static DEVICE_ATTR(idSerial, S_IRUGO, ddcci_attr_serial_show, NULL); -+static DEVICE_ATTR(modalias, S_IRUGO, ddcci_attr_modalias_show, NULL); -+ -+static struct attribute *ddcci_char_device_attrs[] = { -+ &dev_attr_capabilities.attr, -+ &dev_attr_idProt.attr, -+ &dev_attr_idType.attr, -+ &dev_attr_idModel.attr, -+ &dev_attr_idVendor.attr, -+ &dev_attr_idModule.attr, -+ &dev_attr_idSerial.attr, -+ &dev_attr_modalias.attr, -+ NULL, -+}; -+ATTRIBUTE_GROUPS(ddcci_char_device); -+ -+/* DDC/CI bus */ -+ -+static int ddcci_device_uevent(const struct device *dev, struct kobj_uevent_env *env) -+{ -+ struct ddcci_device *device = to_ddcci_device(dev); -+ char model[ARRAY_SIZE(device->model)]; -+ char vendor[ARRAY_SIZE(device->vendor)]; -+ char module[ARRAY_SIZE(device->module)]; -+ -+ memcpy(model, device->model, sizeof(model)); -+ memcpy(vendor, device->vendor, sizeof(vendor)); -+ memcpy(module, device->module, sizeof(module)); -+ ddcci_modalias_clean(model, sizeof(model), '_'); -+ ddcci_modalias_clean(vendor, sizeof(vendor), '_'); -+ ddcci_modalias_clean(module, sizeof(module), '_'); -+ -+ if (add_uevent_var(env, "MODALIAS=%s%s-%s-%s-%s-%s", -+ DDCCI_MODULE_PREFIX, -+ device->prot, -+ device->type, -+ model, -+ vendor, -+ module -+ )) -+ return -ENOMEM; -+ -+ if (device->prot[0]) -+ if (add_uevent_var(env, "DDCCI_PROT=%s", device->prot)) -+ return -ENOMEM; -+ -+ if (device->type[0]) -+ if (add_uevent_var(env, "DDCCI_TYPE=%s", device->type)) -+ return -ENOMEM; -+ -+ if (device->model[0]) -+ if (add_uevent_var(env, "DDCCI_MODEL=%s", device->model)) -+ return -ENOMEM; -+ -+ if (device->vendor[0]) { -+ if (add_uevent_var(env, "DDCCI_VENDOR=%s", device->vendor)) -+ return -ENOMEM; -+ -+ if (add_uevent_var(env, "DDCCI_MODULE=%s", device->module)) -+ return -ENOMEM; -+ -+ if (add_uevent_var(env, "DDCCI_UNIQ=%d", device->device_number)) -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static void ddcci_device_release(struct device *dev) -+{ -+ struct ddcci_device *device = to_ddcci_device(dev); -+ struct ddcci_driver *driver; -+ -+ /* Notify driver */ -+ if (dev->driver) { -+ driver = to_ddcci_driver(dev->driver); -+ if (driver->remove) -+ driver->remove(device); -+ } -+ -+ /* Teardown chardev */ -+ if (dev->devt) { -+ down(&core_lock); -+ if (device->cdev.dev == ddcci_cdev_next-1) -+ ddcci_cdev_next--; -+ cdev_del(&device->cdev); -+ up(&core_lock); -+ } -+ -+ /* Free capability string */ -+ if (device->capabilities) { -+ device->capabilities_len = 0; -+ kfree(device->capabilities); -+ } -+ /* Free device */ -+ kfree(device); -+} -+ -+static char *ddcci_devnode(const struct device *dev, -+ umode_t *mode, kuid_t *uid, kgid_t *gid) -+{ -+ struct ddcci_device *device; -+ -+ device = to_ddcci_device(dev); -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/display", -+ device->i2c_client->adapter->nr); -+} -+ -+static char *ddcci_dependent_devnode(const struct device *dev, -+ umode_t *mode, kuid_t *uid, kgid_t *gid) -+{ -+ struct ddcci_device *device; -+ -+ device = to_ddcci_device(dev); -+ if (device->flags & DDCCI_FLAG_EXTERNAL) { -+ if (device->outer_addr == device->inner_addr) -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x", -+ device->i2c_client->adapter->nr, -+ device->outer_addr); -+ else -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x%02x", -+ device->i2c_client->adapter->nr, -+ device->outer_addr, device->inner_addr); -+ } else { -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/i%02x", -+ device->i2c_client->adapter->nr, -+ device->inner_addr); -+ } -+} -+ -+/* Device type for main DDC/CI devices*/ -+static struct device_type ddcci_device_type = { -+ .name = "ddcci-device", -+ .uevent = ddcci_device_uevent, -+ .groups = ddcci_char_device_groups, -+ .release = ddcci_device_release, -+ .devnode = ddcci_devnode -+}; -+ -+/* Device type for dependent DDC/CI devices*/ -+static struct device_type ddcci_dependent_type = { -+ .name = "ddcci-dependent-device", -+ .uevent = ddcci_device_uevent, -+ .groups = ddcci_char_device_groups, -+ .release = ddcci_device_release, -+ .devnode = ddcci_dependent_devnode -+}; -+ -+/** -+ * ddcci_verify_device - return parameter as ddcci_device, or NULL -+ * @dev: device, probably from some driver model iterator -+ */ -+struct ddcci_device *ddcci_verify_device(struct device *dev) -+{ -+ if (unlikely(!dev)) -+ return NULL; -+ return (dev->type == &ddcci_device_type -+ || dev->type == &ddcci_dependent_type) -+ ? to_ddcci_device(dev) -+ : NULL; -+} -+EXPORT_SYMBOL(ddcci_verify_device); -+ -+/** -+ * ddcci_quirks - Get quirks for DDC/CI device -+ * @dev: Target DDC/CI device -+ */ -+unsigned long ddcci_quirks(struct ddcci_device *dev) -+{ -+ if (unlikely(WARN_ON(!dev))) -+ return ~0L; -+ if (unlikely(WARN_ON(!dev->bus_drv_data))) -+ return ~0L; -+ return dev->bus_drv_data->quirks; -+} -+EXPORT_SYMBOL(ddcci_quirks); -+ -+/** -+ * ddcci_register_driver - register DDC/CI driver -+ * @owner: the owning module -+ * @driver: the driver to register -+ */ -+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver) -+{ -+ int ret; -+ -+ /* Can't register until after driver model init */ -+ if (unlikely(WARN_ON(!ddcci_bus_registered))) -+ return -EAGAIN; -+ -+ pr_debug("registering driver [%s]\n", driver->driver.name); -+ -+ /* add the driver to the list of ddcci drivers in the driver core */ -+ driver->driver.owner = owner; -+ driver->driver.bus = &ddcci_bus_type; -+ -+ /* When registration returns, the driver core -+ * will have called probe() for all matching-but-unbound devices. -+ */ -+ ret = driver_register(&driver->driver); -+ if (ret) -+ return ret; -+ -+ pr_debug("driver [%s] registered\n", driver->driver.name); -+ -+ return 0; -+} -+EXPORT_SYMBOL(ddcci_register_driver); -+ -+/** -+ * ddcci_del_driver - unregister DDC/CI driver -+ * @driver: the driver being unregistered -+ */ -+void ddcci_del_driver(struct ddcci_driver *driver) -+{ -+ driver_unregister(&driver->driver); -+ pr_debug("driver [%s] unregistered\n", driver->driver.name); -+} -+EXPORT_SYMBOL(ddcci_del_driver); -+ -+/** -+ * ddcci_device_write - Write a message to a DDC/CI device -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, true for standard control messages -+ * @data: Data that will be written to the device -+ * @length: How many bytes to write -+ * -+ * Writes the message to the device and sleeps (see module parameter 'delay') -+ */ -+int ddcci_device_write(struct ddcci_device *dev, bool p_flag, -+ unsigned char *data, unsigned char length) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, data, length); -+ msleep(delay); -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_write); -+ -+/** -+ * ddcci_device_read - Read a response from a DDC/CI device -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, must match the corresponding write -+ * @buffer: Where to store data read from the device -+ * @length: Buffer size -+ */ -+int ddcci_device_read(struct ddcci_device *dev, bool p_flag, -+ unsigned char *buffer, unsigned char length) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length); -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_read); -+ -+/** -+ * ddcci_device_writeread - Write a message to a device and read the response -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, true for standard control messages -+ * @buffer: Buffer used for write and read -+ * @length: How many bytes to write -+ * @maxlength: Buffer size on read -+ * -+ * Writing, sleeping and reading are done without releasing the DDC/CI bus. -+ * This provides atomicity in respect to other DDC/CI accesses on the same bus. -+ */ -+int ddcci_device_writeread(struct ddcci_device *dev, bool p_flag, -+ unsigned char *buffer, unsigned char length, -+ unsigned char maxlength) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length); -+ if (ret < 0) -+ goto err; -+ msleep(delay); -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, maxlength); -+err: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_writeread); -+ -+#define IS_ANY_ID(x) (((x)[0] == '\xFF') && ((x)[7] == '\xFF')) -+ -+/* Check if any device id in the array matches the device and return the matching id */ -+static const struct ddcci_device_id *ddcci_match_id(const struct ddcci_device_id *id, -+ const struct ddcci_device *device) -+{ -+ while (id->prot[0] || id->type[0] || id->model[0] || id->vendor[0] || id->module[0]) { -+ if ((IS_ANY_ID(id->prot) || (strcmp(device->prot, id->prot) == 0)) -+ && (IS_ANY_ID(id->type) || (strcmp(device->type, id->type) == 0)) -+ && (IS_ANY_ID(id->model) || (strcmp(device->model, id->model) == 0)) -+ && (IS_ANY_ID(id->vendor) || (strcmp(device->vendor, id->vendor) == 0)) -+ && (IS_ANY_ID(id->module) || (strcmp(device->module, id->module) == 0))) { -+ return id; -+ } -+ id++; -+ } -+ return NULL; -+} -+ -+static int ddcci_device_match(struct device *dev, struct device_driver *drv) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ -+ if (!device) -+ return 0; -+ -+ driver = to_ddcci_driver(drv); -+ /* match on an id table if there is one */ -+ if (driver->id_table) -+ return ddcci_match_id(driver->id_table, device) != NULL; -+ -+ return 0; -+} -+ -+static int ddcci_device_probe(struct device *dev) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ const struct ddcci_device_id *id; -+ int ret = 0; -+ -+ if (!device) -+ return -EINVAL; -+ driver = to_ddcci_driver(dev->driver); -+ -+ id = ddcci_match_id(driver->id_table, device); -+ if (!id) -+ return -ENODEV; -+ -+ if (driver->probe) -+ ret = driver->probe(device, id); -+ -+ return ret; -+} -+ -+static int ddcci_device_remove(struct device *dev) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ int ret = 0; -+ -+ if (!device) -+ return -EINVAL; -+ driver = to_ddcci_driver(dev->driver); -+ -+ if (driver->remove) -+ ret = driver->remove(device); -+ -+ return ret; -+} -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+static void ddcci_device_remove_void(struct device *dev) -+{ -+ ddcci_device_remove(dev); -+} -+#endif -+ -+/** -+ * DDCCI bus type structure -+ */ -+struct bus_type ddcci_bus_type = { -+ .name = "ddcci", -+ .match = ddcci_device_match, -+ .probe = ddcci_device_probe, -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+ .remove = ddcci_device_remove_void -+#else -+ .remove = ddcci_device_remove -+#endif -+}; -+ -+/* Main I2C driver */ -+ -+/* Get a pointer to the closing parenthesis */ -+static char *ddcci_capstr_tok(const char *s, int depth) -+{ -+ const char *ptr = s; -+ char *end; -+ -+ if (s == NULL || s[0] == '\0') -+ return NULL; -+ -+ while ((end = strpbrk(ptr, "()"))) { -+ if (!end || depth == INT_MAX) -+ return NULL; -+ if (*end == '(') -+ depth++; -+ else if (depth > 0) -+ depth--; -+ else -+ break; -+ ptr = end+1; -+ } -+ return end; -+} -+ -+/** -+ * ddcci_find_capstr_item - Search capability string for a tag -+ * @capabilities: Capability string to search -+ * @tag: Tag to find -+ * @length: Buffer for the length of the found tag value (optional) -+ * -+ * Return a pointer to the start of the tag value (directly after the '(') on -+ * success and write the length of the value (excluding the ')') into `length`. -+ * -+ * If the tag is not found or another error occurs, an ERR_PTR is returned -+ * and `length` stays untouched. -+ */ -+const char *ddcci_find_capstr_item(const char * capabilities, -+ const char * __restrict tag, -+ size_t *length) -+{ -+ const char *src = capabilities, *ptr; -+ ptrdiff_t len; -+ int taglen = strnlen(tag, 1024); -+ -+ /* Check length of requested tag */ -+ if (unlikely(taglen <= 0 || taglen > 1023)) -+ return ERR_PTR(-EINVAL); -+ -+ /* Find tag */ -+ while (src && (strncmp(src+1, tag, taglen) != 0 || src[1+taglen] != '(')) -+ src = ddcci_capstr_tok(src+1, -1); -+ if (!src || src[0] == '\0') -+ return ERR_PTR(-ENOENT); -+ -+ /* Locate end of value */ -+ src += taglen+2; -+ ptr = ddcci_capstr_tok(src, 0); -+ if (unlikely(!ptr)) -+ return ERR_PTR(-EOVERFLOW); -+ -+ /* Check length of tag data */ -+ len = ptr-src; -+ if (unlikely(len < 0 || len > 65535)) -+ return ERR_PTR(-EMSGSIZE); -+ -+ /* Return pointer and length */ -+ if (likely(length != NULL)) -+ *length = (size_t)len; -+ return src; -+} -+EXPORT_SYMBOL(ddcci_find_capstr_item); -+ -+/* Search the capability string for a tag and copy the value to dest */ -+static int ddcci_cpy_capstr_item(char *dest, const char *src, -+ const char * __restrict tag, size_t maxlen) -+{ -+ const char *ptr; -+ size_t len; -+ -+ /* Find tag */ -+ ptr = ddcci_find_capstr_item(src, tag, &len); -+ if (IS_ERR(ptr)) { -+ return PTR_ERR(ptr); -+ } -+ -+ /* Copy value */ -+ memcpy(dest, ptr, min(len, maxlen)); -+ return 0; -+} -+ -+/* Fill fields in device by parsing the capability string */ -+static int ddcci_parse_capstring(struct ddcci_device *device) -+{ -+ const char *capstr = device->capabilities; -+ int ret = 0; -+ -+ if (!capstr) -+ return -EINVAL; -+ -+ /* capability string start with a paren */ -+ if (capstr[0] != '(') -+ return -EINVAL; -+ -+ /* get prot(...) */ -+ ret = ddcci_cpy_capstr_item(device->prot, capstr, "prot", sizeof(device->prot)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no protocol tag"); -+ memset(device->prot, 0, sizeof(device->prot)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* get type(...) */ -+ ret = ddcci_cpy_capstr_item(device->type, capstr, "type", sizeof(device->type)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no type tag"); -+ memset(device->type, 0, sizeof(device->type)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* and then model(...) */ -+ ret = ddcci_cpy_capstr_item(device->model, capstr, "model", sizeof(device->model)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no model tag"); -+ memset(device->model, 0, sizeof(device->model)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* if there is no protocol tag */ -+ if (!device->prot[0]) { -+ /* and no type tag: give up. */ -+ if (!device->type[0]) -+ return -ENOENT; -+ -+ /* Assume protocol "monitor" if type is "LCD" or "CRT" */ -+ if (strncasecmp(device->type, "LCD", sizeof(device->type)-1) == 0 -+ || strncasecmp(device->type, "CRT", sizeof(device->type)-1) == 0) { -+ memcpy(device->prot, "monitor", 7); -+ } -+ } -+ -+ /* skip the rest for now */ -+ -+ return 0; -+} -+ -+/* Probe for a device on an inner address and create a ddcci_device for it */ -+static int ddcci_detect_device(struct i2c_client *client, unsigned char addr, -+ int dependent) -+{ -+ int ret; -+ unsigned char outer_addr = client->addr << 1; -+ unsigned char *buffer = NULL; -+ struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client); -+ struct ddcci_device *device = NULL; -+ -+ down(&drv_data->sem); -+ -+ /* Allocate buffer big enough for any capability string */ -+ buffer = kmalloc(16384, GFP_KERNEL); -+ if (!buffer) { -+ ret = -ENOMEM; -+ goto err_end; -+ } -+ -+ /* Allocate device struct */ -+ device = kzalloc(sizeof(struct ddcci_device), GFP_KERNEL); -+ if (!device) { -+ ret = -ENOMEM; -+ goto err_end; -+ } -+ -+ /* Initialize device */ -+ device_initialize(&device->dev); -+ device->dev.parent = &client->dev; -+ device->dev.bus = &ddcci_bus_type; -+ device->outer_addr = outer_addr; -+ device->inner_addr = addr; -+ device->bus_drv_data = drv_data; -+ device->i2c_client = client; -+ -+ if (!dependent) { -+ device->dev.type = &ddcci_device_type; -+ ret = dev_set_name(&device->dev, "ddcci%d", client->adapter->nr); -+ } else if (outer_addr == dependent) { -+ /* Internal dependent device */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT; -+ ret = dev_set_name(&device->dev, "ddcci%di%02x", client->adapter->nr, addr); -+ } else if (outer_addr == addr) { -+ /* External dependent device */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL; -+ ret = dev_set_name(&device->dev, "ddcci%de%02x", client->adapter->nr, addr); -+ } else { -+ /* Dependent device of external dependent device -+ Just in case something like this exists */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL; -+ ret = dev_set_name(&device->dev, "ddcci%de%02x%02x", client->adapter->nr, outer_addr, addr); -+ } -+ -+ if (ret) -+ goto err_free; -+ -+ /* Read identification and check for quirks */ -+ ret = ddcci_identify_device(client, addr, buffer, 29); -+ if (ret < 0) { -+ if (!dependent && (ret == -EBADMSG || ret == -EMSGSIZE)) { -+ dev_warn(&device->dev, "DDC/CI main device sent broken response on identification. Trying to detect solely based on capability information.\n"); -+ } else { -+ goto err_free; -+ } -+ } -+ -+ if (ret == 29 && buffer[0] == DDCCI_REPLY_ID) { -+ memcpy(device->vendor, &buffer[7], 8); -+ memcpy(device->module, &buffer[17], 8); -+ device->device_number = be32_to_cpu(*(__force __be32 *)&buffer[18]); -+ } -+ -+ /* Read capabilities */ -+ ret = ddcci_get_caps(client, addr, buffer, 16384); -+ if (ret > 0) { -+ /* Fixup unparenthesized capability strings, but only if the first -+ character is an ascii lower case letter. -+ This should still allow an early exit for completely garbled -+ data but helps detecting devices where only the parentheses are -+ missing, as the second char must be the first character of a -+ keyword. */ -+ if (ret > 2 && buffer[0] >= 'a' && buffer[0] <= 'z') { -+ dev_err(&device->dev, "DDC/CI device quirk detected: unparenthesized capability string\n"); -+ device->capabilities = kzalloc(ret+3, GFP_KERNEL); -+ if (!device->capabilities) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ device->capabilities_len = ret+2; -+ memcpy(&(device->capabilities[1]), buffer, ret); -+ device->capabilities[0] = '('; -+ device->capabilities[ret+1] = ')'; -+ } else { -+ /* Standard case: simply copy the received string */ -+ device->capabilities = kzalloc(ret+1, GFP_KERNEL); -+ if (!device->capabilities) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ device->capabilities_len = ret; -+ memcpy(device->capabilities, buffer, ret); -+ } -+ -+ ret = ddcci_parse_capstring(device); -+ if (ret) { -+ dev_err(&device->dev, "malformed capability string: \"%s\" errno %d\n", device->capabilities, ret); -+ ret = -EINVAL; -+ goto err_free; -+ } -+ } -+ -+ /* Found a device if either identification or capabilities succeeded */ -+ if (!device->capabilities && device->vendor[0] == '\0') { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] got neither valid identification nor capability data\n", -+ client->addr << 1, addr); -+ ret = -ENODEV; -+ goto err_free; -+ } -+ -+ /* Setup chardev */ -+ down(&core_lock); -+ ret = ddcci_setup_char_device(device); -+ up(&core_lock); -+ if (ret) -+ goto err_free; -+ -+ /* Release semaphore and add device to the tree */ -+ up(&drv_data->sem); -+ pr_debug("found device at %d:%02x:%02x\n", client->adapter->nr, outer_addr, addr); -+ ret = device_add(&device->dev); -+ if (ret) -+ goto err_free; -+ -+ goto end; -+err_free: -+ put_device(&device->dev); -+err_end: -+ up(&drv_data->sem); -+end: -+ kfree(buffer); -+ return ret; -+} -+ -+/* I2C detect function: check if a main or external dependent device exists */ -+static int ddcci_detect(struct i2c_client *client, struct i2c_board_info *info) -+{ -+ int ret; -+ unsigned char outer_addr; -+ unsigned char inner_addr; -+ unsigned char buf[32]; -+ unsigned char cmd_id[1] = { DDCCI_COMMAND_ID }; -+ unsigned char cmd_caps[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00}; -+ unsigned char *cmd; -+ unsigned int cmd_len; -+ -+ /* Check for i2c_master_* functionality */ -+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { -+ pr_debug("i2c adapter %d unsuitable: no i2c_master functionality\n", client->adapter->nr); -+ return -ENODEV; -+ } -+ -+ /* send Capabilities Request (for main) or Identification Request command (for dependent devices) */ -+ outer_addr = client->addr << 1; -+ inner_addr = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? DDCCI_HOST_ADDR_ODD : outer_addr | 1; -+ cmd = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? cmd_caps : cmd_id; -+ cmd_len = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? sizeof(cmd_caps) : sizeof(cmd_id); -+ pr_debug("detecting %d:%02x\n", client->adapter->nr, outer_addr); -+ -+ ret = __ddcci_write_block(client, inner_addr, buf, true, cmd, cmd_len); -+ -+ if (ret == -ENXIO || ret == -EIO) { -+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_WRITE_BYTE)) { -+ pr_debug("i2c write failed with ENXIO or EIO but bytewise writing is not supported\n"); -+ return -ENODEV; -+ } -+ pr_debug("i2c write failed with ENXIO or EIO, trying bytewise writing\n"); -+ ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len); -+ if (ret == 0) { -+ msleep(delay); -+ ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len); -+ } -+ } -+ -+ if (ret < 0) -+ return -ENODEV; -+ -+ /* wait for device */ -+ msleep(delay); -+ /* receive answer */ -+ ret = i2c_master_recv(client, buf, 32); -+ if (ret < 3) { -+ pr_debug("detection failed: no answer\n"); -+ return -ENODEV; -+ } -+ -+ /* check response starts with outer addr */ -+ if (buf[0] != outer_addr) { -+ pr_debug("detection failed: invalid %s response (%02x != %02x)\n", (cmd == cmd_id) ? "identification" : "capabilities", buf[0], outer_addr); -+ pr_debug("received message was %*ph \n", ret, buf); -+ return -ENODEV; -+ } -+ -+ pr_debug("detected %d:%02x\n", client->adapter->nr, outer_addr); -+ -+ /* set device type */ -+ strlcpy(info->type, (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? "ddcci" : "ddcci-dependent", I2C_NAME_SIZE); -+ -+ return 0; -+} -+ -+/* I2C probe function */ -+static int ddcci_probe(struct i2c_client *client) -+{ -+ const struct i2c_device_id *id = i2c_client_get_device_id(client); -+ int i, ret = -ENODEV, tmp; -+ unsigned char main_addr, addr; -+ struct ddcci_bus_drv_data *drv_data; -+ -+ /* Initialize driver data structure */ -+ drv_data = devm_kzalloc(&client->dev, sizeof(struct ddcci_bus_drv_data), GFP_KERNEL); -+ if (!drv_data) -+ return -ENOMEM; -+ drv_data->i2c_dev = client; -+ sema_init(&drv_data->sem, 1); -+ -+ /* Set i2c client data */ -+ i2c_set_clientdata(client, drv_data); -+ -+ if (id->driver_data == 0) { -+ /* Core device, probe at 0x6E */ -+ main_addr = DDCCI_DEFAULT_DEVICE_ADDR; -+ dev_dbg(&client->dev, "probing core device [%02x]\n", -+ client->addr << 1); -+ ret = ddcci_detect_device(client, main_addr, 0); -+ if (ret) { -+ dev_info(&client->dev, "core device [%02x] probe failed: %d\n", -+ client->addr << 1, ret); -+ if (ret == -EIO) -+ ret = -ENODEV; -+ goto err_free; -+ } -+ -+ /* Detect internal dependent devices */ -+ dev_dbg(&client->dev, "probing internal dependent devices\n"); -+ for (i = 0; i < autoprobe_addr_count; ++i) { -+ addr = (unsigned short)autoprobe_addrs[i]; -+ if ((addr & 1) == 0 && addr != main_addr) { -+ tmp = ddcci_detect_device(client, addr, main_addr); -+ if (tmp < 0 && tmp != -ENODEV) { -+ dev_info(&client->dev, "internal dependent device [%02x:%02x] probe failed: %d\n", -+ client->addr << 1, addr, ret); -+ } -+ } -+ } -+ } else if (id->driver_data == 1) { -+ /* External dependent device */ -+ main_addr = client->addr << 1; -+ dev_dbg(&client->dev, "probing external dependent device [%02x]\n", main_addr); -+ ret = ddcci_detect_device(client, main_addr, -1); -+ if (ret) { -+ dev_info(&client->dev, "external dependent device [%02x] probe failed: %d\n", -+ main_addr, ret); -+ if (ret == -EIO) -+ ret = -ENODEV; -+ goto err_free; -+ } -+ } else { -+ dev_warn(&client->dev, -+ "probe() called with invalid i2c device id\n"); -+ ret = -EINVAL; -+ } -+ -+ goto end; -+err_free: -+ devm_kfree(&client->dev, drv_data); -+end: -+ return ret; -+} -+ -+/* -+ * Callback for bus_find_device() used in ddcci_remove() -+ * -+ * Find next device on i2c_client not flagged with -+ * DDCCI_FLAG_REMOVED and flag it. -+ */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,3,0) -+static int ddcci_remove_helper(struct device *dev, const void *p) -+#else -+static int ddcci_remove_helper(struct device *dev, void *p) -+#endif -+{ -+ struct ddcci_device *device; -+ -+ device = ddcci_verify_device(dev); -+ if (!device || device->flags & DDCCI_FLAG_REMOVED) -+ return 0; -+ -+ if (!p || (dev->parent == p)) { -+ device->flags |= DDCCI_FLAG_REMOVED; -+ wmb(); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/* I2C driver remove callback: unregister all subdevices */ -+static int ddcci_remove(struct i2c_client *client) -+{ -+ struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client); -+ struct device *dev; -+ -+ down(&drv_data->sem); -+ while (1) { -+ dev = bus_find_device(&ddcci_bus_type, NULL, client, -+ ddcci_remove_helper); -+ if (!dev) -+ break; -+ device_unregister(dev); -+ put_device(dev); -+ } -+ up(&drv_data->sem); -+ return 0; -+} -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -+static void ddcci_remove_void(struct i2c_client *client) -+{ -+ ddcci_remove(client); -+} -+#endif -+ -+/* -+ * I2C driver device identification table. -+ */ -+static const struct i2c_device_id ddcci_idtable[] = { -+ { "ddcci", 0 }, -+ { "ddcci-dependent", 1 }, -+ {} -+}; -+MODULE_DEVICE_TABLE(i2c, ddcci_idtable); -+ -+/* -+ * I2C driver description structure -+ */ -+static struct i2c_driver ddcci_driver = { -+ .driver = { -+ .name = "ddcci", -+ .owner = THIS_MODULE, -+ }, -+ -+ .id_table = ddcci_idtable, -+ .probe = ddcci_probe, -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -+ .remove = ddcci_remove_void, -+#else -+ .remove = ddcci_remove, -+#endif -+ .class = I2C_CLASS_DDC, -+ .detect = ddcci_detect, -+ .address_list = I2C_ADDRS( -+ DDCCI_DEFAULT_DEVICE_ADDR>>1 -+ ), -+}; -+ -+/* -+ * Module initialization function. Called when the module is inserted or -+ * (if builtin) at boot time. -+ */ -+static int __init ddcci_module_init(void) -+{ -+ int ret; -+ -+ pr_debug("initializing ddcci driver\n"); -+ /* Allocate a device number region for the character devices */ -+ ret = alloc_chrdev_region(&ddcci_cdev_first, 0, 128, DEVICE_NAME); -+ if (ret < 0) { -+ pr_err("failed to register device region: error %d\n", ret); -+ goto err_chrdevreg; -+ } -+ ddcci_cdev_next = ddcci_cdev_first; -+ ddcci_cdev_end = MKDEV(MAJOR(ddcci_cdev_first), MINOR(ddcci_cdev_first)+128); -+ -+ /* Register bus */ -+ ret = bus_register(&ddcci_bus_type); -+ if (ret) { -+ pr_err("failed to register bus 'ddcci'\n"); -+ goto err_busreg; -+ } -+ ddcci_bus_registered = true; -+ -+ /* Register I2C driver */ -+ ret = i2c_add_driver(&ddcci_driver); -+ if (ret) { -+ pr_err("failed to register i2c driver\n"); -+ goto err_drvreg; -+ } -+ -+ pr_debug("ddcci driver initialized\n"); -+ -+ return 0; -+ -+err_drvreg: -+ bus_unregister(&ddcci_bus_type); -+err_busreg: -+ unregister_chrdev_region(ddcci_cdev_first, 128); -+err_chrdevreg: -+ return ret; -+} -+ -+/* -+ * Module clean-up function. Called when the module is removed. -+ */ -+static void __exit ddcci_module_exit(void) -+{ -+ struct device *dev; -+ -+ while (1) { -+ dev = bus_find_device(&ddcci_bus_type, NULL, NULL, ddcci_remove_helper); -+ if (!dev) -+ break; -+ device_unregister(dev); -+ put_device(dev); -+ } -+ -+ i2c_del_driver(&ddcci_driver); -+ bus_unregister(&ddcci_bus_type); -+ unregister_chrdev_region(ddcci_cdev_first, 128); -+} -+ -+/* Let the kernel know the calls for module init and exit */ -+module_init(ddcci_module_init); -+module_exit(ddcci_module_exit); -+ -+/* Module parameter description */ -+module_param(delay, uint, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(delay, "default delay after bus writes (in ms, default 60)"); -+module_param_array(autoprobe_addrs, ushort, &autoprobe_addr_count, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(autoprobe_addrs, "internal dependent device addresses to autoprobe"); -+ -+/* Module description */ -+MODULE_AUTHOR("Christoph Grenz"); -+MODULE_DESCRIPTION("DDC/CI bus driver"); -+MODULE_VERSION("0.4.2"); -+MODULE_LICENSE("GPL"); -diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig -index 51387b1ef012..4b8bfd7c02c6 100644 ---- a/drivers/video/backlight/Kconfig -+++ b/drivers/video/backlight/Kconfig -@@ -297,6 +297,17 @@ config BACKLIGHT_QCOM_WLED - If you have the Qualcomm PMIC, say Y to enable a driver for the - WLED block. Currently it supports PM8941 and PMI8998. - -+config BACKLIGHT_DDCCI -+ tristate "DDCCI Backlight Driver" -+ depends on DDCCI -+ help -+ If you have a DDC/CI supporing monitor, say Y to enable a driver -+ to control its backlight using DDC/CI. This could be useful if -+ your monitor does not include a backlight driver. For this to be -+ useful you need to enable DDCCI support which can be found in -+ Device Drivers -> Character devices and that further depends on -+ I2C. -+ - config BACKLIGHT_RT4831 - tristate "Richtek RT4831 Backlight Driver" - depends on MFD_RT4831 -diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile -index f72e1c3c59e9..656dea21c0ee 100644 ---- a/drivers/video/backlight/Makefile -+++ b/drivers/video/backlight/Makefile -@@ -58,3 +58,4 @@ obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o - obj-$(CONFIG_BACKLIGHT_ARCXCNN) += arcxcnn_bl.o - obj-$(CONFIG_BACKLIGHT_RAVE_SP) += rave-sp-backlight.o - obj-$(CONFIG_BACKLIGHT_LED) += led_bl.o -+obj-$(CONFIG_BACKLIGHT_DDCCI) += ddcci-backlight.o -diff --git a/drivers/video/backlight/ddcci-backlight.c b/drivers/video/backlight/ddcci-backlight.c -new file mode 100644 -index 000000000000..7a9852207f0b ---- /dev/null -+++ b/drivers/video/backlight/ddcci-backlight.c -@@ -0,0 +1,413 @@ -+/* -+ * DDC/CI monitor backlight driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+#include -+#include -+#include -+#include -+ -+#include -+ -+ -+#define DDCCI_COMMAND_READ 0x01 /* read ctrl value */ -+#define DDCCI_REPLY_READ 0x02 /* read ctrl value reply */ -+#define DDCCI_COMMAND_WRITE 0x03 /* write ctrl value */ -+#define DDCCI_COMMAND_SAVE 0x0c /* save current settings */ -+ -+#define DDCCI_MONITOR_LUMINANCE 0x10 -+#define DDCCI_MONITOR_BACKLIGHT 0x13 -+#define DDCCI_MONITOR_BL_WHITE 0x6B -+ -+static bool convenience_symlink = true; -+ -+struct ddcci_monitor_drv_data { -+ struct ddcci_device *device; -+ struct backlight_device *bl_dev; -+ struct device *fb_dev; -+ unsigned char used_vcp; -+}; -+ -+static int ddcci_monitor_writectrl(struct ddcci_device *device, -+ unsigned char ctrl, unsigned short value) -+{ -+ unsigned char buf[4]; -+ int ret; -+ -+ buf[0] = DDCCI_COMMAND_WRITE; -+ buf[1] = ctrl; -+ buf[2] = (value >> 8); -+ buf[3] = (value & 255); -+ -+ ret = ddcci_device_write(device, true, buf, sizeof(buf)); -+ -+ return ret; -+} -+ -+static int ddcci_monitor_readctrl(struct ddcci_device *device, -+ unsigned char ctrl, unsigned short *value, -+ unsigned short *maximum) -+{ -+ int ret; -+ unsigned char buf[10]; -+ -+ buf[0] = DDCCI_COMMAND_READ; -+ buf[1] = ctrl; -+ -+ ret = ddcci_device_writeread(device, true, buf, 2, sizeof(buf)); -+ if (ret < 0) -+ return ret; -+ -+ if (ret == 0) -+ return -ENOTSUPP; -+ -+ if (ret == 8 && buf[0] == DDCCI_REPLY_READ && buf[2] == ctrl) { -+ if (value) -+ *value = buf[6] * 256 + buf[7]; -+ -+ if (maximum) -+ *maximum = buf[4] * 256 + buf[5]; -+ -+ if (buf[1] == 1) -+ return -ENOTSUPP; -+ if (buf[1] != 0) -+ return -EIO; -+ return 0; -+ } -+ -+ return -EIO; -+} -+ -+static int ddcci_backlight_check_fb(struct backlight_device *bl, -+ struct fb_info *info) -+{ -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ -+ return drv_data->fb_dev == NULL || drv_data->fb_dev == info->dev; -+} -+ -+static int ddcci_backlight_update_status(struct backlight_device *bl) -+{ -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ int brightness = bl->props.brightness; -+ int ret; -+ -+ if (bl->props.power != FB_BLANK_UNBLANK || -+ bl->props.state & BL_CORE_FBBLANK) -+ brightness = 0; -+ -+ ret = ddcci_monitor_writectrl(drv_data->device, drv_data->used_vcp, -+ brightness); -+ if (ret > 0) -+ ret = 0; -+ return ret; -+} -+ -+static int ddcci_backlight_get_brightness(struct backlight_device *bl) -+{ -+ unsigned short value = 0, maxval = 0; -+ int ret; -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ -+ ret = ddcci_monitor_readctrl(drv_data->device, drv_data->used_vcp, -+ &value, &maxval); -+ if (ret < 0) -+ return ret; -+ -+ bl->props.brightness = value; -+ bl->props.max_brightness = maxval; -+ ret = value; -+ -+ return ret; -+} -+ -+static const struct backlight_ops ddcci_backlight_ops = { -+ .options = 0, -+ .update_status = ddcci_backlight_update_status, -+ .get_brightness = ddcci_backlight_get_brightness, -+ .check_fb = ddcci_backlight_check_fb, -+}; -+ -+static const char *ddcci_monitor_vcp_name(unsigned char vcp) -+{ -+ switch (vcp) { -+ case DDCCI_MONITOR_BL_WHITE: -+ return "backlight"; -+ case DDCCI_MONITOR_LUMINANCE: -+ return "luminance"; -+ default: -+ return "???"; -+ } -+} -+ -+static const char *ddcci_monitor_next_vcp_item(const char *ptr) -+{ -+ int depth = 0; -+ -+ /* Sanity check */ -+ if (unlikely(ptr == NULL || ptr[0] == '\0')) -+ return NULL; -+ -+ /* Find next white space outside of parentheses */ -+ while ((ptr = strpbrk(ptr, " ()"))) { -+ if (!ptr || depth == INT_MAX) { -+ return NULL; -+ } else if (*ptr == '(') { -+ depth++; -+ } else if (depth > 0) { -+ if (*ptr == ')') -+ depth--; -+ } else { -+ break; -+ } -+ ++ptr; -+ } -+ -+ /* Skip over whitespace */ -+ ptr = skip_spaces(ptr); -+ -+ /* Check if we're now at the end of the list */ -+ if (unlikely(*ptr == '\0' || *ptr == ')')) -+ return NULL; -+ -+ return ptr; -+} -+ -+static bool ddcci_monitor_find_vcp(unsigned char vcp, const char *s) -+{ -+ const char *ptr = s; -+ char vcp_hex[3]; -+ -+ /* Sanity check */ -+ if (unlikely(s == NULL || s[0] == '\0')) -+ return false; -+ -+ /* Create hex representation of VCP */ -+ if (unlikely(snprintf(vcp_hex, 3, "%02hhX", vcp) != 2)) { -+ pr_err("snprintf failed to convert to hex. This should not happen.\n"); -+ return false; -+ } -+ -+ /* Search for it */ -+ do { -+ if (strncasecmp(vcp_hex, ptr, 2) == 0) { -+ if (ptr[2] == ' ' || ptr[2] == '(' || ptr[2] == ')') { -+ return true; -+ } -+ } -+ } while ((ptr = ddcci_monitor_next_vcp_item(ptr))); -+ -+ return false; -+} -+ -+static int ddcci_backlight_create_symlink(struct ddcci_device *ddcci_dev) -+{ -+ int i, result; -+ struct device *dev = &ddcci_dev->dev; -+ struct kernfs_node *dirent; -+ for (i = 0; i < 3; ++i) { -+ dev = dev->parent; -+ if (!dev) { -+ dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: ancestor device not found\n"); -+ return -ENOENT; -+ } -+ } -+ dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight"); -+ if (dirent) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: %s/ddcci_backlight already exists\n", dev_name(dev)); -+ return -EEXIST; -+ } -+ -+ result = sysfs_create_link(&dev->kobj, &ddcci_dev->dev.kobj, "ddcci_backlight"); -+ if (result == 0) { -+ dev_dbg(&ddcci_dev->dev, "created symlink %s/ddcci_backlight\n", dev_name(dev)); -+ } else { -+ dev_info(&ddcci_dev->dev, "failed to create convenience symlink: %d\n", result); -+ } -+ return result; -+} -+ -+static int ddcci_backlight_remove_symlink(struct ddcci_device *ddcci_dev) -+{ -+ int i; -+ struct device *dev = &ddcci_dev->dev; -+ struct kernfs_node *dirent; -+ for (i = 0; i < 3; ++i) { -+ dev = dev->parent; -+ if (!dev) -+ return -ENOENT; -+ } -+ dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight"); -+ if (!dirent) { -+ return -ENOENT; -+ } -+ -+ if ((dirent->flags & KERNFS_LINK) == 0) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: not a symlink\n", dev_name(dev)); -+ return -EINVAL; -+ } -+ -+ if (dirent->symlink.target_kn != ddcci_dev->dev.kobj.sd) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: we are not the link target\n", dev_name(dev)); -+ return -EINVAL; -+ } -+ -+ sysfs_put(dirent); -+ -+ sysfs_remove_link(&dev->kobj, "ddcci_backlight"); -+ dev_dbg(&ddcci_dev->dev, "removed symlink %s/ddcci_backlight\n", dev_name(dev)); -+ return 0; -+} -+ -+static int ddcci_monitor_probe(struct ddcci_device *dev, -+ const struct ddcci_device_id *id) -+{ -+ struct ddcci_monitor_drv_data *drv_data; -+ struct backlight_properties props; -+ struct backlight_device *bl = NULL; -+ int ret = 0; -+ bool support_luminance, support_bl_white; -+ unsigned short brightness = 0, max_brightness = 0; -+ const char *vcps; -+ -+ dev_dbg(&dev->dev, "probing monitor backlight device\n"); -+ -+ /* Get VCP list */ -+ vcps = ddcci_find_capstr_item(dev->capabilities, "vcp", NULL); -+ if (IS_ERR(vcps)) { -+ dev_info(&dev->dev, -+ "monitor doesn't provide a list of supported controls.\n"); -+ support_bl_white = support_luminance = true; -+ } else { -+ /* Check VCP list for supported VCPs */ -+ support_bl_white = ddcci_monitor_find_vcp(DDCCI_MONITOR_BL_WHITE, vcps); -+ support_luminance = ddcci_monitor_find_vcp(DDCCI_MONITOR_LUMINANCE, vcps); -+ /* Fallback to trying if no support is found */ -+ if (!support_bl_white && !support_luminance) { -+ dev_info(&dev->dev, -+ "monitor doesn't announce support for backlight or luminance controls.\n"); -+ support_bl_white = support_luminance = true; -+ } -+ } -+ -+ /* Initialize driver data structure */ -+ drv_data = devm_kzalloc(&dev->dev, sizeof(struct ddcci_monitor_drv_data), -+ GFP_KERNEL); -+ if (!drv_data) -+ return -ENOMEM; -+ drv_data->device = dev; -+ -+ if (support_bl_white) { -+ /* Try getting backlight level */ -+ dev_dbg(&dev->dev, -+ "trying to access \"backlight level white\" control\n"); -+ ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_BL_WHITE, -+ &brightness, &max_brightness); -+ if (ret < 0) { -+ if (ret == -ENOTSUPP) -+ dev_info(&dev->dev, -+ "monitor does not support reading backlight level\n"); -+ else -+ goto err_free; -+ } else { -+ drv_data->used_vcp = DDCCI_MONITOR_BL_WHITE; -+ } -+ } -+ -+ if (support_luminance && !drv_data->used_vcp) { -+ /* Try getting luminance */ -+ dev_dbg(&dev->dev, -+ "trying to access \"luminance\" control\n"); -+ ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_LUMINANCE, -+ &brightness, &max_brightness); -+ if (ret < 0) { -+ if (ret == -ENOTSUPP) -+ dev_info(&dev->dev, -+ "monitor does not support reading luminance\n"); -+ else -+ goto err_free; -+ } else { -+ drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE; -+ } -+ drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE; -+ } -+ -+ if (!drv_data->used_vcp) -+ goto err_free; -+ -+ /* Create brightness device */ -+ memset(&props, 0, sizeof(props)); -+ props.type = BACKLIGHT_RAW; -+ props.max_brightness = max_brightness; -+ props.brightness = brightness; -+ bl = devm_backlight_device_register(&dev->dev, dev_name(&dev->dev), -+ &dev->dev, drv_data, -+ &ddcci_backlight_ops, &props); -+ drv_data->bl_dev = bl; -+ if (IS_ERR(bl)) { -+ dev_err(&dev->dev, "failed to register backlight\n"); -+ return PTR_ERR(bl); -+ } -+ dev_info(&dev->dev, "registered %s as backlight device %s\n", -+ ddcci_monitor_vcp_name(drv_data->used_vcp), -+ dev_name(&dev->dev)); -+ -+ if (convenience_symlink) { -+ ddcci_backlight_create_symlink(dev); -+ } -+ -+ goto end; -+err_free: -+ devm_kfree(&dev->dev, drv_data); -+end: -+ return ret; -+} -+ -+static int ddcci_monitor_remove(struct ddcci_device *dev) -+{ -+ dev_dbg(&dev->dev, "removing device\n"); -+ ddcci_backlight_remove_symlink(dev); -+ return 0; -+} -+ -+static struct ddcci_device_id ddcci_monitor_idtable[] = { -+ { "monitor", DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, 0 }, -+ {} -+}; -+ -+static struct ddcci_driver ddcci_backlight_driver = { -+ .driver = { -+ .name = "ddcci-backlight", -+ .owner = THIS_MODULE, -+ }, -+ -+ .id_table = ddcci_monitor_idtable, -+ .probe = ddcci_monitor_probe, -+ .remove = ddcci_monitor_remove, -+}; -+ -+module_ddcci_driver(ddcci_backlight_driver); -+ -+/* Module parameter description */ -+module_param(convenience_symlink, bool, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(convenience_symlink, "add convenience symlink \"ddcci_backlight\" to ancestor device in sysfs (default true)"); -+ -+MODULE_AUTHOR("Christoph Grenz"); -+MODULE_DESCRIPTION("DDC/CI generic monitor backlight driver"); -+MODULE_VERSION("0.4.2"); -+MODULE_LICENSE("GPL"); -+ -+MODULE_ALIAS("ddcci:monitor-*-*-*-*"); -diff --git a/include/linux/ddcci.h b/include/linux/ddcci.h -new file mode 100644 -index 000000000000..a219f031e584 ---- /dev/null -+++ b/include/linux/ddcci.h -@@ -0,0 +1,164 @@ -+/* -+ * DDC/CI bus driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#ifndef _DDCCI_H -+#define _DDCCI_H -+ -+#include -+#include -+#include -+ -+#define DDCCI_MODULE_PREFIX "ddcci:" -+ -+/* Special addresses */ -+ -+/* default device address (even) */ -+#define DDCCI_DEFAULT_DEVICE_ADDR 0x6E -+/* receiving host address for communication with default device address */ -+#define DDCCI_HOST_ADDR_EVEN 0x50 -+/* sending host address for communication with default device address */ -+#define DDCCI_HOST_ADDR_ODD 0x51 -+ -+/* Command codes */ -+ -+/* Identification Request */ -+#define DDCCI_COMMAND_ID 0xf1 -+/* Identification Reply */ -+#define DDCCI_REPLY_ID 0xe1 -+/* Capabilities Request */ -+#define DDCCI_COMMAND_CAPS 0xf3 -+/* Capabilities Reply */ -+#define DDCCI_REPLY_CAPS 0xe3 -+ -+/* Quirks */ -+ -+/* Device always responds with unset protocol flag */ -+#define DDCCI_QUIRK_NO_PFLAG BIT(1) -+/* Device needs writing one byte at a time */ -+#define DDCCI_QUIRK_WRITE_BYTEWISE BIT(2) -+/* Device repeats first byte on read */ -+#define DDCCI_QUIRK_SKIP_FIRST_BYTE BIT(3) -+ -+/* Flags */ -+ -+#define DDCCI_FLAG_REMOVED BIT(1) -+#define DDCCI_FLAG_DEPENDENT BIT(2) -+#define DDCCI_FLAG_EXTERNAL BIT(3) -+ -+extern struct bus_type ddcci_bus_type; -+ -+struct ddcci_bus_drv_data; -+ -+/* struct ddcci_device_id - identifies DDC/CI devices for probing */ -+struct ddcci_device_id { -+ char prot[9]; -+ char type[9]; -+ char model[9]; -+ char vendor[9]; -+ char module[9]; -+ kernel_ulong_t driver_data; /* Data private to the driver */ -+}; -+#define DDCCI_ANY_ID "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" -+ -+/** -+ * struct ddcci_device - represent an DDC/CI device -+ * @outer_addr: Outer device address (I2C address << 1). -+ * @inner_addr: Inner device address. -+ * @flags: Device flags. -+ * @capabilities: Device capability string. -+ * @capabilities_len: Length of capability string. -+ * @i2c_client: Parent I2C device. -+ * @bus_drv_data: Driver internal data structure. -+ * @dev: Driver model device node for the slave. -+ * @cdev: Character device structure -+ * @cdev_sem: RW semaphore for exclusive access on character device. -+ * @prot: Device class ("protocol", from capability string) -+ * @type: Device subclass ("type", from capability string) -+ * @model: Device model (from capability string) -+ * @vendor: Device vendor (from identification command response) -+ * @module: Device module (from identification command response) -+ * @device_number: Device serial (from identification command response) -+ */ -+struct ddcci_device { -+ unsigned short outer_addr; -+ unsigned short inner_addr; -+ int flags; -+ char *capabilities; -+ size_t capabilities_len; -+ struct i2c_client *i2c_client; -+ struct ddcci_bus_drv_data *bus_drv_data; -+ struct device dev; -+ struct cdev cdev; -+ struct rw_semaphore cdev_sem; -+ char prot[9]; -+ char type[9]; -+ char model[9]; -+ char vendor[9]; -+ char module[9]; -+ int device_number; -+}; -+#define to_ddcci_device(d) container_of(d, struct ddcci_device, dev) -+ -+/** -+ * struct ddcci_driver - represent an DDC/CI device driver -+ * @probe: Callback for device binding -+ * @remove: Callback for device unbinding -+ * @driver: Device driver model driver -+ * @id_table: List of DDC/CI devices supported by this driver -+ * -+ * The driver.owner field should be set to the module owner of this driver. -+ * The driver.name field should be set to the name of this driver. -+ */ -+struct ddcci_driver { -+ int (*probe)(struct ddcci_device *, const struct ddcci_device_id *); -+ int (*remove)(struct ddcci_device *); -+ struct device_driver driver; -+ struct ddcci_device_id *id_table; -+}; -+#define to_ddcci_driver(d) container_of(d, struct ddcci_driver, driver) -+ -+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver); -+#define ddcci_add_driver(driver) \ -+ ddcci_register_driver(THIS_MODULE, driver) -+void ddcci_del_driver(struct ddcci_driver *driver); -+ -+struct ddcci_device *ddcci_verify_device(struct device *dev); -+ -+#define module_ddcci_driver(__ddcci_driver) \ -+ module_driver(__ddcci_driver, ddcci_add_driver, \ -+ ddcci_del_driver) -+ -+int ddcci_device_write(struct ddcci_device *, bool p_flag, unsigned char *data, -+ unsigned char length); -+int ddcci_device_read(struct ddcci_device *, bool p_flag, unsigned char *buffer, -+ unsigned char length); -+int ddcci_device_writeread(struct ddcci_device *, bool p_flag, -+ unsigned char *buffer, unsigned char length, -+ unsigned char maxlength); -+ -+static inline void *ddcci_get_drvdata(const struct ddcci_device *dev) -+{ -+ return dev_get_drvdata(&dev->dev); -+} -+ -+static inline void ddcci_set_drvdata(struct ddcci_device *dev, void *data) -+{ -+ dev_set_drvdata(&dev->dev, data); -+} -+ -+unsigned long ddcci_quirks(struct ddcci_device *dev); -+ -+const char *ddcci_find_capstr_item(const char *capabilities, const char *tag, -+ size_t *length); -+ -+#endif --- -2.41.0 - -From 641b0ffc98fe2842735ec30d31c68b555d559a47 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 10 Jul 2023 18:29:38 +0200 -Subject: [PATCH 4/7] fixes +Date: Mon, 31 Jul 2023 12:20:07 +0200 +Subject: [PATCH 3/5] fixes Signed-off-by: Peter Jung --- @@ -12110,7 +9588,7 @@ Signed-off-by: Peter Jung 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 5ec4ad0a5c86..15c5649bde4d 100644 +index 764d176e9735..deb10b89fa51 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -945,7 +945,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) @@ -12196,10 +9674,10 @@ index ce5faa620517..1f0f2b8df300 100644 -- 2.41.0 -From 96777542ac5d53c962cdfb032cf34cfe4ee57dc8 Mon Sep 17 00:00:00 2001 +From d5b404e3a7461e47b37cbfc3fbe009ba156e2c67 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 10 Jul 2023 17:10:25 +0200 -Subject: [PATCH 5/7] ksm +Subject: [PATCH 4/5] ksm Signed-off-by: Peter Jung --- @@ -12546,10 +10024,10 @@ index 899a314bc487..c2dd786a30e1 100644 static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index de10fc797c8e..1cc93fc7d9b5 100644 +index 5e74ce4a28cd..51d04c1847c1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h -@@ -784,7 +784,7 @@ struct mm_struct { +@@ -812,7 +812,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM @@ -12558,7 +10036,7 @@ index de10fc797c8e..1cc93fc7d9b5 100644 */ unsigned long ksm_merging_pages; /* -@@ -792,7 +792,12 @@ struct mm_struct { +@@ -820,7 +820,12 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; @@ -12610,7 +10088,7 @@ index fd6c1cb585db..11d0fc82c437 100644 /* * 32 bit systems traditionally used different diff --git a/kernel/sys.c b/kernel/sys.c -index 05f838929e72..9df683365a37 100644 +index 2410e3999ebe..b0841a2dd2b7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -12881,7 +10359,7 @@ index ba266359da55..97a9627116fa 100644 #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/memory.c b/mm/memory.c -index 01f39e8144ef..0dc2f193c4d6 100644 +index 603b2f419948..d8c7824558b4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -13046,758 +10524,10 @@ index 26853badae70..0de9d33cd565 100644 -- 2.41.0 -From 31b1d9be3d434ee82ffccc53fabc5a4326db96c7 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 10 Jul 2023 17:10:36 +0200 -Subject: [PATCH 6/7] sched - -Signed-off-by: Peter Jung ---- - arch/x86/kernel/smpboot.c | 11 +-- - include/linux/cgroup-defs.h | 2 + - include/linux/sched.h | 2 + - include/linux/sched/task.h | 38 +++++++- - kernel/cgroup/cgroup.c | 34 +++++++ - kernel/fork.c | 8 ++ - kernel/sched/core.c | 57 ++++++++++++ - kernel/sched/debug.c | 1 + - kernel/sched/fair.c | 177 +++++++++++++++++++++++++++++++++--- - kernel/sched/psi.c | 2 +- - kernel/sched/sched.h | 3 + - kernel/sched/topology.c | 14 ++- - kernel/softirq.c | 2 +- - 13 files changed, 325 insertions(+), 26 deletions(-) - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index e1aa2cd7734b..4c314475cc13 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -632,14 +632,9 @@ static void __init build_sched_topology(void) - }; - #endif - #ifdef CONFIG_SCHED_CLUSTER -- /* -- * For now, skip the cluster domain on Hybrid. -- */ -- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { -- x86_topology[i++] = (struct sched_domain_topology_level){ -- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) -- }; -- } -+ x86_topology[i++] = (struct sched_domain_topology_level){ -+ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) -+ }; - #endif - #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ -diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h -index 8a0d5466c7be..ae20dbb885d6 100644 ---- a/include/linux/cgroup-defs.h -+++ b/include/linux/cgroup-defs.h -@@ -661,6 +661,8 @@ struct cgroup_subsys { - void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); - int (*css_extra_stat_show)(struct seq_file *seq, - struct cgroup_subsys_state *css); -+ int (*css_local_stat_show)(struct seq_file *seq, -+ struct cgroup_subsys_state *css); - - int (*can_attach)(struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_taskset *tset); -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 609bde814cb0..efc9f4bdc4ca 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -2433,9 +2433,11 @@ extern void sched_core_free(struct task_struct *tsk); - extern void sched_core_fork(struct task_struct *p); - extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, - unsigned long uaddr); -+extern int sched_core_idle_cpu(int cpu); - #else - static inline void sched_core_free(struct task_struct *tsk) { } - static inline void sched_core_fork(struct task_struct *p) { } -+static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } - #endif - - extern void sched_set_stop_task(int cpu, struct task_struct *stop); -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index dd35ce28bb90..a23af225c898 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) - } - - extern void __put_task_struct(struct task_struct *t); -+extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); - - static inline void put_task_struct(struct task_struct *t) - { -- if (refcount_dec_and_test(&t->usage)) -+ if (!refcount_dec_and_test(&t->usage)) -+ return; -+ -+ /* -+ * In !RT, it is always safe to call __put_task_struct(). -+ * Under RT, we can only call it in preemptible context. -+ */ -+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { -+ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); -+ -+ lock_map_acquire_try(&put_task_map); - __put_task_struct(t); -+ lock_map_release(&put_task_map); -+ return; -+ } -+ -+ /* -+ * under PREEMPT_RT, we can't call put_task_struct -+ * in atomic context because it will indirectly -+ * acquire sleeping locks. -+ * -+ * call_rcu() will schedule delayed_put_task_struct_rcu() -+ * to be called in process context. -+ * -+ * __put_task_struct() is called when -+ * refcount_dec_and_test(&t->usage) succeeds. -+ * -+ * This means that it can't "conflict" with -+ * put_task_struct_rcu_user() which abuses ->rcu the same -+ * way; rcu_users has a reference so task->usage can't be -+ * zero after rcu_users 1 -> 0 transition. -+ * -+ * delayed_free_task() also uses ->rcu, but it is only called -+ * when it fails to fork a process. Therefore, there is no -+ * way it can conflict with put_task_struct(). -+ */ -+ call_rcu(&t->rcu, __put_task_struct_rcu_cb); - } - - DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) -diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c -index bfe3cd8ccf36..4e3ee13217ce 100644 ---- a/kernel/cgroup/cgroup.c -+++ b/kernel/cgroup/cgroup.c -@@ -3685,6 +3685,36 @@ static int cpu_stat_show(struct seq_file *seq, void *v) - return ret; - } - -+static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq, -+ struct cgroup *cgrp, int ssid) -+{ -+ struct cgroup_subsys *ss = cgroup_subsys[ssid]; -+ struct cgroup_subsys_state *css; -+ int ret; -+ -+ if (!ss->css_local_stat_show) -+ return 0; -+ -+ css = cgroup_tryget_css(cgrp, ss); -+ if (!css) -+ return 0; -+ -+ ret = ss->css_local_stat_show(seq, css); -+ css_put(css); -+ return ret; -+} -+ -+static int cpu_local_stat_show(struct seq_file *seq, void *v) -+{ -+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; -+ int ret = 0; -+ -+#ifdef CONFIG_CGROUP_SCHED -+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); -+#endif -+ return ret; -+} -+ - #ifdef CONFIG_PSI - static int cgroup_io_pressure_show(struct seq_file *seq, void *v) - { -@@ -5235,6 +5265,10 @@ static struct cftype cgroup_base_files[] = { - .name = "cpu.stat", - .seq_show = cpu_stat_show, - }, -+ { -+ .name = "cpu.stat.local", -+ .seq_show = cpu_local_stat_show, -+ }, - { } /* terminate */ - }; - -diff --git a/kernel/fork.c b/kernel/fork.c -index 95ca80492a37..36fb0b711541 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -989,6 +989,14 @@ void __put_task_struct(struct task_struct *tsk) - } - EXPORT_SYMBOL_GPL(__put_task_struct); - -+void __put_task_struct_rcu_cb(struct rcu_head *rhp) -+{ -+ struct task_struct *task = container_of(rhp, struct task_struct, rcu); -+ -+ __put_task_struct(task); -+} -+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); -+ - void __init __weak arch_task_cache_init(void) { } - - /* -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index c52c2eba7c73..83e36547af17 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -7383,6 +7383,19 @@ struct task_struct *idle_task(int cpu) - return cpu_rq(cpu)->idle; - } - -+#ifdef CONFIG_SCHED_CORE -+int sched_core_idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (sched_core_enabled(rq) && rq->curr == rq->idle) -+ return 1; -+ -+ return idle_cpu(cpu); -+} -+ -+#endif -+ - #ifdef CONFIG_SMP - /* - * This function computes an effective utilization for the given CPU, to be -@@ -11139,6 +11152,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) - - return 0; - } -+ -+static u64 throttled_time_self(struct task_group *tg) -+{ -+ int i; -+ u64 total = 0; -+ -+ for_each_possible_cpu(i) { -+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); -+ } -+ -+ return total; -+} -+ -+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) -+{ -+ struct task_group *tg = css_tg(seq_css(sf)); -+ -+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); -+ -+ return 0; -+} - #endif /* CONFIG_CFS_BANDWIDTH */ - #endif /* CONFIG_FAIR_GROUP_SCHED */ - -@@ -11215,6 +11249,10 @@ static struct cftype cpu_legacy_files[] = { - .name = "stat", - .seq_show = cpu_cfs_stat_show, - }, -+ { -+ .name = "stat.local", -+ .seq_show = cpu_cfs_local_stat_show, -+ }, - #endif - #ifdef CONFIG_RT_GROUP_SCHED - { -@@ -11271,6 +11309,24 @@ static int cpu_extra_stat_show(struct seq_file *sf, - return 0; - } - -+static int cpu_local_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+#ifdef CONFIG_CFS_BANDWIDTH -+ { -+ struct task_group *tg = css_tg(css); -+ u64 throttled_self_usec; -+ -+ throttled_self_usec = throttled_time_self(tg); -+ do_div(throttled_self_usec, NSEC_PER_USEC); -+ -+ seq_printf(sf, "throttled_usec %llu\n", -+ throttled_self_usec); -+ } -+#endif -+ return 0; -+} -+ - #ifdef CONFIG_FAIR_GROUP_SCHED - static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -@@ -11449,6 +11505,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { - .css_released = cpu_cgroup_css_released, - .css_free = cpu_cgroup_css_free, - .css_extra_stat_show = cpu_extra_stat_show, -+ .css_local_stat_show = cpu_local_stat_show, - #ifdef CONFIG_RT_GROUP_SCHED - .can_attach = cpu_cgroup_can_attach, - #endif -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 066ff1c8ae4e..aeeba46a096b 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -427,6 +427,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) - #undef SDM - - debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); -+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); - } - - void update_sched_domain_debugfs(void) -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index b097a9f4d817..4039ff46fcb3 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -4805,6 +4805,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) - } - - static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); - - static inline bool cfs_bandwidth_used(void); - -@@ -4891,8 +4892,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - - if (cfs_rq->nr_running == 1) { - check_enqueue_throttle(cfs_rq); -- if (!throttled_hierarchy(cfs_rq)) -+ if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); -+ } else { -+#ifdef CONFIG_CFS_BANDWIDTH -+ struct rq *rq = rq_of(cfs_rq); -+ -+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) -+ cfs_rq->throttled_clock = rq_clock(rq); -+ if (!cfs_rq->throttled_clock_self) -+ cfs_rq->throttled_clock_self = rq_clock(rq); -+#endif -+ } - } - } - -@@ -5395,6 +5406,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); -+ -+ if (cfs_rq->throttled_clock_self) { -+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; -+ -+ cfs_rq->throttled_clock_self = 0; -+ -+ if (SCHED_WARN_ON((s64)delta < 0)) -+ delta = 0; -+ -+ cfs_rq->throttled_clock_self_time += delta; -+ } - } - - return 0; -@@ -5409,6 +5431,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); -+ -+ SCHED_WARN_ON(cfs_rq->throttled_clock_self); -+ if (cfs_rq->nr_running) -+ cfs_rq->throttled_clock_self = rq_clock(rq); - } - cfs_rq->throttle_count++; - -@@ -5498,7 +5524,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) - * throttled-list. rq->lock protects completion. - */ - cfs_rq->throttled = 1; -- cfs_rq->throttled_clock = rq_clock(rq); -+ SCHED_WARN_ON(cfs_rq->throttled_clock); -+ if (cfs_rq->nr_running) -+ cfs_rq->throttled_clock = rq_clock(rq); - return true; - } - -@@ -5516,7 +5544,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) - update_rq_clock(rq); - - raw_spin_lock(&cfs_b->lock); -- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; -+ if (cfs_rq->throttled_clock) { -+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; -+ cfs_rq->throttled_clock = 0; -+ } - list_del_rcu(&cfs_rq->throttled_list); - raw_spin_unlock(&cfs_b->lock); - -@@ -7307,9 +7338,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) - - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - -- if (boost) -- util_est = max(util_est, runnable); -- - /* - * During wake-up @p isn't enqueued yet and doesn't contribute - * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. -@@ -8433,6 +8461,11 @@ enum group_type { - * more powerful CPU. - */ - group_misfit_task, -+ /* -+ * Balance SMT group that's fully busy. Can benefit from migration -+ * a task on SMT with busy sibling to another CPU on idle core. -+ */ -+ group_smt_balance, - /* - * SD_ASYM_PACKING only: One local CPU with higher capacity is available, - * and the task should be migrated to it instead of running on the -@@ -9141,6 +9174,7 @@ struct sg_lb_stats { - unsigned int group_weight; - enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ -+ unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ - #ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; -@@ -9414,6 +9448,9 @@ group_type group_classify(unsigned int imbalance_pct, - if (sgs->group_asym_packing) - return group_asym_packing; - -+ if (sgs->group_smt_balance) -+ return group_smt_balance; -+ - if (sgs->group_misfit_task_load) - return group_misfit_task; - -@@ -9483,6 +9520,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs - return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); - } - -+/* One group has more than one SMT CPU while the other group does not */ -+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, -+ struct sched_group *sg2) -+{ -+ if (!sg1 || !sg2) -+ return false; -+ -+ return (sg1->flags & SD_SHARE_CPUCAPACITY) != -+ (sg2->flags & SD_SHARE_CPUCAPACITY); -+} -+ -+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, -+ struct sched_group *group) -+{ -+ if (env->idle == CPU_NOT_IDLE) -+ return false; -+ -+ /* -+ * For SMT source group, it is better to move a task -+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity. -+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY -+ * will not be on. -+ */ -+ if (group->flags & SD_SHARE_CPUCAPACITY && -+ sgs->sum_h_nr_running > 1) -+ return true; -+ -+ return false; -+} -+ -+static inline long sibling_imbalance(struct lb_env *env, -+ struct sd_lb_stats *sds, -+ struct sg_lb_stats *busiest, -+ struct sg_lb_stats *local) -+{ -+ int ncores_busiest, ncores_local; -+ long imbalance; -+ -+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) -+ return 0; -+ -+ ncores_busiest = sds->busiest->cores; -+ ncores_local = sds->local->cores; -+ -+ if (ncores_busiest == ncores_local) { -+ imbalance = busiest->sum_nr_running; -+ lsub_positive(&imbalance, local->sum_nr_running); -+ return imbalance; -+ } -+ -+ /* Balance such that nr_running/ncores ratio are same on both groups */ -+ imbalance = ncores_local * busiest->sum_nr_running; -+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); -+ /* Normalize imbalance and do rounding on normalization */ -+ imbalance = 2 * imbalance + ncores_local + ncores_busiest; -+ imbalance /= ncores_local + ncores_busiest; -+ -+ /* Take advantage of resource in an empty sched group */ -+ if (imbalance == 0 && local->sum_nr_running == 0 && -+ busiest->sum_nr_running > 1) -+ imbalance = 2; -+ -+ return imbalance; -+} -+ - static inline bool - sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) - { -@@ -9575,6 +9677,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, - sgs->group_asym_packing = 1; - } - -+ /* Check for loaded SMT group to be balanced to dst CPU */ -+ if (!local_group && smt_balance(env, sgs, group)) -+ sgs->group_smt_balance = 1; -+ - sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); - - /* Computing avg_load makes sense only when group is overloaded */ -@@ -9659,6 +9765,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, - return false; - break; - -+ case group_smt_balance: - case group_fully_busy: - /* - * Select the fully busy group with highest avg_load. In -@@ -9687,6 +9794,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, - break; - - case group_has_spare: -+ /* -+ * Do not pick sg with SMT CPUs over sg with pure CPUs, -+ * as we do not want to pull task off SMT core with one task -+ * and make the core idle. -+ */ -+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) { -+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) -+ return false; -+ else -+ return true; -+ } -+ - /* - * Select not overloaded group with lowest number of idle cpus - * and highest number of running tasks. We could also compare -@@ -9883,6 +10002,7 @@ static bool update_pick_idlest(struct sched_group *idlest, - - case group_imbalanced: - case group_asym_packing: -+ case group_smt_balance: - /* Those types are not used in the slow wakeup path */ - return false; - -@@ -10014,6 +10134,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) - - case group_imbalanced: - case group_asym_packing: -+ case group_smt_balance: - /* Those type are not used in the slow wakeup path */ - return NULL; - -@@ -10268,6 +10389,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s - return; - } - -+ if (busiest->group_type == group_smt_balance) { -+ /* Reduce number of tasks sharing CPU capacity */ -+ env->migration_type = migrate_task; -+ env->imbalance = 1; -+ return; -+ } -+ - if (busiest->group_type == group_imbalanced) { - /* - * In the group_imb case we cannot rely on group-wide averages -@@ -10315,14 +10443,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s - } - - if (busiest->group_weight == 1 || sds->prefer_sibling) { -- unsigned int nr_diff = busiest->sum_nr_running; - /* - * When prefer sibling, evenly spread running tasks on - * groups. - */ - env->migration_type = migrate_task; -- lsub_positive(&nr_diff, local->sum_nr_running); -- env->imbalance = nr_diff; -+ env->imbalance = sibling_imbalance(env, sds, busiest, local); - } else { - - /* -@@ -10519,20 +10645,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env) - * group's child domain. - */ - if (sds.prefer_sibling && local->group_type == group_has_spare && -- busiest->sum_nr_running > local->sum_nr_running + 1) -+ sibling_imbalance(env, &sds, busiest, local) > 1) - goto force_balance; - - if (busiest->group_type != group_overloaded) { -- if (env->idle == CPU_NOT_IDLE) -+ if (env->idle == CPU_NOT_IDLE) { - /* - * If the busiest group is not overloaded (and as a - * result the local one too) but this CPU is already - * busy, let another idle CPU try to pull task. - */ - goto out_balanced; -+ } -+ -+ if (busiest->group_type == group_smt_balance && -+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) { -+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */ -+ goto force_balance; -+ } - - if (busiest->group_weight > 1 && -- local->idle_cpus <= (busiest->idle_cpus + 1)) -+ local->idle_cpus <= (busiest->idle_cpus + 1)) { - /* - * If the busiest group is not overloaded - * and there is no imbalance between this and busiest -@@ -10543,12 +10676,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) - * there is more than 1 CPU per group. - */ - goto out_balanced; -+ } - -- if (busiest->sum_h_nr_running == 1) -+ if (busiest->sum_h_nr_running == 1) { - /* - * busiest doesn't have any tasks waiting to run - */ - goto out_balanced; -+ } - } - - force_balance: -@@ -10782,7 +10917,7 @@ static int active_load_balance_cpu_stop(void *data); - static int should_we_balance(struct lb_env *env) - { - struct sched_group *sg = env->sd->groups; -- int cpu; -+ int cpu, idle_smt = -1; - - /* - * Ensure the balancing environment is consistent; can happen -@@ -10809,10 +10944,24 @@ static int should_we_balance(struct lb_env *env) - if (!idle_cpu(cpu)) - continue; - -+ /* -+ * Don't balance to idle SMT in busy core right away when -+ * balancing cores, but remember the first idle SMT CPU for -+ * later consideration. Find CPU on an idle core first. -+ */ -+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { -+ if (idle_smt == -1) -+ idle_smt = cpu; -+ continue; -+ } -+ - /* Are we the first idle CPU? */ - return cpu == env->dst_cpu; - } - -+ if (idle_smt == env->dst_cpu) -+ return true; -+ - /* Are we the first CPU of this group ? */ - return group_balance_cpu(sg) == env->dst_cpu; - } -diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c -index 81fca77397f6..2ccb0b2ebd78 100644 ---- a/kernel/sched/psi.c -+++ b/kernel/sched/psi.c -@@ -140,7 +140,7 @@ - static int psi_bug __read_mostly; - - DEFINE_STATIC_KEY_FALSE(psi_disabled); --DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); -+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); - - #ifdef CONFIG_PSI_DEFAULT_DISABLED - static bool psi_enable; -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index e93e006a942b..9baeb1a2dfdd 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -636,6 +636,8 @@ struct cfs_rq { - u64 throttled_clock; - u64 throttled_clock_pelt; - u64 throttled_clock_pelt_time; -+ u64 throttled_clock_self; -+ u64 throttled_clock_self_time; - int throttled; - int throttle_count; - struct list_head throttled_list; -@@ -1882,6 +1884,7 @@ struct sched_group { - atomic_t ref; - - unsigned int group_weight; -+ unsigned int cores; - struct sched_group_capacity *sgc; - int asym_prefer_cpu; /* CPU of highest priority in group */ - int flags; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index d3a3b2646ec4..4bbe1631d950 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -1275,14 +1275,26 @@ build_sched_groups(struct sched_domain *sd, int cpu) - static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - { - struct sched_group *sg = sd->groups; -+ struct cpumask *mask = sched_domains_tmpmask2; - - WARN_ON(!sg); - - do { -- int cpu, max_cpu = -1; -+ int cpu, cores = 0, max_cpu = -1; - - sg->group_weight = cpumask_weight(sched_group_span(sg)); - -+ cpumask_copy(mask, sched_group_span(sg)); -+ for_each_cpu(cpu, mask) { -+ cores++; -+#ifdef CONFIG_SCHED_SMT -+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); -+#else -+ __cpumask_clear_cpu(cpu, mask); -+#endif -+ } -+ sg->cores = cores; -+ - if (!(sd->flags & SD_ASYM_PACKING)) - goto next; - -diff --git a/kernel/softirq.c b/kernel/softirq.c -index 807b34ccd797..210cf5f8d92c 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c -@@ -612,7 +612,7 @@ static inline void tick_irq_exit(void) - int cpu = smp_processor_id(); - - /* Make sure that timer wheel updates are propagated */ -- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { -+ if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_hardirq()) - tick_nohz_irq_exit(); - } --- -2.41.0 - -From 06e0e78e6ce4cea4215ba00474d011f49a3ff8f5 Mon Sep 17 00:00:00 2001 +From 4e22e9e9fa30a6a257a12a24844d77c4e8362b71 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 10 Jul 2023 17:11:55 +0200 -Subject: [PATCH 7/7] zstd +Subject: [PATCH 5/5] zstd Signed-off-by: Peter Jung --- diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index 590e6a9..363a80d 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,7 +1,7 @@ -From 0af97bb369de3bfe15d724e9bb0e3c971c6f9f20 Mon Sep 17 00:00:00 2001 +From 218c51e49185b75b4e36c8f11b5c77686f955a0a Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:12:45 +0200 -Subject: [PATCH] EEVDF-cachy +Date: Sun, 30 Jul 2023 09:38:51 +0200 +Subject: [PATCH] EEVDF Signed-off-by: Peter Jung --- @@ -13,11 +13,11 @@ Signed-off-by: Peter Jung init/init_task.c | 3 +- kernel/sched/core.c | 65 +- kernel/sched/debug.c | 49 +- - kernel/sched/fair.c | 1157 +++++++++++------------ - kernel/sched/features.h | 24 +- - kernel/sched/sched.h | 22 +- + kernel/sched/fair.c | 1138 +++++++++++------------ + kernel/sched/features.h | 23 +- + kernel/sched/sched.h | 21 +- tools/include/uapi/linux/sched.h | 4 +- - 12 files changed, 733 insertions(+), 658 deletions(-) + 12 files changed, 702 insertions(+), 668 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4ef890191196..3a8d3e1e5591 100644 @@ -78,7 +78,7 @@ index 7ee7ed5de722..6dbc5a1bf6a8 100644 * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h -index efc9f4bdc4ca..e99a9aa6a972 100644 +index 609bde814cb0..c940c4dc8304 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,13 +549,18 @@ struct sched_entity { @@ -196,7 +196,7 @@ index ff6c4b9bfe6b..511cbcf3510d 100644 .rt = { .run_list = LIST_HEAD_INIT(init_task.rt.run_list), diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 83e36547af17..8a541fe2d462 100644 +index c52c2eba7c73..aff81e12460e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) @@ -232,7 +232,7 @@ index 83e36547af17..8a541fe2d462 100644 /* * We don't need the reset flag anymore after the fork. It has -@@ -7529,7 +7539,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) +@@ -7516,7 +7526,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) #define SETPARAM_POLICY -1 static void __setscheduler_params(struct task_struct *p, @@ -241,7 +241,7 @@ index 83e36547af17..8a541fe2d462 100644 { int policy = attr->sched_policy; -@@ -7553,6 +7563,13 @@ static void __setscheduler_params(struct task_struct *p, +@@ -7540,6 +7550,13 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } @@ -255,7 +255,7 @@ index 83e36547af17..8a541fe2d462 100644 /* * Check the target process has a UID that matches the current process's: */ -@@ -7687,6 +7704,13 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7674,6 +7691,13 @@ static int __sched_setscheduler(struct task_struct *p, return retval; } @@ -269,7 +269,7 @@ index 83e36547af17..8a541fe2d462 100644 /* Update task specific "requested" clamps */ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { retval = uclamp_validate(p, attr); -@@ -7734,6 +7758,9 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7721,6 +7745,9 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; @@ -279,7 +279,7 @@ index 83e36547af17..8a541fe2d462 100644 p->sched_reset_on_fork = reset_on_fork; retval = 0; -@@ -7822,6 +7849,7 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7809,6 +7836,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } @@ -287,7 +287,7 @@ index 83e36547af17..8a541fe2d462 100644 __setscheduler_uclamp(p, attr); if (queued) { -@@ -8033,6 +8061,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a +@@ -8020,6 +8048,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -297,7 +297,7 @@ index 83e36547af17..8a541fe2d462 100644 /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? -@@ -8270,6 +8301,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +@@ -8257,6 +8288,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; @@ -306,7 +306,7 @@ index 83e36547af17..8a541fe2d462 100644 #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine -@@ -11214,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +@@ -11180,6 +11213,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, { return sched_group_set_idle(css_tg(css), idle); } @@ -332,7 +332,7 @@ index 83e36547af17..8a541fe2d462 100644 #endif static struct cftype cpu_legacy_files[] = { -@@ -11228,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = { +@@ -11194,6 +11246,11 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -344,7 +344,7 @@ index 83e36547af17..8a541fe2d462 100644 #endif #ifdef CONFIG_CFS_BANDWIDTH { -@@ -11467,6 +11524,12 @@ static struct cftype cpu_files[] = { +@@ -11411,6 +11468,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -358,7 +358,7 @@ index 83e36547af17..8a541fe2d462 100644 #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index aeeba46a096b..5c743bcb340d 100644 +index 066ff1c8ae4e..e7e83181fbb6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -347,10 +347,7 @@ static __init int sched_init_debug(void) @@ -373,7 +373,7 @@ index aeeba46a096b..5c743bcb340d 100644 debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -582,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); @@ -388,7 +388,7 @@ index aeeba46a096b..5c743bcb340d 100644 (long long)(p->nvcsw + p->nivcsw), p->prio); -@@ -627,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { @@ -401,7 +401,7 @@ index aeeba46a096b..5c743bcb340d 100644 unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -644,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); @@ -441,7 +441,7 @@ index aeeba46a096b..5c743bcb340d 100644 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); -@@ -864,10 +863,7 @@ static void sched_debug_header(struct seq_file *m) +@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) @@ -453,7 +453,7 @@ index aeeba46a096b..5c743bcb340d 100644 P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN -@@ -1090,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); @@ -462,7 +462,7 @@ index aeeba46a096b..5c743bcb340d 100644 P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 4039ff46fcb3..0fbb8fb24a50 100644 +index 2c335df30171..461409c0eac7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ @@ -594,7 +594,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 const struct sched_class fair_sched_class; -@@ -619,13 +569,200 @@ static inline bool entity_before(const struct sched_entity *a, +@@ -619,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } @@ -671,7 +671,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime += key * weight; -+ cfs_rq->avg_slice += se->slice * weight; + cfs_rq->avg_load += weight; +} + @@ -682,7 +681,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime -= key * weight; -+ cfs_rq->avg_slice -= se->slice * weight; + cfs_rq->avg_load -= weight; +} + @@ -796,7 +794,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 u64 vruntime = cfs_rq->min_vruntime; -@@ -636,9 +773,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) +@@ -636,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } @@ -807,7 +805,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 if (!curr) vruntime = se->vruntime; else -@@ -647,7 +782,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) +@@ -647,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) /* ensure we never gain time by being placed backwards. */ u64_u32_store(cfs_rq->min_vruntime, @@ -816,7 +814,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) -@@ -655,17 +790,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -655,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } @@ -870,7 +868,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -@@ -678,14 +847,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +@@ -678,14 +845,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } @@ -927,8 +925,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 + if (best->deadline == best->min_deadline) + break; + } - -- return __node_2_se(next); ++ + /* + * If the earlest deadline in this subtree is in the fully + * eligible left half of our space, go there. @@ -941,7 +938,8 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 + + node = node->rb_right; + } -+ + +- return __node_2_se(next); + if (!best || (curr && deadline_gt(deadline, best, curr))) + best = curr; + @@ -957,7 +955,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } #ifdef CONFIG_SCHED_DEBUG -@@ -707,104 +943,53 @@ int sched_update_scaling(void) +@@ -707,104 +941,53 @@ int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1090,7 +1088,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } #include "pelt.h" -@@ -939,6 +1124,7 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -939,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq) schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); @@ -1098,7 +1096,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { -@@ -3393,16 +3579,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +@@ -3393,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -1135,7 +1133,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); -@@ -3412,9 +3618,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +@@ -3412,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); @@ -1149,7 +1147,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } void reweight_task(struct task_struct *p, int prio) -@@ -4710,98 +4918,140 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} +@@ -4710,158 +4916,123 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -1167,94 +1165,42 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 -} - -static inline bool entity_is_long_sleeper(struct sched_entity *se) -+static inline bool -+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags) ++static void ++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - u64 sleep_time; -+ u64 now, vdelta; -+ s64 delta; - -- if (se->exec_start == 0) -+ if (!(flags & ENQUEUE_WAKEUP)) - return false; - -- cfs_rq = cfs_rq_of(se); - -- sleep_time = rq_clock_task(rq_of(cfs_rq)); -+ if (flags & ENQUEUE_MIGRATED) -+ return true; - -- /* Happen while migrating because of clock task divergence */ -- if (sleep_time <= se->exec_start) -+ now = rq_clock_task(rq_of(cfs_rq)); -+ delta = now - se->exec_start; -+ if (delta < 0) - return false; - -- sleep_time -= se->exec_start; -- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) -- return true; -+ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); -+ if (vdelta < vslice) -+ return false; - -- return false; -+ return true; - } - - static void --place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) - { -- u64 vruntime = cfs_rq->min_vruntime; +- if (se->exec_start == 0) +- return false; +- +- cfs_rq = cfs_rq_of(se); + u64 vslice = calc_delta_fair(se->slice, se); + u64 vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; - /* -- * The 'current' period is already promised to the current tasks, -- * however the extra weight of the new task will slow them down a -- * little, place the new task so that it fits in the slot that -- * stays open at the end. +- sleep_time = rq_clock_task(rq_of(cfs_rq)); ++ /* + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 - */ -- if (initial && sched_feat(START_DEBIT)) -- vruntime += sched_vslice(cfs_rq, se); ++ */ + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; -- /* sleeps up to a single latency don't count. */ -- if (!initial) { -- unsigned long thresh; +- /* Happen while migrating because of clock task divergence */ +- if (sleep_time <= se->exec_start) +- return false; + lag = se->vlag; -- if (se_is_idle(se)) -- thresh = sysctl_sched_min_granularity; -- else -- thresh = sysctl_sched_latency; +- sleep_time -= se->exec_start; +- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) +- return true; + /* -+ * For latency sensitive tasks; those that have a shorter than -+ * average slice and do not fully consume the slice, transition -+ * to EEVDF placement strategy #2. -+ */ -+ if (sched_feat(PLACE_FUDGE) && -+ (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) && -+ entity_has_slept(cfs_rq, se, vslice, flags)) { -+ lag += vslice; -+ if (lag > 0) -+ lag = 0; -+ } - - /* -- * Halve their sleep time's effect, to allow -- * for a gentler effect of sleepers: + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly @@ -1305,7 +1251,52 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W - */ ++ */ ++ load = cfs_rq->avg_load; ++ if (curr && curr->on_rq) ++ load += scale_load_down(curr->load.weight); + +- return false; +-} ++ lag *= load + scale_load_down(se->load.weight); ++ if (WARN_ON_ONCE(!load)) ++ load = 1; ++ lag = div_s64(lag, load); ++ } + +-static void +-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +-{ +- u64 vruntime = cfs_rq->min_vruntime; ++ se->vruntime = vruntime - lag; + + /* +- * The 'current' period is already promised to the current tasks, +- * however the extra weight of the new task will slow them down a +- * little, place the new task so that it fits in the slot that +- * stays open at the end. ++ * When joining the competition; the exisiting tasks will be, ++ * on average, halfway through their slice, as such start tasks ++ * off with half a slice to ease into the competition. + */ +- if (initial && sched_feat(START_DEBIT)) +- vruntime += sched_vslice(cfs_rq, se); +- +- /* sleeps up to a single latency don't count. */ +- if (!initial) { +- unsigned long thresh; +- +- if (se_is_idle(se)) +- thresh = sysctl_sched_min_granularity; +- else +- thresh = sysctl_sched_latency; ++ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++ vslice /= 2; + +- /* +- * Halve their sleep time's effect, to allow +- * for a gentler effect of sleepers: +- */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - @@ -1335,26 +1326,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 - se->vruntime = vruntime; - else - se->vruntime = max_vruntime(se->vruntime, vruntime); -+ load = cfs_rq->avg_load; -+ if (curr && curr->on_rq) -+ load += scale_load_down(curr->load.weight); -+ -+ lag *= load + scale_load_down(se->load.weight); -+ if (WARN_ON_ONCE(!load)) -+ load = 1; -+ lag = div_s64(lag, load); -+ } -+ -+ se->vruntime = vruntime - lag; -+ -+ /* -+ * When joining the competition; the exisiting tasks will be, -+ * on average, halfway through their slice, as such start tasks -+ * off with half a slice to ease into the competition. -+ */ -+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) -+ vslice /= 2; -+ + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ @@ -1362,7 +1333,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -@@ -4809,60 +5059,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); @@ -1425,7 +1395,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. -@@ -4874,18 +5084,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4873,18 +5044,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); @@ -1457,7 +1427,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; -@@ -4907,17 +5127,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4896,17 +5077,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } @@ -1475,7 +1445,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -4929,27 +5138,10 @@ static void __clear_buddies_next(struct sched_entity *se) +@@ -4918,27 +5088,10 @@ static void __clear_buddies_next(struct sched_entity *se) } } @@ -1503,7 +1473,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -4983,20 +5175,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4972,20 +5125,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); @@ -1525,7 +1495,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); -@@ -5015,52 +5199,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5004,52 +5149,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_idle_cfs_rq_clock_pelt(cfs_rq); } @@ -1578,7 +1548,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { -@@ -5099,9 +5237,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5088,9 +5187,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -1588,7 +1558,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5112,50 +5247,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +@@ -5101,50 +5197,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { @@ -1644,7 +1614,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -5172,8 +5271,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +@@ -5161,8 +5221,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); @@ -1653,7 +1623,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ -@@ -5214,9 +5311,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -5203,9 +5261,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif @@ -1663,7 +1633,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } -@@ -6259,13 +6353,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} +@@ -6228,13 +6283,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -1678,7 +1648,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 s64 delta = slice - ran; if (delta < 0) { -@@ -6289,8 +6382,7 @@ static void hrtick_update(struct rq *rq) +@@ -6258,8 +6312,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; @@ -1688,7 +1658,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } #else /* !CONFIG_SCHED_HRTICK */ static inline void -@@ -6331,17 +6423,6 @@ static int sched_idle_rq(struct rq *rq) +@@ -6300,17 +6353,6 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } @@ -1706,7 +1676,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { -@@ -7844,18 +7925,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) +@@ -7816,18 +7858,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { struct sched_entity *se = &p->se; @@ -1725,7 +1695,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); -@@ -7893,66 +7962,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -7865,66 +7895,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ @@ -1792,7 +1762,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -7964,12 +7973,6 @@ static void set_next_buddy(struct sched_entity *se) +@@ -7936,12 +7906,6 @@ static void set_next_buddy(struct sched_entity *se) } } @@ -1805,7 +1775,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 /* * Preempt the current task with a newly woken task if needed: */ -@@ -7978,7 +7981,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7950,7 +7914,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1813,7 +1783,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; -@@ -7994,7 +7996,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7966,7 +7929,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; @@ -1822,7 +1792,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 set_next_buddy(pse); next_buddy_marked = 1; } -@@ -8039,35 +8041,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -8011,35 +7974,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; @@ -1865,7 +1835,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } #ifdef CONFIG_SMP -@@ -8268,8 +8254,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +@@ -8240,8 +8187,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple @@ -1874,7 +1844,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 */ static void yield_task_fair(struct rq *rq) { -@@ -8285,21 +8269,19 @@ static void yield_task_fair(struct rq *rq) +@@ -8257,21 +8202,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); @@ -1908,7 +1878,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) -@@ -8547,8 +8529,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) +@@ -8514,8 +8457,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && @@ -1918,7 +1888,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 return 1; if (sysctl_sched_migration_cost == -1) -@@ -12174,8 +12155,8 @@ static void rq_offline_fair(struct rq *rq) +@@ -12025,8 +11967,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { @@ -1928,7 +1898,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 return (rtime * min_nr_tasks > slice); } -@@ -12331,8 +12312,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +@@ -12182,8 +12124,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { @@ -1938,7 +1908,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 struct rq *rq = this_rq(); struct rq_flags rf; -@@ -12341,22 +12322,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12192,22 +12134,9 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; @@ -1963,7 +1933,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 rq_unlock(rq, &rf); } -@@ -12385,34 +12353,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -12236,34 +12165,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } @@ -1998,7 +1968,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it -@@ -12483,16 +12423,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) +@@ -12334,16 +12235,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2015,7 +1985,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 detach_entity_cfs_rq(se); } -@@ -12500,12 +12430,8 @@ static void detach_task_cfs_rq(struct task_struct *p) +@@ -12351,12 +12242,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2028,7 +1998,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 } static void switched_from_fair(struct rq *rq, struct task_struct *p) -@@ -12616,6 +12542,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +@@ -12467,6 +12354,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; @@ -2036,7 +2006,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); -@@ -12714,6 +12641,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +@@ -12565,6 +12453,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; @@ -2046,7 +2016,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; -@@ -12844,6 +12774,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) +@@ -12695,6 +12586,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } @@ -2076,7 +2046,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } -@@ -12870,7 +12823,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task +@@ -12721,7 +12635,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) @@ -2086,10 +2056,10 @@ index 4039ff46fcb3..0fbb8fb24a50 100644 return rr_interval; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd3..7d65b40299d9 100644 +index ee7f23c76bd3..54334ca5c5c6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -1,16 +1,12 @@ +@@ -1,16 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * Only give sleepers 50% of their service deficit. This allows @@ -2106,12 +2076,11 @@ index ee7f23c76bd3..7d65b40299d9 100644 */ -SCHED_FEAT(START_DEBIT, true) +SCHED_FEAT(PLACE_LAG, true) -+SCHED_FEAT(PLACE_FUDGE, true) +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) /* * Prefer to schedule the task we woke last (assuming it failed -@@ -19,13 +15,6 @@ SCHED_FEAT(START_DEBIT, true) +@@ -19,13 +14,6 @@ SCHED_FEAT(START_DEBIT, true) */ SCHED_FEAT(NEXT_BUDDY, false) @@ -2125,7 +2094,7 @@ index ee7f23c76bd3..7d65b40299d9 100644 /* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. -@@ -98,6 +87,3 @@ SCHED_FEAT(UTIL_EST, true) +@@ -98,6 +86,3 @@ SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) @@ -2133,7 +2102,7 @@ index ee7f23c76bd3..7d65b40299d9 100644 -SCHED_FEAT(ALT_PERIOD, true) -SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 9baeb1a2dfdd..4236c4c893aa 100644 +index e93e006a942b..67cd7e1fd501 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -372,6 +372,8 @@ struct task_group { @@ -2154,18 +2123,17 @@ index 9baeb1a2dfdd..4236c4c893aa 100644 #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -@@ -548,6 +552,10 @@ struct cfs_rq { +@@ -548,6 +552,9 @@ struct cfs_rq { unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + s64 avg_vruntime; -+ u64 avg_slice; + u64 avg_load; + u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE -@@ -567,8 +575,6 @@ struct cfs_rq { +@@ -567,8 +574,6 @@ struct cfs_rq { */ struct sched_entity *curr; struct sched_entity *next; @@ -2174,7 +2142,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644 #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; -@@ -2198,6 +2204,7 @@ extern const u32 sched_prio_to_wmult[40]; +@@ -2195,6 +2200,7 @@ extern const u32 sched_prio_to_wmult[40]; #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -2182,7 +2150,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644 #define RETRY_TASK ((void *)-1UL) -@@ -2502,11 +2509,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +@@ -2499,11 +2505,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; @@ -2196,7 +2164,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644 extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; -@@ -2519,6 +2524,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; +@@ -2516,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; #endif @@ -2205,7 +2173,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644 #ifdef CONFIG_SCHED_HRTICK /* -@@ -3483,4 +3490,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +@@ -3480,4 +3486,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } #endif diff --git a/patches/0002-eevdfbore.patch b/patches/0002-eevdfbore.patch index 6d53439..0465cdf 100644 --- a/patches/0002-eevdfbore.patch +++ b/patches/0002-eevdfbore.patch @@ -1,49 +1,76 @@ -From e6e251fb3f3927c18ac4f2a22a43c6c198133d19 Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Sun, 23 Jul 2023 09:46:42 +0200 +From 377657f92d256b364813e3f8b2a58edfc9833815 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sun, 30 Jul 2023 09:43:51 +0200 Subject: [PATCH] bore-eevdf -Signed-off-by: Piotr Gorski +Signed-off-by: Peter Jung --- - include/linux/sched.h | 10 ++ + include/linux/sched.h | 30 ++++++ init/Kconfig | 20 ++++ - kernel/sched/core.c | 117 +++++++++++++++++++++++ + kernel/sched/core.c | 118 +++++++++++++++++++++ kernel/sched/debug.c | 4 + - kernel/sched/fair.c | 203 ++++++++++++++++++++++++++++++++++++++-- + kernel/sched/fair.c | 228 ++++++++++++++++++++++++++++++++++++++-- kernel/sched/features.h | 4 + kernel/sched/sched.h | 1 + - 7 files changed, 351 insertions(+), 8 deletions(-) + 7 files changed, 397 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index e99a9aa6a..14a1ce058 100644 +index c940c4dc8304..8663c0813f81 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -559,6 +559,12 @@ struct sched_entity { +@@ -545,6 +545,26 @@ struct sched_statistics { + #endif /* CONFIG_SCHEDSTATS */ + } ____cacheline_aligned; + ++#ifdef CONFIG_SCHED_BORE ++union union16 { ++ u16 u16; ++ s16 s16; ++ u8 u8[2]; ++ s8 s8[2]; ++}; ++typedef union union16 x16; ++ ++union union32 { ++ u32 u32; ++ s32 s32; ++ u16 u16[2]; ++ s16 s16[2]; ++ u8 u8[4]; ++ s8 s8[4]; ++}; ++typedef union union32 x32; ++#endif // CONFIG_SCHED_BORE ++ + struct sched_entity { + /* For load-balancing: */ + struct load_weight load; +@@ -559,6 +579,12 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; +#ifdef CONFIG_SCHED_BORE -+ u64 prev_burst_time; + u64 burst_time; -+ u64 max_burst_time; -+ u8 penalty_score; ++ u16 prev_burst_penalty; ++ u16 curr_burst_penalty; ++ u16 burst_penalty; +#endif // CONFIG_SCHED_BORE s64 vlag; u64 slice; -@@ -990,6 +996,10 @@ struct task_struct { +@@ -990,6 +1016,10 @@ struct task_struct { struct list_head children; struct list_head sibling; struct task_struct *group_leader; +#ifdef CONFIG_SCHED_BORE -+ u64 child_burst_cache; ++ u16 child_burst_cache; + u64 child_burst_last_cached; +#endif // CONFIG_SCHED_BORE /* * 'ptraced' is the list of tasks this task is using ptrace() on. diff --git a/init/Kconfig b/init/Kconfig -index 71755cc8e..c697be79e 100644 +index 71755cc8ed3e..c697be79e594 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1277,6 +1277,26 @@ config CHECKPOINT_RESTORE @@ -74,30 +101,31 @@ index 71755cc8e..c697be79e 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 8a541fe2d..13969a3a3 100644 +index aff81e12460e..839605620f63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4491,6 +4491,112 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4491,6 +4491,113 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SCHED_BORE -+#define CHILD_BURST_CUTOFF_BITS 9 +extern unsigned int sched_burst_cache_lifetime; +extern unsigned int sched_burst_fork_atavistic; + +void __init sched_init_bore(void) { + init_task.child_burst_cache = 0; + init_task.child_burst_last_cached = 0; -+ init_task.se.prev_burst_time = 0; + init_task.se.burst_time = 0; -+ init_task.se.max_burst_time = 0; ++ init_task.se.prev_burst_penalty = 0; ++ init_task.se.curr_burst_penalty = 0; ++ init_task.se.burst_penalty = 0; +} + +void inline sched_fork_bore(struct task_struct *p) { + p->child_burst_cache = 0; + p->child_burst_last_cached = 0; + p->se.burst_time = 0; ++ p->se.curr_burst_penalty = 0; +} + +static u32 count_child_tasks(struct task_struct *p) { @@ -112,31 +140,31 @@ index 8a541fe2d..13969a3a3 100644 +} + +static void __update_child_burst_cache( -+ struct task_struct *p, u32 cnt, u64 sum, u64 now) { -+ u64 avg = 0; -+ if (cnt) avg = div_u64(sum, cnt) << CHILD_BURST_CUTOFF_BITS; -+ p->child_burst_cache = max(avg, p->se.max_burst_time); ++ struct task_struct *p, u32 cnt, u32 sum, u64 now) { ++ u16 avg = 0; ++ if (cnt) avg = DIV_ROUND_CLOSEST(sum, cnt); ++ p->child_burst_cache = max(avg, p->se.burst_penalty); + p->child_burst_last_cached = now; +} + +static void update_child_burst_cache(struct task_struct *p, u64 now) { + struct task_struct *child; + u32 cnt = 0; -+ u64 sum = 0; ++ u32 sum = 0; + + list_for_each_entry(child, &p->children, sibling) { + cnt++; -+ sum += child->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS; ++ sum += child->se.burst_penalty; + } + + __update_child_burst_cache(p, cnt, sum, now); +} + +static void update_child_burst_cache_atavistic( -+ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u64 *asum) { ++ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { + struct task_struct *child, *dec; + u32 cnt = 0, dcnt = 0; -+ u64 sum = 0; ++ u32 sum = 0; + + list_for_each_entry(child, &p->children, sibling) { + dec = child; @@ -145,13 +173,13 @@ index 8a541fe2d..13969a3a3 100644 + + if (!dcnt || !depth) { + cnt++; -+ sum += dec->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS; ++ sum += dec->se.burst_penalty; + } else { + if (child_burst_cache_expired(dec, now)) + update_child_burst_cache_atavistic(dec, now, depth - 1, &cnt, &sum); + else { + cnt += dcnt; -+ sum += (dec->child_burst_cache >> CHILD_BURST_CUTOFF_BITS) * dcnt; ++ sum += (dec->child_burst_cache) * dcnt; + } + } + } @@ -161,12 +189,12 @@ index 8a541fe2d..13969a3a3 100644 + *asum += sum; +} + -+static void update_task_initial_burst_time(struct task_struct *p) { ++static void fork_burst_penalty(struct task_struct *p) { + struct sched_entity *se = &p->se; + struct task_struct *anc = p->real_parent; + u64 now = ktime_get_ns(); + u32 cnt = 0; -+ u64 sum = 0; ++ u32 sum = 0; + + read_lock(&tasklist_lock); + @@ -182,15 +210,15 @@ index 8a541fe2d..13969a3a3 100644 + + read_unlock(&tasklist_lock); + -+ se->max_burst_time = se->prev_burst_time = -+ max(se->prev_burst_time, anc->child_burst_cache); ++ se->burst_penalty = se->prev_burst_penalty = ++ max(se->prev_burst_penalty, anc->child_burst_cache); +} +#endif // CONFIG_SCHED_BORE + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4507,6 +4613,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4507,6 +4614,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -200,30 +228,30 @@ index 8a541fe2d..13969a3a3 100644 p->se.vlag = 0; INIT_LIST_HEAD(&p->se.group_node); -@@ -4828,6 +4937,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +@@ -4828,6 +4938,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) void sched_post_fork(struct task_struct *p) { +#ifdef CONFIG_SCHED_BORE -+ update_task_initial_burst_time(p); ++ fork_burst_penalty(p); +#endif // CONFIG_SCHED_BORE uclamp_post_fork(p); } -@@ -9967,6 +10079,11 @@ void __init sched_init(void) +@@ -9954,6 +10067,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.5.3 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 3.0 Beta2 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 5c743bcb3..755ef4c8d 100644 +index e7e83181fbb6..ff41a524c1ee 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -348,6 +348,7 @@ static __init int sched_init_debug(void) @@ -234,18 +262,18 @@ index 5c743bcb3..755ef4c8d 100644 debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -595,6 +596,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE -+ SEQ_printf(m, " %2d", p->se.penalty_score); ++ SEQ_printf(m, " %2d", ((x16*)&p->se.burst_penalty)->u8[1]); +#endif #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index d6042543c..e52c14232 100644 +index 461409c0eac7..90ce27fb0a3f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -281,7 +309,7 @@ index d6042543c..e52c14232 100644 /* * After fork, child runs first. If set to 0 (default) then -@@ -84,8 +87,76 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +@@ -84,8 +87,93 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; */ unsigned int sysctl_sched_child_runs_first __read_mostly; @@ -292,61 +320,78 @@ index d6042543c..e52c14232 100644 + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + * -+ * (default: 3.2 msec * 1, units: nanoseconds) ++ * (default: 1.6 msec * 1, units: nanoseconds) + */ -+unsigned int sysctl_sched_wakeup_granularity = 3200000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 3200000UL; ++unsigned int sysctl_sched_wakeup_granularity = 1600000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 1600000UL; + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SCHED_BORE -+unsigned int __read_mostly sched_bore = 1; -+unsigned int __read_mostly sched_burst_cache_lifetime = 60000000; -+unsigned int __read_mostly sched_burst_penalty_offset = 12; -+unsigned int __read_mostly sched_burst_penalty_scale = 1292; -+unsigned int __read_mostly sched_burst_smoothness = 2; -+unsigned int __read_mostly sched_burst_fork_atavistic = 2; ++unsigned int __read_mostly sched_bore = 1; ++unsigned int __read_mostly sched_burst_cache_lifetime = 60000000; ++unsigned int __read_mostly sched_burst_penalty_offset = 18; ++unsigned int __read_mostly sched_burst_penalty_scale = 1292; ++unsigned int __read_mostly sched_burst_smoothness_up = 1; ++unsigned int __read_mostly sched_burst_smoothness_down = 0; ++unsigned int __read_mostly sched_burst_fork_atavistic = 2; +static int three = 3; +static int sixty_four = 64; +static int maxval_12_bits = 4095; + -+#define FIXED_SHIFT 10 -+#define FIXED_ONE (1 << FIXED_SHIFT) -+typedef u32 fixed; ++#define MAX_BURST_PENALTY ((u32)(40UL << 8) - 1) + -+static void update_burst_score(struct sched_entity *se) { -+ u64 burst_time = se->max_burst_time; ++static inline u32 log2plus1_u64_u32f8(u64 v) { ++ x32 result; ++ int msb = fls64(v); ++ result.u8[0] = v << (64 - msb) >> 55; ++ result.u8[1] = msb; ++ return result.u32; ++} + -+ int msb = fls64(burst_time); -+ fixed integer_part = msb << FIXED_SHIFT; -+ fixed fractional_part = burst_time << (64 - msb) << 1 >> (64 - FIXED_SHIFT); -+ fixed greed = integer_part | fractional_part; ++static inline u32 u8h_u32(u8 v) { ++ x32 result; ++ result.u8[1] = v; ++ return result.u32; ++} + -+ fixed tolerance = sched_burst_penalty_offset << FIXED_SHIFT; -+ fixed penalty = max(0, (s32)greed - (s32)tolerance); -+ fixed scaled_penalty = penalty * sched_burst_penalty_scale >> 10; ++static inline u32 calc_burst_penalty(struct sched_entity *se) { ++ u32 greed, tolerance, penalty, scaled_penalty; ++ ++ greed = log2plus1_u64_u32f8(se->burst_time); ++ tolerance = u8h_u32(sched_burst_penalty_offset); ++ penalty = max(0, (s32)greed - (s32)tolerance); ++ scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + -+ u8 score = min(39U, scaled_penalty >> FIXED_SHIFT); -+ se->penalty_score = score; ++ return min(MAX_BURST_PENALTY, scaled_penalty); ++} ++ ++static void update_burst_penalty(struct sched_entity *se) { ++ se->curr_burst_penalty = calc_burst_penalty(se); ++ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); +} + +static inline u64 penalty_scale(u64 delta, struct sched_entity *se) { -+ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22); ++ u8 score = ((x16*)&se->burst_penalty)->u8[1]; ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22); +} + -+static inline u64 __binary_smooth(u64 new, u64 old, unsigned int smoothness) { -+ return (new <= old)? new: (new + old * ((1 << smoothness) - 1)) >> smoothness; ++static inline u32 binary_smooth(u32 new, u32 old) { ++ return (new >= old)? ++ old + ((new - old) >> sched_burst_smoothness_up): ++ old - ((old - new) >> sched_burst_smoothness_down); +} + -+void restart_burst(struct sched_entity *se) { -+ se->max_burst_time = se->prev_burst_time = __binary_smooth( -+ se->burst_time, se->prev_burst_time, sched_burst_smoothness); ++static void restart_burst(struct sched_entity *se) { ++ se->burst_penalty = se->prev_burst_penalty = ++ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); ++ se->curr_burst_penalty = 0; + se->burst_time = 0; +} + +#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, true) +#define calc_delta_fair_unscaled(delta, se) __calc_delta_fair(delta, se, false) -+static inline u64 ++static inline u64 +__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale); + +static s64 wakeup_preempt_backstep_delta(u64 rtime, struct sched_entity *se) { @@ -358,7 +403,7 @@ index d6042543c..e52c14232 100644 int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -145,6 +216,60 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -145,6 +233,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -407,8 +452,17 @@ index d6042543c..e52c14232 100644 + .extra2 = &maxval_12_bits, + }, + { -+ .procname = "sched_burst_smoothness", -+ .data = &sched_burst_smoothness, ++ .procname = "sched_burst_smoothness_down", ++ .data = &sched_burst_smoothness_down, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &three, ++ }, ++ { ++ .procname = "sched_burst_smoothness_up", ++ .data = &sched_burst_smoothness_up, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, @@ -419,7 +473,7 @@ index d6042543c..e52c14232 100644 { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, -@@ -238,6 +363,7 @@ static void update_sysctl(void) +@@ -238,6 +389,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_base_slice); @@ -427,12 +481,12 @@ index d6042543c..e52c14232 100644 #undef SET_SYSCTL } -@@ -308,11 +434,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight +@@ -308,11 +460,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight /* * delta /= w */ +#ifdef CONFIG_SCHED_BORE -+static inline u64 ++static inline u64 +__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale) +#else // CONFIG_SCHED_BORE static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) @@ -447,7 +501,7 @@ index d6042543c..e52c14232 100644 return delta; } -@@ -708,7 +842,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -706,7 +866,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) SCHED_WARN_ON(!se->on_rq); lag = avg_vruntime(cfs_rq) - se->vruntime; @@ -459,7 +513,7 @@ index d6042543c..e52c14232 100644 se->vlag = clamp(lag, -limit, limit); } -@@ -946,6 +1084,7 @@ int sched_update_scaling(void) +@@ -944,6 +1108,7 @@ int sched_update_scaling(void) #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) WRT_SYSCTL(sched_base_slice); @@ -467,19 +521,18 @@ index d6042543c..e52c14232 100644 #undef WRT_SYSCTL return 0; -@@ -1123,6 +1262,11 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1121,6 +1286,10 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); +#ifdef CONFIG_SCHED_BORE + curr->burst_time += delta_exec; -+ curr->max_burst_time = max(curr->max_burst_time, curr->burst_time); -+ update_burst_score(curr); ++ update_burst_penalty(curr); +#endif // CONFIG_SCHED_BORE curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -5237,6 +5381,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5187,6 +5356,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -489,7 +542,7 @@ index d6042543c..e52c14232 100644 /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5247,14 +5394,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5197,14 +5369,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { @@ -499,7 +552,7 @@ index d6042543c..e52c14232 100644 */ if (sched_feat(NEXT_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) && ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) && + wakeup_preempt_entity(cfs_rq->next, candidate) < 1) return cfs_rq->next; @@ -508,7 +561,7 @@ index d6042543c..e52c14232 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -6522,6 +6671,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6452,6 +6626,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } @@ -547,7 +600,7 @@ index d6042543c..e52c14232 100644 static void set_next_buddy(struct sched_entity *se); /* -@@ -6540,6 +6721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6470,6 +6676,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -557,7 +610,7 @@ index d6042543c..e52c14232 100644 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); -@@ -8047,7 +8231,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7980,7 +8189,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * XXX pick_eevdf(cfs_rq) != se ? */ @@ -566,7 +619,7 @@ index d6042543c..e52c14232 100644 goto preempt; return; -@@ -8260,6 +8444,9 @@ static void yield_task_fair(struct rq *rq) +@@ -8193,6 +8402,9 @@ static void yield_task_fair(struct rq *rq) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -577,10 +630,10 @@ index d6042543c..e52c14232 100644 /* * Are we the only task in the tree? diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 7d65b4029..bd274f7c7 100644 +index 54334ca5c5c6..416ec4bcdb0f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -13,7 +13,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +@@ -12,7 +12,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ @@ -593,10 +646,10 @@ index 7d65b4029..bd274f7c7 100644 /* * Consider buddies to be cache hot, decreases the likeliness of a diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 4236c4c89..714cc6ad9 100644 +index 67cd7e1fd501..04d065015d6c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2510,6 +2510,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; +@@ -2506,6 +2506,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; @@ -605,4 +658,4 @@ index 4236c4c89..714cc6ad9 100644 #ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; -- -2.41.0.159.g0bfa463d37 +2.41.0 diff --git a/patches/0006-AMD-cppc.patch b/patches/0006-AMD-cppc.patch new file mode 100644 index 0000000..eee57cc --- /dev/null +++ b/patches/0006-AMD-cppc.patch @@ -0,0 +1,573 @@ +From ab6268d199fa749e274a48b00c443538ae492b16 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Wed, 9 Aug 2023 14:07:31 +0200 +Subject: [PATCH] amd-6.5: merge changes from dev tree + +Signed-off-by: Piotr Gorski +--- + .../admin-guide/kernel-parameters.txt | 5 + + Documentation/admin-guide/pm/amd-pstate.rst | 55 +++++ + drivers/acpi/cppc_acpi.c | 13 ++ + drivers/acpi/processor_driver.c | 6 + + drivers/cpufreq/amd-pstate.c | 191 ++++++++++++++++-- + drivers/cpufreq/cpufreq.c | 13 ++ + include/acpi/cppc_acpi.h | 5 + + include/linux/amd-pstate.h | 1 + + include/linux/cpufreq.h | 4 + + 9 files changed, 272 insertions(+), 21 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a1457995f..1f53c395a 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -363,6 +363,11 @@ + selects a performance level in this range and appropriate + to the current workload. + ++ amd_prefcore= ++ [X86] ++ enable ++ Enable AMD Pstate Preferred Core. ++ + amijoy.map= [HW,JOY] Amiga joystick support + Map of devices attached to JOY0DAT and JOY1DAT + Format: , +diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst +index 1cf40f692..4a30cf235 100644 +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -353,6 +353,49 @@ is activated. In this mode, driver requests minimum and maximum performance + level and the platform autonomously selects a performance level in this range + and appropriate to the current workload. + ++AMD Pstate Preferred Core ++================================= ++ ++The core frequency is subjected to the process variation in semiconductors. ++Not all cores are able to reach the maximum frequency respecting the ++infrastructure limits. Consequently, AMD has redefined the concept of ++maximum frequency of a part. This means that a fraction of cores can reach ++maximum frequency. To find the best process scheduling policy for a given ++scenario, OS needs to know the core ordering informed by the platform through ++highest performance capability register of the CPPC interface. ++ ++``AMD Pstate Preferred Core`` use ITMT arch provides functions and data structures ++for enabling the scheduler to favor scheduling on cores can be get a higher frequency ++with lower voltage under preferred core. And it has the ability to dynamically ++change the preferred core based on the workload and platform conditions and ++accounting for thermals and aging. ++ ++The priority metric will be initialized by the AMD Pstate driver. The AMD Pstate ++driver will also determine whether or not ``AMD Pstate Preferred Core`` is ++supported by the platform. ++ ++AMD Pstate driver will provide an initial core ordering when the system boots. ++The platform uses the CPPC interfaces to communicate the core ranking to the ++operating system and scheduler to make sure that OS is choosing the cores ++with highest performance firstly for scheduling the process. When AMD Pstate ++driver receives a message with the highest performance change, it will ++update the core ranking and set the cpu's priority. ++ ++AMD Preferred Core Switch ++================================= ++Kernel Parameters ++----------------- ++ ++``AMD Pstate Preferred Core`` has two states: enable and disable. ++Enable/disable states can be chosen by different kernel parameters. ++Default disable ``AMD Pstate Preferred Core``. ++ ++``amd_prefcore=enable`` ++ ++If ``amd_prefcore=enable`` is passed to kernel command line option ++then enable ``AMD Pstate Preferred Core`` if the processor and power ++firmware can support preferred core feature. ++ + User Space Interface in ``sysfs`` - General + =========================================== + +@@ -385,6 +428,18 @@ control its functionality at the system level. They are located in the + to the operation mode represented by that string - or to be + unregistered in the "disable" case. + ++``prefcore_state`` ++ Preferred Core state of the driver: "enabled" or "disabled". ++ ++ "enabled" ++ Enable the AMD Preferred Core. ++ ++ "disabled" ++ Disable the AMD Preferred Core ++ ++ ++ This attribute is read-only to check the state of Preferred Core. ++ + ``cpupower`` tool support for ``amd-pstate`` + =============================================== + +diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c +index 7ff269a78..ad388a0e8 100644 +--- a/drivers/acpi/cppc_acpi.c ++++ b/drivers/acpi/cppc_acpi.c +@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) + return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); + } + ++/** ++ * cppc_get_highest_perf - Get the highest performance register value. ++ * @cpunum: CPU from which to get highest performance. ++ * @highest_perf: Return address. ++ * ++ * Return: 0 for success, -EIO otherwise. ++ */ ++int cppc_get_highest_perf(int cpunum, u64 *highest_perf) ++{ ++ return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf); ++} ++EXPORT_SYMBOL_GPL(cppc_get_highest_perf); ++ + /** + * cppc_get_epp_perf - Get the epp register value. + * @cpunum: CPU from which to get epp preference value. +diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c +index 4bd16b3f0..29b2fb68a 100644 +--- a/drivers/acpi/processor_driver.c ++++ b/drivers/acpi/processor_driver.c +@@ -27,6 +27,7 @@ + #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80 + #define ACPI_PROCESSOR_NOTIFY_POWER 0x81 + #define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82 ++#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED 0x85 + + MODULE_AUTHOR("Paul Diefenbaugh"); + MODULE_DESCRIPTION("ACPI Processor Driver"); +@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) + acpi_bus_generate_netlink_event(device->pnp.device_class, + dev_name(&device->dev), event, 0); + break; ++ case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED: ++ cpufreq_update_highest_perf(pr->id); ++ acpi_bus_generate_netlink_event(device->pnp.device_class, ++ dev_name(&device->dev), event, 0); ++ break; + default: + acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); + break; +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 81fba0dcb..ba10aa971 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -49,6 +50,8 @@ + + #define AMD_PSTATE_TRANSITION_LATENCY 20000 + #define AMD_PSTATE_TRANSITION_DELAY 1000 ++#define AMD_PSTATE_PREFCORE_THRESHOLD 166 ++#define AMD_PSTATE_MAX_CPPC_PERF 255 + + /* + * TODO: We need more time to fine tune processors with shared memory solution +@@ -65,6 +68,14 @@ static struct cpufreq_driver amd_pstate_epp_driver; + static int cppc_state = AMD_PSTATE_UNDEFINED; + static bool cppc_enabled; + ++/* ++ * CPPC Preferred Core feature is supported by power firmware ++ */ ++static bool prefcore_enabled = false; ++ ++/* Disable AMD Pstate Preferred Core loading */ ++static bool no_prefcore __read_mostly = true; ++ + /* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive +@@ -290,27 +301,26 @@ static inline int amd_pstate_enable(bool enable) + static int pstate_init_perf(struct amd_cpudata *cpudata) + { + u64 cap1; +- u32 highest_perf; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, + &cap1); + if (ret) + return ret; + +- /* +- * TODO: Introduce AMD specific power feature. +- * +- * CPPC entry doesn't indicate the highest performance in some ASICs. ++ /* For platforms that do not support the preferred core feature, the ++ * highest_pef may be configured with 166 or 255, to avoid max frequency ++ * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as ++ * the default max perf. + */ +- highest_perf = amd_get_highest_perf(); +- if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) +- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); +- +- WRITE_ONCE(cpudata->highest_perf, highest_perf); ++ if (!prefcore_enabled) ++ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ else ++ WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD); + + WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); ++ WRITE_ONCE(cpudata->prefcore_highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + + return 0; + } +@@ -318,22 +328,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) + static int cppc_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; +- u32 highest_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + +- highest_perf = amd_get_highest_perf(); +- if (highest_perf > cppc_perf.highest_perf) +- highest_perf = cppc_perf.highest_perf; +- +- WRITE_ONCE(cpudata->highest_perf, highest_perf); ++ if (!prefcore_enabled) ++ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); ++ else ++ WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD); + + WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, + cppc_perf.lowest_nonlinear_perf); + WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); ++ WRITE_ONCE(cpudata->prefcore_highest_perf, cppc_perf.highest_perf); + + if (cppc_state == AMD_PSTATE_ACTIVE) + return 0; +@@ -676,6 +685,118 @@ static void amd_perf_ctl_reset(unsigned int cpu) + wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); + } + ++/* ++ * Set AMD Pstate Preferred Core enable can't be done directly from cpufreq callbacks ++ * due to locking, so queue the work for later. ++ */ ++static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) ++{ ++ sched_set_itmt_support(); ++} ++static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); ++ ++/** ++ * Get the highest performance register value. ++ * @cpu: CPU from which to get highest performance. ++ * @highest_perf: Return address. ++ * ++ * Return: 0 for success, -EIO otherwise. ++ */ ++static int amd_pstate_get_highest_perf(int cpu, u64 *highest_perf) ++{ ++ int ret; ++ ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ u64 cap1; ++ ++ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); ++ if (ret) ++ return ret; ++ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ } else { ++ ret = cppc_get_highest_perf(cpu, highest_perf); ++ } ++ ++ return (ret); ++} ++ ++static void amd_pstate_init_prefcore(void) ++{ ++ int cpu, ret; ++ u64 highest_perf; ++ ++ if (no_prefcore) ++ return; ++ ++ for_each_possible_cpu(cpu) { ++ ret = amd_pstate_get_highest_perf(cpu, &highest_perf); ++ if (ret) ++ break; ++ ++ sched_set_itmt_core_prio(highest_perf, cpu); ++ } ++ ++ /* ++ * This code can be run during CPU online under the ++ * CPU hotplug locks, so sched_set_amd_prefcore_support() ++ * cannot be called from here. Queue up a work item ++ * to invoke it. ++ */ ++ schedule_work(&sched_prefcore_work); ++} ++ ++static void amd_pstate_update_highest_perf(unsigned int cpu) ++{ ++ struct cpufreq_policy *policy; ++ struct amd_cpudata *cpudata; ++ u32 prev_high = 0, cur_high = 0; ++ u64 highest_perf; ++ int ret; ++ ++ if (!prefcore_enabled) ++ return; ++ ++ ret = amd_pstate_get_highest_perf(cpu, &highest_perf); ++ if (ret) ++ return; ++ ++ policy = cpufreq_cpu_get(cpu); ++ cpudata = policy->driver_data; ++ cur_high = highest_perf; ++ prev_high = READ_ONCE(cpudata->prefcore_highest_perf); ++ ++ if (prev_high != cur_high) { ++ WRITE_ONCE(cpudata->prefcore_highest_perf, cur_high); ++ sched_set_itmt_core_prio(cur_high, cpu); ++ } ++ ++ cpufreq_cpu_put(policy); ++} ++ ++/* ++ * Check if AMD Pstate Preferred core feature is supported and enabled ++ * 1) no_prefcore is used to enable or disable AMD Pstate Preferred Core ++ * loading when user would like to enable or disable it. Without that, ++ * AMD Pstate Preferred Core will be disabled by default if the processor ++ * and power firmware can support preferred core feature. ++ * 2) prefcore_enabled is used to indicate whether CPPC preferred core is enabled. ++ */ ++static void check_prefcore_supported(int cpu) ++{ ++ u64 highest_perf; ++ int ret; ++ ++ if (no_prefcore) ++ return; ++ ++ ret = amd_pstate_get_highest_perf(cpu, &highest_perf); ++ if (ret) ++ return; ++ ++ if(highest_perf < AMD_PSTATE_MAX_CPPC_PERF) ++ prefcore_enabled = true; ++} ++ + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + { + int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; +@@ -697,6 +818,9 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + + cpudata->cpu = policy->cpu; + ++ /* check if CPPC preferred core feature is enabled*/ ++ check_prefcore_supported(policy->cpu); ++ + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; +@@ -1012,8 +1136,8 @@ static int amd_pstate_update_status(const char *buf, size_t size) + return 0; + } + +-static ssize_t show_status(struct kobject *kobj, +- struct kobj_attribute *attr, char *buf) ++static ssize_t status_show(struct device *dev, ++ struct device_attribute *attr, char *buf) + { + ssize_t ret; + +@@ -1024,7 +1148,7 @@ static ssize_t show_status(struct kobject *kobj, + return ret; + } + +-static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, ++static ssize_t status_store(struct device *a, struct device_attribute *b, + const char *buf, size_t count) + { + char *p = memchr(buf, '\n', count); +@@ -1037,13 +1161,20 @@ static ssize_t store_status(struct kobject *a, struct kobj_attribute *b, + return ret < 0 ? ret : count; + } + ++static ssize_t prefcore_state_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%s\n", prefcore_enabled ? "enabled" : "disabled"); ++} ++ + cpufreq_freq_attr_ro(amd_pstate_max_freq); + cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + + cpufreq_freq_attr_ro(amd_pstate_highest_perf); + cpufreq_freq_attr_rw(energy_performance_preference); + cpufreq_freq_attr_ro(energy_performance_available_preferences); +-define_one_global_rw(status); ++static DEVICE_ATTR_RW(status); ++static DEVICE_ATTR_RO(prefcore_state); + + static struct freq_attr *amd_pstate_attr[] = { + &amd_pstate_max_freq, +@@ -1062,7 +1193,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { + }; + + static struct attribute *pstate_global_attributes[] = { +- &status.attr, ++ &dev_attr_status.attr, ++ &dev_attr_prefcore_state.attr, + NULL + }; + +@@ -1114,6 +1246,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + ++ /* check if CPPC preferred core feature is supported*/ ++ check_prefcore_supported(policy->cpu); ++ + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; +@@ -1392,6 +1527,7 @@ static struct cpufreq_driver amd_pstate_driver = { + .suspend = amd_pstate_cpu_suspend, + .resume = amd_pstate_cpu_resume, + .set_boost = amd_pstate_set_boost, ++ .update_highest_perf = amd_pstate_update_highest_perf, + .name = "amd-pstate", + .attr = amd_pstate_attr, + }; +@@ -1406,6 +1542,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { + .online = amd_pstate_epp_cpu_online, + .suspend = amd_pstate_epp_suspend, + .resume = amd_pstate_epp_resume, ++ .update_highest_perf = amd_pstate_update_highest_perf, + .name = "amd-pstate-epp", + .attr = amd_pstate_epp_attr, + }; +@@ -1506,6 +1643,8 @@ static int __init amd_pstate_init(void) + } + } + ++ amd_pstate_init_prefcore(); ++ + return ret; + + global_attr_free: +@@ -1527,7 +1666,17 @@ static int __init amd_pstate_param(char *str) + + return amd_pstate_set_driver(mode_idx); + } ++ ++static int __init amd_prefcore_param(char *str) ++{ ++ if (!strcmp(str, "enable")) ++ no_prefcore = false; ++ ++ return 0; ++} ++ + early_param("amd_pstate", amd_pstate_param); ++early_param("amd_prefcore", amd_prefcore_param); + + MODULE_AUTHOR("Huang Rui "); + MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); +diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c +index 50bbc969f..842357abf 100644 +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu) + } + EXPORT_SYMBOL_GPL(cpufreq_update_limits); + ++/** ++ * cpufreq_update_highest_perf - Update highest performance for a given CPU. ++ * @cpu: CPU to update the highest performance for. ++ * ++ * Invoke the driver's ->update_highest_perf callback if present ++ */ ++void cpufreq_update_highest_perf(unsigned int cpu) ++{ ++ if (cpufreq_driver->update_highest_perf) ++ cpufreq_driver->update_highest_perf(cpu); ++} ++EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf); ++ + /********************************************************************* + * BOOST * + *********************************************************************/ +diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h +index 6126c977e..c0b69ffe7 100644 +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -139,6 +139,7 @@ struct cppc_cpudata { + #ifdef CONFIG_ACPI_CPPC_LIB + extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); + extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); ++extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); + extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); + extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); + extern int cppc_set_enable(int cpu, bool enable); +@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) + { + return -ENOTSUPP; + } ++static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) ++{ ++ return -ENOTSUPP; ++} + static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) + { + return -ENOTSUPP; +diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h +index 446394f84..fa86bc953 100644 +--- a/include/linux/amd-pstate.h ++++ b/include/linux/amd-pstate.h +@@ -70,6 +70,7 @@ struct amd_cpudata { + u32 nominal_perf; + u32 lowest_nonlinear_perf; + u32 lowest_perf; ++ u32 prefcore_highest_perf; + + u32 max_freq; + u32 min_freq; +diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h +index 172ff51c1..766c83a4f 100644 +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); + void refresh_frequency_limits(struct cpufreq_policy *policy); + void cpufreq_update_policy(unsigned int cpu); + void cpufreq_update_limits(unsigned int cpu); ++void cpufreq_update_highest_perf(unsigned int cpu); + bool have_governor_per_policy(void); + bool cpufreq_supports_freq_invariance(void); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); +@@ -376,6 +377,9 @@ struct cpufreq_driver { + /* Called to update policy limits on firmware notifications. */ + void (*update_limits)(unsigned int cpu); + ++ /* Called to update highest performance on firmware notifications. */ ++ void (*update_highest_perf)(unsigned int cpu); ++ + /* optional */ + int (*bios_limit)(int cpu, unsigned int *limit); + +-- +2.42.0.rc0.25.ga82fb66fed diff --git a/scripts/patch.sh b/scripts/patch.sh index 0cf2285..4705d26 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -15,4 +15,6 @@ patch -Np1 < "../patches/0002-eevdfbore.patch" # Allow setting custom pollrates for usb devices patch -Np1 < "../patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch" # Allow pre polaris cards to use the amdgpu kernel module -patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch" \ No newline at end of file +patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch" +# AMD Patch for CPPC +patch -Np1 < "../patches/0006-AMD-cppc.patch" \ No newline at end of file diff --git a/scripts/source.sh b/scripts/source.sh index b0658da..b1bc0b1 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc3.tar.gz -tar -xf ./linux-6.5-rc3.tar.gz +wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc5.tar.gz +tar -xf ./linux-6.5-rc5.tar.gz -cd linux-6.5-rc3 +cd linux-6.5-rc5