From 60dc452fb5231595aa986bd6915e044445c7ccd3 Mon Sep 17 00:00:00 2001
From: ferrreo <harderthanfire@gmail.com>
Date: Thu, 10 Aug 2023 18:30:38 +0100
Subject: [PATCH] 6.5RC5

---
 config                       |   11 +-
 patches/0001-cachy-all.patch | 7344 ++++++++++------------------------
 patches/0002-eevdf.patch     |  316 +-
 patches/0002-eevdfbore.patch |  249 +-
 patches/0006-AMD-cppc.patch  |  573 +++
 scripts/patch.sh             |    4 +-
 scripts/source.sh            |    6 +-
 7 files changed, 2915 insertions(+), 5588 deletions(-)
 create mode 100644 patches/0006-AMD-cppc.patch

diff --git a/config b/config
index 178bb6b..949e0aa 100644
--- a/config
+++ b/config
@@ -594,7 +594,9 @@ CONFIG_CALL_DEPTH_TRACKING=y
 # CONFIG_CALL_THUNKS_DEBUG is not set
 CONFIG_CPU_IBPB_ENTRY=y
 CONFIG_CPU_IBRS_ENTRY=y
+CONFIG_CPU_SRSO=y
 CONFIG_SLS=y
+# CONFIG_GDS_FORCE_MITIGATION is not set
 CONFIG_ARCH_HAS_ADD_PAGES=y
 CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y
 
@@ -1333,11 +1335,10 @@ CONFIG_TCP_CONG_YEAH=m
 CONFIG_TCP_CONG_ILLINOIS=m
 CONFIG_TCP_CONG_DCTCP=m
 CONFIG_TCP_CONG_CDG=m
-CONFIG_TCP_CONG_BBR=m
-CONFIG_TCP_CONG_BBR2=y
-CONFIG_DEFAULT_BBR2=y
+CONFIG_TCP_CONG_BBR=y
+CONFIG_DEFAULT_BBR=y
 # CONFIG_DEFAULT_RENO is not set
-CONFIG_DEFAULT_TCP_CONG="bbr2"
+CONFIG_DEFAULT_TCP_CONG="bbr"
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6=y
 CONFIG_IPV6_ROUTER_PREF=y
@@ -2613,7 +2614,7 @@ CONFIG_ZRAM_DEF_COMP_ZSTD=y
 # CONFIG_ZRAM_DEF_COMP_842 is not set
 CONFIG_ZRAM_DEF_COMP="zstd"
 CONFIG_ZRAM_WRITEBACK=y
-# CONFIG_ZRAM_MEMORY_TRACKING is not set
+CONFIG_ZRAM_MEMORY_TRACKING=y
 CONFIG_ZRAM_MULTI_COMP=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_LOOP_MIN_COUNT=0
diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
index 1c37f59..480912c 100644
--- a/patches/0001-cachy-all.patch
+++ b/patches/0001-cachy-all.patch
@@ -1,68 +1,87 @@
-From a34c2671419dc12fbea9f81528eda4dd6158d320 Mon Sep 17 00:00:00 2001
+From 907edd508b99c761190492fb3f2211443b4e9bb3 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 17:08:52 +0200
-Subject: [PATCH 1/7] bbr2
+Date: Mon, 31 Jul 2023 12:19:09 +0200
+Subject: [PATCH 1/5] bbr3
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- include/linux/tcp.h                |    3 +-
- include/net/inet_connection_sock.h |    3 +-
- include/net/tcp.h                  |   41 +-
- include/uapi/linux/inet_diag.h     |   33 +
- net/ipv4/Kconfig                   |   22 +
- net/ipv4/Makefile                  |    1 +
- net/ipv4/tcp.c                     |    1 +
- net/ipv4/tcp_bbr.c                 |   38 +-
- net/ipv4/tcp_bbr2.c                | 2674 ++++++++++++++++++++++++++++
+ include/linux/tcp.h                |    4 +-
+ include/net/inet_connection_sock.h |    4 +-
+ include/net/tcp.h                  |   72 +-
+ include/uapi/linux/inet_diag.h     |   23 +
+ include/uapi/linux/rtnetlink.h     |    4 +-
+ include/uapi/linux/tcp.h           |    1 +
+ net/ipv4/Kconfig                   |   21 +-
+ net/ipv4/tcp.c                     |    3 +
+ net/ipv4/tcp_bbr.c                 | 2231 +++++++++++++++++++++-------
  net/ipv4/tcp_cong.c                |    1 +
- net/ipv4/tcp_input.c               |   27 +-
- net/ipv4/tcp_output.c              |   26 +-
+ net/ipv4/tcp_input.c               |   40 +-
+ net/ipv4/tcp_minisocks.c           |    2 +
+ net/ipv4/tcp_output.c              |   48 +-
  net/ipv4/tcp_rate.c                |   30 +-
  net/ipv4/tcp_timer.c               |    1 +
- 14 files changed, 2867 insertions(+), 34 deletions(-)
- create mode 100644 net/ipv4/tcp_bbr2.c
+ 15 files changed, 1934 insertions(+), 551 deletions(-)
 
 diff --git a/include/linux/tcp.h b/include/linux/tcp.h
-index b4c08ac86983..4297c9176435 100644
+index 91a37c99ba66..ae0ee688c3f7 100644
 --- a/include/linux/tcp.h
 +++ b/include/linux/tcp.h
-@@ -255,7 +255,8 @@ struct tcp_sock {
+@@ -255,7 +255,9 @@ struct tcp_sock {
  	u8	compressed_ack;
  	u8	dup_ack_counter:2,
  		tlp_retrans:1,	/* TLP is a retransmission */
 -		unused:5;
 +		fast_ack_mode:2, /* which fast ack mode ? */
-+		unused:3;
++		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
++		unused:2;
  	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
  	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
  	u8	chrono_type:2,	/* current chronograph type */
 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
-index c2b15f7e5516..d85858efa571 100644
+index c2b15f7e5516..a400a84088d3 100644
 --- a/include/net/inet_connection_sock.h
 +++ b/include/net/inet_connection_sock.h
-@@ -135,7 +135,8 @@ struct inet_connection_sock {
+@@ -135,8 +135,8 @@ struct inet_connection_sock {
  	u32			  icsk_probes_tstamp;
  	u32			  icsk_user_timeout;
  
 -	u64			  icsk_ca_priv[104 / sizeof(u64)];
-+/* XXX inflated by temporary internal debugging info */
-+	u64			  icsk_ca_priv[216 / sizeof(u64)];
- #define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
+-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
++#define ICSK_CA_PRIV_SIZE      (144)
++	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
  };
  
+ #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
 diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 226bce6d1e8c..64f1ec99c8f0 100644
+index 0ca972ebd3dd..8eb194559b70 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
-@@ -370,6 +370,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
+@@ -370,6 +370,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
  #define	TCP_ECN_QUEUE_CWR	2
  #define	TCP_ECN_DEMAND_CWR	4
  #define	TCP_ECN_SEEN		8
-+#define	TCP_ECN_ECT_PERMANENT	16
++#define	TCP_ECN_LOW		16
++#define	TCP_ECN_ECT_PERMANENT	32
  
  enum tcp_tw_status {
  	TCP_TW_SUCCESS = 0,
-@@ -819,6 +820,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
+@@ -723,6 +725,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
+ 		tcp_fast_path_on(tp);
+ }
+ 
++static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
++					    const struct dst_entry *dst)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
++		tp->ecn_flags |= TCP_ECN_LOW;
++}
++
+ /* Compute the actual rto_min value */
+ static inline u32 tcp_rto_min(struct sock *sk)
+ {
+@@ -819,6 +830,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
  	return max_t(s64, t1 - t0, 0);
  }
  
@@ -74,7 +93,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
  {
  	return tcp_ns_to_ts(skb->skb_mstamp_ns);
-@@ -894,9 +900,14 @@ struct tcp_skb_cb {
+@@ -894,9 +910,14 @@ struct tcp_skb_cb {
  			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
  			__u32 delivered;
  			/* start of send pipeline phase */
@@ -91,7 +110,15 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  		} tx;   /* only used for outgoing skbs */
  		union {
  			struct inet_skb_parm	h4;
-@@ -1022,7 +1033,11 @@ enum tcp_ca_ack_event_flags {
+@@ -1000,6 +1021,7 @@ enum tcp_ca_event {
+ 	CA_EVENT_LOSS,		/* loss timeout */
+ 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
+ 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
++	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
+ };
+ 
+ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
+@@ -1022,7 +1044,11 @@ enum tcp_ca_ack_event_flags {
  #define TCP_CONG_NON_RESTRICTED 0x1
  /* Requires ECN/ECT set on all packets */
  #define TCP_CONG_NEEDS_ECN	0x2
@@ -104,7 +131,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  
  union tcp_cc_info;
  
-@@ -1042,8 +1057,11 @@ struct ack_sample {
+@@ -1042,10 +1068,13 @@ struct ack_sample {
   */
  struct rate_sample {
  	u64  prior_mstamp; /* starting timestamp for interval */
@@ -114,17 +141,22 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
 +	u32 tx_in_flight;	/* packets in flight at starting timestamp */
 +	s32  lost;		/* number of packets lost over interval */
  	s32  delivered;		/* number of packets delivered over interval */
- 	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
++	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
  	long interval_us;	/* time for tp->delivered to incr "delivered" */
-@@ -1057,6 +1075,7 @@ struct rate_sample {
+ 	u32 snd_interval_us;	/* snd interval for delivered packets */
+ 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
+@@ -1056,7 +1085,9 @@ struct rate_sample {
+ 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
  	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
  	bool is_retrans;	/* is sample from retransmission? */
++	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
  	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
 +	bool is_ece;		/* did this ACK have ECN marked? */
  };
  
  struct tcp_congestion_ops {
-@@ -1080,8 +1099,11 @@ struct tcp_congestion_ops {
+@@ -1080,8 +1111,11 @@ struct tcp_congestion_ops {
  	/* hook for packet ack accounting (optional) */
  	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
  
@@ -138,7 +170,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  
  	/* call when packets are delivered to update cwnd and pacing rate,
  	 * after all the ca_state processing. (optional)
-@@ -1147,6 +1169,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+@@ -1147,6 +1181,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
  }
  #endif
  
@@ -153,7 +185,7 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  static inline bool tcp_ca_needs_ecn(const struct sock *sk)
  {
  	const struct inet_connection_sock *icsk = inet_csk(sk);
-@@ -1166,6 +1196,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
+@@ -1166,6 +1208,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
  void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
  
  /* From tcp_rate.c */
@@ -161,33 +193,45 @@ index 226bce6d1e8c..64f1ec99c8f0 100644
  void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
  void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
  			    struct rate_sample *rs);
+@@ -1178,6 +1221,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+ 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+ }
+ 
++/* If a retransmit failed due to local qdisc congestion or other local issues,
++ * then we may have called tcp_set_skb_tso_segs() to increase the number of
++ * segments in the skb without increasing the tx.in_flight. In all other cases,
++ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
++ * do not have the state to know whether a retransmit failed due to local qdisc
++ * congestion or other local issues, so to avoid spurious warnings we consider
++ * that any skb marked lost may have suffered that fate.
++ */
++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
++						      u32 skb_sacked_flags,
++						      u32 tx_in_flight)
++{
++	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
++}
++
+ /* These functions determine how the current flow behaves in respect of SACK
+  * handling. SACK is negotiated with the peer, and therefore it can vary
+  * between different flows.
+@@ -2177,7 +2235,7 @@ struct tcp_plb_state {
+ 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+ 		unused:3;
+ 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+-};
++} __attribute__ ((__packed__));
+ 
+ static inline void tcp_plb_init(const struct sock *sk,
+ 				struct tcp_plb_state *plb)
 diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
-index 50655de04c9b..0e24f11627d5 100644
+index 50655de04c9b..82f8bd8f0d16 100644
 --- a/include/uapi/linux/inet_diag.h
 +++ b/include/uapi/linux/inet_diag.h
-@@ -231,9 +231,42 @@ struct tcp_bbr_info {
+@@ -229,6 +229,29 @@ struct tcp_bbr_info {
+ 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+ 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
  	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
- };
- 
-+/* Phase as reported in netlink/ss stats. */
-+enum tcp_bbr2_phase {
-+	BBR2_PHASE_INVALID		= 0,
-+	BBR2_PHASE_STARTUP		= 1,
-+	BBR2_PHASE_DRAIN		= 2,
-+	BBR2_PHASE_PROBE_RTT		= 3,
-+	BBR2_PHASE_PROBE_BW_UP		= 4,
-+	BBR2_PHASE_PROBE_BW_DOWN	= 5,
-+	BBR2_PHASE_PROBE_BW_CRUISE	= 6,
-+	BBR2_PHASE_PROBE_BW_REFILL	= 7
-+};
-+
-+struct tcp_bbr2_info {
-+	/* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */
-+	__u32	bbr_bw_lsb;		/* lower 32 bits of bw */
-+	__u32	bbr_bw_msb;		/* upper 32 bits of bw */
-+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
-+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
-+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
 +	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
 +	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
 +	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
@@ -195,80 +239,88 @@ index 50655de04c9b..0e24f11627d5 100644
 +	__u8	bbr_mode;		/* current bbr_mode in state machine */
 +	__u8	bbr_phase;		/* current state machine phase */
 +	__u8	unused1;		/* alignment padding; not used yet */
-+	__u8	bbr_version;		/* MUST be at this offset in struct */
-+	__u32	bbr_inflight_lo;	/* lower/short-term data volume bound */
-+	__u32	bbr_inflight_hi;	/* higher/long-term data volume bound */
++	__u8	bbr_version;		/* BBR algorithm version */
++	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
++	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
 +	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
 +};
 +
- union tcp_cc_info {
- 	struct tcpvegas_info	vegas;
- 	struct tcp_dctcp_info	dctcp;
- 	struct tcp_bbr_info	bbr;
-+	struct tcp_bbr2_info	bbr2;
++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
++enum tcp_bbr_phase {
++	BBR_PHASE_INVALID		= 0,
++	BBR_PHASE_STARTUP		= 1,
++	BBR_PHASE_DRAIN			= 2,
++	BBR_PHASE_PROBE_RTT		= 3,
++	BBR_PHASE_PROBE_BW_UP		= 4,
++	BBR_PHASE_PROBE_BW_DOWN		= 5,
++	BBR_PHASE_PROBE_BW_CRUISE	= 6,
++	BBR_PHASE_PROBE_BW_REFILL	= 7,
  };
- #endif /* _UAPI_INET_DIAG_H_ */
+ 
+ union tcp_cc_info {
+diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
+index 51c13cf9c5ae..de8dcba26bec 100644
+--- a/include/uapi/linux/rtnetlink.h
++++ b/include/uapi/linux/rtnetlink.h
+@@ -506,9 +506,11 @@ enum {
+ #define RTAX_FEATURE_SACK	(1 << 1)
+ #define RTAX_FEATURE_TIMESTAMP	(1 << 2)
+ #define RTAX_FEATURE_ALLFRAG	(1 << 3)
++#define RTAX_FEATURE_ECN_LOW	(1 << 4)
+ 
+ #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
+-				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
++				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \
++				 | RTAX_FEATURE_ECN_LOW)
+ 
+ struct rta_session {
+ 	__u8	proto;
+diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
+index 879eeb0a084b..77270053a5e3 100644
+--- a/include/uapi/linux/tcp.h
++++ b/include/uapi/linux/tcp.h
+@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
+ #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
+ #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+ #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
++#define TCPI_OPT_ECN_LOW	64 /* Low-latency ECN configured at init */
+ 
+ /*
+  * Sender's congestion state indicating normal or abnormal situations
 diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
-index 2dfb12230f08..b6bec331a82e 100644
+index 2dfb12230f08..2e14db3bee70 100644
 --- a/net/ipv4/Kconfig
 +++ b/net/ipv4/Kconfig
-@@ -678,6 +678,24 @@ config TCP_CONG_BBR
- 	  AQM schemes that do not provide a delay signal. It requires the fq
- 	  ("Fair Queue") pacing packet scheduler.
+@@ -668,15 +668,18 @@ config TCP_CONG_BBR
+ 	default n
+ 	help
+ 
+-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+-	  maximize network utilization and minimize queues. It builds an explicit
+-	  model of the bottleneck delivery rate and path round-trip propagation
+-	  delay. It tolerates packet loss and delay unrelated to congestion. It
+-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+-	  coexist with flows that use loss-based congestion control, and can
+-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+-	  AQM schemes that do not provide a delay signal. It requires the fq
+-	  ("Fair Queue") pacing packet scheduler.
++	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
++	  model-based congestion control algorithm that aims to maximize
++	  network utilization, keep queues and retransmit rates low, and to be
++	  able to coexist with Reno/CUBIC in common scenarios. It builds an
++	  explicit model of the network path.  It tolerates a targeted degree
++	  of random packet loss and delay. It can operate over LAN, WAN,
++	  cellular, wifi, or cable modem links, and can use shallow-threshold
++	  ECN signals. It can coexist to some degree with flows that use
++	  loss-based congestion control, and can operate with shallow buffers,
++	  deep buffers, bufferbloat, policers, or AQM schemes that do not
++	  provide a delay signal. It requires pacing, using either TCP internal
++	  pacing or the fq ("Fair Queue") pacing packet scheduler.
  
-+config TCP_CONG_BBR2
-+	tristate "BBR2 TCP"
-+	default n
-+	help
-+
-+	BBR2 TCP congestion control is a model-based congestion control
-+	algorithm that aims to maximize network utilization, keep queues and
-+	retransmit rates low, and to be able to coexist with Reno/CUBIC in
-+	common scenarios. It builds an explicit model of the network path.  It
-+	tolerates a targeted degree of random packet loss and delay that are
-+	unrelated to congestion. It can operate over LAN, WAN, cellular, wifi,
-+	or cable modem links, and can use DCTCP-L4S-style ECN signals.  It can
-+	coexist with flows that use loss-based congestion control, and can
-+	operate with shallow buffers, deep buffers, bufferbloat, policers, or
-+	AQM schemes that do not provide a delay signal. It requires pacing,
-+	using either TCP internal pacing or the fq ("Fair Queue") pacing packet
-+	scheduler.
-+
  choice
  	prompt "Default TCP congestion control"
- 	default DEFAULT_CUBIC
-@@ -715,6 +733,9 @@ choice
- 	config DEFAULT_BBR
- 		bool "BBR" if TCP_CONG_BBR=y
- 
-+	config DEFAULT_BBR2
-+		bool "BBR2" if TCP_CONG_BBR2=y
-+
- 	config DEFAULT_RENO
- 		bool "Reno"
- endchoice
-@@ -739,6 +760,7 @@ config DEFAULT_TCP_CONG
- 	default "dctcp" if DEFAULT_DCTCP
- 	default "cdg" if DEFAULT_CDG
- 	default "bbr" if DEFAULT_BBR
-+	default "bbr2" if DEFAULT_BBR2
- 	default "cubic"
- 
- config TCP_MD5SIG
-diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
-index b18ba8ef93ad..b4e3dcb27a20 100644
---- a/net/ipv4/Makefile
-+++ b/net/ipv4/Makefile
-@@ -47,6 +47,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
- obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
- obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
- obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
-+obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o
- obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
- obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
- obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index e03e08745308..326b2c4bacf6 100644
+index 8ed52e1e3c99..0198ac17f3a8 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
 @@ -3083,6 +3083,7 @@ int tcp_disconnect(struct sock *sk, int flags)
@@ -279,43 +331,505 @@ index e03e08745308..326b2c4bacf6 100644
  
  
  	/* Clean up fastopen related fields */
+@@ -3778,6 +3779,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
+ 		info->tcpi_options |= TCPI_OPT_ECN;
+ 	if (tp->ecn_flags & TCP_ECN_SEEN)
+ 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
++	if (tp->ecn_flags & TCP_ECN_LOW)
++		info->tcpi_options |= TCPI_OPT_ECN_LOW;
+ 	if (tp->syn_data_acked)
+ 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ 
 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
-index 146792cd26fe..16038f6ee52a 100644
+index 146792cd26fe..f4f477a69917 100644
 --- a/net/ipv4/tcp_bbr.c
 +++ b/net/ipv4/tcp_bbr.c
-@@ -294,26 +294,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+@@ -1,18 +1,19 @@
+-/* Bottleneck Bandwidth and RTT (BBR) congestion control
++/* BBR (Bottleneck Bandwidth and RTT) congestion control
+  *
+- * BBR congestion control computes the sending rate based on the delivery
+- * rate (throughput) estimated from ACKs. In a nutshell:
++ * BBR is a model-based congestion control algorithm that aims for low queues,
++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
++ * network path, it uses measurements of bandwidth and RTT, as well as (if they
++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
++ * it can use ECN or loss signals explicitly, it does not require either; it
++ * can bound its in-flight data based on its estimate of the BDP.
+  *
+- *   On each ACK, update our model of the network path:
+- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+- *      min_rtt = windowed_min(rtt, 10 seconds)
+- *   pacing_rate = pacing_gain * bottleneck_bandwidth
+- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+- *
+- * The core algorithm does not react directly to packet losses or delays,
+- * although BBR may adjust the size of next send per ACK when loss is
+- * observed, or adjust the sending rate if it estimates there is a
+- * traffic policer, in order to keep the drop rate reasonable.
++ * The model has both higher and lower bounds for the operating range:
++ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
++ *   hi: bw_hi, inflight_hi: robust long-term upper bound
++ * The bandwidth-probing time scale is (a) extended dynamically based on
++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
++ * an interactive wall-clock time-scale to be more scalable and responsive
++ * than Reno and CUBIC.
+  *
+  * Here is a state transition diagram for BBR:
+  *
+@@ -65,6 +66,13 @@
+ #include <linux/random.h>
+ #include <linux/win_minmax.h>
+ 
++#include <trace/events/tcp.h>
++#include "tcp_dctcp.h"
++
++#define BBR_VERSION		3
++
++#define bbr_param(sk,name)	(bbr_ ## name)
++
+ /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+@@ -85,36 +93,41 @@ enum bbr_mode {
+ 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+ };
+ 
++/* How does the incoming ACK stream relate to our bandwidth probing? */
++enum bbr_ack_phase {
++	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
++	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
++	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
++	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
++	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
++};
++
+ /* BBR congestion control block */
+ struct bbr {
+ 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+ 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+ 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
++	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
++	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
+ 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+ 	u64	cycle_mstamp;	     /* time of this cycle phase start */
+-	u32     mode:3,		     /* current bbr_mode in state machine */
++	u32     mode:2,		     /* current bbr_mode in state machine */
+ 		prev_ca_state:3,     /* CA state on previous ACK */
+-		packet_conservation:1,  /* use packet conservation? */
+ 		round_start:1,	     /* start of packet-timed tx->ack round? */
++		ce_state:1,          /* If most recent data has CE bit set */
++		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
++		try_fast_path:1,	/* can we take fast path? */
+ 		idle_restart:1,	     /* restarting after idle? */
+ 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+-		unused:13,
+-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
++		init_cwnd:7,         /* initial cwnd */
++		unused_1:10;
+ 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+ 		cwnd_gain:10,	/* current gain for setting cwnd */
+ 		full_bw_reached:1,   /* reached full bw in Startup? */
+ 		full_bw_cnt:2,	/* number of rounds without large bw gains */
+-		cycle_idx:3,	/* current index in pacing_gain cycle array */
++		cycle_idx:2,	/* current index in pacing_gain cycle array */
+ 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+-		unused_b:5;
++		unused_2:6;
+ 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+ 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+ 
+@@ -124,19 +137,67 @@ struct bbr {
+ 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+ 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+ 		extra_acked_win_idx:1,	/* current index in extra_acked array */
+-		unused_c:6;
++	/* BBR v3 state: */
++		full_bw_now:1,		/* recently reached full bw plateau? */
++		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
++		loss_in_cycle:1,	/* packet loss in this cycle? */
++		ecn_in_cycle:1,		/* ECN in this cycle? */
++		unused_3:1;
++	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
++	u32	undo_bw_lo;	     /* bw_lo before latest losses */
++	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
++	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
++	u32	bw_latest;	 /* max delivered bw in last round trip */
++	u32	bw_lo;		 /* lower bound on sending bandwidth */
++	u32	bw_hi[2];	 /* max recent measured bw sample */
++	u32	inflight_latest; /* max delivered data in last round trip */
++	u32	inflight_lo;	 /* lower bound of inflight data range */
++	u32	inflight_hi;	 /* upper bound of inflight data range */
++	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
++	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
++	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
++	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
++	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
++		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
++		bw_probe_samples:1,    /* rate samples reflect bw probing? */
++		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
++		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
++		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
++		loss_round_start:1,    /* loss_round_delivered round trip? */
++		loss_in_round:1,       /* loss marked in this round trip? */
++		ecn_in_round:1,	       /* ECN marked in this round trip? */
++		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
++		loss_events_in_round:4,/* losses in STARTUP round */
++		initialized:1;	       /* has bbr_init() been called? */
++	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
++	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
++
++	u8	unused_4;		/* to preserve alignment */
++	struct tcp_plb_state plb;
+ };
+ 
+-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
++struct bbr_context {
++	u32 sample_bw;
++};
+ 
+-/* Window length of bw filter (in rounds): */
+-static const int bbr_bw_rtts = CYCLE_LEN + 2;
+ /* Window length of min_rtt filter (in sec): */
+ static const u32 bbr_min_rtt_win_sec = 10;
+ /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+ static const u32 bbr_probe_rtt_mode_ms = 200;
+-/* Skip TSO below the following bandwidth (bits/sec): */
+-static const int bbr_min_tso_rate = 1200000;
++/* Window length of probe_rtt_min_us filter (in ms), and consequently the
++ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
++ */
++static const u32 bbr_probe_rtt_win_ms = 5000;
++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
++
++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
++ * in bigger TSO bursts. We cut the RTT-based allowance in half
++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
++ * is below 1500 bytes after 6 * ~500 usec = 3ms.
++ */
++static const u32 bbr_tso_rtt_shift = 9;
+ 
+ /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+  * In order to help drive the network toward lower queues and low latency while
+@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
+  */
+ static const int bbr_pacing_margin_percent = 1;
+ 
+-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
+  * that will allow a smoothly increasing pacing rate that will double each RTT
+  * and send the same number of packets per RTT that an un-paced, slow-starting
+  * Reno or CUBIC flow would:
+  */
+-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
++/* The gain for deriving startup cwnd: */
++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
++/* The pacing gain in BBR_DRAIN is calculated to typically drain
+  * the queue created in BBR_STARTUP in a single round:
+  */
+ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+ static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+ /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+ static const int bbr_pacing_gain[] = {
+-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
++	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
++	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
++	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
++	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
++};
++enum bbr_pacing_gain_phase {
++	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
++	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
++	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
++	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
+ };
+-/* Randomize the starting gain cycling phase over N phases: */
+-static const u32 bbr_cycle_rand = 7;
+ 
+ /* Try to keep at least this many packets in flight, if things go smoothly. For
+  * smooth functioning, a sliding window protocol ACKing every other packet
+@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
+  */
+ static const u32 bbr_cwnd_min_target = 4;
+ 
+-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
+ /* If bw has increased significantly (1.25x), there may be more bw available: */
+ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+ /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+ static const u32 bbr_full_bw_cnt = 3;
+ 
+-/* "long-term" ("LT") bandwidth estimator parameters... */
+-/* The minimum number of rounds in an LT bw sampling interval: */
+-static const u32 bbr_lt_intvl_min_rtts = 4;
+-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+-static const u32 bbr_lt_loss_thresh = 50;
+-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+-static const u32 bbr_lt_bw_diff = 4000 / 8;
+-/* If we estimate we're policed, use lt_bw for this many round trips: */
+-static const u32 bbr_lt_bw_max_rtts = 48;
+-
+ /* Gain factor for adding extra_acked to target cwnd: */
+ static const int bbr_extra_acked_gain = BBR_UNIT;
+ /* Window length of extra_acked window. */
+@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+ /* Time period for clamping cwnd increment due to ack aggregation */
+ static const u32 bbr_extra_acked_max_us = 100 * 1000;
+ 
++/* Flags to control BBR ECN-related behavior... */
++
++/* Ensure ACKs only ACK packets with consistent ECN CE status? */
++static const bool bbr_precise_ece_ack = true;
++
++/* Max RTT (in usec) at which to use sender-side ECN logic.
++ * Disabled when 0 (ECN allowed at any RTT).
++ */
++static const u32 bbr_ecn_max_rtt_us = 5000;
++
++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
++ * No loss response when 0.
++ */
++static const u32 bbr_beta = BBR_UNIT * 30 / 100;
++
++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
++
++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
++ * to congestion if the bottleneck is congested when the flow starts up.
++ */
++static const u32 bbr_ecn_alpha_init = BBR_UNIT;
++
++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
++ * No ECN based bounding when 0.
++ */
++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
++
++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
++ * Scaled by BBR_SCALE. Disabled when 0.
++ */
++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
++
++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
++ * clears then make the first round's increment to inflight_hi the following
++ * fraction of inflight_hi.
++ */
++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
++
++/* Estimate bw probing has gone too far if loss rate exceeds this level. */
++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
++
++/* Slow down for a packet loss recovered by TLP? */
++static const bool bbr_loss_probe_recovery = true;
++
++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
++ * and loss rate is higher than bbr_loss_thresh.
++ * Disabled if 0.
++ */
++static const u32 bbr_full_loss_cnt = 6;
++
++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
++ * meets this count.
++ */
++static const u32 bbr_full_ecn_cnt = 2;
++
++/* Fraction of unutilized headroom to try to leave in path upon high loss. */
++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
++
++/* How much do we increase cwnd_gain when probing for bandwidth in
++ * BBR_BW_PROBE_UP? This specifies the increment in units of
++ * BBR_UNIT/4. The default is 1, meaning 0.25.
++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
++ */
++static const u32 bbr_bw_probe_cwnd_gain = 1;
++
++/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
++ * we want to tolerate 1% random loss per round, and not have this cut our
++ * inflight too much, we must probe for bw periodically on roughly this scale.
++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
++ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
++ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
++ */
++static const u32 bbr_bw_probe_max_rounds = 63;
++
++/* Max amount of randomness to inject in round counting for Reno-coexistence.
++ */
++static const u32 bbr_bw_probe_rand_rounds = 2;
++
++/* Use BBR-native probe time scale starting at this many usec.
++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
++ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
++ */
++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
++
++/* Use BBR-native probes spread over this many usec: */
++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
++
++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
++static const bool bbr_fast_path = true;
++
++/* Use fast ack mode? */
++static const bool bbr_fast_ack_mode = true;
++
++static u32 bbr_max_bw(const struct sock *sk);
++static u32 bbr_bw(const struct sock *sk);
++static void bbr_exit_probe_rtt(struct sock *sk);
++static void bbr_reset_congestion_signals(struct sock *sk);
++static void bbr_run_loss_probe_recovery(struct sock *sk);
++
+ static void bbr_check_probe_rtt_done(struct sock *sk);
+ 
++/* This connection can use ECN if both endpoints have signaled ECN support in
++ * the handshake and the per-route settings indicated this is a
++ * shallow-threshold ECN environment, meaning both:
++ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
++ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
++ *      with consistent ECN CE status
++ */
++static bool bbr_can_use_ecn(const struct sock *sk)
++{
++	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
++	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
++}
++
+ /* Do we estimate that STARTUP filled the pipe? */
+ static bool bbr_full_bw_reached(const struct sock *sk)
+ {
+@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+ static u32 bbr_max_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return minmax_get(&bbr->bw);
++	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
+ }
+ 
+ /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+ static u32 bbr_bw(const struct sock *sk)
+ {
+-	struct bbr *bbr = inet_csk_ca(sk);
++	const struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
++	return min(bbr_max_bw(sk), bbr->bw_lo);
+ }
+ 
+ /* Return maximum extra acked in past k-2k round trips,
+@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
+  * The order here is chosen carefully to avoid overflow of u64. This should
+  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+  */
+-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
++				  int margin)
+ {
+ 	unsigned int mss = tcp_sk(sk)->mss_cache;
+ 
+ 	rate *= mss;
+ 	rate *= gain;
+ 	rate >>= BBR_SCALE;
+-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+-	return rate >> BW_SCALE;
++	rate *= USEC_PER_SEC / 100 * (100 - margin);
++	rate >>= BW_SCALE;
++	rate = max(rate, 1ULL);
++	return rate;
++}
++
++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
++{
++	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
+ }
+ 
+ /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+ {
+ 	u64 rate = bw;
+ 
+-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
++	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
++				      bbr_pacing_margin_percent);
+ 	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ 	return rate;
+ }
+ 
+-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
+ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+ 	}
+ 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+ 	do_div(bw, rtt_us);
+-	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
++	sk->sk_pacing_rate =
++	  bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain));
+ }
+ 
+ /* Pace using current bw estimate and a gain factor. */
+@@ -294,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
  		sk->sk_pacing_rate = rate;
  }
  
 -/* override sysctl_tcp_min_tso_segs */
- __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
- {
- 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
- }
- 
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
+-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
++/* Return the number of segments BBR would like in a TSO/GSO skb, given a
++ * particular max gso size as a constraint. TODO: make this simpler and more
++ * consistent by switching bbr to just call tcp_tso_autosize().
 + */
 +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
 +				u32 gso_max_size)
 +{
-+	u32 segs;
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 segs, r;
 +	u64 bytes;
 +
 +	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
 +	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
 +
++	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
++	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
++	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
++	 */
++	if (bbr_param(sk, tso_rtt_shift)) {
++		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
++		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
++			bytes += GSO_LEGACY_MAX_SIZE >> r;
++	}
++
 +	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
++	segs = max_t(u32, bytes / mss_now,
++		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 +	return segs;
 +}
 +
 +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+ {
+-	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 +	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
+ }
+ 
 +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
  static u32 bbr_tso_segs_goal(struct sock *sk)
  {
@@ -335,943 +849,176 @@ index 146792cd26fe..16038f6ee52a 100644
  }
  
  /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-@@ -1149,7 +1163,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
- 	.undo_cwnd	= bbr_undo_cwnd,
- 	.cwnd_event	= bbr_cwnd_event,
- 	.ssthresh	= bbr_ssthresh,
--	.min_tso_segs	= bbr_min_tso_segs,
-+	.tso_segs	= bbr_tso_segs,
- 	.get_info	= bbr_get_info,
- 	.set_state	= bbr_set_state,
- };
-diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c
-new file mode 100644
-index 000000000000..85f8052144d1
---- /dev/null
-+++ b/net/ipv4/tcp_bbr2.c
-@@ -0,0 +1,2674 @@
-+/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2
-+ *
-+ * BBRv2 is a model-based congestion control algorithm that aims for low
-+ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model
-+ * of the network path, it uses measurements of bandwidth and RTT, as well as
-+ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals.  Note that
-+ * although it can use ECN or loss signals explicitly, it does not require
-+ * either; it can bound its in-flight data based on its estimate of the BDP.
-+ *
-+ * The model has both higher and lower bounds for the operating range:
-+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
-+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
-+ * The bandwidth-probing time scale is (a) extended dynamically based on
-+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
-+ * an interactive wall-clock time-scale to be more scalable and responsive
-+ * than Reno and CUBIC.
-+ *
-+ * Here is a state transition diagram for BBR:
-+ *
-+ *             |
-+ *             V
-+ *    +---> STARTUP  ----+
-+ *    |        |         |
-+ *    |        V         |
-+ *    |      DRAIN   ----+
-+ *    |        |         |
-+ *    |        V         |
-+ *    +---> PROBE_BW ----+
-+ *    |      ^    |      |
-+ *    |      |    |      |
-+ *    |      +----+      |
-+ *    |                  |
-+ *    +---- PROBE_RTT <--+
-+ *
-+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
-+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
-+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
-+ * A long-lived BBR flow spends the vast majority of its time remaining
-+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
-+ * in a fair manner, with a small, bounded queue. *If* a flow has been
-+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
-+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
-+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
-+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
-+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
-+ * otherwise we enter STARTUP to try to fill the pipe.
-+ *
-+ * BBR is described in detail in:
-+ *   "BBR: Congestion-Based Congestion Control",
-+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
-+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
-+ *
-+ * There is a public e-mail list for discussing BBR development and testing:
-+ *   https://groups.google.com/forum/#!forum/bbr-dev
-+ *
-+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
-+ * otherwise TCP stack falls back to an internal pacing using one high
-+ * resolution timer per TCP socket and may use more resources.
-+ */
-+#include <linux/module.h>
-+#include <net/tcp.h>
-+#include <linux/inet_diag.h>
-+#include <linux/inet.h>
-+#include <linux/random.h>
-+
-+#include "tcp_dctcp.h"
-+
-+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
-+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
-+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
-+ * Since the minimum window is >=4 packets, the lower bound isn't
-+ * an issue. The upper bound isn't an issue with existing technologies.
-+ */
-+#define BW_SCALE 24
-+#define BW_UNIT (1 << BW_SCALE)
-+
-+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
-+#define BBR_UNIT (1 << BBR_SCALE)
-+
-+#define FLAG_DEBUG_VERBOSE	0x1	/* Verbose debugging messages */
-+#define FLAG_DEBUG_LOOPBACK	0x2	/* Do NOT skip loopback addr */
-+
-+#define CYCLE_LEN		8	/* number of phases in a pacing gain cycle */
-+
-+/* BBR has the following modes for deciding how fast to send: */
-+enum bbr_mode {
-+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
-+	BBR_DRAIN,	/* drain any queue created during startup */
-+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
-+	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
-+};
-+
-+/* How does the incoming ACK stream relate to our bandwidth probing? */
-+enum bbr_ack_phase {
-+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
-+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
-+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
-+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
-+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
-+};
-+
-+/* BBR congestion control block */
-+struct bbr {
-+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
-+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
-+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
-+	u32	probe_rtt_min_us;	/* min RTT in bbr_probe_rtt_win_ms window */
-+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
-+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
-+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
-+	u64	cycle_mstamp;	     /* time of this cycle phase start */
-+	u32     mode:3,		     /* current bbr_mode in state machine */
-+		prev_ca_state:3,     /* CA state on previous ACK */
-+		packet_conservation:1,  /* use packet conservation? */
-+		round_start:1,	     /* start of packet-timed tx->ack round? */
-+		ce_state:1,          /* If most recent data has CE bit set */
-+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
-+		try_fast_path:1, 	/* can we take fast path? */
-+		unused2:11,
-+		idle_restart:1,	     /* restarting after idle? */
-+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
-+		cycle_idx:3,	/* current index in pacing_gain cycle array */
-+		has_seen_rtt:1;	     /* have we seen an RTT sample yet? */
-+	u32	pacing_gain:11,	/* current gain for setting pacing rate */
-+		cwnd_gain:11,	/* current gain for setting cwnd */
-+		full_bw_reached:1,   /* reached full bw in Startup? */
-+		full_bw_cnt:2,	/* number of rounds without large bw gains */
-+		init_cwnd:7;	/* initial cwnd */
-+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
-+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
-+
-+	/* For tracking ACK aggregation: */
-+	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
-+	u16	extra_acked[2];		/* max excess data ACKed in epoch */
-+	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
-+		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
-+		extra_acked_win_idx:1,	/* current index in extra_acked array */
-+	/* BBR v2 state: */
-+		unused1:2,
-+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
-+		loss_in_cycle:1,	/* packet loss in this cycle? */
-+		ecn_in_cycle:1;		/* ECN in this cycle? */
-+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
-+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
-+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
-+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
-+	u32	bw_latest;	 /* max delivered bw in last round trip */
-+	u32	bw_lo;		 /* lower bound on sending bandwidth */
-+	u32	bw_hi[2];	 /* upper bound of sending bandwidth range*/
-+	u32	inflight_latest; /* max delivered data in last round trip */
-+	u32	inflight_lo;	 /* lower bound of inflight data range */
-+	u32	inflight_hi;	 /* upper bound of inflight data range */
-+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
-+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
-+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
-+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
-+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
-+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
-+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
-+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
-+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
-+		loss_round_start:1,    /* loss_round_delivered round trip? */
-+		loss_in_round:1,       /* loss marked in this round trip? */
-+		ecn_in_round:1,	       /* ECN marked in this round trip? */
-+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
-+		loss_events_in_round:4,/* losses in STARTUP round */
-+		initialized:1;	       /* has bbr_init() been called? */
-+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
-+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
-+
-+	/* Params configurable using setsockopt. Refer to correspoding
-+	 * module param for detailed description of params.
-+	 */
-+	struct bbr_params {
-+		u32	high_gain:11,		/* max allowed value: 2047 */
-+			drain_gain:10,		/* max allowed value: 1023 */
-+			cwnd_gain:11;		/* max allowed value: 2047 */
-+		u32	cwnd_min_target:4,	/* max allowed value: 15 */
-+			min_rtt_win_sec:5,	/* max allowed value: 31 */
-+			probe_rtt_mode_ms:9,	/* max allowed value: 511 */
-+			full_bw_cnt:3,		/* max allowed value: 7 */
-+			cwnd_tso_budget:1,	/* allowed values: {0, 1} */
-+			unused3:6,
-+			drain_to_target:1,	/* boolean */
-+			precise_ece_ack:1,	/* boolean */
-+			extra_acked_in_startup:1, /* allowed values: {0, 1} */
-+			fast_path:1;		/* boolean */
-+		u32	full_bw_thresh:10,	/* max allowed value: 1023 */
-+			startup_cwnd_gain:11,	/* max allowed value: 2047 */
-+			bw_probe_pif_gain:9,	/* max allowed value: 511 */
-+			usage_based_cwnd:1, 	/* boolean */
-+			unused2:1;
-+		u16	probe_rtt_win_ms:14,	/* max allowed value: 16383 */
-+			refill_add_inc:2;	/* max allowed value: 3 */
-+		u16	extra_acked_gain:11,	/* max allowed value: 2047 */
-+			extra_acked_win_rtts:5; /* max allowed value: 31*/
-+		u16	pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */
-+		/* Mostly BBR v2 parameters below here: */
-+		u32	ecn_alpha_gain:8,	/* max allowed value: 255 */
-+			ecn_factor:8,		/* max allowed value: 255 */
-+			ecn_thresh:8,		/* max allowed value: 255 */
-+			beta:8;			/* max allowed value: 255 */
-+		u32	ecn_max_rtt_us:19,	/* max allowed value: 524287 */
-+			bw_probe_reno_gain:9,	/* max allowed value: 511 */
-+			full_loss_cnt:4;	/* max allowed value: 15 */
-+		u32	probe_rtt_cwnd_gain:8,	/* max allowed value: 255 */
-+			inflight_headroom:8,	/* max allowed value: 255 */
-+			loss_thresh:8,		/* max allowed value: 255 */
-+			bw_probe_max_rounds:8;	/* max allowed value: 255 */
-+		u32	bw_probe_rand_rounds:4, /* max allowed value: 15 */
-+			bw_probe_base_us:26,	/* usecs: 0..2^26-1 (67 secs) */
-+			full_ecn_cnt:2;		/* max allowed value: 3 */
-+		u32	bw_probe_rand_us:26,	/* usecs: 0..2^26-1 (67 secs) */
-+			undo:1,			/* boolean */
-+			tso_rtt_shift:4,	/* max allowed value: 15 */
-+			unused5:1;
-+		u32	ecn_reprobe_gain:9,	/* max allowed value: 511 */
-+			unused1:14,
-+			ecn_alpha_init:9;	/* max allowed value: 256 */
-+	} params;
-+
-+	struct {
-+		u32	snd_isn; /* Initial sequence number */
-+		u32	rs_bw; 	 /* last valid rate sample bw */
-+		u32	target_cwnd; /* target cwnd, based on BDP */
-+		u8	undo:1,  /* Undo even happened but not yet logged */
-+			unused:7;
-+		char	event;	 /* single-letter event debug codes */
-+		u16	unused2;
-+	} debug;
-+};
-+
-+struct bbr_context {
-+	u32 sample_bw;
-+	u32 target_cwnd;
-+	u32 log:1;
-+};
-+
-+/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */
-+static u32 bbr_min_rtt_win_sec = 10;
-+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode.
-+ * Max allowed value is 511 (0x1FF).
-+ */
-+static u32 bbr_probe_rtt_mode_ms = 200;
-+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
-+ * typical interval between PROBE_RTT mode entries.
-+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
-+ */
-+static u32 bbr_probe_rtt_win_ms = 5000;
-+/* Skip TSO below the following bandwidth (bits/sec): */
-+static int bbr_min_tso_rate = 1200000;
-+
-+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
-+ * in bigger TSO bursts. By default we cut the RTT-based allowance in half
-+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
-+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
-+ */
-+static u32 bbr_tso_rtt_shift = 9;  /* halve allowance per 2^9 usecs, 512us */
-+
-+/* Select cwnd TSO budget approach:
-+ *  0: padding
-+ *  1: flooring
-+ */
-+static uint bbr_cwnd_tso_budget = 1;
-+
-+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
-+ * In order to help drive the network toward lower queues and low latency while
-+ * maintaining high utilization, the average pacing rate aims to be slightly
-+ * lower than the estimated bandwidth. This is an important aspect of the
-+ * design.
-+ */
-+static const int bbr_pacing_margin_percent = 1;
-+
-+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
-+ * that will allow a smoothly increasing pacing rate that will double each RTT
-+ * and send the same number of packets per RTT that an un-paced, slow-starting
-+ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
-+/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */
-+static int bbr_startup_cwnd_gain  = BBR_UNIT * 2885 / 1000 + 1;
-+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
-+ * the queue created in BBR_STARTUP in a single round. Max allowed value
-+ * is 1023 (0x3FF).
-+ */
-+static int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
-+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs.
-+ * Max allowed value is 2047 (0x7FF).
-+ */
-+static int bbr_cwnd_gain  = BBR_UNIT * 2;
-+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw.
-+ * Max allowed value for each element is 1023 (0x3FF).
-+ */
-+enum bbr_pacing_gain_phase {
-+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
-+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
-+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
-+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
-+};
-+static int bbr_pacing_gain[] = {
-+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
-+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
-+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
-+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
-+};
-+
-+/* Try to keep at least this many packets in flight, if things go smoothly. For
-+ * smooth functioning, a sliding window protocol ACKing every other packet
-+ * needs at least 4 packets in flight. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_cwnd_min_target = 4;
-+
-+/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%.
-+ * Use 0 to disable. Max allowed value is 255.
-+ */
-+static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
-+
-+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
-+/* If bw has increased significantly (1.25x), there may be more bw available.
-+ * Max allowed value is 1023 (0x3FF).
-+ */
-+static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
-+/* But after 3 rounds w/o significant bw growth, estimate pipe is full.
-+ * Max allowed value is 7 (0x7).
-+ */
-+static u32 bbr_full_bw_cnt = 3;
-+
-+static u32 bbr_flags;		/* Debugging related stuff */
-+
-+/* Whether to debug using printk.
-+ */
-+static bool bbr_debug_with_printk;
-+
-+/* Whether to debug using ftrace event tcp:tcp_bbr_event.
-+ * Ignored when bbr_debug_with_printk is set.
-+ */
-+static bool bbr_debug_ftrace;
-+
-+/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */
-+static bool bbr_drain_to_target = true;		/* default: enabled */
-+
-+/* Experiment: Flags to control BBR with ECN behavior.
-+ */
-+static bool bbr_precise_ece_ack = true;		/* default: enabled */
-+
-+/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is
-+ * (2^(16+14) B)/(1024 B/packet) = 1M packets.
-+ */
-+static u32 bbr_cwnd_warn_val	= 1U << 20;
-+
-+static u16 bbr_debug_port_mask;
-+
-+/* BBR module parameters. These are module parameters only in Google prod.
-+ * Upstream these are intentionally not module parameters.
-+ */
-+static int bbr_pacing_gain_size = CYCLE_LEN;
-+
-+/* Gain factor for adding extra_acked to target cwnd: */
-+static int bbr_extra_acked_gain = 256;
-+
-+/* Window length of extra_acked window. Max allowed val is 31. */
-+static u32 bbr_extra_acked_win_rtts = 5;
-+
-+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
-+static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
-+
-+/* Time period for clamping cwnd increment due to ack aggregation */
-+static u32 bbr_extra_acked_max_us = 100 * 1000;
-+
-+/* Use extra acked in startup ?
-+ * 0: disabled
-+ * 1: use latest extra_acked value from 1-2 rtt in startup
-+ */
-+static int bbr_extra_acked_in_startup = 1;		/* default: enabled */
-+
-+/* Experiment: don't grow cwnd beyond twice of what we just probed. */
-+static bool bbr_usage_based_cwnd;		/* default: disabled */
-+
-+/* For lab testing, researchers can enable BBRv2 ECN support with this flag,
-+ * when they know that any ECN marks that the connections experience will be
-+ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks.
-+ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on
-+ * negotiation or configuration that is outside the scope of the BBRv2
-+ * alpha release.
-+ */
-+static bool bbr_ecn_enable = false;
-+
-+module_param_named(min_tso_rate,      bbr_min_tso_rate,      int,    0644);
-+module_param_named(tso_rtt_shift,     bbr_tso_rtt_shift,     int,    0644);
-+module_param_named(high_gain,         bbr_high_gain,         int,    0644);
-+module_param_named(drain_gain,        bbr_drain_gain,        int,    0644);
-+module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int,    0644);
-+module_param_named(cwnd_gain,         bbr_cwnd_gain,         int,    0644);
-+module_param_array_named(pacing_gain, bbr_pacing_gain,       int,
-+			 &bbr_pacing_gain_size, 0644);
-+module_param_named(cwnd_min_target,   bbr_cwnd_min_target,   uint,   0644);
-+module_param_named(probe_rtt_cwnd_gain,
-+		   bbr_probe_rtt_cwnd_gain,		     uint,   0664);
-+module_param_named(cwnd_warn_val,     bbr_cwnd_warn_val,     uint,   0664);
-+module_param_named(debug_port_mask,   bbr_debug_port_mask,   ushort, 0644);
-+module_param_named(flags,             bbr_flags,             uint,   0644);
-+module_param_named(debug_ftrace,      bbr_debug_ftrace, bool,   0644);
-+module_param_named(debug_with_printk, bbr_debug_with_printk, bool,   0644);
-+module_param_named(min_rtt_win_sec,   bbr_min_rtt_win_sec,   uint,   0644);
-+module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint,   0644);
-+module_param_named(probe_rtt_win_ms,  bbr_probe_rtt_win_ms,  uint,   0644);
-+module_param_named(full_bw_thresh,    bbr_full_bw_thresh,    uint,   0644);
-+module_param_named(full_bw_cnt,       bbr_full_bw_cnt,       uint,   0644);
-+module_param_named(cwnd_tso_bduget,   bbr_cwnd_tso_budget,   uint,   0664);
-+module_param_named(extra_acked_gain,  bbr_extra_acked_gain,  int,    0664);
-+module_param_named(extra_acked_win_rtts,
-+		   bbr_extra_acked_win_rtts, uint,   0664);
-+module_param_named(extra_acked_max_us,
-+		   bbr_extra_acked_max_us, uint,   0664);
-+module_param_named(ack_epoch_acked_reset_thresh,
-+		   bbr_ack_epoch_acked_reset_thresh, uint,   0664);
-+module_param_named(drain_to_target,   bbr_drain_to_target,   bool,   0664);
-+module_param_named(precise_ece_ack,   bbr_precise_ece_ack,   bool,   0664);
-+module_param_named(extra_acked_in_startup,
-+		   bbr_extra_acked_in_startup, int, 0664);
-+module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool,   0664);
-+module_param_named(ecn_enable,       bbr_ecn_enable,         bool,   0664);
-+
-+static void bbr2_exit_probe_rtt(struct sock *sk);
-+static void bbr2_reset_congestion_signals(struct sock *sk);
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk);
-+
-+/* Do we estimate that STARTUP filled the pipe? */
-+static bool bbr_full_bw_reached(const struct sock *sk)
-+{
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return bbr->full_bw_reached;
-+}
-+
-+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
-+static u32 bbr_max_bw(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
-+}
-+
-+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
-+static u32 bbr_bw(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return min(bbr_max_bw(sk), bbr->bw_lo);
-+}
-+
-+/* Return maximum extra acked in past k-2k round trips,
-+ * where k = bbr_extra_acked_win_rtts.
-+ */
-+static u16 bbr_extra_acked(const struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
-+}
-+
-+/* Return rate in bytes per second, optionally with a gain.
-+ * The order here is chosen carefully to avoid overflow of u64. This should
-+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
-+ */
-+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
-+				  int margin)
-+{
-+	unsigned int mss = tcp_sk(sk)->mss_cache;
-+
-+	rate *= mss;
-+	rate *= gain;
-+	rate >>= BBR_SCALE;
-+	rate *= USEC_PER_SEC / 100 * (100 - margin);
-+	rate >>= BW_SCALE;
-+	rate = max(rate, 1ULL);
-+	return rate;
-+}
-+
-+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
-+{
-+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
-+}
-+
-+static u64 bbr_rate_kbps(struct sock *sk, u64 rate)
-+{
-+	rate = bbr_bw_bytes_per_sec(sk, rate);
-+	rate *= 8;
-+	do_div(rate, 1000);
-+	return rate;
-+}
-+
-+static u32 bbr_tso_segs_goal(struct sock *sk);
-+static void bbr_debug(struct sock *sk, u32 acked,
-+		      const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	static const char ca_states[] = {
-+		[TCP_CA_Open]		= 'O',
-+		[TCP_CA_Disorder]	= 'D',
-+		[TCP_CA_CWR]		= 'C',
-+		[TCP_CA_Recovery]	= 'R',
-+		[TCP_CA_Loss]		= 'L',
-+	};
-+	static const char mode[] = {
-+		'G',  /* Growing   - BBR_STARTUP */
-+		'D',  /* Drain     - BBR_DRAIN */
-+		'W',  /* Window    - BBR_PROBE_BW */
-+		'M',  /* Min RTT   - BBR_PROBE_RTT */
-+	};
-+	static const char ack_phase[] = { /* bbr_ack_phase strings */
-+		'I',	/* BBR_ACKS_INIT	   - 'Init' */
-+		'R',	/* BBR_ACKS_REFILLING	   - 'Refilling' */
-+		'B',	/* BBR_ACKS_PROBE_STARTING - 'Before' */
-+		'F',	/* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */
-+		'A',	/* BBR_ACKS_PROBE_STOPPING - 'After' */
-+	};
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	const u32 una = tp->snd_una - bbr->debug.snd_isn;
-+	const u32 fack = tcp_highest_sack_seq(tp);
-+	const u16 dport = ntohs(inet_sk(sk)->inet_dport);
-+	bool is_port_match = (bbr_debug_port_mask &&
-+			      ((dport & bbr_debug_port_mask) == 0));
-+	char debugmsg[320];
-+
-+	if (sk->sk_state == TCP_SYN_SENT)
-+		return;  /* no bbr_init() yet if SYN retransmit -> CA_Loss */
-+
-+	if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) {
-+		char addr[INET6_ADDRSTRLEN + 10] = { 0 };
-+
-+		if (sk->sk_family == AF_INET)
-+			snprintf(addr, sizeof(addr), "%pI4:%u",
-+				 &inet_sk(sk)->inet_daddr, dport);
-+		else if (sk->sk_family == AF_INET6)
-+			snprintf(addr, sizeof(addr), "%pI6:%u",
-+				 &sk->sk_v6_daddr, dport);
-+
-+		WARN_ONCE(1,
-+			"BBR %s cwnd alert: %u "
-+			"snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u "
-+			"bw: %u rtt: %u min_rtt: %u "
-+			"acked: %u tso_segs: %u "
-+			"bw: %d %ld %d pif: %u\n",
-+			addr, tp->snd_cwnd,
-+			una, inet_csk(sk)->icsk_ca_state,
-+			bbr->pacing_gain, bbr->cwnd_gain,
-+			bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us,
-+			acked, bbr_tso_segs_goal(sk),
-+			rs->delivered, rs->interval_us, rs->is_retrans,
-+			tcp_packets_in_flight(tp));
-+	}
-+
-+	if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace))
-+		return;
-+
-+	if (!sock_flag(sk, SOCK_DBG) && !is_port_match)
-+		return;
-+
-+	if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE))
-+		return;
-+
-+	if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) &&
-+	    !(bbr_flags & FLAG_DEBUG_LOOPBACK))
-+		return;
-+
-+	snprintf(debugmsg, sizeof(debugmsg) - 1,
-+		 "BBR %pI4:%-5u %5u,%03u:%-7u %c "
-+		 "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu "
-+		 "bw %llu lb %llu ib %llu qb %llu "
-+		 "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c "
-+		 "lr %d er %d ea %d bwl %lld il %d ih %d c %d "
-+		 "v %d %c %u %c %s\n",
-+		 &inet_sk(sk)->inet_daddr, dport,
-+		 una / 1000, una % 1000, fack - tp->snd_una,
-+		 ca_states[inet_csk(sk)->icsk_ca_state],
-+		 bbr->debug.undo ? '@' : mode[bbr->mode],
-+		 tp->snd_cwnd,
-+		 bbr_extra_acked(sk),	/* br (legacy): extra_acked */
-+		 rs->tx_in_flight,	/* cr (legacy): tx_inflight */
-+		 rs->rtt_us,
-+		 rs->delivered,
-+		 rs->interval_us,
-+		 bbr->min_rtt_us,
-+		 rs->is_app_limited ? '_' : 'l',
-+		 bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */
-+		 bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */
-+		 0ULL,				    /* lb: [obsolete] */
-+		 0ULL,				    /* ib: [obsolete] */
-+		 div_u64((u64)sk->sk_pacing_rate * 8, 1000),
-+		 acked,
-+		 tcp_packets_in_flight(tp),
-+		 rs->is_ack_delayed ? 'd' : '.',
-+		 bbr->round_start ? '*' : '.',
-+		 tp->delivered, tp->lost,
-+		 tp->app_limited,
-+		 0,			    	    /* #: [obsolete] */
-+		 ctx->target_cwnd,
-+		 tp->reord_seen ? 'r' : '.',  /* r: reordering seen? */
-+		 ca_states[bbr->prev_ca_state],
-+		 (rs->lost + rs->delivered) > 0 ?
-+		 (1000 * rs->lost /
-+		  (rs->lost + rs->delivered)) : 0,    /* lr: loss rate x1000 */
-+		 (rs->delivered) > 0 ?
-+		 (1000 * rs->delivered_ce /
-+		  (rs->delivered)) : 0,		      /* er: ECN rate x1000 */
-+		 1000 * bbr->ecn_alpha >> BBR_SCALE,  /* ea: ECN alpha x1000 */
-+		 bbr->bw_lo == ~0U ?
-+		   -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */
-+		 bbr->inflight_lo,	/* il */
-+		 bbr->inflight_hi,	/* ih */
-+		 bbr->bw_probe_up_cnt,	/* c */
-+		 2,			/* v: version */
-+		 bbr->debug.event,
-+		 bbr->cycle_idx,
-+		 ack_phase[bbr->ack_phase],
-+		 bbr->bw_probe_samples ? "Y" : "N");
-+	debugmsg[sizeof(debugmsg) - 1] = 0;
-+
-+	/* printk takes a higher precedence. */
-+	if (bbr_debug_with_printk)
-+		printk(KERN_DEBUG "%s", debugmsg);
-+
-+	if (unlikely(bbr->debug.undo))
-+		bbr->debug.undo = 0;
-+}
-+
-+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+	u64 rate = bw;
-+
-+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
-+				      bbr_pacing_margin_percent);
-+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
-+	return rate;
-+}
-+
-+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
-+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 bw;
-+	u32 rtt_us;
-+
-+	if (tp->srtt_us) {		/* any RTT sample yet? */
-+		rtt_us = max(tp->srtt_us >> 3, 1U);
-+		bbr->has_seen_rtt = 1;
-+	} else {			 /* no RTT sample yet */
-+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
-+	}
-+	bw = (u64)tp->snd_cwnd * BW_UNIT;
-+	do_div(bw, rtt_us);
-+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain);
-+}
-+
-+/* Pace using current bw estimate and a gain factor. */
-+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
-+
-+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
-+		bbr_init_pacing_rate_from_rtt(sk);
-+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
-+		sk->sk_pacing_rate = rate;
-+}
-+
-+static u32 bbr_min_tso_segs(struct sock *sk)
-+{
-+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
-+}
-+
-+/* Return the number of segments BBR would like in a TSO/GSO skb, given
-+ * a particular max gso size as a constraint.
-+ */
-+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
-+				u32 gso_max_size)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 segs, r;
-+	u64 bytes;
-+
-+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
-+	bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
-+
-+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
-+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
-+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
-+	 */
-+	if (bbr->params.tso_rtt_shift) {
-+		r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift;
-+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
-+			bytes += GSO_MAX_SIZE >> r;
-+	}
-+
-+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
-+	segs = max_t(u32, div_u64(bytes, mss_now), bbr_min_tso_segs(sk));
-+	return segs;
-+}
-+
-+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-+static u32  bbr_tso_segs(struct sock *sk, unsigned int mss_now)
-+{
-+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
-+}
-+
-+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
-+static u32 bbr_tso_segs_goal(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+
-+	return  bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
-+}
-+
-+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
-+static void bbr_save_cwnd(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
-+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
-+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
-+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
-+}
-+
-+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (event == CA_EVENT_TX_START && tp->app_limited) {
-+		bbr->idle_restart = 1;
-+		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+		bbr->ack_epoch_acked = 0;
-+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
-+		 * need more speed (we're restarting from idle and app-limited).
-+		 */
-+		if (bbr->mode == BBR_PROBE_BW)
-+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
-+		else if (bbr->mode == BBR_PROBE_RTT)
-+			bbr_check_probe_rtt_done(sk);
+@@ -333,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	if (event == CA_EVENT_TX_START && tp->app_limited) {
++	if (event == CA_EVENT_TX_START) {
++		if (!tp->app_limited)
++			return;
+ 		bbr->idle_restart = 1;
+ 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ 		bbr->ack_epoch_acked = 0;
+@@ -344,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+ 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ 		else if (bbr->mode == BBR_PROBE_RTT)
+ 			bbr_check_probe_rtt_done(sk);
 +	} else if ((event == CA_EVENT_ECN_IS_CE ||
 +		    event == CA_EVENT_ECN_NO_CE) &&
-+		    bbr_ecn_enable &&
-+		    bbr->params.precise_ece_ack) {
++		   bbr_can_use_ecn(sk) &&
++		   bbr_param(sk, precise_ece_ack)) {
 +		u32 state = bbr->ce_state;
 +		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
 +		bbr->ce_state = state;
-+		if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE)
-+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
-+	}
-+}
-+
-+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
-+ *
-+ * bdp = ceil(bw * min_rtt * gain)
-+ *
-+ * The key factor, gain, controls the amount of queue. While a small gain
-+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
-+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
-+ * noise may cause BBR to under-estimate the rate.
-+ */
-+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 bdp;
-+	u64 w;
-+
-+	/* If we've never had a valid RTT sample, cap cwnd at the initial
-+	 * default. This should only happen when the connection is not using TCP
-+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
-+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
++	} else if (event == CA_EVENT_TLP_RECOVERY &&
++		   bbr_param(sk, loss_probe_recovery)) {
++		bbr_run_loss_probe_recovery(sk);
+ 	}
+ }
+ 
+@@ -366,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+ 	 * default. This should only happen when the connection is not using TCP
+ 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+ 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
 +	 * case we need to slow-start up toward something safe: initial cwnd.
-+	 */
-+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+ 	 */
+ 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
 +		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
-+
-+	w = (u64)bw * bbr->min_rtt_us;
-+
-+	/* Apply a gain to the given value, remove the BW_SCALE shift, and
-+	 * round the value up to avoid a negative feedback loop.
-+	 */
-+	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
-+
-+	return bdp;
-+}
-+
-+/* To achieve full performance in high-speed paths, we budget enough cwnd to
-+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
-+ *   - one skb in sending host Qdisc,
-+ *   - one skb in sending host TSO/GSO engine
-+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
-+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
-+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
-+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
-+ * full even with ACK-every-other-packet delayed ACKs.
-+ */
-+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	w = (u64)bw * bbr->min_rtt_us;
+ 
+@@ -386,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+  *   - one skb in sending host Qdisc,
+  *   - one skb in sending host TSO/GSO engine
+  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
++ * Don't worry, at low rates this won't bloat cwnd because
++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
+  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+  * full even with ACK-every-other-packet delayed ACKs.
+  */
+ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 tso_segs_goal;
-+
+ 
+-	/* Allow enough full-sized skbs in flight to utilize end systems. */
+-	cwnd += 3 * bbr_tso_segs_goal(sk);
+-
+-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+-	cwnd = (cwnd + 1) & ~1U;
 +	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
-+
+ 
 +	/* Allow enough full-sized skbs in flight to utilize end systems. */
-+	if (bbr->params.cwnd_tso_budget == 1) {
-+		cwnd = max_t(u32, cwnd, tso_segs_goal);
-+		cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+	} else {
-+		cwnd += tso_segs_goal;
-+		cwnd = (cwnd + 1) & ~1U;
-+	}
-+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
++	cwnd = max_t(u32, cwnd, tso_segs_goal);
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
 +	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+		cwnd += 2;
-+
-+	return cwnd;
-+}
-+
-+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
-+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
-+{
-+	u32 inflight;
-+
-+	inflight = bbr_bdp(sk, bw, gain);
-+	inflight = bbr_quantization_budget(sk, inflight);
-+
-+	return inflight;
-+}
-+
-+/* With pacing at lower layers, there's often less data "in the network" than
-+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
-+ * we often have several skbs queued in the pacing layer with a pre-scheduled
-+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
-+ * inflight level that it estimates has already been "baked in" by previous
-+ * departure time decisions. We calculate a rough estimate of the number of our
-+ * packets that might be in the network at the earliest departure time for the
-+ * next skb scheduled:
-+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
-+ * If we're increasing inflight, then we want to know if the transmit of the
-+ * EDT skb will push inflight above the target, so inflight_at_edt includes
-+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
-+ * then estimate if inflight will sink too low just before the EDT transmit.
-+ */
-+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 now_ns, edt_ns, interval_us;
-+	u32 interval_delivered, inflight_at_edt;
-+
-+	now_ns = tp->tcp_clock_cache;
-+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
-+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
-+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
-+	inflight_at_edt = inflight_now;
-+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
-+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
-+	if (interval_delivered >= inflight_at_edt)
-+		return 0;
-+	return inflight_at_edt - interval_delivered;
-+}
-+
-+/* Find the cwnd increment based on estimate of ack aggregation */
-+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 max_aggr_cwnd, aggr_cwnd = 0;
-+
-+	if (bbr->params.extra_acked_gain &&
-+	    (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) {
-+		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
-+				/ BW_UNIT;
-+		aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk))
-+			     >> BBR_SCALE;
-+		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
-+	}
-+
-+	return aggr_cwnd;
-+}
-+
+ 		cwnd += 2;
+ 
+ 	return cwnd;
+@@ -457,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ {
+ 	u32 max_aggr_cwnd, aggr_cwnd = 0;
+ 
+-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
++	if (bbr_param(sk, extra_acked_gain)) {
+ 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ 				/ BW_UNIT;
+-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
++		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
+ 			     >> BBR_SCALE;
+ 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ 	}
+@@ -468,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+ 	return aggr_cwnd;
+ }
+ 
+-/* An optimization in BBR to reduce losses: On the first round of recovery, we
+- * follow the packet conservation principle: send P packets per P packets acked.
+- * After that, we slow-start and send at most 2*P packets per P packets acked.
+- * After recovery finishes, or upon undo, we restore the cwnd we had when
+- * recovery started (capped by the target cwnd based on estimated BDP).
+- *
+- * TODO(ycheng/ncardwell): implement a rate-based approach.
+- */
+-static bool bbr_set_cwnd_to_recover_or_restore(
+-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
 +/* Returns the cwnd for PROBE_RTT mode. */
 +static u32 bbr_probe_rtt_cwnd(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->params.probe_rtt_cwnd_gain == 0)
-+		return bbr->params.cwnd_min_target;
-+	return max_t(u32, bbr->params.cwnd_min_target,
-+		     bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain));
-+}
-+
-+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
-+ * has drawn us down below target), or snap down to target if we're above it.
-+ */
-+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+-	u32 cwnd = tcp_snd_cwnd(tp);
+-
+-	/* An ACK for P pkts should release at most 2*P packets. We do this
+-	 * in two steps. First, here we deduct the number of lost packets.
+-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+-	 */
+-	if (rs->losses > 0)
+-		cwnd = max_t(s32, cwnd - rs->losses, 1);
+-
+-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+-		/* Starting 1st round of Recovery, so do packet conservation. */
+-		bbr->packet_conservation = 1;
+-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+-		cwnd = tcp_packets_in_flight(tp) + acked;
+-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+-		/* Exiting loss recovery; restore cwnd saved before recovery. */
+-		cwnd = max(cwnd, bbr->prior_cwnd);
+-		bbr->packet_conservation = 0;
+-	}
+-	bbr->prev_ca_state = state;
+-
+-	if (bbr->packet_conservation) {
+-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+-		return true;	/* yes, using packet conservation */
+-	}
+-	*new_cwnd = cwnd;
+-	return false;
++	return max_t(u32, bbr_param(sk, cwnd_min_target),
++		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
+ }
+ 
+ /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+  * has drawn us down below target), or snap down to target if we're above it.
+  */
+ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+-			 u32 acked, u32 bw, int gain)
 +			 u32 acked, u32 bw, int gain, u32 cwnd,
 +			 struct bbr_context *ctx)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe;
-+
-+	if (!acked)
-+		goto done;  /* no packet fully ACKed; just apply caps */
-+
-+	target_cwnd = bbr_bdp(sk, bw, gain);
-+
-+	/* Increment the cwnd to account for excess ACKed data that seems
-+	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
-+	 */
-+	target_cwnd += bbr_ack_aggregation_cwnd(sk);
-+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
-+
-+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
-+	bbr->debug.target_cwnd = target_cwnd;
-+
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
++	u32 target_cwnd = 0;
+ 
+ 	if (!acked)
+ 		goto done;  /* no packet fully ACKed; just apply caps */
+ 
+-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+-		goto done;
+-
+ 	target_cwnd = bbr_bdp(sk, bw, gain);
+ 
+ 	/* Increment the cwnd to account for excess ACKed data that seems
+@@ -536,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+ 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+ 
+-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+-		cwnd = min(cwnd + acked, target_cwnd);
+-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+-		cwnd = cwnd + acked;
+-	cwnd = max(cwnd, bbr_cwnd_min_target);
 +	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
 +	bbr->try_fast_path = 0;
 +	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
@@ -1285,52 +1032,279 @@ index 000000000000..85f8052144d1
 +	} else {
 +		bbr->try_fast_path = 1;
 +	}
-+
-+	/* When growing cwnd, don't grow beyond twice what we just probed. */
-+	if (bbr->params.usage_based_cwnd) {
-+		max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd);
-+		cwnd = min(cwnd, max_probe);
-+	}
-+
-+	cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target);
-+done:
-+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
-+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
-+		tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk));
-+
-+	ctx->target_cwnd = target_cwnd;
-+	ctx->log = (tp->snd_cwnd != prev_cwnd);
-+}
-+
-+/* See if we have reached next round trip */
-+static void bbr_update_round_start(struct sock *sk,
+ 
++	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
+ done:
+-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
++	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
+ 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+-}
+-
+-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+-static bool bbr_is_next_cycle_phase(struct sock *sk,
+-				    const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	bool is_full_length =
+-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+-		bbr->min_rtt_us;
+-	u32 inflight, bw;
+-
+-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+-	 * use the pipe without increasing the queue.
+-	 */
+-	if (bbr->pacing_gain == BBR_UNIT)
+-		return is_full_length;		/* just use wall clock time */
+-
+-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+-	bw = bbr_max_bw(sk);
+-
+-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+-	 * a path with small buffers may not hold that much.
+-	 */
+-	if (bbr->pacing_gain > BBR_UNIT)
+-		return is_full_length &&
+-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+-
+-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+-	 * probing didn't find more bw. If inflight falls to match BDP then we
+-	 * estimate queue is drained; persisting would underutilize the pipe.
+-	 */
+-	return is_full_length ||
+-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+-}
+-
+-static void bbr_advance_cycle_phase(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+-	bbr->cycle_mstamp = tp->delivered_mstamp;
+-}
+-
+-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+-static void bbr_update_cycle_phase(struct sock *sk,
+-				   const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+-		bbr_advance_cycle_phase(sk);
++		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
++					   bbr_probe_rtt_cwnd(sk)));
+ }
+ 
+ static void bbr_reset_startup_mode(struct sock *sk)
+@@ -613,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
+ 	bbr->mode = BBR_STARTUP;
+ }
+ 
+-static void bbr_reset_probe_bw_mode(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->mode = BBR_PROBE_BW;
+-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+-}
+-
+-static void bbr_reset_mode(struct sock *sk)
+-{
+-	if (!bbr_full_bw_reached(sk))
+-		bbr_reset_startup_mode(sk);
+-	else
+-		bbr_reset_probe_bw_mode(sk);
+-}
+-
+-/* Start a new long-term sampling interval. */
+-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+-	bbr->lt_last_delivered = tp->delivered;
+-	bbr->lt_last_lost = tp->lost;
+-	bbr->lt_rtt_cnt = 0;
+-}
+-
+-/* Completely reset long-term bandwidth sampling. */
+-static void bbr_reset_lt_bw_sampling(struct sock *sk)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	bbr->lt_bw = 0;
+-	bbr->lt_use_bw = 0;
+-	bbr->lt_is_sampling = false;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 diff;
+-
+-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+-		/* Is new bw close to the lt_bw from the previous interval? */
+-		diff = abs(bw - bbr->lt_bw);
+-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+-		     bbr_lt_bw_diff)) {
+-			/* All criteria are met; estimate we're policed. */
+-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+-			bbr->lt_use_bw = 1;
+-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+-			bbr->lt_rtt_cnt = 0;
+-			return;
+-		}
+-	}
+-	bbr->lt_bw = bw;
+-	bbr_reset_lt_bw_sampling_interval(sk);
+-}
+-
+-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+- * explicitly models their policed rate, to reduce unnecessary losses. We
+- * estimate that we're policed if we see 2 consecutive sampling intervals with
+- * consistent throughput and high packet loss. If we think we're being policed,
+- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
++/* See if we have reached next round trip. Upon start of the new round,
++ * returns packets delivered since previous round start plus this ACK.
+  */
+-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct tcp_sock *tp = tcp_sk(sk);
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 lost, delivered;
+-	u64 bw;
+-	u32 t;
+-
+-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+-		}
+-		return;
+-	}
+-
+-	/* Wait for the first loss before sampling, to let the policer exhaust
+-	 * its tokens and estimate the steady-state rate allowed by the policer.
+-	 * Starting samples earlier includes bursts that over-estimate the bw.
+-	 */
+-	if (!bbr->lt_is_sampling) {
+-		if (!rs->losses)
+-			return;
+-		bbr_reset_lt_bw_sampling_interval(sk);
+-		bbr->lt_is_sampling = true;
+-	}
+-
+-	/* To avoid underestimates, reset sampling if we run out of data. */
+-	if (rs->is_app_limited) {
+-		bbr_reset_lt_bw_sampling(sk);
+-		return;
+-	}
+-
+-	if (bbr->round_start)
+-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+-		return;		/* sampling interval needs to be longer */
+-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+-		return;
+-	}
+-
+-	/* End sampling interval when a packet is lost, so we estimate the
+-	 * policer tokens were exhausted. Stopping the sampling before the
+-	 * tokens are exhausted under-estimates the policed rate.
+-	 */
+-	if (!rs->losses)
+-		return;
+-
+-	/* Calculate packets lost and delivered in sampling interval. */
+-	lost = tp->lost - bbr->lt_last_lost;
+-	delivered = tp->delivered - bbr->lt_last_delivered;
+-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+-		return;
+-
+-	/* Find average delivery rate in this sampling interval. */
+-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+-	if ((s32)t < 1)
+-		return;		/* interval is less than one ms, so wait */
+-	/* Check if can multiply without overflow */
+-	if (t >= ~0U / USEC_PER_MSEC) {
+-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+-		return;
+-	}
+-	t *= USEC_PER_MSEC;
+-	bw = (u64)delivered * BW_UNIT;
+-	do_div(bw, t);
+-	bbr_lt_bw_interval_done(sk, bw);
+-}
+-
+-/* Estimate the bandwidth based on how fast packets are delivered */
+-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
++static u32 bbr_update_round_start(struct sock *sk,
 +		const struct rate_sample *rs, struct bbr_context *ctx)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	bbr->round_start = 0;
-+
-+	/* See if we've reached the next RTT */
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u64 bw;
++	u32 round_delivered = 0;
+ 
+ 	bbr->round_start = 0;
+-	if (rs->delivered < 0 || rs->interval_us <= 0)
+-		return; /* Not a valid observation */
+ 
+ 	/* See if we've reached the next RTT */
+-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
 +	if (rs->interval_us > 0 &&
 +	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
-+		bbr->next_rtt_delivered = tp->delivered;
-+		bbr->round_start = 1;
-+	}
++		round_delivered = tp->delivered - bbr->next_rtt_delivered;
+ 		bbr->next_rtt_delivered = tp->delivered;
+-		bbr->rtt_cnt++;
+ 		bbr->round_start = 1;
+-		bbr->packet_conservation = 0;
+ 	}
++	return round_delivered;
 +}
-+
+ 
+-	bbr_lt_bw_sampling(sk, rs);
 +/* Calculate the bandwidth based on how fast packets are delivered */
 +static void bbr_calculate_bw_sample(struct sock *sk,
 +			const struct rate_sample *rs, struct bbr_context *ctx)
 +{
-+	struct bbr *bbr = inet_csk_ca(sk);
 +	u64 bw = 0;
-+
-+	/* Divide delivered by the interval to find a (lower bound) bottleneck
-+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
-+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+ 
+ 	/* Divide delivered by the interval to find a (lower bound) bottleneck
+ 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+ 	 * ratio will be <<1 for most connections. So delivered is first scaled.
 +	 * Round up to allow growth at low rates, even with integer division.
-+	 */
+ 	 */
+-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+-
+-	/* If this sample is application-limited, it is likely to have a very
+-	 * low delivered count that represents application behavior rather than
+-	 * the available network rate. Such a sample could drag down estimated
+-	 * bw, causing needless slow-down. Thus, to continue to send at the
+-	 * last measured network rate, we filter out app-limited samples unless
+-	 * they describe the path bw at least as well as our bw model.
+-	 *
+-	 * So the goal during app-limited phase is to proceed with the best
+-	 * network rate no matter how long. We automatically leave this
+-	 * phase when app writes faster than the network can deliver :)
+-	 */
+-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+-		/* Incorporate new sample into our max bw filter. */
+-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
 +	if (rs->interval_us > 0) {
 +		if (WARN_ONCE(rs->delivered < 0,
 +			      "negative delivered: %d interval_us: %ld\n",
@@ -1338,362 +1312,289 @@ index 000000000000..85f8052144d1
 +			return;
 +
 +		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
-+	}
+ 	}
 +
 +	ctx->sample_bw = bw;
-+	bbr->debug.rs_bw = bw;
-+}
-+
-+/* Estimates the windowed max degree of ack aggregation.
-+ * This is used to provision extra in-flight data to keep sending during
-+ * inter-ACK silences.
-+ *
-+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
-+ *
-+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
-+ * cwnd += max_extra_acked
-+ *
-+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
-+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ }
+ 
+ /* Estimates the windowed max degree of ack aggregation.
+@@ -811,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+  *
+  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+  * Max filter is an approximate sliding window of 5-10 (packet timed) round
+- * trips.
 + * trips for non-startup phase, and 1-2 round trips for startup.
-+ */
-+static void bbr_update_ack_aggregation(struct sock *sk,
-+				       const struct rate_sample *rs)
-+{
-+	u32 epoch_us, expected_acked, extra_acked;
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts;
-+
-+	if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 ||
-+	    rs->delivered < 0 || rs->interval_us <= 0)
-+		return;
-+
-+	if (bbr->round_start) {
-+		bbr->extra_acked_win_rtts = min(0x1F,
-+						bbr->extra_acked_win_rtts + 1);
-+		if (bbr->params.extra_acked_in_startup &&
-+		    !bbr_full_bw_reached(sk))
+  */
+ static void bbr_update_ack_aggregation(struct sock *sk,
+ 				       const struct rate_sample *rs)
+@@ -819,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 	u32 epoch_us, expected_acked, extra_acked;
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
+ 
+-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
++	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
+ 	    rs->delivered < 0 || rs->interval_us <= 0)
+ 		return;
+ 
+ 	if (bbr->round_start) {
+ 		bbr->extra_acked_win_rtts = min(0x1F,
+ 						bbr->extra_acked_win_rtts + 1);
+-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
++		if (!bbr_full_bw_reached(sk))
 +			extra_acked_win_rtts_thresh = 1;
 +		if (bbr->extra_acked_win_rtts >=
 +		    extra_acked_win_rtts_thresh) {
-+			bbr->extra_acked_win_rtts = 0;
-+			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
-+						   0 : 1;
-+			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
-+		}
-+	}
-+
-+	/* Compute how many packets we expected to be delivered over epoch. */
-+	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
-+				      bbr->ack_epoch_mstamp);
-+	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
-+
-+	/* Reset the aggregation epoch if ACK rate is below expected rate or
-+	 * significantly large no. of ack received since epoch (potentially
-+	 * quite old epoch).
-+	 */
-+	if (bbr->ack_epoch_acked <= expected_acked ||
-+	    (bbr->ack_epoch_acked + rs->acked_sacked >=
-+	     bbr_ack_epoch_acked_reset_thresh)) {
-+		bbr->ack_epoch_acked = 0;
-+		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
-+		expected_acked = 0;
-+	}
-+
-+	/* Compute excess data delivered, beyond what was expected. */
-+	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
-+				   bbr->ack_epoch_acked + rs->acked_sacked);
-+	extra_acked = bbr->ack_epoch_acked - expected_acked;
-+	extra_acked = min(extra_acked, tp->snd_cwnd);
-+	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
-+		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
-+}
-+
-+/* Estimate when the pipe is full, using the change in delivery rate: BBR
-+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
-+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
-+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
-+ * higher rwin, 3: we get higher delivery rate samples. Or transient
-+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
-+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
-+ */
-+static void bbr_check_full_bw_reached(struct sock *sk,
-+				      const struct rate_sample *rs)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 bw_thresh;
-+
-+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
-+		return;
-+
-+	bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE;
-+	if (bbr_max_bw(sk) >= bw_thresh) {
-+		bbr->full_bw = bbr_max_bw(sk);
-+		bbr->full_bw_cnt = 0;
-+		return;
-+	}
-+	++bbr->full_bw_cnt;
-+	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt;
-+}
-+
-+/* If pipe is probably full, drain the queue and then enter steady-state. */
-+static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
-+			    struct bbr_context *ctx)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
-+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-+		tcp_sk(sk)->snd_ssthresh =
-+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-+		bbr2_reset_congestion_signals(sk);
-+	}	/* fall through to check if in-flight is already small: */
-+	if (bbr->mode == BBR_DRAIN &&
-+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
-+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
-+		return true;  /* exiting DRAIN now */
-+	return false;
-+}
-+
-+static void bbr_check_probe_rtt_done(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (!(bbr->probe_rtt_done_stamp &&
-+	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
-+		return;
-+
+ 			bbr->extra_acked_win_rtts = 0;
+ 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 						   0 : 1;
+@@ -861,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
+ 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+ }
+ 
+-/* Estimate when the pipe is full, using the change in delivery rate: BBR
+- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+- * higher rwin, 3: we get higher delivery rate samples. Or transient
+- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+- */
+-static void bbr_check_full_bw_reached(struct sock *sk,
+-				      const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw_thresh;
+-
+-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+-		return;
+-
+-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+-	if (bbr_max_bw(sk) >= bw_thresh) {
+-		bbr->full_bw = bbr_max_bw(sk);
+-		bbr->full_bw_cnt = 0;
+-		return;
+-	}
+-	++bbr->full_bw_cnt;
+-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+-}
+-
+-/* If pipe is probably full, drain the queue and then enter steady-state. */
+-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+-{
+-	struct bbr *bbr = inet_csk_ca(sk);
+-
+-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+-		tcp_sk(sk)->snd_ssthresh =
+-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+-	}	/* fall through to check if in-flight is already small: */
+-	if (bbr->mode == BBR_DRAIN &&
+-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+-}
+-
+ static void bbr_check_probe_rtt_done(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+@@ -913,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
+ 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ 		return;
+ 
+-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
 +	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
-+	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+	bbr2_exit_probe_rtt(sk);
-+}
-+
-+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
-+ * periodically drain the bottleneck queue, to converge to measure the true
-+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
-+ * small (reducing queuing delay and packet loss) and achieve fairness among
-+ * BBR flows.
-+ *
-+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
-+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
-+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
-+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
-+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
-+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
-+ *
-+ * Note that flows need only pay 2% if they are busy sending over the last 10
-+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
-+ * natural silences or low-rate periods within 10 seconds where the rate is low
-+ * enough for long enough to drain its queue in the bottleneck. We pick up
-+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
-+ */
-+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
+ 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+-	bbr_reset_mode(sk);
++	bbr_exit_probe_rtt(sk);
+ }
+ 
+ /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+@@ -941,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	bool filter_expired;
 +	bool probe_rtt_expired, min_rtt_expired;
 +	u32 expire;
-+
+ 
+-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+-	filter_expired = after(tcp_jiffies32,
+-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
 +	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
 +	expire = bbr->probe_rtt_min_stamp +
-+		 msecs_to_jiffies(bbr->params.probe_rtt_win_ms);
++		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
 +	probe_rtt_expired = after(tcp_jiffies32, expire);
-+	if (rs->rtt_us >= 0 &&
-+	    (rs->rtt_us <= bbr->probe_rtt_min_us ||
+ 	if (rs->rtt_us >= 0 &&
+-	    (rs->rtt_us < bbr->min_rtt_us ||
+-	     (filter_expired && !rs->is_ack_delayed))) {
+-		bbr->min_rtt_us = rs->rtt_us;
+-		bbr->min_rtt_stamp = tcp_jiffies32;
++	    (rs->rtt_us < bbr->probe_rtt_min_us ||
 +	     (probe_rtt_expired && !rs->is_ack_delayed))) {
 +		bbr->probe_rtt_min_us = rs->rtt_us;
 +		bbr->probe_rtt_min_stamp = tcp_jiffies32;
 +	}
 +	/* Track min RTT seen in the min_rtt_win_sec filter window: */
-+	expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ;
++	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
 +	min_rtt_expired = after(tcp_jiffies32, expire);
 +	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
 +	    min_rtt_expired) {
 +		bbr->min_rtt_us = bbr->probe_rtt_min_us;
 +		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
-+	}
-+
-+	if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired &&
-+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
-+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
-+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
-+		bbr->probe_rtt_done_stamp = 0;
+ 	}
+ 
+-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
++	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
+ 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+ 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+ 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+ 		bbr->probe_rtt_done_stamp = 0;
 +		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
 +		bbr->next_rtt_delivered = tp->delivered;
-+	}
-+
-+	if (bbr->mode == BBR_PROBE_RTT) {
-+		/* Ignore low rate samples during this mode. */
-+		tp->app_limited =
-+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-+		/* Maintain min packets in flight for max(200 ms, 1 round). */
-+		if (!bbr->probe_rtt_done_stamp &&
+ 	}
+ 
+ 	if (bbr->mode == BBR_PROBE_RTT) {
+@@ -966,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+ 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+ 		/* Maintain min packets in flight for max(200 ms, 1 round). */
+ 		if (!bbr->probe_rtt_done_stamp &&
+-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
 +		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
-+			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
-+				msecs_to_jiffies(bbr->params.probe_rtt_mode_ms);
-+			bbr->probe_rtt_round_done = 0;
-+			bbr->next_rtt_delivered = tp->delivered;
-+		} else if (bbr->probe_rtt_done_stamp) {
-+			if (bbr->round_start)
-+				bbr->probe_rtt_round_done = 1;
-+			if (bbr->probe_rtt_round_done)
-+				bbr_check_probe_rtt_done(sk);
-+		}
-+	}
-+	/* Restart after idle ends only once we process a new S/ACK for data */
-+	if (rs->delivered > 0)
-+		bbr->idle_restart = 0;
-+}
-+
-+static void bbr_update_gains(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	switch (bbr->mode) {
-+	case BBR_STARTUP:
-+		bbr->pacing_gain = bbr->params.high_gain;
-+		bbr->cwnd_gain	 = bbr->params.startup_cwnd_gain;
-+		break;
-+	case BBR_DRAIN:
-+		bbr->pacing_gain = bbr->params.drain_gain;  /* slow, to drain */
-+		bbr->cwnd_gain = bbr->params.startup_cwnd_gain;  /* keep cwnd */
-+		break;
-+	case BBR_PROBE_BW:
-+		bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx];
-+		bbr->cwnd_gain = bbr->params.cwnd_gain;
-+		break;
-+	case BBR_PROBE_RTT:
-+		bbr->pacing_gain = BBR_UNIT;
-+		bbr->cwnd_gain = BBR_UNIT;
-+		break;
-+	default:
-+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
-+		break;
-+	}
-+}
-+
-+static void bbr_init(struct sock *sk)
-+{
-+	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	int i;
-+
-+	WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val);
-+
-+	bbr->initialized = 1;
-+	bbr->params.high_gain = min(0x7FF, bbr_high_gain);
-+	bbr->params.drain_gain = min(0x3FF, bbr_drain_gain);
-+	bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain);
-+	bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain);
-+	bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget);
-+	bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target);
-+	bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec);
-+	bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms);
-+	bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt);
-+	bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh);
-+	bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain);
-+	bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts);
-+	bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0;
-+	bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0;
-+	bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0;
-+	bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain);
-+	bbr->params.probe_rtt_win_ms =
-+		min(0x3FFFU,
-+		    min_t(u32, bbr_probe_rtt_win_ms,
-+			  bbr->params.min_rtt_win_sec * MSEC_PER_SEC));
-+	for (i = 0; i < CYCLE_LEN; i++)
-+		bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]);
-+	bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0;
-+	bbr->params.tso_rtt_shift =  min(0xFU, bbr_tso_rtt_shift);
-+
-+	bbr->debug.snd_isn = tp->snd_una;
-+	bbr->debug.target_cwnd = 0;
-+	bbr->debug.undo = 0;
-+
-+	bbr->init_cwnd = min(0x7FU, tp->snd_cwnd);
-+	bbr->prior_cwnd = tp->prior_cwnd;
-+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-+	bbr->next_rtt_delivered = 0;
-+	bbr->prev_ca_state = TCP_CA_Open;
-+	bbr->packet_conservation = 0;
-+
-+	bbr->probe_rtt_done_stamp = 0;
-+	bbr->probe_rtt_round_done = 0;
-+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
-+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
-+	bbr->min_rtt_us = tcp_min_rtt(tp);
-+	bbr->min_rtt_stamp = tcp_jiffies32;
-+
-+	bbr->has_seen_rtt = 0;
-+	bbr_init_pacing_rate_from_rtt(sk);
-+
-+	bbr->round_start = 0;
-+	bbr->idle_restart = 0;
-+	bbr->full_bw_reached = 0;
-+	bbr->full_bw = 0;
-+	bbr->full_bw_cnt = 0;
-+	bbr->cycle_mstamp = 0;
-+	bbr->cycle_idx = 0;
-+	bbr->mode = BBR_STARTUP;
-+	bbr->debug.rs_bw = 0;
-+
-+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-+	bbr->ack_epoch_acked = 0;
-+	bbr->extra_acked_win_rtts = 0;
-+	bbr->extra_acked_win_idx = 0;
-+	bbr->extra_acked[0] = 0;
-+	bbr->extra_acked[1] = 0;
-+
-+	bbr->ce_state = 0;
-+	bbr->prior_rcv_nxt = tp->rcv_nxt;
-+	bbr->try_fast_path = 0;
-+
-+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
-+}
-+
-+static u32 bbr_sndbuf_expand(struct sock *sk)
-+{
+ 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
++				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
+ 			bbr->probe_rtt_round_done = 0;
+ 			bbr->next_rtt_delivered = tp->delivered;
+ 		} else if (bbr->probe_rtt_done_stamp) {
+@@ -989,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
+ 
+ 	switch (bbr->mode) {
+ 	case BBR_STARTUP:
+-		bbr->pacing_gain = bbr_high_gain;
+-		bbr->cwnd_gain	 = bbr_high_gain;
++		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
+ 		break;
+ 	case BBR_DRAIN:
+-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
++		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
++		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
+ 		break;
+ 	case BBR_PROBE_BW:
+-		bbr->pacing_gain = (bbr->lt_use_bw ?
+-				    BBR_UNIT :
+-				    bbr_pacing_gain[bbr->cycle_idx]);
+-		bbr->cwnd_gain	 = bbr_cwnd_gain;
++		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
++		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
++		if (bbr_param(sk, bw_probe_cwnd_gain) &&
++		    bbr->cycle_idx == BBR_BW_PROBE_UP)
++			bbr->cwnd_gain +=
++				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
+ 		break;
+ 	case BBR_PROBE_RTT:
+ 		bbr->pacing_gain = BBR_UNIT;
+@@ -1012,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
+ 	}
+ }
+ 
+-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+ {
+-	bbr_update_bw(sk, rs);
+-	bbr_update_ack_aggregation(sk, rs);
+-	bbr_update_cycle_phase(sk, rs);
+-	bbr_check_full_bw_reached(sk, rs);
+-	bbr_check_drain(sk, rs);
+-	bbr_update_min_rtt(sk, rs);
+-	bbr_update_gains(sk);
 +	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
 +	return 3;
-+}
-+
-+/* __________________________________________________________________________
-+ *
-+ * Functions new to BBR v2 ("bbr") congestion control are below here.
-+ * __________________________________________________________________________
-+ */
-+
+ }
+ 
+-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
 +/* Incorporate a new bw sample into the current window of our max filter. */
-+static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
+ {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+-	u32 bw;
+ 
+-	bbr_update_model(sk, rs);
+-
+-	bw = bbr_bw(sk);
+-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
 +	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
-+}
-+
+ }
+ 
+-__bpf_kfunc static void bbr_init(struct sock *sk)
 +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
-+static void bbr2_advance_bw_hi_filter(struct sock *sk)
-+{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
++static void bbr_advance_max_bw_filter(struct sock *sk)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->prior_cwnd = 0;
+-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+-	bbr->rtt_cnt = 0;
+-	bbr->next_rtt_delivered = tp->delivered;
+-	bbr->prev_ca_state = TCP_CA_Open;
+-	bbr->packet_conservation = 0;
+-
+-	bbr->probe_rtt_done_stamp = 0;
+-	bbr->probe_rtt_round_done = 0;
+-	bbr->min_rtt_us = tcp_min_rtt(tp);
+-	bbr->min_rtt_stamp = tcp_jiffies32;
+-
+-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
 +	if (!bbr->bw_hi[1])
 +		return;  /* no samples in this window; remember old window */
 +	bbr->bw_hi[0] = bbr->bw_hi[1];
 +	bbr->bw_hi[1] = 0;
 +}
-+
+ 
+-	bbr->has_seen_rtt = 0;
+-	bbr_init_pacing_rate_from_rtt(sk);
++/* Reset the estimator for reaching full bandwidth based on bw plateau. */
++static void bbr_reset_full_bw(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
+ 
+-	bbr->round_start = 0;
+-	bbr->idle_restart = 0;
+-	bbr->full_bw_reached = 0;
+ 	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr->cycle_mstamp = 0;
+-	bbr->cycle_idx = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	bbr_reset_startup_mode(sk);
++	bbr->full_bw_now = 0;
++}
+ 
+-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+-	bbr->ack_epoch_acked = 0;
+-	bbr->extra_acked_win_rtts = 0;
+-	bbr->extra_acked_win_idx = 0;
+-	bbr->extra_acked[0] = 0;
+-	bbr->extra_acked[1] = 0;
 +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
-+static u32 bbr2_target_inflight(struct sock *sk)
++static u32 bbr_target_inflight(struct sock *sk)
 +{
 +	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
-+
+ 
+-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
 +	return min(bdp, tcp_sk(sk)->snd_cwnd);
-+}
-+
-+static bool bbr2_is_probing_bandwidth(struct sock *sk)
-+{
+ }
+ 
+-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
++static bool bbr_is_probing_bandwidth(struct sock *sk)
+ {
+-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+-	return 3;
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
 +	return (bbr->mode == BBR_STARTUP) ||
@@ -1703,7 +1604,7 @@ index 000000000000..85f8052144d1
 +}
 +
 +/* Has the given amount of time elapsed since we marked the phase start? */
-+static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
 +{
 +	const struct tcp_sock *tp = tcp_sk(sk);
 +	const struct bbr *bbr = inet_csk_ca(sk);
@@ -1712,62 +1613,74 @@ index 000000000000..85f8052144d1
 +				  bbr->cycle_mstamp + interval_us) > 0;
 +}
 +
-+static void bbr2_handle_queue_too_high_in_startup(struct sock *sk)
++static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bdp;  /* estimated BDP in packets, with quantization budget */
 +
 +	bbr->full_bw_reached = 1;
-+	bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++
++	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
 +}
 +
 +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
-+static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
 +	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
-+	    !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh)
++	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
 +		return;
 +
-+	if (ce_ratio >= bbr->params.ecn_thresh)
++	if (ce_ratio >= bbr_param(sk, ecn_thresh))
 +		bbr->startup_ecn_rounds++;
 +	else
 +		bbr->startup_ecn_rounds = 0;
 +
-+	if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) {
-+		bbr->debug.event = 'E';  /* ECN caused STARTUP exit */
-+		bbr2_handle_queue_too_high_in_startup(sk);
++	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
++		bbr_handle_queue_too_high_in_startup(sk);
 +		return;
 +	}
 +}
 +
-+static void bbr2_update_ecn_alpha(struct sock *sk)
++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
++static int bbr_update_ecn_alpha(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
++	struct net *net = sock_net(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	s32 delivered, delivered_ce;
 +	u64 alpha, ce_ratio;
 +	u32 gain;
++	bool want_ecn_alpha;
 +
-+	if (bbr->params.ecn_factor == 0)
-+		return;
++	/* See if we should use ECN sender logic for this connection. */
++	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
++	    bbr_param(sk, ecn_factor) &&
++	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
++	     !bbr_ecn_max_rtt_us))
++		bbr->ecn_eligible = 1;
++
++	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
++	want_ecn_alpha = (bbr->ecn_eligible ||
++			  (bbr_can_use_ecn(sk) &&
++			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
++	if (!want_ecn_alpha)
++		return -1;
 +
 +	delivered = tp->delivered - bbr->alpha_last_delivered;
 +	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
 +
 +	if (delivered == 0 ||		/* avoid divide by zero */
 +	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
-+		return;
-+
-+	/* See if we should use ECN sender logic for this connection. */
-+	if (!bbr->ecn_eligible && bbr_ecn_enable &&
-+	    (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us ||
-+	     !bbr->params.ecn_max_rtt_us))
-+		bbr->ecn_eligible = 1;
++		return -1;
 +
++	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
 +	ce_ratio = (u64)delivered_ce << BBR_SCALE;
 +	do_div(ce_ratio, delivered);
-+	gain = bbr->params.ecn_alpha_gain;
++
++	gain = bbr_param(sk, ecn_alpha_gain);
 +	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
 +	alpha += (gain * ce_ratio) >> BBR_SCALE;
 +	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
@@ -1775,37 +1688,51 @@ index 000000000000..85f8052144d1
 +	bbr->alpha_last_delivered = tp->delivered;
 +	bbr->alpha_last_delivered_ce = tp->delivered_ce;
 +
-+	bbr2_check_ecn_too_high_in_startup(sk, ce_ratio);
++	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
++	return (int)ce_ratio;
+ }
+ 
+-/* In theory BBR does not need to undo the cwnd since it does not
+- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
++ * flow label) if it encounters sustained congestion in the form of ECN marks.
+  */
+-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (bbr->round_start && ce_ratio >= 0)
++		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
++
++	tcp_plb_check_rehash(sk, &bbr->plb);
 +}
 +
 +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
-+static void bbr2_raise_inflight_hi_slope(struct sock *sk)
-+{
++static void bbr_raise_inflight_hi_slope(struct sock *sk)
+ {
 +	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
+ 	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 growth_this_round, cnt;
 +
 +	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
 +	growth_this_round = 1 << bbr->bw_probe_up_rounds;
 +	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
-+	cnt = tp->snd_cwnd / growth_this_round;
++	cnt = tcp_snd_cwnd(tp) / growth_this_round;
 +	cnt = max(cnt, 1U);
 +	bbr->bw_probe_up_cnt = cnt;
-+	bbr->debug.event = 'G';  /* Grow inflight_hi slope */
 +}
 +
 +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
-+static void bbr2_probe_inflight_hi_upward(struct sock *sk,
++static void bbr_probe_inflight_hi_upward(struct sock *sk,
 +					  const struct rate_sample *rs)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 delta;
 +
-+	if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) {
-+		bbr->bw_probe_up_acks = 0;  /* don't accmulate unused credits */
++	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
 +		return;  /* not fully using inflight_hi, so don't grow it */
-+	}
 +
 +	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
 +	bbr->bw_probe_up_acks += rs->acked_sacked;
@@ -1813,11 +1740,11 @@ index 000000000000..85f8052144d1
 +		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
 +		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
 +		bbr->inflight_hi += delta;
-+		bbr->debug.event = 'I';  /* Increment inflight_hi */
++		bbr->try_fast_path = 0;  /* Need to update cwnd */
 +	}
 +
 +	if (bbr->round_start)
-+		bbr2_raise_inflight_hi_slope(sk);
++		bbr_raise_inflight_hi_slope(sk);
 +}
 +
 +/* Does loss/ECN rate for this sample say inflight is "too high"?
@@ -1825,25 +1752,28 @@ index 000000000000..85f8052144d1
 + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
 + * uses it to notice when loss/ECN rates suggest inflight is too high.
 + */
-+static bool bbr2_is_inflight_too_high(const struct sock *sk,
-+				     const struct rate_sample *rs)
++static bool bbr_is_inflight_too_high(const struct sock *sk,
++				      const struct rate_sample *rs)
 +{
 +	const struct bbr *bbr = inet_csk_ca(sk);
 +	u32 loss_thresh, ecn_thresh;
-+
+ 
+-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
 +	if (rs->lost > 0 && rs->tx_in_flight) {
-+		loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >>
++		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
 +				BBR_SCALE;
-+		if (rs->lost > loss_thresh)
++		if (rs->lost > loss_thresh) {
 +			return true;
++		}
 +	}
 +
 +	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
-+	    bbr->ecn_eligible && bbr->params.ecn_thresh) {
-+		ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >>
++	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
++		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
 +				BBR_SCALE;
-+		if (rs->delivered_ce >= ecn_thresh)
++		if (rs->delivered_ce > ecn_thresh) {
 +			return true;
++		}
 +	}
 +
 +	return false;
@@ -1857,12 +1787,12 @@ index 000000000000..85f8052144d1
 + * Then we take that equation, convert it to fixed point, and
 + * round up to the nearest packet.
 + */
-+static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk,
++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
 +					  const struct rate_sample *rs,
 +					  const struct sk_buff *skb)
 +{
-+	const struct bbr *bbr = inet_csk_ca(sk);
-+	u32 loss_thresh  = bbr->params.loss_thresh;
++	const struct tcp_sock *tp = tcp_sk(sk);
++	u32 loss_thresh  = bbr_param(sk, loss_thresh);
 +	u32 pcount, divisor, inflight_hi;
 +	s32 inflight_prev, lost_prev;
 +	u64 loss_budget, lost_prefix;
@@ -1871,14 +1801,28 @@ index 000000000000..85f8052144d1
 +
 +	/* How much data was in flight before this skb? */
 +	inflight_prev = rs->tx_in_flight - pcount;
-+	if (WARN_ONCE(inflight_prev < 0,
-+		      "tx_in_flight: %u pcount: %u reneg: %u",
-+		      rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg))
++	if (inflight_prev < 0) {
++		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++				  pcount,
++				  TCP_SKB_CB(skb)->sacked,
++				  rs->tx_in_flight),
++			  "tx_in_flight: %u pcount: %u reneg: %u",
++			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
 +		return ~0U;
++	}
 +
 +	/* How much inflight data was marked lost before this skb? */
 +	lost_prev = rs->lost - pcount;
-+	if (WARN_ON_ONCE(lost_prev < 0))
++	if (WARN_ONCE(lost_prev < 0,
++		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
++		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
++		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
++		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
++		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
++		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
++		      rs->lost, lost_prev, pcount,
++		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
++		      tp->is_sack_reneg))
 +		return ~0U;
 +
 +	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
@@ -1903,7 +1847,7 @@ index 000000000000..85f8052144d1
 + * buffer, return an operating point that tries to leave unutilized headroom in
 + * the path for other flows, for fairness convergence and lower RTTs and loss.
 + */
-+static u32 bbr2_inflight_with_headroom(const struct sock *sk)
++static u32 bbr_inflight_with_headroom(const struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 headroom, headroom_fraction;
@@ -1911,17 +1855,17 @@ index 000000000000..85f8052144d1
 +	if (bbr->inflight_hi == ~0U)
 +		return ~0U;
 +
-+	headroom_fraction = bbr->params.inflight_headroom;
++	headroom_fraction = bbr_param(sk, inflight_headroom);
 +	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
 +	headroom = max(headroom, 1U);
 +	return max_t(s32, bbr->inflight_hi - headroom,
-+		     bbr->params.cwnd_min_target);
++		     bbr_param(sk, cwnd_min_target));
 +}
 +
 +/* Bound cwnd to a sensible level, based on our current probing state
 + * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
 + */
-+static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk)
++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
@@ -1942,13 +1886,55 @@ index 000000000000..85f8052144d1
 +		if (bbr->mode == BBR_PROBE_RTT ||
 +		    (bbr->mode == BBR_PROBE_BW &&
 +		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
-+			cap = bbr2_inflight_with_headroom(sk);
++			cap = bbr_inflight_with_headroom(sk);
 +	}
 +	/* Adapt to any loss/ECN since our last bw probe. */
 +	cap = min(cap, bbr->inflight_lo);
 +
-+	cap = max_t(u32, cap, bbr->params.cwnd_min_target);
-+	tp->snd_cwnd = min(cap, tp->snd_cwnd);
++	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
++	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
++}
++
++/* How should we multiplicatively cut bw or inflight limits based on ECN? */
++u32 bbr_ecn_cut(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	return BBR_UNIT -
++		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
++}
++
++/* Init lower bounds if have not inited yet. */
++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	if (init_bw && bbr->bw_lo == ~0U)
++		bbr->bw_lo = bbr_max_bw(sk);
++	if (bbr->inflight_lo == ~0U)
++		bbr->inflight_lo = tcp_snd_cwnd(tp);
++}
++
++/* Reduce bw and inflight to (1 - beta). */
++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
++{
++	struct bbr* bbr = inet_csk_ca(sk);
++	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
++
++	*bw = max_t(u32, bbr->bw_latest,
++		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
++	*inflight = max_t(u32, bbr->inflight_latest,
++			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
++}
++
++/* Reduce inflight to (1 - alpha*ecn_factor). */
++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 ecn_cut = bbr_ecn_cut(sk);
++
++	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
 +}
 +
 +/* Estimate a short-term lower bound on the capacity available now, based
@@ -1967,57 +1953,39 @@ index 000000000000..85f8052144d1
 + * cause low bw for Reno/CUBIC and high loss recovery latency for
 + * request/response flows using any congestion control.
 + */
-+static void bbr2_adapt_lower_bounds(struct sock *sk)
++static void bbr_adapt_lower_bounds(struct sock *sk,
++				    const struct rate_sample *rs)
 +{
-+	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 ecn_cut, ecn_inflight_lo, beta;
++	u32 ecn_inflight_lo = ~0U;
 +
 +	/* We only use lower-bound estimates when not probing bw.
 +	 * When probing we need to push inflight higher to probe bw.
 +	 */
-+	if (bbr2_is_probing_bandwidth(sk))
++	if (bbr_is_probing_bandwidth(sk))
 +		return;
 +
 +	/* ECN response. */
-+	if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) {
-+		/* Reduce inflight to (1 - alpha*ecn_factor). */
-+		ecn_cut = (BBR_UNIT -
-+			   ((bbr->ecn_alpha * bbr->params.ecn_factor) >>
-+			    BBR_SCALE));
-+		if (bbr->inflight_lo == ~0U)
-+			bbr->inflight_lo = tp->snd_cwnd;
-+		ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
-+	} else {
-+		ecn_inflight_lo = ~0U;
++	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
++		bbr_init_lower_bounds(sk, false);
++		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
 +	}
 +
 +	/* Loss response. */
 +	if (bbr->loss_in_round) {
-+		/* Reduce bw and inflight to (1 - beta). */
-+		if (bbr->bw_lo == ~0U)
-+			bbr->bw_lo = bbr_max_bw(sk);
-+		if (bbr->inflight_lo == ~0U)
-+			bbr->inflight_lo = tp->snd_cwnd;
-+		beta = bbr->params.beta;
-+		bbr->bw_lo =
-+			max_t(u32, bbr->bw_latest,
-+			      (u64)bbr->bw_lo *
-+			      (BBR_UNIT - beta) >> BBR_SCALE);
-+		bbr->inflight_lo =
-+			max_t(u32, bbr->inflight_latest,
-+			      (u64)bbr->inflight_lo *
-+			      (BBR_UNIT - beta) >> BBR_SCALE);
++		bbr_init_lower_bounds(sk, true);
++		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
 +	}
 +
-+	/* Adjust to the lower of the levels implied by loss or ECN. */
++	/* Adjust to the lower of the levels implied by loss/ECN. */
 +	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
++	bbr->bw_lo = max(1U, bbr->bw_lo);
 +}
 +
 +/* Reset any short-term lower-bound adaptation to congestion, so that we can
 + * push our inflight up.
 + */
-+static void bbr2_reset_lower_bounds(struct sock *sk)
++static void bbr_reset_lower_bounds(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
@@ -2028,7 +1996,7 @@ index 000000000000..85f8052144d1
 +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
 + * machine phase where we adapt our lower bound based on congestion signals.
 + */
-+static void bbr2_reset_congestion_signals(struct sock *sk)
++static void bbr_reset_congestion_signals(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
@@ -2040,42 +2008,76 @@ index 000000000000..85f8052144d1
 +	bbr->inflight_latest = 0;
 +}
 +
-+/* Update (most of) our congestion signals: track the recent rate and volume of
-+ * delivered data, presence of loss, and EWMA degree of ECN marking.
-+ */
-+static void bbr2_update_congestion_signals(
++static void bbr_exit_loss_recovery(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
++	bbr->try_fast_path = 0; /* bound cwnd using latest model */
++}
++
++/* Update rate and volume of delivered data from latest round trip. */
++static void bbr_update_latest_delivery_signals(
 +	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	u64 bw;
 +
 +	bbr->loss_round_start = 0;
 +	if (rs->interval_us <= 0 || !rs->acked_sacked)
 +		return; /* Not a valid observation */
++
++	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
++	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
++
++	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
++		bbr->loss_round_delivered = tp->delivered;
++		bbr->loss_round_start = 1;  /* mark start of new round trip */
++	}
++}
++
++/* Once per round, reset filter for latest rate and volume of delivered data. */
++static void bbr_advance_latest_delivery_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	/* If ACK matches a TLP retransmit, persist the filter. If we detect
++	 * that a TLP retransmit plugged a tail loss, we'll want to remember
++	 * how much data the path delivered before the tail loss.
++	 */
++	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
++		bbr->bw_latest = ctx->sample_bw;
++		bbr->inflight_latest = rs->delivered;
++	}
++}
++
++/* Update (most of) our congestion signals: track the recent rate and volume of
++ * delivered data, presence of loss, and EWMA degree of ECN marking.
++ */
++static void bbr_update_congestion_signals(
++	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u64 bw;
++
++	if (rs->interval_us <= 0 || !rs->acked_sacked)
++		return; /* Not a valid observation */
 +	bw = ctx->sample_bw;
 +
 +	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
-+		bbr2_take_bw_hi_sample(sk, bw);
++		bbr_take_max_bw_sample(sk, bw);
 +
 +	bbr->loss_in_round |= (rs->losses > 0);
 +
-+	/* Update rate and volume of delivered data from latest round trip: */
-+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
-+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
-+
-+	if (before(rs->prior_delivered, bbr->loss_round_delivered))
++	if (!bbr->loss_round_start)
 +		return;		/* skip the per-round-trip updates */
 +	/* Now do per-round-trip updates. */
-+	bbr->loss_round_delivered = tp->delivered;  /* mark round trip */
-+	bbr->loss_round_start = 1;
-+	bbr2_adapt_lower_bounds(sk);
++	bbr_adapt_lower_bounds(sk, rs);
 +
-+	/* Update windowed "latest" (single-round-trip) filters. */
 +	bbr->loss_in_round = 0;
 +	bbr->ecn_in_round  = 0;
-+	bbr->bw_latest = ctx->sample_bw;
-+	bbr->inflight_latest = rs->delivered;
 +}
 +
 +/* Bandwidth probing can cause loss. To help coexistence with loss-based
@@ -2085,22 +2087,15 @@ index 000000000000..85f8052144d1
 + * flow. We count packet-timed round trips directly, since measured RTT can
 + * vary widely, and Reno is driven by packet-timed round trips.
 + */
-+static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk)
++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	u32 inflight, rounds, reno_gain, reno_rounds;
++	u32 rounds;
 +
 +	/* Random loss can shave some small percentage off of our inflight
 +	 * in each round. To survive this, flows need robust periodic probes.
 +	 */
-+	rounds = bbr->params.bw_probe_max_rounds;
-+
-+	reno_gain = bbr->params.bw_probe_reno_gain;
-+	if (reno_gain) {
-+		inflight = bbr2_target_inflight(sk);
-+		reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE;
-+		rounds = min(rounds, reno_rounds);
-+	}
++	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
 +	return bbr->rounds_since_probe >= rounds;
 +}
 +
@@ -2121,19 +2116,19 @@ index 000000000000..85f8052144d1
 + *      time-scales (e.g. perhaps traffic from a web page download that we
 + *      were competing with is now complete).
 + */
-+static void bbr2_pick_probe_wait(struct sock *sk)
++static void bbr_pick_probe_wait(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
 +	/* Decide the random round-trip bound for wait until probe: */
 +	bbr->rounds_since_probe =
-+		get_random_u32_below(bbr->params.bw_probe_rand_rounds);
++		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
 +	/* Decide the random wall clock bound for wait until probe: */
-+	bbr->probe_wait_us = bbr->params.bw_probe_base_us +
-+			     get_random_u32_below(bbr->params.bw_probe_rand_us);
++	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
++			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
 +}
 +
-+static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx)
++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
@@ -2148,24 +2143,22 @@ index 000000000000..85f8052144d1
 + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
 + * inflight_hi estimates will underestimate.
 + */
-+static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	bbr2_reset_lower_bounds(sk);
-+	if (bbr->inflight_hi != ~0U)
-+		bbr->inflight_hi += bbr->params.refill_add_inc;
++	bbr_reset_lower_bounds(sk);
 +	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
 +	bbr->bw_probe_up_acks = 0;
 +	bbr->stopped_risky_probe = 0;
 +	bbr->ack_phase = BBR_ACKS_REFILLING;
 +	bbr->next_rtt_delivered = tp->delivered;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
 +}
 +
 +/* Now probe max deliverable data rate and volume. */
-+static void bbr2_start_bw_probe_up(struct sock *sk)
++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
@@ -2173,8 +2166,10 @@ index 000000000000..85f8052144d1
 +	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
 +	bbr->next_rtt_delivered = tp->delivered;
 +	bbr->cycle_mstamp = tp->tcp_mstamp;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP);
-+	bbr2_raise_inflight_hi_slope(sk);
++	bbr_reset_full_bw(sk);
++	bbr->full_bw = ctx->sample_bw;
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
++	bbr_raise_inflight_hi_slope(sk);
 +}
 +
 +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
@@ -2183,57 +2178,57 @@ index 000000000000..85f8052144d1
 + * keep packet loss rates low. Also start a round-trip counter, to probe faster
 + * if we estimate a Reno flow at our BDP would probe faster.
 + */
-+static void bbr2_start_bw_probe_down(struct sock *sk)
++static void bbr_start_bw_probe_down(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	bbr2_reset_congestion_signals(sk);
++	bbr_reset_congestion_signals(sk);
 +	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
-+	bbr2_pick_probe_wait(sk);
++	bbr_pick_probe_wait(sk);
 +	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
 +	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
 +	bbr->next_rtt_delivered = tp->delivered;
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
 +}
 +
 +/* Cruise: maintain what we estimate to be a neutral, conservative
 + * operating point, without attempting to probe up for bandwidth or down for
 + * RTT, and only reducing inflight in response to loss/ECN signals.
 + */
-+static void bbr2_start_bw_probe_cruise(struct sock *sk)
++static void bbr_start_bw_probe_cruise(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
 +	if (bbr->inflight_lo != ~0U)
 +		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
 +
-+	bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
++	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
 +}
 +
 +/* Loss and/or ECN rate is too high while probing.
 + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
 + */
-+static void bbr2_handle_inflight_too_high(struct sock *sk,
++static void bbr_handle_inflight_too_high(struct sock *sk,
 +					  const struct rate_sample *rs)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	const u32 beta = bbr->params.beta;
++	const u32 beta = bbr_param(sk, beta);
 +
 +	bbr->prev_probe_too_high = 1;
 +	bbr->bw_probe_samples = 0;  /* only react once per probe */
-+	bbr->debug.event = 'L';     /* Loss/ECN too high */
 +	/* If we are app-limited then we are not robustly
 +	 * probing the max volume of inflight data we think
 +	 * might be safe (analogous to how app-limited bw
 +	 * samples are not known to be robustly probing bw).
 +	 */
-+	if (!rs->is_app_limited)
++	if (!rs->is_app_limited) {
 +		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
-+					 (u64)bbr2_target_inflight(sk) *
++					 (u64)bbr_target_inflight(sk) *
 +					 (BBR_UNIT - beta) >> BBR_SCALE);
++	}
 +	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
-+		bbr2_start_bw_probe_down(sk);
++		bbr_start_bw_probe_down(sk);
 +}
 +
 +/* If we're seeing bw and loss samples reflecting our bw probing, adapt
@@ -2241,8 +2236,9 @@ index 000000000000..85f8052144d1
 + * inflight_hi downward. If we're able to push inflight higher without such
 + * signals, push higher: adapt inflight_hi upward.
 + */
-+static bool bbr2_adapt_upper_bounds(struct sock *sk,
-+				   const struct rate_sample *rs)
++static bool bbr_adapt_upper_bounds(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
@@ -2259,7 +2255,7 @@ index 000000000000..85f8052144d1
 +		 * samples from the previous cycle, by advancing the window.
 +		 */
 +		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
-+			bbr2_advance_bw_hi_filter(sk);
++			bbr_advance_max_bw_filter(sk);
 +		/* If we had an inflight_hi, then probed and pushed inflight all
 +		 * the way up to hit that inflight_hi without seeing any
 +		 * high loss/ECN in all the resulting ACKs from that probing,
@@ -2268,100 +2264,91 @@ index 000000000000..85f8052144d1
 +		 */
 +		if (bbr->mode == BBR_PROBE_BW &&
 +		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
-+			bbr->debug.event = 'R';  /* reprobe */
-+			bbr2_start_bw_probe_refill(sk, 0);
++			bbr_start_bw_probe_refill(sk, 0);
 +			return true;  /* yes, decided state transition */
 +		}
 +	}
-+
-+	if (bbr2_is_inflight_too_high(sk, rs)) {
++	if (bbr_is_inflight_too_high(sk, rs)) {
 +		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
-+			bbr2_handle_inflight_too_high(sk, rs);
++			bbr_handle_inflight_too_high(sk, rs);
 +	} else {
 +		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
-+		if (bbr->inflight_hi == ~0U)  /* no excess queue signals yet? */
-+			return false;
 +
-+		/* To be resilient to random loss, we must raise inflight_hi
++		if (bbr->inflight_hi == ~0U)
++			return false;   /* no excess queue signals yet */
++
++		/* To be resilient to random loss, we must raise bw/inflight_hi
 +		 * if we observe in any phase that a higher level is safe.
 +		 */
 +		if (rs->tx_in_flight > bbr->inflight_hi) {
 +			bbr->inflight_hi = rs->tx_in_flight;
-+			bbr->debug.event = 'U';  /* raise up inflight_hi */
 +		}
 +
 +		if (bbr->mode == BBR_PROBE_BW &&
 +		    bbr->cycle_idx == BBR_BW_PROBE_UP)
-+			bbr2_probe_inflight_hi_upward(sk, rs);
++			bbr_probe_inflight_hi_upward(sk, rs);
 +	}
 +
 +	return false;
 +}
 +
 +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
-+static bool bbr2_check_time_to_probe_bw(struct sock *sk)
++static bool bbr_check_time_to_probe_bw(struct sock *sk,
++					const struct rate_sample *rs)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 n;
 +
 +	/* If we seem to be at an operating point where we are not seeing loss
 +	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
-+	 * quickly (in case a burst of cross-traffic has ceased and freed up bw,
-+	 * or in case we are sharing with multiplicatively probing traffic).
++	 * quickly (in case cross-traffic has ceased and freed up bw).
 +	 */
-+	if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible &&
++	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
 +	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
 +	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
-+		bbr->debug.event = 'A';  /* *A*ll clear to probe *A*gain */
-+		/* Calculate n so that when bbr2_raise_inflight_hi_slope()
++		/* Calculate n so that when bbr_raise_inflight_hi_slope()
 +		 * computes growth_this_round as 2^n it will be roughly the
 +		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
 +		 */
 +		n = ilog2((((u64)bbr->inflight_hi *
-+			    bbr->params.ecn_reprobe_gain) >> BBR_SCALE));
-+		bbr2_start_bw_probe_refill(sk, n);
++			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
++		bbr_start_bw_probe_refill(sk, n);
 +		return true;
 +	}
 +
-+	if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
-+	    bbr2_is_reno_coexistence_probe_time(sk)) {
-+		bbr2_start_bw_probe_refill(sk, 0);
++	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
++	    bbr_is_reno_coexistence_probe_time(sk)) {
++		bbr_start_bw_probe_refill(sk, 0);
 +		return true;
 +	}
 +	return false;
 +}
 +
 +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
-+static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
 +{
-+	struct bbr *bbr = inet_csk_ca(sk);
-+	bool is_under_bdp, is_long_enough;
-+
 +	/* Always need to pull inflight down to leave headroom in queue. */
-+	if (inflight > bbr2_inflight_with_headroom(sk))
++	if (inflight > bbr_inflight_with_headroom(sk))
 +		return false;
 +
-+	is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-+	if (bbr->params.drain_to_target)
-+		return is_under_bdp;
-+
-+	is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us);
-+	return is_under_bdp || is_long_enough;
++	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
 +}
 +
 +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
-+static void bbr2_update_cycle_phase(struct sock *sk,
-+				    const struct rate_sample *rs)
++static void bbr_update_cycle_phase(struct sock *sk,
++				    const struct rate_sample *rs,
++				    struct bbr_context *ctx)
 +{
++	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	bool is_risky = false, is_queuing = false;
++	bool is_bw_probe_done = false;
 +	u32 inflight, bw;
 +
 +	if (!bbr_full_bw_reached(sk))
 +		return;
 +
 +	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
-+	if (bbr2_adapt_upper_bounds(sk, rs))
++	if (bbr_adapt_upper_bounds(sk, rs, ctx))
 +		return;		/* already decided state transition */
 +
 +	if (bbr->mode != BBR_PROBE_BW)
@@ -2377,7 +2364,7 @@ index 000000000000..85f8052144d1
 +	 * by slowing down.
 +	 */
 +	case BBR_BW_PROBE_CRUISE:
-+		if (bbr2_check_time_to_probe_bw(sk))
++		if (bbr_check_time_to_probe_bw(sk, rs))
 +			return;		/* already decided state transition */
 +		break;
 +
@@ -2392,7 +2379,7 @@ index 000000000000..85f8052144d1
 +			 * may be putting too much data in flight.
 +			 */
 +			bbr->bw_probe_samples = 1;
-+			bbr2_start_bw_probe_up(sk);
++			bbr_start_bw_probe_up(sk, ctx);
 +		}
 +		break;
 +
@@ -2407,31 +2394,33 @@ index 000000000000..85f8052144d1
 +	 *     most recent previous bw probe phase. Thus we want to start
 +	 *     draining the queue immediately because it's very likely the most
 +	 *     recently sent packets will fill the queue and cause drops.
-+	 *     (checked here)
-+	 * (2) We have probed for at least 1*min_rtt_us, and the
-+	 *     estimated queue is high enough (inflight > 1.25 * estimated_bdp).
-+	 *     (checked here)
++	 * (2) If inflight_hi has not limited bandwidth growth recently, and
++	 *     yet delivered bandwidth has not increased much recently
++	 *     (bbr->full_bw_now).
 +	 * (3) Loss filter says loss rate is "too high".
-+	 *     (checked in bbr_is_inflight_too_high())
 +	 * (4) ECN filter says ECN mark rate is "too high".
-+	 *     (checked in bbr_is_inflight_too_high())
++	 *
++	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
 +	 */
 +	case BBR_BW_PROBE_UP:
 +		if (bbr->prev_probe_too_high &&
 +		    inflight >= bbr->inflight_hi) {
 +			bbr->stopped_risky_probe = 1;
-+			is_risky = true;
-+			bbr->debug.event = 'D';   /* D for danger */
-+		} else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) &&
-+			   inflight >=
-+			   bbr_inflight(sk, bw,
-+					bbr->params.bw_probe_pif_gain)) {
-+			is_queuing = true;
-+			bbr->debug.event = 'Q'; /* building Queue */
++			is_bw_probe_done = true;
++		} else {
++			if (tp->is_cwnd_limited &&
++			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
++				/* inflight_hi is limiting bw growth */
++				bbr_reset_full_bw(sk);
++				bbr->full_bw = ctx->sample_bw;
++			} else if (bbr->full_bw_now) {
++				/* Plateau in estimated bw. Pipe looks full. */
++				is_bw_probe_done = true;
++			}
 +		}
-+		if (is_risky || is_queuing) {
++		if (is_bw_probe_done) {
 +			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
-+			bbr2_start_bw_probe_down(sk);  /* restart w/ down */
++			bbr_start_bw_probe_down(sk);  /* restart w/ down */
 +		}
 +		break;
 +
@@ -2445,10 +2434,10 @@ index 000000000000..85f8052144d1
 +	 * the queue is drained; persisting would underutilize the pipe.
 +	 */
 +	case BBR_BW_PROBE_DOWN:
-+		if (bbr2_check_time_to_probe_bw(sk))
++		if (bbr_check_time_to_probe_bw(sk, rs))
 +			return;		/* already decided state transition */
-+		if (bbr2_check_time_to_cruise(sk, inflight, bw))
-+			bbr2_start_bw_probe_cruise(sk);
++		if (bbr_check_time_to_cruise(sk, inflight, bw))
++			bbr_start_bw_probe_cruise(sk);
 +		break;
 +
 +	default:
@@ -2457,22 +2446,22 @@ index 000000000000..85f8052144d1
 +}
 +
 +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
-+static void bbr2_exit_probe_rtt(struct sock *sk)
++static void bbr_exit_probe_rtt(struct sock *sk)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	bbr2_reset_lower_bounds(sk);
++	bbr_reset_lower_bounds(sk);
 +	if (bbr_full_bw_reached(sk)) {
 +		bbr->mode = BBR_PROBE_BW;
 +		/* Raising inflight after PROBE_RTT may cause loss, so reset
 +		 * the PROBE_BW clock and schedule the next bandwidth probe for
 +		 * a friendly and randomized future point in time.
 +		 */
-+		bbr2_start_bw_probe_down(sk);
++		bbr_start_bw_probe_down(sk);
 +		/* Since we are exiting PROBE_RTT, we know inflight is
 +		 * below our estimated BDP, so it is reasonable to cruise.
 +		 */
-+		bbr2_start_bw_probe_cruise(sk);
++		bbr_start_bw_probe_cruise(sk);
 +	} else {
 +		bbr->mode = BBR_STARTUP;
 +	}
@@ -2482,8 +2471,8 @@ index 000000000000..85f8052144d1
 + * the end of the round in recovery to get a good estimate of how many packets
 + * have been lost, and how many we need to drain with a low pacing rate.
 + */
-+static void bbr2_check_loss_too_high_in_startup(struct sock *sk,
-+					       const struct rate_sample *rs)
++static void bbr_check_loss_too_high_in_startup(struct sock *sk,
++						const struct rate_sample *rs)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
@@ -2497,39 +2486,83 @@ index 000000000000..85f8052144d1
 +	 */
 +	if (rs->losses && bbr->loss_events_in_round < 0xf)
 +		bbr->loss_events_in_round++;  /* update saturating counter */
-+	if (bbr->params.full_loss_cnt && bbr->loss_round_start &&
++	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
 +	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
-+	    bbr->loss_events_in_round >= bbr->params.full_loss_cnt &&
-+	    bbr2_is_inflight_too_high(sk, rs)) {
-+		bbr->debug.event = 'P';  /* Packet loss caused STARTUP exit */
-+		bbr2_handle_queue_too_high_in_startup(sk);
++	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
++	    bbr_is_inflight_too_high(sk, rs)) {
++		bbr_handle_queue_too_high_in_startup(sk);
 +		return;
 +	}
 +	if (bbr->loss_round_start)
 +		bbr->loss_events_in_round = 0;
 +}
 +
-+/* If we are done draining, advance into steady state operation in PROBE_BW. */
-+static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs,
-+			     struct bbr_context *ctx)
++/* Estimate when the pipe is full, using the change in delivery rate: BBR
++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
++ * higher rwin, 3: we get higher delivery rate samples. Or transient
++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
++ */
++static void bbr_check_full_bw_reached(struct sock *sk,
++				       const struct rate_sample *rs,
++				       struct bbr_context *ctx)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++	u32 bw_thresh, full_cnt, thresh;
++
++	if (bbr->full_bw_now || rs->is_app_limited)
++		return;
++
++	thresh = bbr_param(sk, full_bw_thresh);
++	full_cnt = bbr_param(sk, full_bw_cnt);
++	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
++	if (ctx->sample_bw >= bw_thresh) {
++		bbr_reset_full_bw(sk);
++		bbr->full_bw = ctx->sample_bw;
++		return;
++	}
++	if (!bbr->round_start)
++		return;
++	++bbr->full_bw_cnt;
++	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
++	bbr->full_bw_reached |= bbr->full_bw_now;
++}
++
++/* If pipe is probably full, drain the queue and then enter steady-state. */
++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
++			    struct bbr_context *ctx)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	if (bbr_check_drain(sk, rs, ctx)) {
++	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
++		bbr->mode = BBR_DRAIN;	/* drain queue we created */
++		/* Set ssthresh to export purely for monitoring, to signal
++		 * completion of initial STARTUP by setting to a non-
++		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
++		 */
++		tcp_sk(sk)->snd_ssthresh =
++				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
++		bbr_reset_congestion_signals(sk);
++	}	/* fall through to check if in-flight is already small: */
++	if (bbr->mode == BBR_DRAIN &&
++	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
++	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
 +		bbr->mode = BBR_PROBE_BW;
-+		bbr2_start_bw_probe_down(sk);
++		bbr_start_bw_probe_down(sk);
 +	}
 +}
 +
-+static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs,
++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
 +			      struct bbr_context *ctx)
 +{
-+	bbr2_update_congestion_signals(sk, rs, ctx);
++	bbr_update_congestion_signals(sk, rs, ctx);
 +	bbr_update_ack_aggregation(sk, rs);
-+	bbr2_check_loss_too_high_in_startup(sk, rs);
-+	bbr_check_full_bw_reached(sk, rs);
-+	bbr2_check_drain(sk, rs, ctx);
-+	bbr2_update_cycle_phase(sk, rs);
++	bbr_check_loss_too_high_in_startup(sk, rs);
++	bbr_check_full_bw_reached(sk, rs, ctx);
++	bbr_check_drain(sk, rs, ctx);
++	bbr_update_cycle_phase(sk, rs, ctx);
 +	bbr_update_min_rtt(sk, rs);
 +}
 +
@@ -2557,25 +2590,26 @@ index 000000000000..85f8052144d1
 + *
 + * Returns whether we can take fast path or not.
 + */
-+static bool bbr2_fast_path(struct sock *sk, bool *update_model,
++static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
 +		const struct rate_sample *rs, struct bbr_context *ctx)
 +{
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	u32 prev_min_rtt_us, prev_mode;
 +
-+	if (bbr->params.fast_path && bbr->try_fast_path &&
++	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
 +	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
-+	    !bbr->loss_in_round && !bbr->ecn_in_round) {
++	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
 +		prev_mode = bbr->mode;
 +		prev_min_rtt_us = bbr->min_rtt_us;
-+		bbr2_check_drain(sk, rs, ctx);
-+		bbr2_update_cycle_phase(sk, rs);
++		bbr_check_drain(sk, rs, ctx);
++		bbr_update_cycle_phase(sk, rs, ctx);
 +		bbr_update_min_rtt(sk, rs);
 +
 +		if (bbr->mode == prev_mode &&
 +		    bbr->min_rtt_us == prev_min_rtt_us &&
-+		    bbr->try_fast_path)
++		    bbr->try_fast_path) {
 +			return true;
++		}
 +
 +		/* Skip model update, but control still needs to be updated */
 +		*update_model = false;
@@ -2583,217 +2617,95 @@ index 000000000000..85f8052144d1
 +	return false;
 +}
 +
-+static void bbr2_main(struct sock *sk, const struct rate_sample *rs)
++__bpf_kfunc void bbr_main(struct sock *sk, const struct rate_sample *rs)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +	struct bbr_context ctx = { 0 };
 +	bool update_model = true;
-+	u32 bw;
++	u32 bw, round_delivered;
++	int ce_ratio = -1;
 +
-+	bbr->debug.event = '.';  /* init to default NOP (no event yet) */
-+
-+	bbr_update_round_start(sk, rs, &ctx);
++	round_delivered = bbr_update_round_start(sk, rs, &ctx);
 +	if (bbr->round_start) {
 +		bbr->rounds_since_probe =
 +			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
-+		bbr2_update_ecn_alpha(sk);
++		ce_ratio = bbr_update_ecn_alpha(sk);
 +	}
++	bbr_plb(sk, rs, ce_ratio);
 +
-+	bbr->ecn_in_round  |= rs->is_ece;
++	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
 +	bbr_calculate_bw_sample(sk, rs, &ctx);
++	bbr_update_latest_delivery_signals(sk, rs, &ctx);
 +
-+	if (bbr2_fast_path(sk, &update_model, rs, &ctx))
++	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
 +		goto out;
 +
 +	if (update_model)
-+		bbr2_update_model(sk, rs, &ctx);
++		bbr_update_model(sk, rs, &ctx);
 +
 +	bbr_update_gains(sk);
 +	bw = bbr_bw(sk);
 +	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
 +	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
-+		     tp->snd_cwnd, &ctx);
-+	bbr2_bound_cwnd_for_inflight_model(sk);
++		     tcp_snd_cwnd(tp), &ctx);
++	bbr_bound_cwnd_for_inflight_model(sk);
 +
 +out:
++	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
 +	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
 +	bbr->loss_in_cycle |= rs->lost > 0;
 +	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
-+
-+	bbr_debug(sk, rs->acked_sacked, rs, &ctx);
 +}
 +
-+/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared
-+ * down here, so that the algorithm functions that use the parameters must use
-+ * the per-socket parameters; if they accidentally use the global version
-+ * then there will be a compile error.
-+ * TODO(ncardwell): move all per-socket parameters down to this section.
-+ */
-+
-+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
-+ * No loss response when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_beta = BBR_UNIT * 30 / 100;
-+
-+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE.
-+ * Max allowed value is 255.
-+ */
-+static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;  /* 1/16 = 6.25% */
-+
-+/* The initial value for the ecn_alpha state variable. Default and max
-+ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly
-+ * to congestion if the bottleneck is congested when the flow starts up.
-+ */
-+static u32 bbr_ecn_alpha_init = BBR_UNIT;	/* 1.0, to respond quickly */
-+
-+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
-+ * No ECN based bounding when 0. Max allwed value is 255.
-+ */
-+static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	    /* 1/3 = 33% */
-+
-+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
-+ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255.
-+ */
-+static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
-+
-+/* Max RTT (in usec) at which to use sender-side ECN logic.
-+ * Disabled when 0 (ECN allowed at any RTT).
-+ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms.
-+ */
-+static u32 bbr_ecn_max_rtt_us = 5000;
-+
-+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
-+ * clears then use a multiplicative increase to quickly reprobe bw by
-+ * starting inflight probing at the given multiple of inflight_hi.
-+ * Default for this experimental knob is 0 (disabled).
-+ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5.
-+ */
-+static u32 bbr_ecn_reprobe_gain;
-+
-+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
-+static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
-+
-+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
-+ * and loss rate is higher than bbr_loss_thresh.
-+ * Disabled if 0. Max allowed value is 15 (0xF).
-+ */
-+static u32 bbr_full_loss_cnt = 8;
-+
-+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
-+ * meets this count. Max allowed value is 3.
-+ */
-+static u32 bbr_full_ecn_cnt = 2;
-+
-+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
-+static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
-+
-+/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase.
-+ * Default is 1.25x, as in BBR v1. Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4;
-+
-+/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips.
-+ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism.
-+ * Max allowed is 511.
-+ */
-+static u32 bbr_bw_probe_reno_gain = BBR_UNIT;
-+
-+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
-+ * we want to tolerate 1% random loss per round, and not have this cut our
-+ * inflight too much, we must probe for bw periodically on roughly this scale.
-+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
-+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
-+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
-+ */
-+static u32 bbr_bw_probe_max_rounds = 63;
-+
-+/* Max amount of randomness to inject in round counting for Reno-coexistence.
-+ * Max value is 15.
-+ */
-+static u32 bbr_bw_probe_rand_rounds = 2;
-+
-+/* Use BBR-native probe time scale starting at this many usec.
-+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
-+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
-+ */
-+static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
-+
-+/* Use BBR-native probes spread over this many usec: */
-+static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
-+
-+/* Undo the model changes made in loss recovery if recovery was spurious? */
-+static bool bbr_undo = true;
-+
-+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
-+static bool bbr_fast_path = true;	/* default: enabled */
-+
-+/* Use fast ack mode ? */
-+static int bbr_fast_ack_mode = 1;	/* default: rwnd check off */
-+
-+/* How much to additively increase inflight_hi when entering REFILL? */
-+static u32 bbr_refill_add_inc;		/* default: disabled */
-+
-+module_param_named(beta,                 bbr_beta,                 uint, 0644);
-+module_param_named(ecn_alpha_gain,       bbr_ecn_alpha_gain,       uint, 0644);
-+module_param_named(ecn_alpha_init,       bbr_ecn_alpha_init,       uint, 0644);
-+module_param_named(ecn_factor,           bbr_ecn_factor,           uint, 0644);
-+module_param_named(ecn_thresh,           bbr_ecn_thresh,           uint, 0644);
-+module_param_named(ecn_max_rtt_us,       bbr_ecn_max_rtt_us,       uint, 0644);
-+module_param_named(ecn_reprobe_gain,     bbr_ecn_reprobe_gain,     uint, 0644);
-+module_param_named(loss_thresh,          bbr_loss_thresh,          uint, 0664);
-+module_param_named(full_loss_cnt,        bbr_full_loss_cnt,        uint, 0664);
-+module_param_named(full_ecn_cnt,         bbr_full_ecn_cnt,         uint, 0664);
-+module_param_named(inflight_headroom,    bbr_inflight_headroom,    uint, 0664);
-+module_param_named(bw_probe_pif_gain,    bbr_bw_probe_pif_gain,    uint, 0664);
-+module_param_named(bw_probe_reno_gain,   bbr_bw_probe_reno_gain,   uint, 0664);
-+module_param_named(bw_probe_max_rounds,  bbr_bw_probe_max_rounds,  uint, 0664);
-+module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664);
-+module_param_named(bw_probe_base_us,     bbr_bw_probe_base_us,     uint, 0664);
-+module_param_named(bw_probe_rand_us,     bbr_bw_probe_rand_us,     uint, 0664);
-+module_param_named(undo,                 bbr_undo,                 bool, 0664);
-+module_param_named(fast_path,		 bbr_fast_path,		   bool, 0664);
-+module_param_named(fast_ack_mode,	 bbr_fast_ack_mode,	   uint, 0664);
-+module_param_named(refill_add_inc,       bbr_refill_add_inc,       uint, 0664);
-+
-+static void bbr2_init(struct sock *sk)
++__bpf_kfunc static void bbr_init(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	bbr_init(sk);	/* run shared init code for v1 and v2 */
-+
-+	/* BBR v2 parameters: */
-+	bbr->params.beta = min_t(u32, 0xFFU, bbr_beta);
-+	bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain);
-+	bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init);
-+	bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor);
-+	bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh);
-+	bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us);
-+	bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain);
-+	bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh);
-+	bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt);
-+	bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt);
-+	bbr->params.inflight_headroom =
-+		min_t(u32, 0xFFU, bbr_inflight_headroom);
-+	bbr->params.bw_probe_pif_gain =
-+		min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain);
-+	bbr->params.bw_probe_reno_gain =
-+		min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain);
-+	bbr->params.bw_probe_max_rounds =
-+		min_t(u32, 0xFFU, bbr_bw_probe_max_rounds);
-+	bbr->params.bw_probe_rand_rounds =
-+		min_t(u32, 0xFU, bbr_bw_probe_rand_rounds);
-+	bbr->params.bw_probe_base_us =
-+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us);
-+	bbr->params.bw_probe_rand_us =
-+		min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us);
-+	bbr->params.undo = bbr_undo;
-+	bbr->params.fast_path = bbr_fast_path ? 1 : 0;
-+	bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc);
-+
-+	/* BBR v2 state: */
 +	bbr->initialized = 1;
++
++	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
++	bbr->prior_cwnd = tp->prior_cwnd;
++	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
++	bbr->next_rtt_delivered = tp->delivered;
++	bbr->prev_ca_state = TCP_CA_Open;
++
++	bbr->probe_rtt_done_stamp = 0;
++	bbr->probe_rtt_round_done = 0;
++	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
++	bbr->probe_rtt_min_stamp = tcp_jiffies32;
++	bbr->min_rtt_us = tcp_min_rtt(tp);
++	bbr->min_rtt_stamp = tcp_jiffies32;
++
++	bbr->has_seen_rtt = 0;
++	bbr_init_pacing_rate_from_rtt(sk);
++
++	bbr->round_start = 0;
++	bbr->idle_restart = 0;
++	bbr->full_bw_reached = 0;
++	bbr->full_bw = 0;
+ 	bbr->full_bw_cnt = 0;
+-	bbr_reset_lt_bw_sampling(sk);
+-	return tcp_snd_cwnd(tcp_sk(sk));
++	bbr->cycle_mstamp = 0;
++	bbr->cycle_idx = 0;
++
++	bbr_reset_startup_mode(sk);
++
++	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
++	bbr->ack_epoch_acked = 0;
++	bbr->extra_acked_win_rtts = 0;
++	bbr->extra_acked_win_idx = 0;
++	bbr->extra_acked[0] = 0;
++	bbr->extra_acked[1] = 0;
++
++	bbr->ce_state = 0;
++	bbr->prior_rcv_nxt = tp->rcv_nxt;
++	bbr->try_fast_path = 0;
++
++	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++
 +	/* Start sampling ECN mark rate after first full flight is ACKed: */
 +	bbr->loss_round_delivered = tp->delivered + 1;
 +	bbr->loss_round_start = 0;
@@ -2802,12 +2714,13 @@ index 000000000000..85f8052144d1
 +	bbr->undo_inflight_hi = 0;
 +	bbr->loss_events_in_round = 0;
 +	bbr->startup_ecn_rounds = 0;
-+	bbr2_reset_congestion_signals(sk);
++	bbr_reset_congestion_signals(sk);
 +	bbr->bw_lo = ~0U;
 +	bbr->bw_hi[0] = 0;
 +	bbr->bw_hi[1] = 0;
 +	bbr->inflight_lo = ~0U;
 +	bbr->inflight_hi = ~0U;
++	bbr_reset_full_bw(sk);
 +	bbr->bw_probe_up_cnt = ~0U;
 +	bbr->bw_probe_up_acks = 0;
 +	bbr->bw_probe_up_rounds = 0;
@@ -2818,31 +2731,43 @@ index 000000000000..85f8052144d1
 +	bbr->bw_probe_samples = 0;
 +	bbr->prev_probe_too_high = 0;
 +	bbr->ecn_eligible = 0;
-+	bbr->ecn_alpha = bbr->params.ecn_alpha_init;
++	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
 +	bbr->alpha_last_delivered = 0;
 +	bbr->alpha_last_delivered_ce = 0;
++	bbr->plb.pause_until = 0;
 +
-+	tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode);
++	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
 +
-+	if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable)
++	if (bbr_can_use_ecn(sk))
 +		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
 +}
 +
-+/* Core TCP stack informs us that the given skb was just marked lost. */
-+static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
++/* BBR marks the current round trip as a loss round. */
++static void bbr_note_loss(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
-+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-+	struct rate_sample rs;
 +
-+	/* Capture "current" data over the full round trip of loss,
-+	 * to have a better chance to see the full capacity of the path.
-+	*/
++	/* Capture "current" data over the full round trip of loss, to
++	 * have a better chance of observing the full capacity of the path.
++	 */
 +	if (!bbr->loss_in_round)  /* first loss in this round trip? */
 +		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
 +	bbr->loss_in_round = 1;
 +	bbr->loss_in_cycle = 1;
+ }
+ 
+-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
++/* Core TCP stack informs us that the given skb was just marked lost. */
++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
++					    const struct sk_buff *skb)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct bbr *bbr = inet_csk_ca(sk);
++	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
++	struct rate_sample rs = {};
++
++	bbr_note_loss(sk);
 +
 +	if (!bbr->bw_probe_samples)
 +		return;  /* not an skb sent while probing for bandwidth */
@@ -2852,178 +2777,214 @@ index 000000000000..85f8052144d1
 +	 * estimates what happened in the flight leading up to this lost skb,
 +	 * then see if the loss rate went too high, and if so at which packet.
 +	 */
-+	memset(&rs, 0, sizeof(rs));
 +	rs.tx_in_flight = scb->tx.in_flight;
 +	rs.lost = tp->lost - scb->tx.lost;
 +	rs.is_app_limited = scb->tx.is_app_limited;
-+	if (bbr2_is_inflight_too_high(sk, &rs)) {
-+		rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb);
-+		bbr2_handle_inflight_too_high(sk, &rs);
++	if (bbr_is_inflight_too_high(sk, &rs)) {
++		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
++		bbr_handle_inflight_too_high(sk, &rs);
 +	}
 +}
 +
-+/* Revert short-term model if current loss recovery event was spurious. */
-+static u32 bbr2_undo_cwnd(struct sock *sk)
++static void bbr_run_loss_probe_recovery(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +	struct bbr *bbr = inet_csk_ca(sk);
++	struct rate_sample rs = {0};
 +
-+	bbr->debug.undo = 1;
-+	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
-+	bbr->full_bw_cnt = 0;
++	bbr_note_loss(sk);
++
++	if (!bbr->bw_probe_samples)
++		return;  /* not sent while probing for bandwidth */
++	/* We are probing for bandwidth. Construct a rate sample that
++	 * estimates what happened in the flight leading up to this
++	 * loss, then see if the loss rate went too high.
++	 */
++	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
++	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
++	rs.is_app_limited = tp->tlp_orig_data_app_limited;
++	if (bbr_is_inflight_too_high(sk, &rs))
++		bbr_handle_inflight_too_high(sk, &rs);
++}
++
++/* Revert short-term model if current loss recovery event was spurious. */
++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
++{
++	struct bbr *bbr = inet_csk_ca(sk);
++
++	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
 +	bbr->loss_in_round = 0;
 +
-+	if (!bbr->params.undo)
-+		return tp->snd_cwnd;
-+
 +	/* Revert to cwnd and other state saved before loss episode. */
 +	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
 +	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
 +	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
++	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
 +	return bbr->prior_cwnd;
 +}
 +
 +/* Entering loss recovery, so save state for when we undo recovery. */
-+static u32 bbr2_ssthresh(struct sock *sk)
-+{
+ __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+ {
 +	struct bbr *bbr = inet_csk_ca(sk);
 +
-+	bbr_save_cwnd(sk);
+ 	bbr_save_cwnd(sk);
 +	/* For undo, save state that adapts based on loss signal. */
 +	bbr->undo_bw_lo		= bbr->bw_lo;
 +	bbr->undo_inflight_lo	= bbr->inflight_lo;
 +	bbr->undo_inflight_hi	= bbr->inflight_hi;
-+	return tcp_sk(sk)->snd_ssthresh;
-+}
-+
-+static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr)
+ 	return tcp_sk(sk)->snd_ssthresh;
+ }
+ 
++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
 +{
 +	switch (bbr->mode) {
 +	case BBR_STARTUP:
-+		return BBR2_PHASE_STARTUP;
++		return BBR_PHASE_STARTUP;
 +	case BBR_DRAIN:
-+		return BBR2_PHASE_DRAIN;
++		return BBR_PHASE_DRAIN;
 +	case BBR_PROBE_BW:
 +		break;
 +	case BBR_PROBE_RTT:
-+		return BBR2_PHASE_PROBE_RTT;
++		return BBR_PHASE_PROBE_RTT;
 +	default:
-+		return BBR2_PHASE_INVALID;
++		return BBR_PHASE_INVALID;
 +	}
 +	switch (bbr->cycle_idx) {
 +	case BBR_BW_PROBE_UP:
-+		return BBR2_PHASE_PROBE_BW_UP;
++		return BBR_PHASE_PROBE_BW_UP;
 +	case BBR_BW_PROBE_DOWN:
-+		return BBR2_PHASE_PROBE_BW_DOWN;
++		return BBR_PHASE_PROBE_BW_DOWN;
 +	case BBR_BW_PROBE_CRUISE:
-+		return BBR2_PHASE_PROBE_BW_CRUISE;
++		return BBR_PHASE_PROBE_BW_CRUISE;
 +	case BBR_BW_PROBE_REFILL:
-+		return BBR2_PHASE_PROBE_BW_REFILL;
++		return BBR_PHASE_PROBE_BW_REFILL;
 +	default:
-+		return BBR2_PHASE_INVALID;
++		return BBR_PHASE_INVALID;
 +	}
 +}
 +
-+static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr,
+ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+-			   union tcp_cc_info *info)
 +			    union tcp_cc_info *info)
-+{
-+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
-+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-+		struct bbr *bbr = inet_csk_ca(sk);
+ {
+ 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+ 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+-		struct tcp_sock *tp = tcp_sk(sk);
+ 		struct bbr *bbr = inet_csk_ca(sk);
+-		u64 bw = bbr_bw(sk);
+-
+-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+-		memset(&info->bbr, 0, sizeof(info->bbr));
+-		info->bbr.bbr_bw_lo		= (u32)bw;
+-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
 +		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
 +		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
 +		u64 bw_lo = bbr->bw_lo == ~0U ?
 +			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
++		struct tcp_bbr_info *bbr_info = &info->bbr;
 +
-+		memset(&info->bbr2, 0, sizeof(info->bbr2));
-+		info->bbr2.bbr_bw_lsb		= (u32)bw;
-+		info->bbr2.bbr_bw_msb		= (u32)(bw >> 32);
-+		info->bbr2.bbr_min_rtt		= bbr->min_rtt_us;
-+		info->bbr2.bbr_pacing_gain	= bbr->pacing_gain;
-+		info->bbr2.bbr_cwnd_gain	= bbr->cwnd_gain;
-+		info->bbr2.bbr_bw_hi_lsb	= (u32)bw_hi;
-+		info->bbr2.bbr_bw_hi_msb	= (u32)(bw_hi >> 32);
-+		info->bbr2.bbr_bw_lo_lsb	= (u32)bw_lo;
-+		info->bbr2.bbr_bw_lo_msb	= (u32)(bw_lo >> 32);
-+		info->bbr2.bbr_mode		= bbr->mode;
-+		info->bbr2.bbr_phase		= (__u8)bbr2_get_phase(bbr);
-+		info->bbr2.bbr_version		= (__u8)2;
-+		info->bbr2.bbr_inflight_lo	= bbr->inflight_lo;
-+		info->bbr2.bbr_inflight_hi	= bbr->inflight_hi;
-+		info->bbr2.bbr_extra_acked	= bbr_extra_acked(sk);
-+		*attr = INET_DIAG_BBRINFO;
-+		return sizeof(info->bbr2);
-+	}
-+	return 0;
-+}
-+
-+static void bbr2_set_state(struct sock *sk, u8 new_state)
-+{
++		memset(bbr_info, 0, sizeof(*bbr_info));
++		bbr_info->bbr_bw_lo		= (u32)bw;
++		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
++		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
++		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
++		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
++		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
++		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
++		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
++		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
++		bbr_info->bbr_mode		= bbr->mode;
++		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
++		bbr_info->bbr_version		= (__u8)BBR_VERSION;
++		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
++		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
++		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
+ 		*attr = INET_DIAG_BBRINFO;
+-		return sizeof(info->bbr);
++		return sizeof(*bbr_info);
+ 	}
+ 	return 0;
+ }
+ 
+ __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+ {
 +	struct tcp_sock *tp = tcp_sk(sk);
-+	struct bbr *bbr = inet_csk_ca(sk);
-+
-+	if (new_state == TCP_CA_Loss) {
-+		struct rate_sample rs = { .losses = 1 };
-+		struct bbr_context ctx = { 0 };
-+
-+		bbr->prev_ca_state = TCP_CA_Loss;
-+		bbr->full_bw = 0;
-+		if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
+ 	struct bbr *bbr = inet_csk_ca(sk);
+ 
+ 	if (new_state == TCP_CA_Loss) {
+-		struct rate_sample rs = { .losses = 1 };
+ 
+ 		bbr->prev_ca_state = TCP_CA_Loss;
+-		bbr->full_bw = 0;
+-		bbr->round_start = 1;	/* treat RTO like end of a round */
+-		bbr_lt_bw_sampling(sk, &rs);
++		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
++		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
++		 * repathed this flow, so re-learn the min network RTT on the
++		 * new path:
++		 */
++		bbr_reset_full_bw(sk);
++		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
 +			/* bbr_adapt_lower_bounds() needs cwnd before
 +			 * we suffered an RTO, to update inflight_lo:
 +			 */
 +			bbr->inflight_lo =
-+				max(tp->snd_cwnd, bbr->prior_cwnd);
++				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
 +		}
-+		bbr_debug(sk, 0, &rs, &ctx);
 +	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
 +		   new_state != TCP_CA_Loss) {
-+		tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
-+		bbr->try_fast_path = 0; /* bound cwnd using latest model */
-+	}
-+}
++		bbr_exit_loss_recovery(sk);
+ 	}
+ }
+ 
 +
-+static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = {
+ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+-	.flags		= TCP_CONG_NON_RESTRICTED,
 +	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
-+	.name		= "bbr2",
-+	.owner		= THIS_MODULE,
-+	.init		= bbr2_init,
-+	.cong_control	= bbr2_main,
-+	.sndbuf_expand	= bbr_sndbuf_expand,
-+	.skb_marked_lost = bbr2_skb_marked_lost,
-+	.undo_cwnd	= bbr2_undo_cwnd,
-+	.cwnd_event	= bbr_cwnd_event,
-+	.ssthresh	= bbr2_ssthresh,
+ 	.name		= "bbr",
+ 	.owner		= THIS_MODULE,
+ 	.init		= bbr_init,
+ 	.cong_control	= bbr_main,
+ 	.sndbuf_expand	= bbr_sndbuf_expand,
++	.skb_marked_lost = bbr_skb_marked_lost,
+ 	.undo_cwnd	= bbr_undo_cwnd,
+ 	.cwnd_event	= bbr_cwnd_event,
+ 	.ssthresh	= bbr_ssthresh,
+-	.min_tso_segs	= bbr_min_tso_segs,
 +	.tso_segs	= bbr_tso_segs,
-+	.get_info	= bbr2_get_info,
-+	.set_state	= bbr2_set_state,
-+};
-+
-+static int __init bbr_register(void)
-+{
-+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
-+	return tcp_register_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+static void __exit bbr_unregister(void)
-+{
-+	tcp_unregister_congestion_control(&tcp_bbr2_cong_ops);
-+}
-+
-+module_init(bbr_register);
-+module_exit(bbr_unregister);
-+
-+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
-+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
-+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
-+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+ 	.get_info	= bbr_get_info,
+ 	.set_state	= bbr_set_state,
+ };
+@@ -1160,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids)
+ BTF_ID_FLAGS(func, bbr_init)
+ BTF_ID_FLAGS(func, bbr_main)
+ BTF_ID_FLAGS(func, bbr_sndbuf_expand)
++BTF_ID_FLAGS(func, bbr_skb_marked_lost)
+ BTF_ID_FLAGS(func, bbr_undo_cwnd)
+ BTF_ID_FLAGS(func, bbr_cwnd_event)
+ BTF_ID_FLAGS(func, bbr_ssthresh)
+-BTF_ID_FLAGS(func, bbr_min_tso_segs)
++BTF_ID_FLAGS(func, bbr_tso_segs)
+ BTF_ID_FLAGS(func, bbr_set_state)
+ #endif
+ #endif
+@@ -1198,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+ MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+ MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+ MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
 +MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
 +MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
 +MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
 +MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
++MODULE_AUTHOR("David Morley <morleyd@google.com>");
 +
-+MODULE_LICENSE("Dual BSD/GPL");
-+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
++MODULE_VERSION(__stringify(BBR_VERSION));
 diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
 index 1b34050a7538..66d40449b3f4 100644
 --- a/net/ipv4/tcp_cong.c
@@ -3037,7 +2998,7 @@ index 1b34050a7538..66d40449b3f4 100644
  		icsk->icsk_ca_ops->init(sk);
  	if (tcp_ca_needs_ecn(sk))
 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index 57c8af1859c1..3193ef5aac61 100644
+index 57c8af1859c1..2195ba488142 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
 @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -3089,7 +3050,37 @@ index 57c8af1859c1..3193ef5aac61 100644
  	/* When we're adding to gso_segs == 1, gso_size will be zero,
  	 * in theory this shouldn't be necessary but as long as DSACK
  	 * code can come after this skb later on it's better to keep
-@@ -3819,6 +3835,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -3688,7 +3704,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+ /* This routine deals with acks during a TLP episode and ends an episode by
+  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+  */
+-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
++				struct rate_sample *rs)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+@@ -3705,6 +3722,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 		/* ACK advances: there was a loss, so reduce cwnd. Reset
+ 		 * tlp_high_seq in tcp_init_cwnd_reduction()
+ 		 */
++		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
+ 		tcp_init_cwnd_reduction(sk);
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 		tcp_end_cwnd_reduction(sk);
+@@ -3715,6 +3733,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+ 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ 		/* Pure dupack: original and TLP probe arrived; no loss */
+ 		tp->tlp_high_seq = 0;
++	} else {
++		/* This ACK matches a TLP retransmit. We cannot yet tell if
++		 * this ACK is for the original or the TLP retransmit.
++		 */
++		rs->is_acking_tlp_retrans_seq = 1;
+ 	}
+ }
+ 
+@@ -3819,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
  	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
  	rs.prior_in_flight = tcp_packets_in_flight(tp);
@@ -3097,7 +3088,16 @@ index 57c8af1859c1..3193ef5aac61 100644
  
  	/* ts_recent update must be made after we are sure that the packet
  	 * is in window.
-@@ -3917,6 +3934,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+@@ -3893,7 +3917,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_rack_update_reo_wnd(sk, &rs);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 
+ 	if (tcp_ack_is_dubious(sk, flag)) {
+ 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+@@ -3917,6 +3941,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	delivered = tcp_newly_delivered(sk, delivered, flag);
  	lost = tp->lost - lost;			/* freshly marked lost */
  	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
@@ -3105,7 +3105,16 @@ index 57c8af1859c1..3193ef5aac61 100644
  	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
  	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
  	tcp_xmit_recovery(sk, rexmit);
-@@ -5527,13 +5545,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+@@ -3936,7 +3961,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	tcp_ack_probe(sk);
+ 
+ 	if (tp->tlp_high_seq)
+-		tcp_process_tlp_ack(sk, ack, flag);
++		tcp_process_tlp_ack(sk, ack, flag, &rs);
+ 	return 1;
+ 
+ old_ack:
+@@ -5527,13 +5552,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
  
  	    /* More than one full frame received... */
  	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -3122,11 +3131,46 @@ index 57c8af1859c1..3193ef5aac61 100644
  	    /* We ACK each frame or... */
  	    tcp_in_quickack_mode(sk) ||
  	    /* Protocol state mandates a one-time immediate ACK */
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index c8f2aa003387..fdf51e436899 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+ 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ 	bool ca_got_dst = false;
+ 
++	tcp_set_ecn_low_from_dst(sk, dst);
++
+ 	if (ca_key != TCP_CA_UNSPEC) {
+ 		const struct tcp_congestion_ops *ca;
+ 
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 2cb39b6dad02..703d166c1778 100644
+index 51d8638d4b4c..2fb064057868 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
-@@ -377,7 +377,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+@@ -325,10 +325,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+ 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
++	const struct dst_entry *dst = __sk_dst_get(sk);
+ 
+ 	if (!use_ecn) {
+-		const struct dst_entry *dst = __sk_dst_get(sk);
+-
+ 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ 			use_ecn = true;
+ 	}
+@@ -340,6 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ 			INET_ECN_xmit(sk);
++
++		if (dst)
++			tcp_set_ecn_low_from_dst(sk, dst);
+ 	}
+ }
+ 
+@@ -377,7 +379,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
  				th->cwr = 1;
  				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
  			}
@@ -3136,7 +3180,7 @@ index 2cb39b6dad02..703d166c1778 100644
  			/* ACK or retransmitted segment: clear ECT|CE */
  			INET_ECN_dontxmit(sk);
  		}
-@@ -1532,7 +1533,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+@@ -1532,7 +1535,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *buff;
@@ -3145,23 +3189,38 @@ index 2cb39b6dad02..703d166c1778 100644
  	long limit;
  	int nlen;
  	u8 flags;
-@@ -1607,6 +1608,15 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+@@ -1607,6 +1610,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  
  		if (diff)
  			tcp_adjust_pcount(sk, skb, diff);
 +
-+		/* Set buff tx.in_flight as if buff were sent by itself. */
 +		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
-+		if (WARN_ONCE(inflight_prev < 0,
-+			      "inconsistent: tx.in_flight: %u old_factor: %d",
-+			      TCP_SKB_CB(skb)->tx.in_flight, old_factor))
++		if (inflight_prev < 0) {
++			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
++					  old_factor,
++					  TCP_SKB_CB(skb)->sacked,
++					  TCP_SKB_CB(skb)->tx.in_flight),
++				  "inconsistent: tx.in_flight: %u "
++				  "old_factor: %d mss: %u sacked: %u "
++				  "1st pcount: %d 2nd pcount: %d "
++				  "1st len: %u 2nd len: %u ",
++				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
++				  mss_now, TCP_SKB_CB(skb)->sacked,
++				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
++				  skb->len, buff->len);
 +			inflight_prev = 0;
++		}
++		/* Set 1st tx.in_flight as if 1st were sent by itself: */
++		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb);
++		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
 +		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
++						 tcp_skb_pcount(skb) +
 +						 tcp_skb_pcount(buff);
  	}
  
  	/* Link BUFF into the send queue. */
-@@ -1982,13 +1992,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+@@ -1982,13 +2009,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
  static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
  {
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
@@ -3176,11 +3235,11 @@ index 2cb39b6dad02..703d166c1778 100644
 +	tso_segs = ca_ops->tso_segs ?
 +		ca_ops->tso_segs(sk, mss_now) :
 +		tcp_tso_autosize(sk, mss_now,
-+				 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs));
++				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
  	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
  }
  
-@@ -2674,6 +2683,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+@@ -2674,6 +2700,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
  			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
  			tcp_init_tso_segs(skb, mss_now);
@@ -3188,8 +3247,16 @@ index 2cb39b6dad02..703d166c1778 100644
  			goto repair; /* Skip network transmission */
  		}
  
+@@ -2886,6 +2913,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ 		goto rearm_timer;
+ 
++	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
+ 	if (__tcp_retransmit_skb(sk, skb, 1))
+ 		goto rearm_timer;
+ 
 diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
-index a8f6d9d06f2e..a8b4c9504570 100644
+index a8f6d9d06f2e..8737f2134648 100644
 --- a/net/ipv4/tcp_rate.c
 +++ b/net/ipv4/tcp_rate.c
 @@ -34,6 +34,24 @@
@@ -3237,8 +3304,8 @@ index a8f6d9d06f2e..a8b4c9504570 100644
  		rs->prior_mstamp     = scb->tx.delivered_mstamp;
  		rs->is_app_limited   = scb->tx.is_app_limited;
  		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
- 		rs->last_end_seq     = scb->end_seq;
 +		rs->tx_in_flight     = scb->tx.in_flight;
+ 		rs->last_end_seq     = scb->end_seq;
  
  		/* Record send time of most recently ACKed packet: */
  		tp->first_tx_mstamp  = tx_tstamp;
@@ -3283,10 +3350,10 @@ index 470f581eedd4..2b8d7e94a369 100644
 -- 
 2.41.0
 
-From 15fb201317f2aaf349c0929478acd92a068be6d1 Mon Sep 17 00:00:00 2001
+From 883b0afdb45d6c4944bf6b917196870726ce0caa Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 17:09:03 +0200
-Subject: [PATCH 2/7] cachy
+Date: Mon, 31 Jul 2023 12:19:39 +0200
+Subject: [PATCH 2/5] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -3367,10 +3434,10 @@ index a1457995fd41..0b33c7960259 100644
  				Safety option to keep boot IRQs enabled. This
  				should never be necessary.
 diff --git a/Makefile b/Makefile
-index 47690c28456a..79abb476e260 100644
+index 653238528aac..32ab6e225c91 100644
 --- a/Makefile
 +++ b/Makefile
-@@ -819,6 +819,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
+@@ -831,6 +831,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
  ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
  KBUILD_CFLAGS += -O2
  KBUILD_RUSTFLAGS += -Copt-level=2
@@ -3380,7 +3447,7 @@ index 47690c28456a..79abb476e260 100644
  else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
  KBUILD_CFLAGS += -Os
  KBUILD_RUSTFLAGS += -Copt-level=s
-@@ -1064,11 +1067,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
+@@ -1076,11 +1079,6 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
  # Make sure -fstack-check isn't enabled (like gentoo apparently did)
  KBUILD_CFLAGS  += -fno-stack-check
  
@@ -9110,7 +9177,7 @@ index 000000000000..77a6677ec19e
 +MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
 +MODULE_LICENSE("GPL");
 diff --git a/include/linux/mm.h b/include/linux/mm.h
-index 2dd73e4f3d8e..e0706755c7c3 100644
+index 406ab9ea818f..17794c213055 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page)
@@ -9290,7 +9357,7 @@ index d2e12b6d2b18..95ca80492a37 100644
  	if (err)
  		goto bad_unshare_out;
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index a80a73909dc2..b097a9f4d817 100644
+index b3e25be58e2b..2c335df30171 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -69,9 +69,13 @@
@@ -9506,2599 +9573,10 @@ index 1080209a568b..f76aa8268215 100644
 -- 
 2.41.0
 
-From 924ab3ea3113d6e31ad314896faee2c528d917ac Mon Sep 17 00:00:00 2001
+From 0a48385ee928e0a277eb626a86efe9d4aec339f3 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 17:09:16 +0200
-Subject: [PATCH 3/7] ddcci
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- drivers/char/Kconfig                      |   11 +
- drivers/char/Makefile                     |    1 +
- drivers/char/ddcci.c                      | 1909 +++++++++++++++++++++
- drivers/video/backlight/Kconfig           |   11 +
- drivers/video/backlight/Makefile          |    1 +
- drivers/video/backlight/ddcci-backlight.c |  413 +++++
- include/linux/ddcci.h                     |  164 ++
- 7 files changed, 2510 insertions(+)
- create mode 100644 drivers/char/ddcci.c
- create mode 100644 drivers/video/backlight/ddcci-backlight.c
- create mode 100644 include/linux/ddcci.h
-
-diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
-index 625af75833fc..3930aeb8e17b 100644
---- a/drivers/char/Kconfig
-+++ b/drivers/char/Kconfig
-@@ -422,4 +422,15 @@ config ADI
- 	  and SSM (Silicon Secured Memory).  Intended consumers of this
- 	  driver include crash and makedumpfile.
- 
-+config DDCCI
-+	tristate "DDCCI display protocol support"
-+	depends on I2C
-+	help
-+	  Display Data Channel Command Interface is an
-+	  interface that allows the kernel to "talk"
-+	  to most displays made after 2005. Check your
-+	  display's specification to see if it has
-+	  support for this. This depends on I2C to
-+	  compile.
-+
- endmenu
-diff --git a/drivers/char/Makefile b/drivers/char/Makefile
-index c5f532e412f1..b12476014311 100644
---- a/drivers/char/Makefile
-+++ b/drivers/char/Makefile
-@@ -3,6 +3,7 @@
- # Makefile for the kernel character device drivers.
- #
- 
-+obj-$(CONFIG_DDCCI)				+= ddcci.o
- obj-y				+= mem.o random.o
- obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
- obj-y				+= misc.o
-diff --git a/drivers/char/ddcci.c b/drivers/char/ddcci.c
-new file mode 100644
-index 000000000000..129aede43651
---- /dev/null
-+++ b/drivers/char/ddcci.c
-@@ -0,0 +1,1909 @@
-+/*
-+ *  DDC/CI sub-bus driver
-+ *
-+ *  Copyright (c) 2015 Christoph Grenz
-+ */
-+
-+/*
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms of the GNU General Public License as published by the Free
-+ * Software Foundation; either version 2 of the License, or (at your option)
-+ * any later version.
-+ */
-+
-+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-+#include <asm-generic/fcntl.h>
-+#include <linux/cdev.h>
-+#include <linux/delay.h>
-+#include <linux/device.h>
-+#include <linux/fs.h>
-+#include <linux/i2c.h>
-+#include <linux/module.h>
-+#include <linux/mutex.h>
-+#include <linux/rwsem.h>
-+#include <linux/sem.h>
-+#include <linux/slab.h>
-+#include <linux/uaccess.h>
-+#include <linux/version.h>
-+
-+#include <linux/ddcci.h>
-+
-+#define DDCCI_RECV_BUFFER_SIZE 130
-+#define DEVICE_NAME "ddcci"
-+#define DDCCI_MAX_CAP_CHUNKS 200
-+
-+static unsigned int delay = 60;
-+static unsigned short autoprobe_addrs[127] = {0xF0, 0xF2, 0xF4, 0xF6, 0xF8};
-+static int autoprobe_addr_count = 5;
-+
-+static dev_t ddcci_cdev_first;
-+static dev_t ddcci_cdev_next;
-+static dev_t ddcci_cdev_end;
-+static DEFINE_SEMAPHORE(core_lock, 1);
-+
-+struct bus_type ddcci_bus_type;
-+EXPORT_SYMBOL_GPL(ddcci_bus_type);
-+static bool ddcci_bus_registered;
-+
-+/* Assert neccessary string array sizes  */
-+#ifndef sizeof_field
-+# define sizeof_field(t,m) FIELD_SIZEOF(t,m)
-+#endif
-+static_assert(sizeof_field(struct ddcci_device, prot) > 8);
-+static_assert(sizeof_field(struct ddcci_device, type) > 8);
-+static_assert(sizeof_field(struct ddcci_device, model) > 8);
-+static_assert(sizeof_field(struct ddcci_device, vendor) > 8);
-+static_assert(sizeof_field(struct ddcci_device, module) > 8);
-+
-+/* Internal per-i2c-client driver data */
-+struct ddcci_bus_drv_data {
-+	unsigned long quirks;
-+	struct i2c_client *i2c_dev;
-+	struct semaphore sem;
-+	unsigned char recv_buffer[DDCCI_RECV_BUFFER_SIZE];
-+};
-+
-+/* Replace non-alphanumeric characters in a string (used for modalias) */
-+static void ddcci_modalias_clean(char *string, size_t n, char replacement)
-+{
-+	int i;
-+	for (i = 0; i < n; ++i) {
-+		char c = string[i];
-+		if (c == 0) {
-+			return;
-+		} else if (c < '0' || (c > '9' && c < 'A') || (c > 'Z' && c < 'a') || c > 'z') {
-+			string[i] = replacement;
-+		}
-+	}
-+}
-+
-+/* Write a message to the DDC/CI bus using i2c_smbus_write_byte() */
-+static int __ddcci_write_bytewise(struct i2c_client *client, unsigned char addr,
-+				  bool p_flag, const unsigned char * __restrict buf,
-+				  unsigned char len)
-+{
-+	int ret = 0;
-+	unsigned char outer_addr = (unsigned char)(client->addr << 1);
-+	unsigned xor = outer_addr; /* initial xor value */
-+
-+	/* Consistency checks */
-+	if (len > 127)
-+		return -EINVAL;
-+
-+	/* Special case: sender to 0x6E is always 0x51 */
-+	if (addr == DDCCI_DEFAULT_DEVICE_ADDR) {
-+		addr = DDCCI_HOST_ADDR_ODD;
-+	} else {
-+		/* When sending the odd address is used */
-+		addr = addr | 1;
-+	}
-+
-+	/* first byte: sender address */
-+	xor ^= addr;
-+	ret = i2c_smbus_write_byte(client, addr);
-+	if (ret < 0)
-+		return ret;
-+
-+	/* second byte: protocol flag and message size */
-+	xor ^= ((p_flag << 7) | len);
-+	ret = i2c_smbus_write_byte(client, (p_flag << 7)|len);
-+	if (ret < 0)
-+		return ret;
-+
-+	/* send payload */
-+	while (len--) {
-+		xor ^= (*buf);
-+		ret = i2c_smbus_write_byte(client, (*buf));
-+		if (ret < 0)
-+			return ret;
-+		buf++;
-+	}
-+
-+	/* send checksum */
-+	ret = i2c_smbus_write_byte(client, xor);
-+	return ret;
-+}
-+
-+/* Write a message to the DDC/CI bus using i2c_master_send() */
-+static int __ddcci_write_block(struct i2c_client *client, unsigned char addr,
-+			       unsigned char *sendbuf, bool p_flag,
-+			       const unsigned char *data, unsigned char len)
-+{
-+	unsigned char outer_addr = (unsigned char)(client->addr << 1);
-+	unsigned xor = outer_addr;	/* initial xor value */
-+	unsigned char *ptr = sendbuf;
-+
-+	/* Consistency checks */
-+	if (len > 127)
-+		return -EINVAL;
-+
-+	/* Special case: sender to 0x6E is always 0x51 */
-+	if (addr == DDCCI_DEFAULT_DEVICE_ADDR) {
-+		addr = DDCCI_HOST_ADDR_ODD;
-+	} else {
-+		/* When sending the odd address is used */
-+		addr = addr | 1;
-+	}
-+
-+	/* first byte: sender address */
-+	xor ^= addr;
-+	*(ptr++) = addr;
-+	/* second byte: protocol flag and message size */
-+	xor ^= ((p_flag << 7) | len);
-+	*(ptr++) = (p_flag << 7)|len;
-+	/* payload */
-+	while (len--) {
-+		xor ^= (*data);
-+		*(ptr++) = (*data);
-+		data++;
-+	}
-+	/* checksum */
-+	(*ptr) = xor;
-+
-+	/* Send it */
-+	return i2c_master_send(client, sendbuf, ptr - sendbuf + 1);
-+}
-+
-+/*
-+ * Write a message to the DDC/CI bus.
-+ *
-+ * You must hold the bus semaphore when calling this function.
-+ */
-+static int ddcci_write(struct i2c_client *client, unsigned char addr,
-+		       bool p_flag, const unsigned char *data,
-+		       unsigned char len)
-+{
-+	struct ddcci_bus_drv_data *drv_data;
-+	unsigned char *sendbuf;
-+	int ret;
-+
-+	drv_data = i2c_get_clientdata(client);
-+
-+
-+	pr_debug("sending to %d:%02x:%02x: %*ph\n", client->adapter->nr,
-+		 client->addr << 1, addr, len, data);
-+	if (drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE) {
-+		ret = __ddcci_write_bytewise(client, addr, p_flag, data, len);
-+	} else {
-+		sendbuf = drv_data->recv_buffer;
-+		ret = __ddcci_write_block(client, addr, sendbuf, p_flag, data, len);
-+	}
-+
-+	return ret;
-+}
-+
-+/*
-+ * Read a response from the DDC/CI bus with headers directly into a buffer.
-+ * Always check for DDCCI_QUIRK_SKIP_FIRST_BYTE when using this function.
-+ * The returned length contains the whole unmodified response.
-+ * If -EMSGSIZE is returned, the buffer contains the response up to `len`.
-+ * If any other negative error code is returned, the buffer content is
-+ * unspecified.
-+ */
-+static int __ddcci_read(struct i2c_client *client, unsigned char addr,
-+			bool p_flag, unsigned long quirks, unsigned char *buf,
-+			unsigned char len)
-+{
-+	int i, payload_len, packet_length, ret;
-+	unsigned char xor = DDCCI_HOST_ADDR_EVEN;
-+
-+	/* Consistency checks */
-+	if (len < 3)
-+		return -EINVAL;
-+
-+	/* Read frame */
-+	ret = i2c_master_recv(client, buf, len);
-+	if (ret < 0)
-+		goto out_err;
-+	packet_length = ret;
-+
-+	/* Skip first byte if quirk active */
-+	if ((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE) && ret > 0 && len > 0) {
-+		ret--;
-+		len--;
-+		buf++;
-+	}
-+
-+	/* If answer too short (= incomplete) break out */
-+	if (ret < 3) {
-+		ret = -EIO;
-+		goto out_err;
-+	}
-+
-+	/* validate first byte */
-+	if (unlikely(buf[0] != addr)) {
-+		ret = (buf[0] == '\0') ? -EAGAIN : -EIO;
-+		goto out_err;
-+	}
-+
-+	/* validate second byte (protocol flag) */
-+	if (unlikely((buf[1] & 0x80) != (p_flag << 7))) {
-+		if (!p_flag || !(quirks & DDCCI_QUIRK_NO_PFLAG)) {
-+			ret = -EIO;
-+			goto out_err;
-+		}
-+	}
-+
-+	/* get and check payload length */
-+	payload_len = buf[1] & 0x7F;
-+	if (3+payload_len > packet_length)
-+		return -EBADMSG;
-+	if (3+payload_len > len)
-+		return -EMSGSIZE;
-+
-+	/* calculate checksum */
-+	for (i = 0; i < 3+payload_len; i++)
-+		xor ^= buf[i];
-+
-+	/* verify checksum */
-+	if (xor != 0) {
-+		dev_err(&client->dev, "invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n",
-+			xor, payload_len);
-+		ret = -EBADMSG;
-+		goto out_err;
-+	}
-+
-+	/* return result */
-+	ret = payload_len+3+((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)?1:0);
-+
-+out_err:
-+	return ret;
-+}
-+
-+/*
-+ * Read a response from the DDC/CI bus
-+ *
-+ * You must hold the bus semaphore when calling this function.
-+ */
-+static int ddcci_read(struct i2c_client *client, unsigned char addr,
-+		      bool p_flag, unsigned char *buf, unsigned char len)
-+{
-+	struct ddcci_bus_drv_data *drv_data;
-+	unsigned char *recvbuf;
-+	int ret;
-+
-+	drv_data = i2c_get_clientdata(client);
-+	recvbuf = drv_data->recv_buffer;
-+
-+	/* Read frame */
-+	ret = __ddcci_read(client, addr, p_flag,
-+		drv_data->quirks, recvbuf, DDCCI_RECV_BUFFER_SIZE);
-+	if (ret < 0)
-+		return ret;
-+
-+	if (drv_data->quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)
-+		recvbuf++;
-+
-+	/* return result */
-+	if (buf) {
-+		if (ret > 3) {
-+			ret = ret-3;
-+			/* copy to caller buffer */
-+			memcpy(buf, &recvbuf[2], (ret < len) ? ret : len);
-+
-+			if (ret > len) {
-+				/* if message was truncated, return -EMSGSIZE */
-+				pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph ...\n",
-+					 client->adapter->nr, client->addr << 1,
-+					 addr, ret, len, len, buf);
-+				ret = -EMSGSIZE;
-+			} else {
-+				pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph\n",
-+					 client->adapter->nr, client->addr << 1,
-+					 addr, ret, len, ret, buf);
-+			}
-+		}
-+	}
-+	if (!(drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) {
-+		/* second read to clear buffers, needed on some devices */
-+		__ddcci_read(client, addr, true, drv_data->quirks, recvbuf, 1);
-+	}
-+	return ret;
-+}
-+
-+/* Request the capability string for a device and put it into buf */
-+static int ddcci_get_caps(struct i2c_client *client, unsigned char addr,
-+			  unsigned char *buf, unsigned int len)
-+{
-+	int result = 0, counter = 0, offset = 0;
-+	unsigned char cmd[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00 };
-+	unsigned char *chunkbuf = kzalloc(35, GFP_KERNEL);
-+
-+	if (!chunkbuf)
-+		return -ENOMEM;
-+
-+	do {
-+		/* Send command */
-+		result = ddcci_write(client, addr, true, cmd, sizeof(cmd));
-+		if (result < 0)
-+			goto err_free;
-+		msleep(delay);
-+		/* read result chunk */
-+		result = ddcci_read(client, addr, true, chunkbuf,
-+				    (len > 32) ? 35 : len+3);
-+		if (result < 0)
-+			goto err_free;
-+
-+		if (result > 0) {
-+			/* check chunk header */
-+			if (chunkbuf[0] != DDCCI_REPLY_CAPS) {
-+				result = -EIO;
-+				goto err_free;
-+			}
-+			if (chunkbuf[1] != cmd[1] || chunkbuf[2] != cmd[2]) {
-+				result = -EIO;
-+				goto err_free;
-+			}
-+			if (result < 3) {
-+				result = -EIO;
-+				goto err_free;
-+			}
-+			memcpy(buf, chunkbuf+3, min((unsigned int)result-3, len));
-+
-+			counter++;
-+			/* adjust offset, etc. */
-+			offset += result-3;
-+			len -= result-3;
-+			buf += result-3;
-+			cmd[1] = offset >> 8;
-+			cmd[2] = offset & 0xFF;
-+			/* Another superfluous read to make some devices happy... */
-+			ddcci_read(client, addr, true, NULL, 2);
-+		}
-+	} while (result > 3 && counter < DDCCI_MAX_CAP_CHUNKS);
-+
-+	kfree(chunkbuf);
-+	return offset+result-3;
-+err_free:
-+	kfree(chunkbuf);
-+	return result;
-+}
-+
-+/*
-+ * Request the device identification and put it into buf.
-+ *
-+ * Also detects all communication quirks and sets the corresponding flags
-+ * in the ddcci_bus_drv_data structure associated with client.
-+ *
-+ * The identification command will fail on most DDC devices, as it is optional
-+ * to support, but even the "failed" response suffices to detect quirks.
-+ */
-+static int ddcci_identify_device(struct i2c_client *client, unsigned char addr,
-+				 unsigned char *buf, unsigned char len)
-+{
-+	int i, payload_len, ret = -ENODEV;
-+	unsigned long quirks;
-+	unsigned char cmd[1] = { DDCCI_COMMAND_ID };
-+	unsigned char *buffer;
-+	unsigned char xor = DDCCI_HOST_ADDR_EVEN;
-+	struct ddcci_bus_drv_data *bus_drv_data;
-+
-+	bus_drv_data = i2c_get_clientdata(client);
-+	quirks = bus_drv_data->quirks;
-+	buffer = bus_drv_data->recv_buffer;
-+
-+	/* Send Identification command */
-+	if (!(quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) {
-+		ret = __ddcci_write_block(client, addr, buffer, true, cmd, sizeof(cmd));
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] writing identification command in block mode: %d\n",
-+			client->addr << 1, addr, ret);
-+		if ((ret == -ENXIO)
-+		    && i2c_check_functionality(client->adapter,
-+					       I2C_FUNC_SMBUS_WRITE_BYTE)) {
-+			quirks |= DDCCI_QUIRK_WRITE_BYTEWISE;
-+			dev_info(&client->dev,
-+				"DDC/CI bus quirk detected: writes must be done bytewise\n");
-+			/* Some devices need writing twice after a failed blockwise write */
-+			__ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd));
-+			msleep(delay);
-+		}
-+	}
-+	if (ret < 0 && (quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) {
-+		ret = __ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd));
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] writing identification command in bytewise mode: %d\n",
-+			client->addr << 1, addr, ret);
-+	}
-+	if (ret < 0)
-+		return -ENODEV;
-+
-+	/* Wait */
-+	msleep(delay);
-+
-+	/* Receive response */
-+	ret = i2c_master_recv(client, buffer, DDCCI_RECV_BUFFER_SIZE);
-+	if (ret < 0) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] receiving identification response resulted in errno %d\n",
-+			client->addr << 1, addr, ret);
-+		return ret;
-+	}
-+
-+	if (ret == 0) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] no identification response received\n",
-+			client->addr << 1, addr);
-+		return ret;
-+	}
-+
-+	/* Skip first byte if quirk already active */
-+	if (quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE && ret > 1) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] doubled first byte quirk in effect\n",
-+			client->addr << 1, addr);
-+		ret--;
-+		buffer++;
-+	}
-+
-+	/* If answer too short (= incomplete) break out */
-+	if (ret < 3) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] identification response is too short (%d bytes)\n",
-+			client->addr << 1, addr, ret);
-+		return -EIO;
-+	}
-+
-+	/* validate first byte */
-+	if (buffer[0] != addr) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] identification response: %*ph\n",
-+			client->addr << 1, addr, (ret > 32 ? 32 : ret), buffer);
-+
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] identification response invalid (expected first byte %02x, got %02x)\n",
-+			client->addr << 1, addr, addr, buffer[0]);
-+		return -ENODEV;
-+	}
-+
-+	/* Check if first byte is doubled (QUIRK_SKIP_FIRST_BYTE) */
-+	if (!(quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)) {
-+		if (buffer[0] == buffer[1]) {
-+			quirks |= DDCCI_QUIRK_SKIP_FIRST_BYTE;
-+			dev_info(&client->dev,
-+				"DDC/CI bus quirk detected: doubled first byte on read\n");
-+			ret--;
-+			buffer++;
-+			if (ret < 3)
-+				return -EIO;
-+		}
-+	}
-+
-+	/* validate second byte (protocol flag) */
-+	if ((buffer[1] & 0x80) != 0x80 && !(quirks & DDCCI_QUIRK_NO_PFLAG)) {
-+		dev_info(&client->dev,
-+			"DDC/CI bus quirk detected: device omits protocol flag on responses\n");
-+		quirks |= DDCCI_QUIRK_NO_PFLAG;
-+	}
-+
-+	/* get and check payload length */
-+	payload_len = buffer[1] & 0x7F;
-+	if (3+payload_len > ret) {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] identification response: %*ph ...\n",
-+			client->addr << 1, addr, ret, buffer);
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] identification response was truncated (expected %d bytes, got %d)\n",
-+			client->addr << 1, addr, 3+payload_len, ret);
-+		return -EBADMSG;
-+	}
-+
-+	dev_dbg(&client->dev,
-+		"[%02x:%02x] identification response: %*ph\n",
-+		client->addr << 1, addr, 3+payload_len, buffer);
-+
-+	/* calculate checksum */
-+	for (i = 0; i < 3+payload_len; i++)
-+		xor ^= buffer[i];
-+
-+	/* verify checksum */
-+	if (xor != 0) {
-+		dev_err(&client->dev,
-+			"[%02x:%02x] invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n",
-+			client->addr << 1, addr, xor, payload_len);
-+		return -EBADMSG;
-+	}
-+
-+	/* save quirks */
-+	bus_drv_data->quirks = quirks;
-+
-+	/* return result */
-+	if (payload_len <= len) {
-+		ret = payload_len;
-+		memcpy(buf, &buffer[2], payload_len);
-+	} else {
-+		ret = -EMSGSIZE;
-+		memcpy(buf, &buffer[2], len);
-+	}
-+	return ret;
-+}
-+
-+/* Character device */
-+
-+/* Data structure for an open file handle */
-+struct ddcci_fp_data {
-+	struct ddcci_device *dev;
-+	bool exclusive;
-+	unsigned char buffer[129];
-+};
-+
-+/* Called when the character device is opened */
-+static int ddcci_cdev_open(struct inode *inode, struct file *filp)
-+{
-+	struct ddcci_device *dev = container_of(inode->i_cdev,
-+						struct ddcci_device, cdev);
-+	struct ddcci_fp_data *fp_data = NULL;
-+
-+	fp_data = kzalloc(sizeof(struct ddcci_fp_data), GFP_KERNEL);
-+
-+	if (!fp_data)
-+		return -ENOMEM;
-+
-+	fp_data->exclusive = filp->f_flags & O_EXCL;
-+
-+	if (fp_data->exclusive) {
-+		if (down_write_trylock(&dev->cdev_sem) == 0) {
-+			kfree(fp_data);
-+			return -EBUSY;
-+		}
-+	} else {
-+		if (down_read_trylock(&dev->cdev_sem) == 0) {
-+			kfree(fp_data);
-+			return -EBUSY;
-+		}
-+	}
-+
-+	fp_data->dev = dev;
-+	filp->private_data = fp_data;
-+
-+	return 0;
-+}
-+
-+/* Called when the character device is closed */
-+static int ddcci_cdev_close(struct inode *inode, struct file *filp)
-+{
-+	struct ddcci_fp_data *fp_data = filp->private_data;
-+	struct ddcci_device *dev = fp_data->dev;
-+
-+	if (fp_data->exclusive)
-+		up_write(&dev->cdev_sem);
-+	else
-+		up_read(&dev->cdev_sem);
-+
-+	filp->private_data = NULL;
-+	kfree(fp_data);
-+	return 0;
-+}
-+
-+/* Called when reading from the character device */
-+static ssize_t ddcci_cdev_read(struct file *filp, char __user *buffer,
-+			       size_t length, loff_t *offset)
-+{
-+	struct ddcci_fp_data *fp_data = filp->private_data;
-+	struct ddcci_device *dev = fp_data->dev;
-+	unsigned char *buf = fp_data->buffer;
-+	const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0;
-+	int ret = 0;
-+
-+	if ((filp->f_mode & FMODE_READ) == 0)
-+		return -EBADF;
-+
-+	/* Lock mutex */
-+	if (nonblocking) {
-+		if (down_trylock(&dev->bus_drv_data->sem))
-+			return -EAGAIN;
-+	} else {
-+		if (down_interruptible(&dev->bus_drv_data->sem))
-+			return -ERESTARTSYS;
-+	}
-+
-+	/* Execute read */
-+	ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, true, buf,
-+			 length);
-+
-+	if (ret > 0) {
-+		/* Copy data from user space */
-+		if (copy_to_user(buffer, buf, ret)) {
-+			ret = -EFAULT;
-+			goto out;
-+		}
-+	}
-+
-+out:
-+	up(&dev->bus_drv_data->sem);
-+	return ret;
-+}
-+
-+/* Called when writing to the character device */
-+static ssize_t ddcci_cdev_write(struct file *filp, const char __user *buffer,
-+				size_t count, loff_t *offset)
-+{
-+	struct ddcci_fp_data *fp_data = filp->private_data;
-+	struct ddcci_device *dev = fp_data->dev;
-+	unsigned char *buf = fp_data->buffer;
-+	const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0;
-+	int ret = 0;
-+
-+	if ((filp->f_mode & FMODE_WRITE) == 0)
-+		return -EBADF;
-+
-+	if (count > 127)
-+		return -EINVAL;
-+
-+	/* Lock mutex */
-+	if (nonblocking) {
-+		if (down_trylock(&dev->bus_drv_data->sem))
-+			return -EAGAIN;
-+	} else {
-+		if (down_interruptible(&dev->bus_drv_data->sem))
-+			return -ERESTARTSYS;
-+	}
-+
-+	if (count > 0) {
-+		/* Copy data from user space */
-+		if (copy_from_user(buf, buffer, count)) {
-+			ret = -EFAULT;
-+			goto err_out;
-+		}
-+
-+		/* Execute write */
-+		ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr,
-+				  true, buf, count);
-+	}
-+
-+	if (ret >= 0) {
-+		msleep(delay);
-+		up(&dev->bus_drv_data->sem);
-+		return count;
-+	}
-+
-+err_out:
-+	up(&dev->bus_drv_data->sem);
-+	return ret;
-+}
-+
-+/* Called when seeking the character device */
-+static loff_t ddcci_cdev_seek(struct file *filp, loff_t offset, int anchor)
-+{
-+	return -EINVAL;
-+}
-+
-+static const struct file_operations ddcci_fops = {
-+	.owner = THIS_MODULE,
-+	.read = ddcci_cdev_read,
-+	.write = ddcci_cdev_write,
-+	.open = ddcci_cdev_open,
-+	.release = ddcci_cdev_close,
-+	.llseek = ddcci_cdev_seek
-+};
-+
-+/* Set up the character device for a DDC/CI device */
-+static int ddcci_setup_char_device(struct ddcci_device *device)
-+{
-+	int ret = -EINVAL;
-+
-+	/* Check if free minor exists */
-+	if (ddcci_cdev_next == ddcci_cdev_end) {
-+		dev_err(&device->dev, "no free major/minor\n");
-+		ret = -ENFILE;
-+		goto out;
-+	}
-+
-+	/* Initialize rwsem */
-+	init_rwsem(&device->cdev_sem);
-+
-+	/* Initialize character device node */
-+	cdev_init(&device->cdev, &ddcci_fops);
-+	device->cdev.owner = THIS_MODULE;
-+
-+	/* Publish char device */
-+	device->dev.devt = ddcci_cdev_next;
-+	ret = cdev_add(&device->cdev, ddcci_cdev_next, 1);
-+	if (ret) {
-+		device->dev.devt = 0;
-+		goto out;
-+	}
-+
-+	ddcci_cdev_next++;
-+out:
-+	return ret;
-+}
-+
-+/* sysfs attributes */
-+
-+static ssize_t ddcci_attr_capabilities_show(struct device *dev,
-+					    struct device_attribute *attr,
-+					    char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = device->capabilities_len;
-+		if (unlikely(len > PAGE_SIZE))
-+			len = PAGE_SIZE;
-+		if (len == 0) {
-+			ret = len;
-+		} else {
-+			memcpy(buf, device->capabilities, len);
-+			if (likely(len < PAGE_SIZE)) {
-+				buf[len] = '\n';
-+				ret = len+1;
-+			}
-+		}
-+	}
-+
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_prot_show(struct device *dev,
-+				    struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = strnlen(device->prot, sizeof(device->prot));
-+		strncpy(buf, device->prot, PAGE_SIZE);
-+		if (len == 0) {
-+			ret = len;
-+		} else if (likely(len < PAGE_SIZE)) {
-+			buf[len] = '\n';
-+			ret = len+1;
-+		} else {
-+			ret = PAGE_SIZE;
-+		}
-+	}
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_type_show(struct device *dev,
-+				    struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = strnlen(device->type, sizeof(device->type));
-+		strncpy(buf, device->type, PAGE_SIZE);
-+		if (len == 0) {
-+			ret = len;
-+		} else if (likely(len < PAGE_SIZE)) {
-+			buf[len] = '\n';
-+			ret = len+1;
-+		} else {
-+			ret = PAGE_SIZE;
-+		}
-+	}
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_model_show(struct device *dev,
-+				     struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = strnlen(device->model, sizeof(device->model));
-+		strncpy(buf, device->model, PAGE_SIZE);
-+		if (len == 0) {
-+			ret = len;
-+		} else if (likely(len < PAGE_SIZE)) {
-+			buf[len] = '\n';
-+			ret = len+1;
-+		} else {
-+			ret = PAGE_SIZE;
-+		}
-+	}
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_vendor_show(struct device *dev,
-+				      struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = strnlen(device->vendor, sizeof(device->vendor));
-+		strncpy(buf, device->vendor, PAGE_SIZE);
-+		if (len == 0) {
-+			ret = len;
-+		} else if (likely(len < PAGE_SIZE)) {
-+			buf[len] = '\n';
-+			ret = len+1;
-+		} else {
-+			ret = PAGE_SIZE;
-+		}
-+	}
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_module_show(struct device *dev,
-+				      struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	size_t len;
-+
-+	if (likely(device != NULL)) {
-+		len = strnlen(device->module, sizeof(device->module));
-+		strncpy(buf, device->module, PAGE_SIZE);
-+		if (len == 0) {
-+			ret = len;
-+		} else if (likely(len < PAGE_SIZE)) {
-+			buf[len] = '\n';
-+			ret = len+1;
-+		} else {
-+			ret = PAGE_SIZE;
-+		}
-+	}
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_serial_show(struct device *dev,
-+				      struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+
-+	if (likely(device != NULL))
-+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", device->device_number);
-+
-+	return ret;
-+}
-+
-+static ssize_t ddcci_attr_modalias_show(struct device *dev,
-+				      struct device_attribute *attr, char *buf)
-+{
-+	struct ddcci_device *device = ddcci_verify_device(dev);
-+	ssize_t ret = -ENOENT;
-+	char model[ARRAY_SIZE(device->model)];
-+	char vendor[ARRAY_SIZE(device->model)];
-+	char module[ARRAY_SIZE(device->model)];
-+
-+	if (likely(device != NULL)) {
-+		memcpy(model, device->model, sizeof(model));
-+		memcpy(vendor, device->vendor, sizeof(vendor));
-+		memcpy(module, device->module, sizeof(module));
-+		ddcci_modalias_clean(model, sizeof(model), '_');
-+		ddcci_modalias_clean(vendor, sizeof(vendor), '_');
-+		ddcci_modalias_clean(module, sizeof(module), '_');
-+
-+		ret = scnprintf(buf, PAGE_SIZE, "%s%s-%s-%s-%s-%s\n",
-+			DDCCI_MODULE_PREFIX,
-+			device->prot,
-+			device->type,
-+			model,
-+			vendor,
-+			module
-+		);
-+	}
-+	return ret;
-+}
-+
-+static DEVICE_ATTR(capabilities, S_IRUGO, ddcci_attr_capabilities_show, NULL);
-+static DEVICE_ATTR(idProt, S_IRUGO, ddcci_attr_prot_show, NULL);
-+static DEVICE_ATTR(idType, S_IRUGO, ddcci_attr_type_show, NULL);
-+static DEVICE_ATTR(idModel, S_IRUGO, ddcci_attr_model_show, NULL);
-+static DEVICE_ATTR(idVendor, S_IRUGO, ddcci_attr_vendor_show, NULL);
-+static DEVICE_ATTR(idModule, S_IRUGO, ddcci_attr_module_show, NULL);
-+static DEVICE_ATTR(idSerial, S_IRUGO, ddcci_attr_serial_show, NULL);
-+static DEVICE_ATTR(modalias, S_IRUGO, ddcci_attr_modalias_show, NULL);
-+
-+static struct attribute *ddcci_char_device_attrs[] = {
-+	&dev_attr_capabilities.attr,
-+	&dev_attr_idProt.attr,
-+	&dev_attr_idType.attr,
-+	&dev_attr_idModel.attr,
-+	&dev_attr_idVendor.attr,
-+	&dev_attr_idModule.attr,
-+	&dev_attr_idSerial.attr,
-+	&dev_attr_modalias.attr,
-+	NULL,
-+};
-+ATTRIBUTE_GROUPS(ddcci_char_device);
-+
-+/* DDC/CI bus */
-+
-+static int ddcci_device_uevent(const struct device *dev, struct kobj_uevent_env *env)
-+{
-+	struct ddcci_device	*device = to_ddcci_device(dev);
-+	char model[ARRAY_SIZE(device->model)];
-+	char vendor[ARRAY_SIZE(device->vendor)];
-+	char module[ARRAY_SIZE(device->module)];
-+
-+	memcpy(model, device->model, sizeof(model));
-+	memcpy(vendor, device->vendor, sizeof(vendor));
-+	memcpy(module, device->module, sizeof(module));
-+	ddcci_modalias_clean(model, sizeof(model), '_');
-+	ddcci_modalias_clean(vendor, sizeof(vendor), '_');
-+	ddcci_modalias_clean(module, sizeof(module), '_');
-+
-+	if (add_uevent_var(env, "MODALIAS=%s%s-%s-%s-%s-%s",
-+			   DDCCI_MODULE_PREFIX,
-+			   device->prot,
-+			   device->type,
-+			   model,
-+			   vendor,
-+			   module
-+		))
-+		return -ENOMEM;
-+
-+	if (device->prot[0])
-+		if (add_uevent_var(env, "DDCCI_PROT=%s", device->prot))
-+			return -ENOMEM;
-+
-+	if (device->type[0])
-+		if (add_uevent_var(env, "DDCCI_TYPE=%s", device->type))
-+			return -ENOMEM;
-+
-+	if (device->model[0])
-+		if (add_uevent_var(env, "DDCCI_MODEL=%s", device->model))
-+			return -ENOMEM;
-+
-+	if (device->vendor[0]) {
-+		if (add_uevent_var(env, "DDCCI_VENDOR=%s", device->vendor))
-+			return -ENOMEM;
-+
-+		if (add_uevent_var(env, "DDCCI_MODULE=%s", device->module))
-+			return -ENOMEM;
-+
-+		if (add_uevent_var(env, "DDCCI_UNIQ=%d", device->device_number))
-+			return -ENOMEM;
-+	}
-+
-+	return 0;
-+}
-+
-+static void ddcci_device_release(struct device *dev)
-+{
-+	struct ddcci_device *device = to_ddcci_device(dev);
-+	struct ddcci_driver *driver;
-+
-+	/* Notify driver */
-+	if (dev->driver) {
-+		driver = to_ddcci_driver(dev->driver);
-+		if (driver->remove)
-+			driver->remove(device);
-+	}
-+
-+	/* Teardown chardev */
-+	if (dev->devt) {
-+		down(&core_lock);
-+		if (device->cdev.dev == ddcci_cdev_next-1)
-+			ddcci_cdev_next--;
-+		cdev_del(&device->cdev);
-+		up(&core_lock);
-+	}
-+
-+	/* Free capability string */
-+	if (device->capabilities) {
-+		device->capabilities_len = 0;
-+		kfree(device->capabilities);
-+	}
-+	/* Free device */
-+	kfree(device);
-+}
-+
-+static char *ddcci_devnode(const struct device *dev,
-+			 umode_t *mode, kuid_t *uid, kgid_t *gid)
-+{
-+	struct ddcci_device *device;
-+
-+	device = to_ddcci_device(dev);
-+	return kasprintf(GFP_KERNEL, "bus/ddcci/%d/display",
-+			 device->i2c_client->adapter->nr);
-+}
-+
-+static char *ddcci_dependent_devnode(const struct device *dev,
-+			 umode_t *mode, kuid_t *uid, kgid_t *gid)
-+{
-+	struct ddcci_device *device;
-+
-+	device = to_ddcci_device(dev);
-+	if (device->flags & DDCCI_FLAG_EXTERNAL) {
-+		if (device->outer_addr == device->inner_addr)
-+			return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x",
-+					 device->i2c_client->adapter->nr,
-+					 device->outer_addr);
-+		else
-+			return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x%02x",
-+					 device->i2c_client->adapter->nr,
-+					 device->outer_addr, device->inner_addr);
-+	} else {
-+		return kasprintf(GFP_KERNEL, "bus/ddcci/%d/i%02x",
-+				 device->i2c_client->adapter->nr,
-+				 device->inner_addr);
-+	}
-+}
-+
-+/* Device type for main DDC/CI devices*/
-+static struct device_type ddcci_device_type = {
-+	.name	= "ddcci-device",
-+	.uevent		= ddcci_device_uevent,
-+	.groups		= ddcci_char_device_groups,
-+	.release	= ddcci_device_release,
-+	.devnode	= ddcci_devnode
-+};
-+
-+/* Device type for dependent DDC/CI devices*/
-+static struct device_type ddcci_dependent_type = {
-+	.name	= "ddcci-dependent-device",
-+	.uevent		= ddcci_device_uevent,
-+	.groups		= ddcci_char_device_groups,
-+	.release	= ddcci_device_release,
-+	.devnode	= ddcci_dependent_devnode
-+};
-+
-+/**
-+ * ddcci_verify_device - return parameter as ddcci_device, or NULL
-+ * @dev: device, probably from some driver model iterator
-+ */
-+struct ddcci_device *ddcci_verify_device(struct device *dev)
-+{
-+	if (unlikely(!dev))
-+		return NULL;
-+	return (dev->type == &ddcci_device_type
-+		|| dev->type == &ddcci_dependent_type)
-+			? to_ddcci_device(dev)
-+			: NULL;
-+}
-+EXPORT_SYMBOL(ddcci_verify_device);
-+
-+/**
-+ * ddcci_quirks - Get quirks for DDC/CI device
-+ * @dev: Target DDC/CI device
-+ */
-+unsigned long ddcci_quirks(struct ddcci_device *dev)
-+{
-+	if (unlikely(WARN_ON(!dev)))
-+		return ~0L;
-+	if (unlikely(WARN_ON(!dev->bus_drv_data)))
-+		return ~0L;
-+	return dev->bus_drv_data->quirks;
-+}
-+EXPORT_SYMBOL(ddcci_quirks);
-+
-+/**
-+ * ddcci_register_driver - register DDC/CI driver
-+ * @owner: the owning module
-+ * @driver: the driver to register
-+ */
-+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver)
-+{
-+	int ret;
-+
-+	/* Can't register until after driver model init */
-+	if (unlikely(WARN_ON(!ddcci_bus_registered)))
-+		return -EAGAIN;
-+
-+	pr_debug("registering driver [%s]\n", driver->driver.name);
-+
-+	/* add the driver to the list of ddcci drivers in the driver core */
-+	driver->driver.owner = owner;
-+	driver->driver.bus = &ddcci_bus_type;
-+
-+	/* When registration returns, the driver core
-+	 * will have called probe() for all matching-but-unbound devices.
-+	 */
-+	ret = driver_register(&driver->driver);
-+	if (ret)
-+		return ret;
-+
-+	pr_debug("driver [%s] registered\n", driver->driver.name);
-+
-+	return 0;
-+}
-+EXPORT_SYMBOL(ddcci_register_driver);
-+
-+/**
-+ * ddcci_del_driver - unregister DDC/CI driver
-+ * @driver: the driver being unregistered
-+ */
-+void ddcci_del_driver(struct ddcci_driver *driver)
-+{
-+	driver_unregister(&driver->driver);
-+	pr_debug("driver [%s] unregistered\n", driver->driver.name);
-+}
-+EXPORT_SYMBOL(ddcci_del_driver);
-+
-+/**
-+ * ddcci_device_write - Write a message to a DDC/CI device
-+ * @dev: Target DDC/CI device
-+ * @p_flag: Protocol flag, true for standard control messages
-+ * @data: Data that will be written to the device
-+ * @length: How many bytes to write
-+ *
-+ * Writes the message to the device and sleeps (see module parameter 'delay')
-+ */
-+int ddcci_device_write(struct ddcci_device *dev, bool p_flag,
-+		       unsigned char *data, unsigned char length)
-+{
-+	int ret;
-+
-+	if (down_interruptible(&dev->bus_drv_data->sem))
-+		return -EAGAIN;
-+
-+	ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, data, length);
-+	msleep(delay);
-+	up(&dev->bus_drv_data->sem);
-+	return ret;
-+}
-+EXPORT_SYMBOL(ddcci_device_write);
-+
-+/**
-+ * ddcci_device_read - Read a response from a DDC/CI device
-+ * @dev: Target DDC/CI device
-+ * @p_flag: Protocol flag, must match the corresponding write
-+ * @buffer: Where to store data read from the device
-+ * @length: Buffer size
-+ */
-+int ddcci_device_read(struct ddcci_device *dev, bool p_flag,
-+		      unsigned char *buffer, unsigned char length)
-+{
-+	int ret;
-+
-+	if (down_interruptible(&dev->bus_drv_data->sem))
-+		return -EAGAIN;
-+
-+	ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length);
-+	up(&dev->bus_drv_data->sem);
-+	return ret;
-+}
-+EXPORT_SYMBOL(ddcci_device_read);
-+
-+/**
-+ * ddcci_device_writeread - Write a message to a device and read the response
-+ * @dev: Target DDC/CI device
-+ * @p_flag: Protocol flag, true for standard control messages
-+ * @buffer: Buffer used for write and read
-+ * @length: How many bytes to write
-+ * @maxlength: Buffer size on read
-+ *
-+ * Writing, sleeping and reading are done without releasing the DDC/CI bus.
-+ * This provides atomicity in respect to other DDC/CI accesses on the same bus.
-+ */
-+int ddcci_device_writeread(struct ddcci_device *dev, bool p_flag,
-+			   unsigned char *buffer, unsigned char length,
-+			   unsigned char maxlength)
-+{
-+	int ret;
-+
-+	if (down_interruptible(&dev->bus_drv_data->sem))
-+		return -EAGAIN;
-+
-+	ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length);
-+	if (ret < 0)
-+		goto err;
-+	msleep(delay);
-+	ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, maxlength);
-+err:
-+	up(&dev->bus_drv_data->sem);
-+	return ret;
-+}
-+EXPORT_SYMBOL(ddcci_device_writeread);
-+
-+#define IS_ANY_ID(x) (((x)[0] == '\xFF') && ((x)[7] == '\xFF'))
-+
-+/* Check if any device id in the array matches the device and return the matching id */
-+static const struct ddcci_device_id *ddcci_match_id(const struct ddcci_device_id *id,
-+						    const struct ddcci_device *device)
-+{
-+	while (id->prot[0] || id->type[0] || id->model[0] || id->vendor[0] || id->module[0]) {
-+		if ((IS_ANY_ID(id->prot) || (strcmp(device->prot, id->prot) == 0))
-+		 && (IS_ANY_ID(id->type) || (strcmp(device->type, id->type) == 0))
-+		 && (IS_ANY_ID(id->model) || (strcmp(device->model, id->model) == 0))
-+		 && (IS_ANY_ID(id->vendor) || (strcmp(device->vendor, id->vendor) == 0))
-+		 && (IS_ANY_ID(id->module) || (strcmp(device->module, id->module) == 0))) {
-+			return id;
-+		}
-+		id++;
-+	}
-+	return NULL;
-+}
-+
-+static int ddcci_device_match(struct device *dev, struct device_driver *drv)
-+{
-+	struct ddcci_device	*device = ddcci_verify_device(dev);
-+	struct ddcci_driver	*driver;
-+
-+	if (!device)
-+		return 0;
-+
-+	driver = to_ddcci_driver(drv);
-+	/* match on an id table if there is one */
-+	if (driver->id_table)
-+		return ddcci_match_id(driver->id_table, device) != NULL;
-+
-+	return 0;
-+}
-+
-+static int ddcci_device_probe(struct device *dev)
-+{
-+	struct ddcci_device	*device = ddcci_verify_device(dev);
-+	struct ddcci_driver	*driver;
-+	const struct ddcci_device_id *id;
-+	int ret = 0;
-+
-+	if (!device)
-+		return -EINVAL;
-+	driver = to_ddcci_driver(dev->driver);
-+
-+	id = ddcci_match_id(driver->id_table, device);
-+	if (!id)
-+		return -ENODEV;
-+
-+	if (driver->probe)
-+		ret = driver->probe(device, id);
-+
-+	return ret;
-+}
-+
-+static int ddcci_device_remove(struct device *dev)
-+{
-+	struct ddcci_device	*device = ddcci_verify_device(dev);
-+	struct ddcci_driver	*driver;
-+	int ret = 0;
-+
-+	if (!device)
-+		return -EINVAL;
-+	driver = to_ddcci_driver(dev->driver);
-+
-+	if (driver->remove)
-+		ret = driver->remove(device);
-+
-+	return ret;
-+}
-+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
-+static void ddcci_device_remove_void(struct device *dev)
-+{
-+	ddcci_device_remove(dev);
-+}
-+#endif
-+
-+/**
-+ * DDCCI bus type structure
-+ */
-+struct bus_type ddcci_bus_type = {
-+	.name		= "ddcci",
-+	.match		= ddcci_device_match,
-+	.probe		= ddcci_device_probe,
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
-+	.remove		= ddcci_device_remove_void
-+#else
-+	.remove		= ddcci_device_remove
-+#endif
-+};
-+
-+/* Main I2C driver */
-+
-+/* Get a pointer to the closing parenthesis */
-+static char *ddcci_capstr_tok(const char *s, int depth)
-+{
-+	const char *ptr = s;
-+	char *end;
-+
-+	if (s == NULL || s[0] == '\0')
-+		return NULL;
-+
-+	while ((end = strpbrk(ptr, "()"))) {
-+		if (!end || depth == INT_MAX)
-+			return NULL;
-+		if (*end == '(')
-+			depth++;
-+		else if (depth > 0)
-+			depth--;
-+		else
-+			break;
-+		ptr = end+1;
-+	}
-+	return end;
-+}
-+
-+/**
-+ * ddcci_find_capstr_item - Search capability string for a tag
-+ * @capabilities: Capability string to search
-+ * @tag: Tag to find
-+ * @length: Buffer for the length of the found tag value (optional)
-+ *
-+ * Return a pointer to the start of the tag value (directly after the '(') on
-+ * success and write the length of the value (excluding the ')') into `length`.
-+ *
-+ * If the tag is not found or another error occurs, an ERR_PTR is returned
-+ * and `length` stays untouched.
-+ */
-+const char *ddcci_find_capstr_item(const char * capabilities,
-+				   const char * __restrict tag,
-+				   size_t *length)
-+{
-+	const char *src = capabilities, *ptr;
-+	ptrdiff_t len;
-+	int taglen = strnlen(tag, 1024);
-+
-+	/* Check length of requested tag */
-+	if (unlikely(taglen <= 0 || taglen > 1023))
-+		return ERR_PTR(-EINVAL);
-+
-+	/* Find tag */
-+	while (src && (strncmp(src+1, tag, taglen) != 0 || src[1+taglen] != '('))
-+		src = ddcci_capstr_tok(src+1, -1);
-+	if (!src || src[0] == '\0')
-+		return ERR_PTR(-ENOENT);
-+
-+	/* Locate end of value */
-+	src += taglen+2;
-+	ptr = ddcci_capstr_tok(src, 0);
-+	if (unlikely(!ptr))
-+		return ERR_PTR(-EOVERFLOW);
-+
-+	/* Check length of tag data */
-+	len = ptr-src;
-+	if (unlikely(len < 0 || len > 65535))
-+		return ERR_PTR(-EMSGSIZE);
-+
-+	/* Return pointer and length */
-+	if (likely(length != NULL))
-+		*length = (size_t)len;
-+	return src;
-+}
-+EXPORT_SYMBOL(ddcci_find_capstr_item);
-+
-+/* Search the capability string for a tag and copy the value to dest */
-+static int ddcci_cpy_capstr_item(char *dest, const char *src,
-+				  const char * __restrict tag, size_t maxlen)
-+{
-+	const char *ptr;
-+	size_t len;
-+
-+	/* Find tag */
-+	ptr = ddcci_find_capstr_item(src, tag, &len);
-+	if (IS_ERR(ptr)) {
-+		return PTR_ERR(ptr);
-+	}
-+
-+	/* Copy value */
-+	memcpy(dest, ptr, min(len, maxlen));
-+	return 0;
-+}
-+
-+/* Fill fields in device by parsing the capability string */
-+static int ddcci_parse_capstring(struct ddcci_device *device)
-+{
-+	const char *capstr = device->capabilities;
-+	int ret = 0;
-+
-+	if (!capstr)
-+		return -EINVAL;
-+
-+	/* capability string start with a paren */
-+	if (capstr[0] != '(')
-+		return -EINVAL;
-+
-+	/* get prot(...) */
-+	ret = ddcci_cpy_capstr_item(device->prot, capstr, "prot", sizeof(device->prot)-1);
-+	if (ret) {
-+		if (ret == -ENOENT) {
-+			dev_warn(&device->dev, "malformed capability string: no protocol tag");
-+			memset(device->prot, 0, sizeof(device->prot)-1);
-+		} else {
-+			return ret;
-+		}
-+	}
-+
-+	/* get type(...) */
-+	ret = ddcci_cpy_capstr_item(device->type, capstr, "type", sizeof(device->type)-1);
-+	if (ret) {
-+		if (ret == -ENOENT) {
-+			dev_warn(&device->dev, "malformed capability string: no type tag");
-+			memset(device->type, 0, sizeof(device->type)-1);
-+		} else {
-+			return ret;
-+		}
-+	}
-+
-+	/* and then model(...) */
-+	ret = ddcci_cpy_capstr_item(device->model, capstr, "model", sizeof(device->model)-1);
-+	if (ret) {
-+		if (ret == -ENOENT) {
-+			dev_warn(&device->dev, "malformed capability string: no model tag");
-+			memset(device->model, 0, sizeof(device->model)-1);
-+		} else {
-+			return ret;
-+		}
-+	}
-+
-+	/* if there is no protocol tag */
-+	if (!device->prot[0]) {
-+		/* and no type tag: give up. */
-+		if (!device->type[0])
-+			return -ENOENT;
-+
-+		/* Assume protocol "monitor" if type is "LCD" or "CRT" */
-+		if (strncasecmp(device->type, "LCD", sizeof(device->type)-1) == 0
-+		 || strncasecmp(device->type, "CRT", sizeof(device->type)-1) == 0) {
-+			memcpy(device->prot, "monitor", 7);
-+		}
-+	}
-+
-+	/* skip the rest for now */
-+
-+	return 0;
-+}
-+
-+/* Probe for a device on an inner address and create a ddcci_device for it */
-+static int ddcci_detect_device(struct i2c_client *client, unsigned char addr,
-+			       int dependent)
-+{
-+	int ret;
-+	unsigned char outer_addr = client->addr << 1;
-+	unsigned char *buffer = NULL;
-+	struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client);
-+	struct ddcci_device *device = NULL;
-+
-+	down(&drv_data->sem);
-+
-+	/* Allocate buffer big enough for any capability string */
-+	buffer = kmalloc(16384, GFP_KERNEL);
-+	if (!buffer) {
-+		ret = -ENOMEM;
-+		goto err_end;
-+	}
-+
-+	/* Allocate device struct */
-+	device = kzalloc(sizeof(struct ddcci_device), GFP_KERNEL);
-+	if (!device) {
-+		ret = -ENOMEM;
-+		goto err_end;
-+	}
-+
-+	/* Initialize device */
-+	device_initialize(&device->dev);
-+	device->dev.parent = &client->dev;
-+	device->dev.bus = &ddcci_bus_type;
-+	device->outer_addr = outer_addr;
-+	device->inner_addr = addr;
-+	device->bus_drv_data = drv_data;
-+	device->i2c_client = client;
-+
-+	if (!dependent) {
-+		device->dev.type = &ddcci_device_type;
-+		ret = dev_set_name(&device->dev, "ddcci%d", client->adapter->nr);
-+	} else if (outer_addr == dependent) {
-+		/* Internal dependent device */
-+		device->dev.type = &ddcci_dependent_type;
-+		device->flags = DDCCI_FLAG_DEPENDENT;
-+		ret = dev_set_name(&device->dev, "ddcci%di%02x", client->adapter->nr, addr);
-+	} else if (outer_addr == addr) {
-+		/* External dependent device */
-+		device->dev.type = &ddcci_dependent_type;
-+		device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL;
-+		ret = dev_set_name(&device->dev, "ddcci%de%02x", client->adapter->nr, addr);
-+	} else {
-+		/* Dependent device of external dependent device
-+		   Just in case something like this exists */
-+		device->dev.type = &ddcci_dependent_type;
-+		device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL;
-+		ret = dev_set_name(&device->dev, "ddcci%de%02x%02x", client->adapter->nr, outer_addr, addr);
-+	}
-+
-+	if (ret)
-+		goto err_free;
-+
-+	/* Read identification and check for quirks */
-+	ret = ddcci_identify_device(client, addr, buffer, 29);
-+	if (ret < 0) {
-+		if (!dependent && (ret == -EBADMSG || ret == -EMSGSIZE)) {
-+			dev_warn(&device->dev, "DDC/CI main device sent broken response on identification. Trying to detect solely based on capability information.\n");
-+		} else {
-+			goto err_free;
-+		}
-+	}
-+
-+	if (ret == 29 && buffer[0] == DDCCI_REPLY_ID) {
-+		memcpy(device->vendor, &buffer[7], 8);
-+		memcpy(device->module, &buffer[17], 8);
-+		device->device_number = be32_to_cpu(*(__force __be32 *)&buffer[18]);
-+	}
-+
-+	/* Read capabilities */
-+	ret = ddcci_get_caps(client, addr, buffer, 16384);
-+	if (ret > 0) {
-+		/* Fixup unparenthesized capability strings, but only if the first
-+		   character is an ascii lower case letter.
-+		   This should still allow an early exit for completely garbled
-+		   data but helps detecting devices where only the parentheses are
-+		   missing, as the second char must be the first character of a
-+		   keyword. */
-+		if (ret > 2 && buffer[0] >= 'a' && buffer[0] <= 'z') {
-+			dev_err(&device->dev, "DDC/CI device quirk detected: unparenthesized capability string\n");
-+			device->capabilities = kzalloc(ret+3, GFP_KERNEL);
-+			if (!device->capabilities) {
-+				ret = -ENOMEM;
-+				goto err_free;
-+			}
-+			device->capabilities_len = ret+2;
-+			memcpy(&(device->capabilities[1]), buffer, ret);
-+			device->capabilities[0] = '(';
-+			device->capabilities[ret+1] = ')';
-+		} else {
-+			/* Standard case: simply copy the received string */
-+			device->capabilities = kzalloc(ret+1, GFP_KERNEL);
-+			if (!device->capabilities) {
-+				ret = -ENOMEM;
-+				goto err_free;
-+			}
-+			device->capabilities_len = ret;
-+			memcpy(device->capabilities, buffer, ret);
-+		}
-+
-+		ret = ddcci_parse_capstring(device);
-+		if (ret) {
-+			dev_err(&device->dev, "malformed capability string: \"%s\" errno %d\n", device->capabilities, ret);
-+			ret = -EINVAL;
-+			goto err_free;
-+		}
-+	}
-+
-+	/* Found a device if either identification or capabilities succeeded */
-+	if (!device->capabilities && device->vendor[0] == '\0') {
-+		dev_dbg(&client->dev,
-+			"[%02x:%02x] got neither valid identification nor capability data\n",
-+			client->addr << 1, addr);
-+		ret = -ENODEV;
-+		goto err_free;
-+	}
-+
-+	/* Setup chardev */
-+	down(&core_lock);
-+	ret = ddcci_setup_char_device(device);
-+	up(&core_lock);
-+	if (ret)
-+		goto err_free;
-+
-+	/* Release semaphore and add device to the tree */
-+	up(&drv_data->sem);
-+	pr_debug("found device at %d:%02x:%02x\n", client->adapter->nr, outer_addr, addr);
-+	ret = device_add(&device->dev);
-+	if (ret)
-+		goto err_free;
-+
-+	goto end;
-+err_free:
-+	put_device(&device->dev);
-+err_end:
-+	up(&drv_data->sem);
-+end:
-+	kfree(buffer);
-+	return ret;
-+}
-+
-+/* I2C detect function: check if a main or external dependent device exists */
-+static int ddcci_detect(struct i2c_client *client, struct i2c_board_info *info)
-+{
-+	int ret;
-+	unsigned char outer_addr;
-+	unsigned char inner_addr;
-+	unsigned char buf[32];
-+	unsigned char cmd_id[1] = { DDCCI_COMMAND_ID };
-+	unsigned char cmd_caps[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00};
-+	unsigned char *cmd;
-+	unsigned int cmd_len;
-+
-+	/* Check for i2c_master_* functionality */
-+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-+		pr_debug("i2c adapter %d unsuitable: no i2c_master functionality\n", client->adapter->nr);
-+		return -ENODEV;
-+	}
-+
-+	/* send Capabilities Request (for main) or Identification Request command (for dependent devices) */
-+	outer_addr = client->addr << 1;
-+	inner_addr = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? DDCCI_HOST_ADDR_ODD : outer_addr | 1;
-+	cmd = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? cmd_caps : cmd_id;
-+	cmd_len = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? sizeof(cmd_caps) : sizeof(cmd_id);
-+	pr_debug("detecting %d:%02x\n", client->adapter->nr, outer_addr);
-+
-+	ret = __ddcci_write_block(client, inner_addr, buf, true, cmd, cmd_len);
-+
-+	if (ret == -ENXIO || ret == -EIO) {
-+		if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_WRITE_BYTE)) {
-+			pr_debug("i2c write failed with ENXIO or EIO but bytewise writing is not supported\n");
-+			return -ENODEV;
-+		}
-+		pr_debug("i2c write failed with ENXIO or EIO, trying bytewise writing\n");
-+		ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len);
-+		if (ret == 0) {
-+			msleep(delay);
-+			ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len);
-+		}
-+	}
-+
-+	if (ret < 0)
-+		return -ENODEV;
-+
-+	/* wait for device */
-+	msleep(delay);
-+	/* receive answer */
-+	ret = i2c_master_recv(client, buf, 32);
-+	if (ret < 3) {
-+		pr_debug("detection failed: no answer\n");
-+		return -ENODEV;
-+	}
-+
-+	/* check response starts with outer addr */
-+	if (buf[0] != outer_addr) {
-+		pr_debug("detection failed: invalid %s response (%02x != %02x)\n", (cmd == cmd_id) ? "identification" : "capabilities", buf[0], outer_addr);
-+		pr_debug("received message was %*ph \n", ret, buf);
-+		return -ENODEV;
-+	}
-+
-+	pr_debug("detected %d:%02x\n", client->adapter->nr, outer_addr);
-+
-+	/* set device type */
-+	strlcpy(info->type, (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? "ddcci" : "ddcci-dependent", I2C_NAME_SIZE);
-+
-+	return 0;
-+}
-+
-+/* I2C probe function */
-+static int ddcci_probe(struct i2c_client *client)
-+{
-+	const struct i2c_device_id *id = i2c_client_get_device_id(client);
-+	int i, ret = -ENODEV, tmp;
-+	unsigned char main_addr, addr;
-+	struct ddcci_bus_drv_data *drv_data;
-+
-+	/* Initialize driver data structure */
-+	drv_data = devm_kzalloc(&client->dev, sizeof(struct ddcci_bus_drv_data), GFP_KERNEL);
-+	if (!drv_data)
-+		return -ENOMEM;
-+	drv_data->i2c_dev = client;
-+	sema_init(&drv_data->sem, 1);
-+
-+	/* Set i2c client data */
-+	i2c_set_clientdata(client, drv_data);
-+
-+	if (id->driver_data == 0) {
-+		/* Core device, probe at 0x6E */
-+		main_addr = DDCCI_DEFAULT_DEVICE_ADDR;
-+		dev_dbg(&client->dev, "probing core device [%02x]\n",
-+			client->addr << 1);
-+		ret = ddcci_detect_device(client, main_addr, 0);
-+		if (ret) {
-+			dev_info(&client->dev, "core device [%02x] probe failed: %d\n",
-+				 client->addr << 1, ret);
-+			if (ret == -EIO)
-+				ret = -ENODEV;
-+			goto err_free;
-+		}
-+
-+		/* Detect internal dependent devices */
-+		dev_dbg(&client->dev, "probing internal dependent devices\n");
-+		for (i = 0; i < autoprobe_addr_count; ++i) {
-+			addr = (unsigned short)autoprobe_addrs[i];
-+			if ((addr & 1) == 0 && addr != main_addr) {
-+				tmp = ddcci_detect_device(client, addr, main_addr);
-+				if (tmp < 0 && tmp != -ENODEV) {
-+					dev_info(&client->dev, "internal dependent device [%02x:%02x] probe failed: %d\n",
-+						 client->addr << 1, addr, ret);
-+				}
-+			}
-+		}
-+	} else if (id->driver_data == 1) {
-+		/* External dependent device */
-+		main_addr = client->addr << 1;
-+		dev_dbg(&client->dev, "probing external dependent device [%02x]\n", main_addr);
-+		ret = ddcci_detect_device(client, main_addr, -1);
-+		if (ret) {
-+			dev_info(&client->dev, "external dependent device [%02x] probe failed: %d\n",
-+				 main_addr, ret);
-+			if (ret == -EIO)
-+				ret = -ENODEV;
-+			goto err_free;
-+		}
-+	} else {
-+		dev_warn(&client->dev,
-+			 "probe() called with invalid i2c device id\n");
-+		ret = -EINVAL;
-+	}
-+
-+	goto end;
-+err_free:
-+	devm_kfree(&client->dev, drv_data);
-+end:
-+	return ret;
-+}
-+
-+/*
-+ * Callback for bus_find_device() used in ddcci_remove()
-+ *
-+ * Find next device on i2c_client not flagged with
-+ * DDCCI_FLAG_REMOVED and flag it.
-+ */
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,3,0)
-+static int ddcci_remove_helper(struct device *dev, const void *p)
-+#else
-+static int ddcci_remove_helper(struct device *dev, void *p)
-+#endif
-+{
-+	struct ddcci_device *device;
-+
-+	device = ddcci_verify_device(dev);
-+	if (!device || device->flags & DDCCI_FLAG_REMOVED)
-+		return 0;
-+
-+	if (!p || (dev->parent == p)) {
-+		device->flags |= DDCCI_FLAG_REMOVED;
-+		wmb();
-+		return 1;
-+	}
-+
-+	return 0;
-+}
-+
-+/* I2C driver remove callback: unregister all subdevices */
-+static int ddcci_remove(struct i2c_client *client)
-+{
-+	struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client);
-+	struct device *dev;
-+
-+	down(&drv_data->sem);
-+	while (1) {
-+		dev = bus_find_device(&ddcci_bus_type, NULL, client,
-+				      ddcci_remove_helper);
-+		if (!dev)
-+			break;
-+		device_unregister(dev);
-+		put_device(dev);
-+	}
-+	up(&drv_data->sem);
-+	return 0;
-+}
-+
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
-+static void ddcci_remove_void(struct i2c_client *client)
-+{
-+	ddcci_remove(client);
-+}
-+#endif
-+
-+/*
-+ * I2C driver device identification table.
-+ */
-+static const struct i2c_device_id ddcci_idtable[] = {
-+	{ "ddcci", 0 },
-+	{ "ddcci-dependent", 1 },
-+	{}
-+};
-+MODULE_DEVICE_TABLE(i2c, ddcci_idtable);
-+
-+/*
-+ * I2C driver description structure
-+ */
-+static struct i2c_driver ddcci_driver = {
-+	.driver = {
-+		.name	= "ddcci",
-+		.owner	= THIS_MODULE,
-+	},
-+
-+	.id_table	= ddcci_idtable,
-+	.probe		= ddcci_probe,
-+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0)
-+	.remove		= ddcci_remove_void,
-+#else
-+	.remove		= ddcci_remove,
-+#endif
-+	.class		= I2C_CLASS_DDC,
-+	.detect		= ddcci_detect,
-+	.address_list	= I2C_ADDRS(
-+		DDCCI_DEFAULT_DEVICE_ADDR>>1
-+	),
-+};
-+
-+/*
-+ * Module initialization function. Called when the module is inserted or
-+ * (if builtin) at boot time.
-+ */
-+static int __init ddcci_module_init(void)
-+{
-+	int ret;
-+
-+	pr_debug("initializing ddcci driver\n");
-+	/* Allocate a device number region for the character devices */
-+	ret = alloc_chrdev_region(&ddcci_cdev_first, 0, 128, DEVICE_NAME);
-+	if (ret < 0) {
-+		pr_err("failed to register device region: error %d\n", ret);
-+		goto err_chrdevreg;
-+	}
-+	ddcci_cdev_next = ddcci_cdev_first;
-+	ddcci_cdev_end = MKDEV(MAJOR(ddcci_cdev_first), MINOR(ddcci_cdev_first)+128);
-+
-+	/* Register bus */
-+	ret = bus_register(&ddcci_bus_type);
-+	if (ret) {
-+		pr_err("failed to register bus 'ddcci'\n");
-+		goto err_busreg;
-+	}
-+	ddcci_bus_registered = true;
-+
-+	/* Register I2C driver */
-+	ret = i2c_add_driver(&ddcci_driver);
-+	if (ret) {
-+		pr_err("failed to register i2c driver\n");
-+		goto err_drvreg;
-+	}
-+
-+	pr_debug("ddcci driver initialized\n");
-+
-+	return 0;
-+
-+err_drvreg:
-+	bus_unregister(&ddcci_bus_type);
-+err_busreg:
-+	unregister_chrdev_region(ddcci_cdev_first, 128);
-+err_chrdevreg:
-+	return ret;
-+}
-+
-+/*
-+ * Module clean-up function. Called when the module is removed.
-+ */
-+static void __exit ddcci_module_exit(void)
-+{
-+	struct device *dev;
-+
-+	while (1) {
-+		dev = bus_find_device(&ddcci_bus_type, NULL, NULL, ddcci_remove_helper);
-+		if (!dev)
-+			break;
-+		device_unregister(dev);
-+		put_device(dev);
-+	}
-+
-+	i2c_del_driver(&ddcci_driver);
-+	bus_unregister(&ddcci_bus_type);
-+	unregister_chrdev_region(ddcci_cdev_first, 128);
-+}
-+
-+/* Let the kernel know the calls for module init and exit */
-+module_init(ddcci_module_init);
-+module_exit(ddcci_module_exit);
-+
-+/* Module parameter description */
-+module_param(delay, uint, S_IRUGO|S_IWUSR);
-+MODULE_PARM_DESC(delay, "default delay after bus writes (in ms, default 60)");
-+module_param_array(autoprobe_addrs, ushort, &autoprobe_addr_count, S_IRUGO|S_IWUSR);
-+MODULE_PARM_DESC(autoprobe_addrs, "internal dependent device addresses to autoprobe");
-+
-+/* Module description */
-+MODULE_AUTHOR("Christoph Grenz");
-+MODULE_DESCRIPTION("DDC/CI bus driver");
-+MODULE_VERSION("0.4.2");
-+MODULE_LICENSE("GPL");
-diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
-index 51387b1ef012..4b8bfd7c02c6 100644
---- a/drivers/video/backlight/Kconfig
-+++ b/drivers/video/backlight/Kconfig
-@@ -297,6 +297,17 @@ config BACKLIGHT_QCOM_WLED
- 	  If you have the Qualcomm PMIC, say Y to enable a driver for the
- 	  WLED block. Currently it supports PM8941 and PMI8998.
- 
-+config BACKLIGHT_DDCCI
-+	tristate "DDCCI Backlight Driver"
-+	depends on DDCCI
-+	help
-+	  If you have a DDC/CI supporing monitor, say Y to enable a driver
-+	  to control its backlight using DDC/CI. This could be useful if
-+	  your monitor does not include a backlight driver. For this to be
-+	  useful you need to enable DDCCI support which can be found in
-+	  Device Drivers -> Character devices and that further depends on
-+	  I2C.
-+
- config BACKLIGHT_RT4831
- 	tristate "Richtek RT4831 Backlight Driver"
- 	depends on MFD_RT4831
-diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
-index f72e1c3c59e9..656dea21c0ee 100644
---- a/drivers/video/backlight/Makefile
-+++ b/drivers/video/backlight/Makefile
-@@ -58,3 +58,4 @@ obj-$(CONFIG_BACKLIGHT_WM831X)		+= wm831x_bl.o
- obj-$(CONFIG_BACKLIGHT_ARCXCNN) 	+= arcxcnn_bl.o
- obj-$(CONFIG_BACKLIGHT_RAVE_SP)		+= rave-sp-backlight.o
- obj-$(CONFIG_BACKLIGHT_LED)		+= led_bl.o
-+obj-$(CONFIG_BACKLIGHT_DDCCI)		+= ddcci-backlight.o
-diff --git a/drivers/video/backlight/ddcci-backlight.c b/drivers/video/backlight/ddcci-backlight.c
-new file mode 100644
-index 000000000000..7a9852207f0b
---- /dev/null
-+++ b/drivers/video/backlight/ddcci-backlight.c
-@@ -0,0 +1,413 @@
-+/*
-+ *  DDC/CI monitor backlight driver
-+ *
-+ *  Copyright (c) 2015 Christoph Grenz
-+ */
-+
-+/*
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms of the GNU General Public License as published by the Free
-+ * Software Foundation; either version 2 of the License, or (at your option)
-+ * any later version.
-+ */
-+
-+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-+#include <linux/backlight.h>
-+#include <linux/module.h>
-+#include <linux/fb.h>
-+#include <linux/sysfs.h>
-+
-+#include <linux/ddcci.h>
-+
-+
-+#define DDCCI_COMMAND_READ	0x01	/* read ctrl value */
-+#define DDCCI_REPLY_READ	0x02	/* read ctrl value reply */
-+#define DDCCI_COMMAND_WRITE	0x03	/* write ctrl value */
-+#define DDCCI_COMMAND_SAVE	0x0c	/* save current settings */
-+
-+#define DDCCI_MONITOR_LUMINANCE	0x10
-+#define DDCCI_MONITOR_BACKLIGHT	0x13
-+#define DDCCI_MONITOR_BL_WHITE		0x6B
-+
-+static bool convenience_symlink = true;
-+
-+struct ddcci_monitor_drv_data {
-+	struct ddcci_device *device;
-+	struct backlight_device *bl_dev;
-+	struct device *fb_dev;
-+	unsigned char used_vcp;
-+};
-+
-+static int ddcci_monitor_writectrl(struct ddcci_device *device,
-+				   unsigned char ctrl, unsigned short value)
-+{
-+	unsigned char buf[4];
-+	int ret;
-+
-+	buf[0] = DDCCI_COMMAND_WRITE;
-+	buf[1] = ctrl;
-+	buf[2] = (value >> 8);
-+	buf[3] = (value & 255);
-+
-+	ret = ddcci_device_write(device, true, buf, sizeof(buf));
-+
-+	return ret;
-+}
-+
-+static int ddcci_monitor_readctrl(struct ddcci_device *device,
-+				  unsigned char ctrl, unsigned short *value,
-+				  unsigned short *maximum)
-+{
-+	int ret;
-+	unsigned char buf[10];
-+
-+	buf[0] = DDCCI_COMMAND_READ;
-+	buf[1] = ctrl;
-+
-+	ret = ddcci_device_writeread(device, true, buf, 2, sizeof(buf));
-+	if (ret < 0)
-+		return ret;
-+
-+	if (ret == 0)
-+		return -ENOTSUPP;
-+
-+	if (ret == 8 && buf[0] == DDCCI_REPLY_READ && buf[2] == ctrl) {
-+		if (value)
-+			*value = buf[6] * 256 + buf[7];
-+
-+		if (maximum)
-+			*maximum = buf[4] * 256 + buf[5];
-+
-+		if (buf[1] == 1)
-+			return -ENOTSUPP;
-+		if (buf[1] != 0)
-+			return -EIO;
-+		return 0;
-+	}
-+
-+	return -EIO;
-+}
-+
-+static int ddcci_backlight_check_fb(struct backlight_device *bl,
-+				   struct fb_info *info)
-+{
-+	struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl);
-+
-+	return drv_data->fb_dev == NULL || drv_data->fb_dev == info->dev;
-+}
-+
-+static int ddcci_backlight_update_status(struct backlight_device *bl)
-+{
-+	struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl);
-+	int brightness = bl->props.brightness;
-+	int ret;
-+
-+	if (bl->props.power != FB_BLANK_UNBLANK ||
-+	    bl->props.state & BL_CORE_FBBLANK)
-+		brightness = 0;
-+
-+	ret = ddcci_monitor_writectrl(drv_data->device, drv_data->used_vcp,
-+				      brightness);
-+	if (ret > 0)
-+		ret = 0;
-+	return ret;
-+}
-+
-+static int ddcci_backlight_get_brightness(struct backlight_device *bl)
-+{
-+	unsigned short value = 0, maxval = 0;
-+	int ret;
-+	struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl);
-+
-+	ret = ddcci_monitor_readctrl(drv_data->device, drv_data->used_vcp,
-+				     &value, &maxval);
-+	if (ret < 0)
-+		return ret;
-+
-+	bl->props.brightness = value;
-+	bl->props.max_brightness = maxval;
-+	ret = value;
-+
-+	return ret;
-+}
-+
-+static const struct backlight_ops ddcci_backlight_ops = {
-+	.options	= 0,
-+	.update_status	= ddcci_backlight_update_status,
-+	.get_brightness = ddcci_backlight_get_brightness,
-+	.check_fb	= ddcci_backlight_check_fb,
-+};
-+
-+static const char *ddcci_monitor_vcp_name(unsigned char vcp)
-+{
-+	switch (vcp) {
-+		case DDCCI_MONITOR_BL_WHITE:
-+			return "backlight";
-+		case DDCCI_MONITOR_LUMINANCE:
-+			return "luminance";
-+		default:
-+			return "???";
-+	}
-+}
-+
-+static const char *ddcci_monitor_next_vcp_item(const char *ptr)
-+{
-+	int depth = 0;
-+
-+	/* Sanity check */
-+	if (unlikely(ptr == NULL || ptr[0] == '\0'))
-+		return NULL;
-+
-+	/* Find next white space outside of parentheses */
-+	while ((ptr = strpbrk(ptr, " ()"))) {
-+		if (!ptr || depth == INT_MAX) {
-+			return NULL;
-+		} else if (*ptr == '(') {
-+			depth++;
-+		} else if (depth > 0) {
-+			if (*ptr == ')')
-+				depth--;
-+		} else {
-+			break;
-+		}
-+		++ptr;
-+	}
-+
-+	/* Skip over whitespace */
-+	ptr = skip_spaces(ptr);
-+
-+	/* Check if we're now at the end of the list */
-+	if (unlikely(*ptr == '\0' || *ptr == ')'))
-+		return NULL;
-+
-+	return ptr;
-+}
-+
-+static bool ddcci_monitor_find_vcp(unsigned char vcp, const char *s)
-+{
-+	const char *ptr = s;
-+	char vcp_hex[3];
-+
-+	/* Sanity check */
-+	if (unlikely(s == NULL || s[0] == '\0'))
-+		return false;
-+
-+	/* Create hex representation of VCP */
-+	if (unlikely(snprintf(vcp_hex, 3, "%02hhX", vcp) != 2)) {
-+		pr_err("snprintf failed to convert to hex. This should not happen.\n");
-+		return false;
-+	}
-+
-+	/* Search for it */
-+	do {
-+		if (strncasecmp(vcp_hex, ptr, 2) == 0) {
-+			if (ptr[2] == ' ' || ptr[2] == '(' || ptr[2] == ')') {
-+				return true;
-+			}
-+		}
-+	} while ((ptr = ddcci_monitor_next_vcp_item(ptr)));
-+
-+	return false;
-+}
-+
-+static int ddcci_backlight_create_symlink(struct ddcci_device *ddcci_dev)
-+{
-+	int i, result;
-+	struct device *dev = &ddcci_dev->dev;
-+	struct kernfs_node *dirent;
-+	for (i = 0; i < 3; ++i) {
-+		dev = dev->parent;
-+		if (!dev) {
-+			dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: ancestor device not found\n");
-+			return -ENOENT;
-+		}
-+	}
-+	dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight");
-+	if (dirent) {
-+		sysfs_put(dirent);
-+		dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: %s/ddcci_backlight already exists\n", dev_name(dev));
-+		return -EEXIST;
-+	}
-+
-+	result = sysfs_create_link(&dev->kobj, &ddcci_dev->dev.kobj, "ddcci_backlight");
-+	if (result == 0) {
-+		dev_dbg(&ddcci_dev->dev, "created symlink %s/ddcci_backlight\n", dev_name(dev));
-+	} else {
-+		dev_info(&ddcci_dev->dev, "failed to create convenience symlink: %d\n", result);
-+	}
-+	return result;
-+}
-+
-+static int ddcci_backlight_remove_symlink(struct ddcci_device *ddcci_dev)
-+{
-+	int i;
-+	struct device *dev = &ddcci_dev->dev;
-+	struct kernfs_node *dirent;
-+	for (i = 0; i < 3; ++i) {
-+		dev = dev->parent;
-+		if (!dev)
-+			return -ENOENT;
-+	}
-+	dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight");
-+	if (!dirent) {
-+		return -ENOENT;
-+	}
-+
-+	if ((dirent->flags & KERNFS_LINK) == 0) {
-+		sysfs_put(dirent);
-+		dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: not a symlink\n", dev_name(dev));
-+		return -EINVAL;
-+	}
-+
-+	if (dirent->symlink.target_kn != ddcci_dev->dev.kobj.sd) {
-+		sysfs_put(dirent);
-+		dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: we are not the link target\n", dev_name(dev));
-+		return -EINVAL;
-+	}
-+
-+	sysfs_put(dirent);
-+
-+	sysfs_remove_link(&dev->kobj, "ddcci_backlight");
-+	dev_dbg(&ddcci_dev->dev, "removed symlink %s/ddcci_backlight\n", dev_name(dev));
-+	return 0;
-+}
-+
-+static int ddcci_monitor_probe(struct ddcci_device *dev,
-+			       const struct ddcci_device_id *id)
-+{
-+	struct ddcci_monitor_drv_data *drv_data;
-+	struct backlight_properties props;
-+	struct backlight_device *bl = NULL;
-+	int ret = 0;
-+	bool support_luminance, support_bl_white;
-+	unsigned short brightness = 0, max_brightness = 0;
-+	const char *vcps;
-+
-+	dev_dbg(&dev->dev, "probing monitor backlight device\n");
-+
-+	/* Get VCP list */
-+	vcps = ddcci_find_capstr_item(dev->capabilities, "vcp", NULL);
-+	if (IS_ERR(vcps)) {
-+		dev_info(&dev->dev,
-+			 "monitor doesn't provide a list of supported controls.\n");
-+		support_bl_white = support_luminance = true;
-+	} else {
-+		/* Check VCP list for supported VCPs */
-+		support_bl_white = ddcci_monitor_find_vcp(DDCCI_MONITOR_BL_WHITE, vcps);
-+		support_luminance = ddcci_monitor_find_vcp(DDCCI_MONITOR_LUMINANCE, vcps);
-+		/* Fallback to trying if no support is found */
-+		if (!support_bl_white && !support_luminance) {
-+			dev_info(&dev->dev,
-+				 "monitor doesn't announce support for backlight or luminance controls.\n");
-+			support_bl_white = support_luminance = true;
-+		}
-+	}
-+
-+	/* Initialize driver data structure */
-+	drv_data = devm_kzalloc(&dev->dev, sizeof(struct ddcci_monitor_drv_data),
-+				GFP_KERNEL);
-+	if (!drv_data)
-+		return -ENOMEM;
-+	drv_data->device = dev;
-+
-+	if (support_bl_white) {
-+		/* Try getting backlight level */
-+		dev_dbg(&dev->dev,
-+			"trying to access \"backlight level white\" control\n");
-+		ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_BL_WHITE,
-+						&brightness, &max_brightness);
-+		if (ret < 0) {
-+			if (ret == -ENOTSUPP)
-+				dev_info(&dev->dev,
-+					"monitor does not support reading backlight level\n");
-+			else
-+				goto err_free;
-+		} else {
-+			drv_data->used_vcp = DDCCI_MONITOR_BL_WHITE;
-+		}
-+	}
-+
-+	if (support_luminance && !drv_data->used_vcp) {
-+		/* Try getting luminance */
-+		dev_dbg(&dev->dev,
-+			"trying to access \"luminance\" control\n");
-+		ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_LUMINANCE,
-+					     &brightness, &max_brightness);
-+		if (ret < 0) {
-+			if (ret == -ENOTSUPP)
-+				dev_info(&dev->dev,
-+					"monitor does not support reading luminance\n");
-+			else
-+				goto err_free;
-+		} else {
-+			drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE;
-+		}
-+		drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE;
-+	}
-+
-+	if (!drv_data->used_vcp)
-+		goto err_free;
-+
-+	/* Create brightness device */
-+	memset(&props, 0, sizeof(props));
-+	props.type = BACKLIGHT_RAW;
-+	props.max_brightness = max_brightness;
-+	props.brightness = brightness;
-+	bl = devm_backlight_device_register(&dev->dev, dev_name(&dev->dev),
-+					    &dev->dev, drv_data,
-+					    &ddcci_backlight_ops, &props);
-+	drv_data->bl_dev = bl;
-+	if (IS_ERR(bl)) {
-+		dev_err(&dev->dev, "failed to register backlight\n");
-+		return PTR_ERR(bl);
-+	}
-+	dev_info(&dev->dev, "registered %s as backlight device %s\n",
-+		 ddcci_monitor_vcp_name(drv_data->used_vcp),
-+		 dev_name(&dev->dev));
-+
-+	if (convenience_symlink) {
-+		ddcci_backlight_create_symlink(dev);
-+	}
-+
-+	goto end;
-+err_free:
-+	devm_kfree(&dev->dev, drv_data);
-+end:
-+	return ret;
-+}
-+
-+static int ddcci_monitor_remove(struct ddcci_device *dev)
-+{
-+	dev_dbg(&dev->dev, "removing device\n");
-+	ddcci_backlight_remove_symlink(dev);
-+	return 0;
-+}
-+
-+static struct ddcci_device_id ddcci_monitor_idtable[] = {
-+	{ "monitor", DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, 0 },
-+	{}
-+};
-+
-+static struct ddcci_driver ddcci_backlight_driver = {
-+	.driver = {
-+		.name	= "ddcci-backlight",
-+		.owner	= THIS_MODULE,
-+	},
-+
-+	.id_table	= ddcci_monitor_idtable,
-+	.probe		= ddcci_monitor_probe,
-+	.remove		= ddcci_monitor_remove,
-+};
-+
-+module_ddcci_driver(ddcci_backlight_driver);
-+
-+/* Module parameter description */
-+module_param(convenience_symlink, bool, S_IRUGO|S_IWUSR);
-+MODULE_PARM_DESC(convenience_symlink, "add convenience symlink \"ddcci_backlight\" to ancestor device in sysfs (default true)");
-+
-+MODULE_AUTHOR("Christoph Grenz");
-+MODULE_DESCRIPTION("DDC/CI generic monitor backlight driver");
-+MODULE_VERSION("0.4.2");
-+MODULE_LICENSE("GPL");
-+
-+MODULE_ALIAS("ddcci:monitor-*-*-*-*");
-diff --git a/include/linux/ddcci.h b/include/linux/ddcci.h
-new file mode 100644
-index 000000000000..a219f031e584
---- /dev/null
-+++ b/include/linux/ddcci.h
-@@ -0,0 +1,164 @@
-+/*
-+ *  DDC/CI bus driver
-+ *
-+ *  Copyright (c) 2015 Christoph Grenz
-+ */
-+
-+/*
-+ * This program is free software; you can redistribute it and/or modify it
-+ * under the terms of the GNU General Public License as published by the Free
-+ * Software Foundation; either version 2 of the License, or (at your option)
-+ * any later version.
-+ */
-+
-+#ifndef _DDCCI_H
-+#define _DDCCI_H
-+
-+#include <linux/mod_devicetable.h>
-+#include <linux/device.h>
-+#include <linux/cdev.h>
-+
-+#define DDCCI_MODULE_PREFIX "ddcci:"
-+
-+/* Special addresses */
-+
-+/* default device address (even) */
-+#define DDCCI_DEFAULT_DEVICE_ADDR	0x6E
-+/* receiving host address for communication with default device address */
-+#define DDCCI_HOST_ADDR_EVEN	0x50
-+/* sending host address for communication with default device address */
-+#define DDCCI_HOST_ADDR_ODD	0x51
-+
-+/* Command codes */
-+
-+/* Identification Request */
-+#define DDCCI_COMMAND_ID	0xf1
-+/* Identification Reply */
-+#define DDCCI_REPLY_ID	0xe1
-+/* Capabilities Request */
-+#define DDCCI_COMMAND_CAPS	0xf3
-+/* Capabilities Reply */
-+#define DDCCI_REPLY_CAPS	0xe3
-+
-+/* Quirks */
-+
-+/* Device always responds with unset protocol flag */
-+#define DDCCI_QUIRK_NO_PFLAG BIT(1)
-+/* Device needs writing one byte at a time  */
-+#define DDCCI_QUIRK_WRITE_BYTEWISE BIT(2)
-+/* Device repeats first byte on read */
-+#define DDCCI_QUIRK_SKIP_FIRST_BYTE BIT(3)
-+
-+/* Flags */
-+
-+#define DDCCI_FLAG_REMOVED BIT(1)
-+#define DDCCI_FLAG_DEPENDENT BIT(2)
-+#define DDCCI_FLAG_EXTERNAL BIT(3)
-+
-+extern struct bus_type ddcci_bus_type;
-+
-+struct ddcci_bus_drv_data;
-+
-+/* struct ddcci_device_id - identifies DDC/CI devices for probing */
-+struct ddcci_device_id {
-+	char prot[9];
-+	char type[9];
-+	char model[9];
-+	char vendor[9];
-+	char module[9];
-+	kernel_ulong_t driver_data;	/* Data private to the driver */
-+};
-+#define DDCCI_ANY_ID "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
-+
-+/**
-+ * struct ddcci_device - represent an DDC/CI device
-+ * @outer_addr: Outer device address (I2C address << 1).
-+ * @inner_addr: Inner device address.
-+ * @flags: Device flags.
-+ * @capabilities: Device capability string.
-+ * @capabilities_len: Length of capability string.
-+ * @i2c_client: Parent I2C device.
-+ * @bus_drv_data: Driver internal data structure.
-+ * @dev: Driver model device node for the slave.
-+ * @cdev: Character device structure
-+ * @cdev_sem: RW semaphore for exclusive access on character device.
-+ * @prot: Device class ("protocol", from capability string)
-+ * @type: Device subclass ("type", from capability string)
-+ * @model: Device model (from capability string)
-+ * @vendor: Device vendor (from identification command response)
-+ * @module: Device module (from identification command response)
-+ * @device_number: Device serial (from identification command response)
-+ */
-+struct ddcci_device {
-+	unsigned short outer_addr;
-+	unsigned short inner_addr;
-+	int flags;
-+	char *capabilities;
-+	size_t capabilities_len;
-+	struct i2c_client *i2c_client;
-+	struct ddcci_bus_drv_data *bus_drv_data;
-+	struct device dev;
-+	struct cdev cdev;
-+	struct rw_semaphore cdev_sem;
-+	char prot[9];
-+	char type[9];
-+	char model[9];
-+	char vendor[9];
-+	char module[9];
-+	int device_number;
-+};
-+#define to_ddcci_device(d) container_of(d, struct ddcci_device, dev)
-+
-+/**
-+ * struct ddcci_driver - represent an DDC/CI device driver
-+ * @probe: Callback for device binding
-+ * @remove: Callback for device unbinding
-+ * @driver: Device driver model driver
-+ * @id_table: List of DDC/CI devices supported by this driver
-+ *
-+ * The driver.owner field should be set to the module owner of this driver.
-+ * The driver.name field should be set to the name of this driver.
-+ */
-+struct ddcci_driver {
-+	int (*probe)(struct ddcci_device *, const struct ddcci_device_id *);
-+	int (*remove)(struct ddcci_device *);
-+	struct device_driver driver;
-+	struct ddcci_device_id *id_table;
-+};
-+#define to_ddcci_driver(d) container_of(d, struct ddcci_driver, driver)
-+
-+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver);
-+#define ddcci_add_driver(driver) \
-+	ddcci_register_driver(THIS_MODULE, driver)
-+void ddcci_del_driver(struct ddcci_driver *driver);
-+
-+struct ddcci_device *ddcci_verify_device(struct device *dev);
-+
-+#define module_ddcci_driver(__ddcci_driver) \
-+	module_driver(__ddcci_driver, ddcci_add_driver, \
-+			ddcci_del_driver)
-+
-+int ddcci_device_write(struct ddcci_device *, bool p_flag, unsigned char *data,
-+		       unsigned char length);
-+int ddcci_device_read(struct ddcci_device *, bool p_flag, unsigned char *buffer,
-+		      unsigned char length);
-+int ddcci_device_writeread(struct ddcci_device *, bool p_flag,
-+			   unsigned char *buffer, unsigned char length,
-+			   unsigned char maxlength);
-+
-+static inline void *ddcci_get_drvdata(const struct ddcci_device *dev)
-+{
-+	return dev_get_drvdata(&dev->dev);
-+}
-+
-+static inline void ddcci_set_drvdata(struct ddcci_device *dev, void *data)
-+{
-+	dev_set_drvdata(&dev->dev, data);
-+}
-+
-+unsigned long ddcci_quirks(struct ddcci_device *dev);
-+
-+const char *ddcci_find_capstr_item(const char *capabilities, const char *tag,
-+				   size_t *length);
-+
-+#endif
--- 
-2.41.0
-
-From 641b0ffc98fe2842735ec30d31c68b555d559a47 Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 18:29:38 +0200
-Subject: [PATCH 4/7] fixes
+Date: Mon, 31 Jul 2023 12:20:07 +0200
+Subject: [PATCH 3/5] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -12110,7 +9588,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  5 files changed, 14 insertions(+), 6 deletions(-)
 
 diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
-index 5ec4ad0a5c86..15c5649bde4d 100644
+index 764d176e9735..deb10b89fa51 100644
 --- a/drivers/bluetooth/btusb.c
 +++ b/drivers/bluetooth/btusb.c
 @@ -945,7 +945,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
@@ -12196,10 +9674,10 @@ index ce5faa620517..1f0f2b8df300 100644
 -- 
 2.41.0
 
-From 96777542ac5d53c962cdfb032cf34cfe4ee57dc8 Mon Sep 17 00:00:00 2001
+From d5b404e3a7461e47b37cbfc3fbe009ba156e2c67 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 10 Jul 2023 17:10:25 +0200
-Subject: [PATCH 5/7] ksm
+Subject: [PATCH 4/5] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -12546,10 +10024,10 @@ index 899a314bc487..c2dd786a30e1 100644
  static inline void collect_procs_ksm(struct page *page,
  				     struct list_head *to_kill, int force_early)
 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index de10fc797c8e..1cc93fc7d9b5 100644
+index 5e74ce4a28cd..51d04c1847c1 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
-@@ -784,7 +784,7 @@ struct mm_struct {
+@@ -812,7 +812,7 @@ struct mm_struct {
  #ifdef CONFIG_KSM
  		/*
  		 * Represent how many pages of this process are involved in KSM
@@ -12558,7 +10036,7 @@ index de10fc797c8e..1cc93fc7d9b5 100644
  		 */
  		unsigned long ksm_merging_pages;
  		/*
-@@ -792,7 +792,12 @@ struct mm_struct {
+@@ -820,7 +820,12 @@ struct mm_struct {
  		 * including merged and not merged.
  		 */
  		unsigned long ksm_rmap_items;
@@ -12610,7 +10088,7 @@ index fd6c1cb585db..11d0fc82c437 100644
  /*
   * 32 bit systems traditionally used different
 diff --git a/kernel/sys.c b/kernel/sys.c
-index 05f838929e72..9df683365a37 100644
+index 2410e3999ebe..b0841a2dd2b7 100644
 --- a/kernel/sys.c
 +++ b/kernel/sys.c
 @@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -12881,7 +10359,7 @@ index ba266359da55..97a9627116fa 100644
  #ifdef CONFIG_NUMA
  	&merge_across_nodes_attr.attr,
 diff --git a/mm/memory.c b/mm/memory.c
-index 01f39e8144ef..0dc2f193c4d6 100644
+index 603b2f419948..d8c7824558b4 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -13046,758 +10524,10 @@ index 26853badae70..0de9d33cd565 100644
 -- 
 2.41.0
 
-From 31b1d9be3d434ee82ffccc53fabc5a4326db96c7 Mon Sep 17 00:00:00 2001
-From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 17:10:36 +0200
-Subject: [PATCH 6/7] sched
-
-Signed-off-by: Peter Jung <admin@ptr1337.dev>
----
- arch/x86/kernel/smpboot.c   |  11 +--
- include/linux/cgroup-defs.h |   2 +
- include/linux/sched.h       |   2 +
- include/linux/sched/task.h  |  38 +++++++-
- kernel/cgroup/cgroup.c      |  34 +++++++
- kernel/fork.c               |   8 ++
- kernel/sched/core.c         |  57 ++++++++++++
- kernel/sched/debug.c        |   1 +
- kernel/sched/fair.c         | 177 +++++++++++++++++++++++++++++++++---
- kernel/sched/psi.c          |   2 +-
- kernel/sched/sched.h        |   3 +
- kernel/sched/topology.c     |  14 ++-
- kernel/softirq.c            |   2 +-
- 13 files changed, 325 insertions(+), 26 deletions(-)
-
-diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
-index e1aa2cd7734b..4c314475cc13 100644
---- a/arch/x86/kernel/smpboot.c
-+++ b/arch/x86/kernel/smpboot.c
-@@ -632,14 +632,9 @@ static void __init build_sched_topology(void)
- 	};
- #endif
- #ifdef CONFIG_SCHED_CLUSTER
--	/*
--	 * For now, skip the cluster domain on Hybrid.
--	 */
--	if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
--		x86_topology[i++] = (struct sched_domain_topology_level){
--			cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
--		};
--	}
-+	x86_topology[i++] = (struct sched_domain_topology_level){
-+		cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
-+	};
- #endif
- #ifdef CONFIG_SCHED_MC
- 	x86_topology[i++] = (struct sched_domain_topology_level){
-diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
-index 8a0d5466c7be..ae20dbb885d6 100644
---- a/include/linux/cgroup-defs.h
-+++ b/include/linux/cgroup-defs.h
-@@ -661,6 +661,8 @@ struct cgroup_subsys {
- 	void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
- 	int (*css_extra_stat_show)(struct seq_file *seq,
- 				   struct cgroup_subsys_state *css);
-+	int (*css_local_stat_show)(struct seq_file *seq,
-+				   struct cgroup_subsys_state *css);
- 
- 	int (*can_attach)(struct cgroup_taskset *tset);
- 	void (*cancel_attach)(struct cgroup_taskset *tset);
-diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 609bde814cb0..efc9f4bdc4ca 100644
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -2433,9 +2433,11 @@ extern void sched_core_free(struct task_struct *tsk);
- extern void sched_core_fork(struct task_struct *p);
- extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
- 				unsigned long uaddr);
-+extern int sched_core_idle_cpu(int cpu);
- #else
- static inline void sched_core_free(struct task_struct *tsk) { }
- static inline void sched_core_fork(struct task_struct *p) { }
-+static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
- #endif
- 
- extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
-index dd35ce28bb90..a23af225c898 100644
---- a/include/linux/sched/task.h
-+++ b/include/linux/sched/task.h
-@@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
- }
- 
- extern void __put_task_struct(struct task_struct *t);
-+extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
- 
- static inline void put_task_struct(struct task_struct *t)
- {
--	if (refcount_dec_and_test(&t->usage))
-+	if (!refcount_dec_and_test(&t->usage))
-+		return;
-+
-+	/*
-+	 * In !RT, it is always safe to call __put_task_struct().
-+	 * Under RT, we can only call it in preemptible context.
-+	 */
-+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
-+		static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
-+
-+		lock_map_acquire_try(&put_task_map);
- 		__put_task_struct(t);
-+		lock_map_release(&put_task_map);
-+		return;
-+	}
-+
-+	/*
-+	 * under PREEMPT_RT, we can't call put_task_struct
-+	 * in atomic context because it will indirectly
-+	 * acquire sleeping locks.
-+	 *
-+	 * call_rcu() will schedule delayed_put_task_struct_rcu()
-+	 * to be called in process context.
-+	 *
-+	 * __put_task_struct() is called when
-+	 * refcount_dec_and_test(&t->usage) succeeds.
-+	 *
-+	 * This means that it can't "conflict" with
-+	 * put_task_struct_rcu_user() which abuses ->rcu the same
-+	 * way; rcu_users has a reference so task->usage can't be
-+	 * zero after rcu_users 1 -> 0 transition.
-+	 *
-+	 * delayed_free_task() also uses ->rcu, but it is only called
-+	 * when it fails to fork a process. Therefore, there is no
-+	 * way it can conflict with put_task_struct().
-+	 */
-+	call_rcu(&t->rcu, __put_task_struct_rcu_cb);
- }
- 
- DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
-diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
-index bfe3cd8ccf36..4e3ee13217ce 100644
---- a/kernel/cgroup/cgroup.c
-+++ b/kernel/cgroup/cgroup.c
-@@ -3685,6 +3685,36 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
- 	return ret;
- }
- 
-+static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq,
-+						 struct cgroup *cgrp, int ssid)
-+{
-+	struct cgroup_subsys *ss = cgroup_subsys[ssid];
-+	struct cgroup_subsys_state *css;
-+	int ret;
-+
-+	if (!ss->css_local_stat_show)
-+		return 0;
-+
-+	css = cgroup_tryget_css(cgrp, ss);
-+	if (!css)
-+		return 0;
-+
-+	ret = ss->css_local_stat_show(seq, css);
-+	css_put(css);
-+	return ret;
-+}
-+
-+static int cpu_local_stat_show(struct seq_file *seq, void *v)
-+{
-+	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
-+	int ret = 0;
-+
-+#ifdef CONFIG_CGROUP_SCHED
-+	ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
-+#endif
-+	return ret;
-+}
-+
- #ifdef CONFIG_PSI
- static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
- {
-@@ -5235,6 +5265,10 @@ static struct cftype cgroup_base_files[] = {
- 		.name = "cpu.stat",
- 		.seq_show = cpu_stat_show,
- 	},
-+	{
-+		.name = "cpu.stat.local",
-+		.seq_show = cpu_local_stat_show,
-+	},
- 	{ }	/* terminate */
- };
- 
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 95ca80492a37..36fb0b711541 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -989,6 +989,14 @@ void __put_task_struct(struct task_struct *tsk)
- }
- EXPORT_SYMBOL_GPL(__put_task_struct);
- 
-+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
-+{
-+	struct task_struct *task = container_of(rhp, struct task_struct, rcu);
-+
-+	__put_task_struct(task);
-+}
-+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
-+
- void __init __weak arch_task_cache_init(void) { }
- 
- /*
-diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index c52c2eba7c73..83e36547af17 100644
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -7383,6 +7383,19 @@ struct task_struct *idle_task(int cpu)
- 	return cpu_rq(cpu)->idle;
- }
- 
-+#ifdef CONFIG_SCHED_CORE
-+int sched_core_idle_cpu(int cpu)
-+{
-+	struct rq *rq = cpu_rq(cpu);
-+
-+	if (sched_core_enabled(rq) && rq->curr == rq->idle)
-+		return 1;
-+
-+	return idle_cpu(cpu);
-+}
-+
-+#endif
-+
- #ifdef CONFIG_SMP
- /*
-  * This function computes an effective utilization for the given CPU, to be
-@@ -11139,6 +11152,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
- 
- 	return 0;
- }
-+
-+static u64 throttled_time_self(struct task_group *tg)
-+{
-+	int i;
-+	u64 total = 0;
-+
-+	for_each_possible_cpu(i) {
-+		total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
-+	}
-+
-+	return total;
-+}
-+
-+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
-+{
-+	struct task_group *tg = css_tg(seq_css(sf));
-+
-+	seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
-+
-+	return 0;
-+}
- #endif /* CONFIG_CFS_BANDWIDTH */
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- 
-@@ -11215,6 +11249,10 @@ static struct cftype cpu_legacy_files[] = {
- 		.name = "stat",
- 		.seq_show = cpu_cfs_stat_show,
- 	},
-+	{
-+		.name = "stat.local",
-+		.seq_show = cpu_cfs_local_stat_show,
-+	},
- #endif
- #ifdef CONFIG_RT_GROUP_SCHED
- 	{
-@@ -11271,6 +11309,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
- 	return 0;
- }
- 
-+static int cpu_local_stat_show(struct seq_file *sf,
-+			       struct cgroup_subsys_state *css)
-+{
-+#ifdef CONFIG_CFS_BANDWIDTH
-+	{
-+		struct task_group *tg = css_tg(css);
-+		u64 throttled_self_usec;
-+
-+		throttled_self_usec = throttled_time_self(tg);
-+		do_div(throttled_self_usec, NSEC_PER_USEC);
-+
-+		seq_printf(sf, "throttled_usec %llu\n",
-+			   throttled_self_usec);
-+	}
-+#endif
-+	return 0;
-+}
-+
- #ifdef CONFIG_FAIR_GROUP_SCHED
- static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
- 			       struct cftype *cft)
-@@ -11449,6 +11505,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
- 	.css_released	= cpu_cgroup_css_released,
- 	.css_free	= cpu_cgroup_css_free,
- 	.css_extra_stat_show = cpu_extra_stat_show,
-+	.css_local_stat_show = cpu_local_stat_show,
- #ifdef CONFIG_RT_GROUP_SCHED
- 	.can_attach	= cpu_cgroup_can_attach,
- #endif
-diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 066ff1c8ae4e..aeeba46a096b 100644
---- a/kernel/sched/debug.c
-+++ b/kernel/sched/debug.c
-@@ -427,6 +427,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
- #undef SDM
- 
- 	debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
-+	debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
- }
- 
- void update_sched_domain_debugfs(void)
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index b097a9f4d817..4039ff46fcb3 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -4805,6 +4805,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
- }
- 
- static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
- 
- static inline bool cfs_bandwidth_used(void);
- 
-@@ -4891,8 +4892,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- 
- 	if (cfs_rq->nr_running == 1) {
- 		check_enqueue_throttle(cfs_rq);
--		if (!throttled_hierarchy(cfs_rq))
-+		if (!throttled_hierarchy(cfs_rq)) {
- 			list_add_leaf_cfs_rq(cfs_rq);
-+		} else {
-+#ifdef CONFIG_CFS_BANDWIDTH
-+			struct rq *rq = rq_of(cfs_rq);
-+
-+			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
-+				cfs_rq->throttled_clock = rq_clock(rq);
-+			if (!cfs_rq->throttled_clock_self)
-+				cfs_rq->throttled_clock_self = rq_clock(rq);
-+#endif
-+		}
- 	}
- }
- 
-@@ -5395,6 +5406,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
- 		/* Add cfs_rq with load or one or more already running entities to the list */
- 		if (!cfs_rq_is_decayed(cfs_rq))
- 			list_add_leaf_cfs_rq(cfs_rq);
-+
-+		if (cfs_rq->throttled_clock_self) {
-+			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
-+
-+			cfs_rq->throttled_clock_self = 0;
-+
-+			if (SCHED_WARN_ON((s64)delta < 0))
-+				delta = 0;
-+
-+			cfs_rq->throttled_clock_self_time += delta;
-+		}
- 	}
- 
- 	return 0;
-@@ -5409,6 +5431,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
- 	if (!cfs_rq->throttle_count) {
- 		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- 		list_del_leaf_cfs_rq(cfs_rq);
-+
-+		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
-+		if (cfs_rq->nr_running)
-+			cfs_rq->throttled_clock_self = rq_clock(rq);
- 	}
- 	cfs_rq->throttle_count++;
- 
-@@ -5498,7 +5524,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
- 	 * throttled-list.  rq->lock protects completion.
- 	 */
- 	cfs_rq->throttled = 1;
--	cfs_rq->throttled_clock = rq_clock(rq);
-+	SCHED_WARN_ON(cfs_rq->throttled_clock);
-+	if (cfs_rq->nr_running)
-+		cfs_rq->throttled_clock = rq_clock(rq);
- 	return true;
- }
- 
-@@ -5516,7 +5544,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
- 	update_rq_clock(rq);
- 
- 	raw_spin_lock(&cfs_b->lock);
--	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
-+	if (cfs_rq->throttled_clock) {
-+		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
-+		cfs_rq->throttled_clock = 0;
-+	}
- 	list_del_rcu(&cfs_rq->throttled_list);
- 	raw_spin_unlock(&cfs_b->lock);
- 
-@@ -7307,9 +7338,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
- 
- 		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
- 
--		if (boost)
--			util_est = max(util_est, runnable);
--
- 		/*
- 		 * During wake-up @p isn't enqueued yet and doesn't contribute
- 		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
-@@ -8433,6 +8461,11 @@ enum group_type {
- 	 * more powerful CPU.
- 	 */
- 	group_misfit_task,
-+	/*
-+	 * Balance SMT group that's fully busy. Can benefit from migration
-+	 * a task on SMT with busy sibling to another CPU on idle core.
-+	 */
-+	group_smt_balance,
- 	/*
- 	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
- 	 * and the task should be migrated to it instead of running on the
-@@ -9141,6 +9174,7 @@ struct sg_lb_stats {
- 	unsigned int group_weight;
- 	enum group_type group_type;
- 	unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
-+	unsigned int group_smt_balance;  /* Task on busy SMT be moved */
- 	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
- #ifdef CONFIG_NUMA_BALANCING
- 	unsigned int nr_numa_running;
-@@ -9414,6 +9448,9 @@ group_type group_classify(unsigned int imbalance_pct,
- 	if (sgs->group_asym_packing)
- 		return group_asym_packing;
- 
-+	if (sgs->group_smt_balance)
-+		return group_smt_balance;
-+
- 	if (sgs->group_misfit_task_load)
- 		return group_misfit_task;
- 
-@@ -9483,6 +9520,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
- 	return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
- }
- 
-+/* One group has more than one SMT CPU while the other group does not */
-+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
-+				    struct sched_group *sg2)
-+{
-+	if (!sg1 || !sg2)
-+		return false;
-+
-+	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
-+		(sg2->flags & SD_SHARE_CPUCAPACITY);
-+}
-+
-+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
-+			       struct sched_group *group)
-+{
-+	if (env->idle == CPU_NOT_IDLE)
-+		return false;
-+
-+	/*
-+	 * For SMT source group, it is better to move a task
-+	 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
-+	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
-+	 * will not be on.
-+	 */
-+	if (group->flags & SD_SHARE_CPUCAPACITY &&
-+	    sgs->sum_h_nr_running > 1)
-+		return true;
-+
-+	return false;
-+}
-+
-+static inline long sibling_imbalance(struct lb_env *env,
-+				    struct sd_lb_stats *sds,
-+				    struct sg_lb_stats *busiest,
-+				    struct sg_lb_stats *local)
-+{
-+	int ncores_busiest, ncores_local;
-+	long imbalance;
-+
-+	if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
-+		return 0;
-+
-+	ncores_busiest = sds->busiest->cores;
-+	ncores_local = sds->local->cores;
-+
-+	if (ncores_busiest == ncores_local) {
-+		imbalance = busiest->sum_nr_running;
-+		lsub_positive(&imbalance, local->sum_nr_running);
-+		return imbalance;
-+	}
-+
-+	/* Balance such that nr_running/ncores ratio are same on both groups */
-+	imbalance = ncores_local * busiest->sum_nr_running;
-+	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
-+	/* Normalize imbalance and do rounding on normalization */
-+	imbalance = 2 * imbalance + ncores_local + ncores_busiest;
-+	imbalance /= ncores_local + ncores_busiest;
-+
-+	/* Take advantage of resource in an empty sched group */
-+	if (imbalance == 0 && local->sum_nr_running == 0 &&
-+	    busiest->sum_nr_running > 1)
-+		imbalance = 2;
-+
-+	return imbalance;
-+}
-+
- static inline bool
- sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
- {
-@@ -9575,6 +9677,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
- 		sgs->group_asym_packing = 1;
- 	}
- 
-+	/* Check for loaded SMT group to be balanced to dst CPU */
-+	if (!local_group && smt_balance(env, sgs, group))
-+		sgs->group_smt_balance = 1;
-+
- 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
- 
- 	/* Computing avg_load makes sense only when group is overloaded */
-@@ -9659,6 +9765,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
- 			return false;
- 		break;
- 
-+	case group_smt_balance:
- 	case group_fully_busy:
- 		/*
- 		 * Select the fully busy group with highest avg_load. In
-@@ -9687,6 +9794,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
- 		break;
- 
- 	case group_has_spare:
-+		/*
-+		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
-+		 * as we do not want to pull task off SMT core with one task
-+		 * and make the core idle.
-+		 */
-+		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
-+			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
-+				return false;
-+			else
-+				return true;
-+		}
-+
- 		/*
- 		 * Select not overloaded group with lowest number of idle cpus
- 		 * and highest number of running tasks. We could also compare
-@@ -9883,6 +10002,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
- 
- 	case group_imbalanced:
- 	case group_asym_packing:
-+	case group_smt_balance:
- 		/* Those types are not used in the slow wakeup path */
- 		return false;
- 
-@@ -10014,6 +10134,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
- 
- 	case group_imbalanced:
- 	case group_asym_packing:
-+	case group_smt_balance:
- 		/* Those type are not used in the slow wakeup path */
- 		return NULL;
- 
-@@ -10268,6 +10389,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
- 		return;
- 	}
- 
-+	if (busiest->group_type == group_smt_balance) {
-+		/* Reduce number of tasks sharing CPU capacity */
-+		env->migration_type = migrate_task;
-+		env->imbalance = 1;
-+		return;
-+	}
-+
- 	if (busiest->group_type == group_imbalanced) {
- 		/*
- 		 * In the group_imb case we cannot rely on group-wide averages
-@@ -10315,14 +10443,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
- 		}
- 
- 		if (busiest->group_weight == 1 || sds->prefer_sibling) {
--			unsigned int nr_diff = busiest->sum_nr_running;
- 			/*
- 			 * When prefer sibling, evenly spread running tasks on
- 			 * groups.
- 			 */
- 			env->migration_type = migrate_task;
--			lsub_positive(&nr_diff, local->sum_nr_running);
--			env->imbalance = nr_diff;
-+			env->imbalance = sibling_imbalance(env, sds, busiest, local);
- 		} else {
- 
- 			/*
-@@ -10519,20 +10645,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
- 	 * group's child domain.
- 	 */
- 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
--	    busiest->sum_nr_running > local->sum_nr_running + 1)
-+	    sibling_imbalance(env, &sds, busiest, local) > 1)
- 		goto force_balance;
- 
- 	if (busiest->group_type != group_overloaded) {
--		if (env->idle == CPU_NOT_IDLE)
-+		if (env->idle == CPU_NOT_IDLE) {
- 			/*
- 			 * If the busiest group is not overloaded (and as a
- 			 * result the local one too) but this CPU is already
- 			 * busy, let another idle CPU try to pull task.
- 			 */
- 			goto out_balanced;
-+		}
-+
-+		if (busiest->group_type == group_smt_balance &&
-+		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
-+			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
-+			goto force_balance;
-+		}
- 
- 		if (busiest->group_weight > 1 &&
--		    local->idle_cpus <= (busiest->idle_cpus + 1))
-+		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
- 			/*
- 			 * If the busiest group is not overloaded
- 			 * and there is no imbalance between this and busiest
-@@ -10543,12 +10676,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
- 			 * there is more than 1 CPU per group.
- 			 */
- 			goto out_balanced;
-+		}
- 
--		if (busiest->sum_h_nr_running == 1)
-+		if (busiest->sum_h_nr_running == 1) {
- 			/*
- 			 * busiest doesn't have any tasks waiting to run
- 			 */
- 			goto out_balanced;
-+		}
- 	}
- 
- force_balance:
-@@ -10782,7 +10917,7 @@ static int active_load_balance_cpu_stop(void *data);
- static int should_we_balance(struct lb_env *env)
- {
- 	struct sched_group *sg = env->sd->groups;
--	int cpu;
-+	int cpu, idle_smt = -1;
- 
- 	/*
- 	 * Ensure the balancing environment is consistent; can happen
-@@ -10809,10 +10944,24 @@ static int should_we_balance(struct lb_env *env)
- 		if (!idle_cpu(cpu))
- 			continue;
- 
-+		/*
-+		 * Don't balance to idle SMT in busy core right away when
-+		 * balancing cores, but remember the first idle SMT CPU for
-+		 * later consideration.  Find CPU on an idle core first.
-+		 */
-+		if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
-+			if (idle_smt == -1)
-+				idle_smt = cpu;
-+			continue;
-+		}
-+
- 		/* Are we the first idle CPU? */
- 		return cpu == env->dst_cpu;
- 	}
- 
-+	if (idle_smt == env->dst_cpu)
-+		return true;
-+
- 	/* Are we the first CPU of this group ? */
- 	return group_balance_cpu(sg) == env->dst_cpu;
- }
-diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
-index 81fca77397f6..2ccb0b2ebd78 100644
---- a/kernel/sched/psi.c
-+++ b/kernel/sched/psi.c
-@@ -140,7 +140,7 @@
- static int psi_bug __read_mostly;
- 
- DEFINE_STATIC_KEY_FALSE(psi_disabled);
--DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
-+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
- 
- #ifdef CONFIG_PSI_DEFAULT_DISABLED
- static bool psi_enable;
-diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index e93e006a942b..9baeb1a2dfdd 100644
---- a/kernel/sched/sched.h
-+++ b/kernel/sched/sched.h
-@@ -636,6 +636,8 @@ struct cfs_rq {
- 	u64			throttled_clock;
- 	u64			throttled_clock_pelt;
- 	u64			throttled_clock_pelt_time;
-+	u64			throttled_clock_self;
-+	u64			throttled_clock_self_time;
- 	int			throttled;
- 	int			throttle_count;
- 	struct list_head	throttled_list;
-@@ -1882,6 +1884,7 @@ struct sched_group {
- 	atomic_t		ref;
- 
- 	unsigned int		group_weight;
-+	unsigned int		cores;
- 	struct sched_group_capacity *sgc;
- 	int			asym_prefer_cpu;	/* CPU of highest priority in group */
- 	int			flags;
-diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
-index d3a3b2646ec4..4bbe1631d950 100644
---- a/kernel/sched/topology.c
-+++ b/kernel/sched/topology.c
-@@ -1275,14 +1275,26 @@ build_sched_groups(struct sched_domain *sd, int cpu)
- static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
- {
- 	struct sched_group *sg = sd->groups;
-+	struct cpumask *mask = sched_domains_tmpmask2;
- 
- 	WARN_ON(!sg);
- 
- 	do {
--		int cpu, max_cpu = -1;
-+		int cpu, cores = 0, max_cpu = -1;
- 
- 		sg->group_weight = cpumask_weight(sched_group_span(sg));
- 
-+		cpumask_copy(mask, sched_group_span(sg));
-+		for_each_cpu(cpu, mask) {
-+			cores++;
-+#ifdef CONFIG_SCHED_SMT
-+			cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
-+#else
-+			__cpumask_clear_cpu(cpu, mask);
-+#endif
-+		}
-+		sg->cores = cores;
-+
- 		if (!(sd->flags & SD_ASYM_PACKING))
- 			goto next;
- 
-diff --git a/kernel/softirq.c b/kernel/softirq.c
-index 807b34ccd797..210cf5f8d92c 100644
---- a/kernel/softirq.c
-+++ b/kernel/softirq.c
-@@ -612,7 +612,7 @@ static inline void tick_irq_exit(void)
- 	int cpu = smp_processor_id();
- 
- 	/* Make sure that timer wheel updates are propagated */
--	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
-+	if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
- 		if (!in_hardirq())
- 			tick_nohz_irq_exit();
- 	}
--- 
-2.41.0
-
-From 06e0e78e6ce4cea4215ba00474d011f49a3ff8f5 Mon Sep 17 00:00:00 2001
+From 4e22e9e9fa30a6a257a12a24844d77c4e8362b71 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 10 Jul 2023 17:11:55 +0200
-Subject: [PATCH 7/7] zstd
+Subject: [PATCH 5/5] zstd
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch
index 590e6a9..363a80d 100644
--- a/patches/0002-eevdf.patch
+++ b/patches/0002-eevdf.patch
@@ -1,7 +1,7 @@
-From 0af97bb369de3bfe15d724e9bb0e3c971c6f9f20 Mon Sep 17 00:00:00 2001
+From 218c51e49185b75b4e36c8f11b5c77686f955a0a Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 10 Jul 2023 17:12:45 +0200
-Subject: [PATCH] EEVDF-cachy
+Date: Sun, 30 Jul 2023 09:38:51 +0200
+Subject: [PATCH] EEVDF
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -13,11 +13,11 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  init/init_task.c                        |    3 +-
  kernel/sched/core.c                     |   65 +-
  kernel/sched/debug.c                    |   49 +-
- kernel/sched/fair.c                     | 1157 +++++++++++------------
- kernel/sched/features.h                 |   24 +-
- kernel/sched/sched.h                    |   22 +-
+ kernel/sched/fair.c                     | 1138 +++++++++++------------
+ kernel/sched/features.h                 |   23 +-
+ kernel/sched/sched.h                    |   21 +-
  tools/include/uapi/linux/sched.h        |    4 +-
- 12 files changed, 733 insertions(+), 658 deletions(-)
+ 12 files changed, 702 insertions(+), 668 deletions(-)
 
 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
 index 4ef890191196..3a8d3e1e5591 100644
@@ -78,7 +78,7 @@ index 7ee7ed5de722..6dbc5a1bf6a8 100644
   * Template for declaring augmented rbtree callbacks (generic case)
   *
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index efc9f4bdc4ca..e99a9aa6a972 100644
+index 609bde814cb0..c940c4dc8304 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -549,13 +549,18 @@ struct sched_entity {
@@ -196,7 +196,7 @@ index ff6c4b9bfe6b..511cbcf3510d 100644
  	.rt		= {
  		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 83e36547af17..8a541fe2d462 100644
+index c52c2eba7c73..aff81e12460e 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
@@ -232,7 +232,7 @@ index 83e36547af17..8a541fe2d462 100644
  
  		/*
  		 * We don't need the reset flag anymore after the fork. It has
-@@ -7529,7 +7539,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
+@@ -7516,7 +7526,7 @@ static struct task_struct *find_process_by_pid(pid_t pid)
  #define SETPARAM_POLICY	-1
  
  static void __setscheduler_params(struct task_struct *p,
@@ -241,7 +241,7 @@ index 83e36547af17..8a541fe2d462 100644
  {
  	int policy = attr->sched_policy;
  
-@@ -7553,6 +7563,13 @@ static void __setscheduler_params(struct task_struct *p,
+@@ -7540,6 +7550,13 @@ static void __setscheduler_params(struct task_struct *p,
  	set_load_weight(p, true);
  }
  
@@ -255,7 +255,7 @@ index 83e36547af17..8a541fe2d462 100644
  /*
   * Check the target process has a UID that matches the current process's:
   */
-@@ -7687,6 +7704,13 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7674,6 +7691,13 @@ static int __sched_setscheduler(struct task_struct *p,
  			return retval;
  	}
  
@@ -269,7 +269,7 @@ index 83e36547af17..8a541fe2d462 100644
  	/* Update task specific "requested" clamps */
  	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
  		retval = uclamp_validate(p, attr);
-@@ -7734,6 +7758,9 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7721,6 +7745,9 @@ static int __sched_setscheduler(struct task_struct *p,
  			goto change;
  		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
  			goto change;
@@ -279,7 +279,7 @@ index 83e36547af17..8a541fe2d462 100644
  
  		p->sched_reset_on_fork = reset_on_fork;
  		retval = 0;
-@@ -7822,6 +7849,7 @@ static int __sched_setscheduler(struct task_struct *p,
+@@ -7809,6 +7836,7 @@ static int __sched_setscheduler(struct task_struct *p,
  		__setscheduler_params(p, attr);
  		__setscheduler_prio(p, newprio);
  	}
@@ -287,7 +287,7 @@ index 83e36547af17..8a541fe2d462 100644
  	__setscheduler_uclamp(p, attr);
  
  	if (queued) {
-@@ -8033,6 +8061,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+@@ -8020,6 +8048,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
  	    size < SCHED_ATTR_SIZE_VER1)
  		return -EINVAL;
  
@@ -297,7 +297,7 @@ index 83e36547af17..8a541fe2d462 100644
  	/*
  	 * XXX: Do we want to be lenient like existing syscalls; or do we want
  	 * to be strict and return an error on out-of-bounds values?
-@@ -8270,6 +8301,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+@@ -8257,6 +8288,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
  	get_params(p, &kattr);
  	kattr.sched_flags &= SCHED_FLAG_ALL;
  
@@ -306,7 +306,7 @@ index 83e36547af17..8a541fe2d462 100644
  #ifdef CONFIG_UCLAMP_TASK
  	/*
  	 * This could race with another potential updater, but this is fine
-@@ -11214,6 +11247,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+@@ -11180,6 +11213,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
  {
  	return sched_group_set_idle(css_tg(css), idle);
  }
@@ -332,7 +332,7 @@ index 83e36547af17..8a541fe2d462 100644
  #endif
  
  static struct cftype cpu_legacy_files[] = {
-@@ -11228,6 +11280,11 @@ static struct cftype cpu_legacy_files[] = {
+@@ -11194,6 +11246,11 @@ static struct cftype cpu_legacy_files[] = {
  		.read_s64 = cpu_idle_read_s64,
  		.write_s64 = cpu_idle_write_s64,
  	},
@@ -344,7 +344,7 @@ index 83e36547af17..8a541fe2d462 100644
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
-@@ -11467,6 +11524,12 @@ static struct cftype cpu_files[] = {
+@@ -11411,6 +11468,12 @@ static struct cftype cpu_files[] = {
  		.read_s64 = cpu_idle_read_s64,
  		.write_s64 = cpu_idle_write_s64,
  	},
@@ -358,7 +358,7 @@ index 83e36547af17..8a541fe2d462 100644
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index aeeba46a096b..5c743bcb340d 100644
+index 066ff1c8ae4e..e7e83181fbb6 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
 @@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
@@ -373,7 +373,7 @@ index aeeba46a096b..5c743bcb340d 100644
  
  	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
  	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-@@ -582,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  	else
  		SEQ_printf(m, " %c", task_state_to_char(p));
  
@@ -388,7 +388,7 @@ index aeeba46a096b..5c743bcb340d 100644
  		(long long)(p->nvcsw + p->nivcsw),
  		p->prio);
  
-@@ -627,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
  
  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  {
@@ -401,7 +401,7 @@ index aeeba46a096b..5c743bcb340d 100644
  	unsigned long flags;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-@@ -644,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  			SPLIT_NS(cfs_rq->exec_clock));
  
  	raw_spin_rq_lock_irqsave(rq, flags);
@@ -441,7 +441,7 @@ index aeeba46a096b..5c743bcb340d 100644
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
  			cfs_rq->nr_spread_over);
  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
-@@ -864,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
+@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m)
  	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
  #define PN(x) \
  	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
@@ -453,7 +453,7 @@ index aeeba46a096b..5c743bcb340d 100644
  	P(sysctl_sched_child_runs_first);
  	P(sysctl_sched_features);
  #undef PN
-@@ -1090,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
  #endif
  	P(policy);
  	P(prio);
@@ -462,7 +462,7 @@ index aeeba46a096b..5c743bcb340d 100644
  		P(dl.runtime);
  		P(dl.deadline);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 4039ff46fcb3..0fbb8fb24a50 100644
+index 2c335df30171..461409c0eac7 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -47,6 +47,7 @@
@@ -594,7 +594,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  
  const struct sched_class fair_sched_class;
  
-@@ -619,13 +569,200 @@ static inline bool entity_before(const struct sched_entity *a,
+@@ -619,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
  	return (s64)(a->vruntime - b->vruntime) < 0;
  }
  
@@ -671,7 +671,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 +	s64 key = entity_key(cfs_rq, se);
 +
 +	cfs_rq->avg_vruntime += key * weight;
-+	cfs_rq->avg_slice += se->slice * weight;
 +	cfs_rq->avg_load += weight;
 +}
 +
@@ -682,7 +681,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 +	s64 key = entity_key(cfs_rq, se);
 +
 +	cfs_rq->avg_vruntime -= key * weight;
-+	cfs_rq->avg_slice -= se->slice * weight;
 +	cfs_rq->avg_load -= weight;
 +}
 +
@@ -796,7 +794,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  
  	u64 vruntime = cfs_rq->min_vruntime;
  
-@@ -636,9 +773,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+@@ -636,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
  			curr = NULL;
  	}
  
@@ -807,7 +805,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  		if (!curr)
  			vruntime = se->vruntime;
  		else
-@@ -647,7 +782,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+@@ -647,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
  
  	/* ensure we never gain time by being placed backwards. */
  	u64_u32_store(cfs_rq->min_vruntime,
@@ -816,7 +814,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
-@@ -655,17 +790,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -655,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
  	return entity_before(__node_2_se(a), __node_2_se(b));
  }
  
@@ -870,7 +868,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-@@ -678,14 +847,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+@@ -678,14 +845,81 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  	return __node_2_se(left);
  }
  
@@ -927,8 +925,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 +			if (best->deadline == best->min_deadline)
 +				break;
 +		}
- 
--	return __node_2_se(next);
++
 +		/*
 +		 * If the earlest deadline in this subtree is in the fully
 +		 * eligible left half of our space, go there.
@@ -941,7 +938,8 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 +
 +		node = node->rb_right;
 +	}
-+
+ 
+-	return __node_2_se(next);
 +	if (!best || (curr && deadline_gt(deadline, best, curr)))
 +		best = curr;
 +
@@ -957,7 +955,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  #ifdef CONFIG_SCHED_DEBUG
-@@ -707,104 +943,53 @@ int sched_update_scaling(void)
+@@ -707,104 +941,53 @@ int sched_update_scaling(void)
  {
  	unsigned int factor = get_update_sysctl_factor();
  
@@ -1090,7 +1088,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  #include "pelt.h"
-@@ -939,6 +1124,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -939,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	schedstat_add(cfs_rq->exec_clock, delta_exec);
  
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
@@ -1098,7 +1096,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	update_min_vruntime(cfs_rq);
  
  	if (entity_is_task(curr)) {
-@@ -3393,16 +3579,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+@@ -3393,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  			    unsigned long weight)
  {
@@ -1135,7 +1133,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  #ifdef CONFIG_SMP
  	do {
  		u32 divider = get_pelt_divider(&se->avg);
-@@ -3412,9 +3618,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+@@ -3412,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  #endif
  
  	enqueue_load_avg(cfs_rq, se);
@@ -1149,7 +1147,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  void reweight_task(struct task_struct *p, int prio)
-@@ -4710,98 +4918,140 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+@@ -4710,158 +4916,123 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
  
  #endif /* CONFIG_SMP */
  
@@ -1167,94 +1165,42 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 -}
 -
 -static inline bool entity_is_long_sleeper(struct sched_entity *se)
-+static inline bool
-+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags)
++static void
++place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
 -	struct cfs_rq *cfs_rq;
 -	u64 sleep_time;
-+	u64 now, vdelta;
-+	s64 delta;
- 
--	if (se->exec_start == 0)
-+	if (!(flags & ENQUEUE_WAKEUP))
- 		return false;
- 
--	cfs_rq = cfs_rq_of(se);
 -
--	sleep_time = rq_clock_task(rq_of(cfs_rq));
-+	if (flags & ENQUEUE_MIGRATED)
-+		return true;
- 
--	/* Happen while migrating because of clock task divergence */
--	if (sleep_time <= se->exec_start)
-+	now = rq_clock_task(rq_of(cfs_rq));
-+	delta = now - se->exec_start;
-+	if (delta < 0)
- 		return false;
- 
--	sleep_time -= se->exec_start;
--	if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
--		return true;
-+	vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load);
-+	if (vdelta < vslice)
-+		return false;
- 
--	return false;
-+	return true;
- }
- 
- static void
--place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
--	u64 vruntime = cfs_rq->min_vruntime;
+-	if (se->exec_start == 0)
+-		return false;
+-
+-	cfs_rq = cfs_rq_of(se);
 +	u64 vslice = calc_delta_fair(se->slice, se);
 +	u64 vruntime = avg_vruntime(cfs_rq);
 +	s64 lag = 0;
  
- 	/*
--	 * The 'current' period is already promised to the current tasks,
--	 * however the extra weight of the new task will slow them down a
--	 * little, place the new task so that it fits in the slot that
--	 * stays open at the end.
+-	sleep_time = rq_clock_task(rq_of(cfs_rq));
++	/*
 +	 * Due to how V is constructed as the weighted average of entities,
 +	 * adding tasks with positive lag, or removing tasks with negative lag
 +	 * will move 'time' backwards, this can screw around with the lag of
 +	 * other tasks.
 +	 *
 +	 * EEVDF: placement strategy #1 / #2
- 	 */
--	if (initial && sched_feat(START_DEBIT))
--		vruntime += sched_vslice(cfs_rq, se);
++	 */
 +	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
 +		struct sched_entity *curr = cfs_rq->curr;
 +		unsigned long load;
  
--	/* sleeps up to a single latency don't count. */
--	if (!initial) {
--		unsigned long thresh;
+-	/* Happen while migrating because of clock task divergence */
+-	if (sleep_time <= se->exec_start)
+-		return false;
 +		lag = se->vlag;
  
--		if (se_is_idle(se))
--			thresh = sysctl_sched_min_granularity;
--		else
--			thresh = sysctl_sched_latency;
+-	sleep_time -= se->exec_start;
+-	if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+-		return true;
 +		/*
-+		 * For latency sensitive tasks; those that have a shorter than
-+		 * average slice and do not fully consume the slice, transition
-+		 * to EEVDF placement strategy #2.
-+		 */
-+		if (sched_feat(PLACE_FUDGE) &&
-+		    (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) &&
-+		    entity_has_slept(cfs_rq, se, vslice, flags)) {
-+			lag += vslice;
-+			if (lag > 0)
-+				lag = 0;
-+		}
- 
- 		/*
--		 * Halve their sleep time's effect, to allow
--		 * for a gentler effect of sleepers:
 +		 * If we want to place a task and preserve lag, we have to
 +		 * consider the effect of the new entity on the weighted
 +		 * average and compensate for this, otherwise lag can quickly
@@ -1305,7 +1251,52 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 +		 *                   = W*vl_i
 +		 *
 +		 *   vl_i = (W + w_i)*vl'_i / W
- 		 */
++		 */
++		load = cfs_rq->avg_load;
++		if (curr && curr->on_rq)
++			load += scale_load_down(curr->load.weight);
+ 
+-	return false;
+-}
++		lag *= load + scale_load_down(se->load.weight);
++		if (WARN_ON_ONCE(!load))
++			load = 1;
++		lag = div_s64(lag, load);
++	}
+ 
+-static void
+-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+-{
+-	u64 vruntime = cfs_rq->min_vruntime;
++	se->vruntime = vruntime - lag;
+ 
+ 	/*
+-	 * The 'current' period is already promised to the current tasks,
+-	 * however the extra weight of the new task will slow them down a
+-	 * little, place the new task so that it fits in the slot that
+-	 * stays open at the end.
++	 * When joining the competition; the exisiting tasks will be,
++	 * on average, halfway through their slice, as such start tasks
++	 * off with half a slice to ease into the competition.
+ 	 */
+-	if (initial && sched_feat(START_DEBIT))
+-		vruntime += sched_vslice(cfs_rq, se);
+-
+-	/* sleeps up to a single latency don't count. */
+-	if (!initial) {
+-		unsigned long thresh;
+-
+-		if (se_is_idle(se))
+-			thresh = sysctl_sched_min_granularity;
+-		else
+-			thresh = sysctl_sched_latency;
++	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
++		vslice /= 2;
+ 
+-		/*
+-		 * Halve their sleep time's effect, to allow
+-		 * for a gentler effect of sleepers:
+-		 */
 -		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 -			thresh >>= 1;
 -
@@ -1335,26 +1326,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
 -		se->vruntime = vruntime;
 -	else
 -		se->vruntime = max_vruntime(se->vruntime, vruntime);
-+		load = cfs_rq->avg_load;
-+		if (curr && curr->on_rq)
-+			load += scale_load_down(curr->load.weight);
-+
-+		lag *= load + scale_load_down(se->load.weight);
-+		if (WARN_ON_ONCE(!load))
-+			load = 1;
-+		lag = div_s64(lag, load);
-+	}
-+
-+	se->vruntime = vruntime - lag;
-+
-+	/*
-+	 * When joining the competition; the exisiting tasks will be,
-+	 * on average, halfway through their slice, as such start tasks
-+	 * off with half a slice to ease into the competition.
-+	 */
-+	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
-+		vslice /= 2;
-+
 +	/*
 +	 * EEVDF: vd_i = ve_i + r_i/w_i
 +	 */
@@ -1362,7 +1333,6 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-@@ -4809,60 +5059,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
  
  static inline bool cfs_bandwidth_used(void);
  
@@ -1425,7 +1395,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	/*
  	 * When enqueuing a sched_entity, we must:
  	 *   - Update loads to have both entity and cfs_rq synced with now.
-@@ -4874,18 +5084,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4873,18 +5044,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	 */
  	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
  	se_update_runnable(se);
@@ -1457,7 +1427,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	if (!curr)
  		__enqueue_entity(cfs_rq, se);
  	se->on_rq = 1;
-@@ -4907,17 +5127,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4896,17 +5077,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	}
  }
  
@@ -1475,7 +1445,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  static void __clear_buddies_next(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
-@@ -4929,27 +5138,10 @@ static void __clear_buddies_next(struct sched_entity *se)
+@@ -4918,27 +5088,10 @@ static void __clear_buddies_next(struct sched_entity *se)
  	}
  }
  
@@ -1503,7 +1473,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-@@ -4983,20 +5175,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -4972,20 +5125,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
  	clear_buddies(cfs_rq, se);
  
@@ -1525,7 +1495,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	/* return excess runtime on last dequeue */
  	return_cfs_rq_runtime(cfs_rq);
  
-@@ -5015,52 +5199,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -5004,52 +5149,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  		update_idle_cfs_rq_clock_pelt(cfs_rq);
  }
  
@@ -1578,7 +1548,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  static void
  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-@@ -5099,9 +5237,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -5088,9 +5187,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
@@ -1588,7 +1558,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  /*
   * Pick the next process, keeping these things in mind, in this order:
   * 1) keep things fair between processes/task groups
-@@ -5112,50 +5247,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+@@ -5101,50 +5197,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
@@ -1644,7 +1614,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-@@ -5172,8 +5271,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+@@ -5161,8 +5221,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
  	/* throttle cfs_rqs exceeding runtime */
  	check_cfs_rq_runtime(cfs_rq);
  
@@ -1653,7 +1623,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	if (prev->on_rq) {
  		update_stats_wait_start_fair(cfs_rq, prev);
  		/* Put 'current' back into the tree. */
-@@ -5214,9 +5311,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+@@ -5203,9 +5261,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  		return;
  #endif
@@ -1663,7 +1633,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  
-@@ -6259,13 +6353,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+@@ -6228,13 +6283,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
@@ -1678,7 +1648,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  		s64 delta = slice - ran;
  
  		if (delta < 0) {
-@@ -6289,8 +6382,7 @@ static void hrtick_update(struct rq *rq)
+@@ -6258,8 +6312,7 @@ static void hrtick_update(struct rq *rq)
  	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
  		return;
  
@@ -1688,7 +1658,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
-@@ -6331,17 +6423,6 @@ static int sched_idle_rq(struct rq *rq)
+@@ -6300,17 +6353,6 @@ static int sched_idle_rq(struct rq *rq)
  			rq->nr_running);
  }
  
@@ -1706,7 +1676,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  #ifdef CONFIG_SMP
  static int sched_idle_cpu(int cpu)
  {
-@@ -7844,18 +7925,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+@@ -7816,18 +7858,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
  {
  	struct sched_entity *se = &p->se;
  
@@ -1725,7 +1695,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	if (!task_on_rq_migrating(p)) {
  		remove_entity_load_avg(se);
  
-@@ -7893,66 +7962,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+@@ -7865,66 +7895,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  }
  #endif /* CONFIG_SMP */
  
@@ -1792,7 +1762,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  static void set_next_buddy(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
-@@ -7964,12 +7973,6 @@ static void set_next_buddy(struct sched_entity *se)
+@@ -7936,12 +7906,6 @@ static void set_next_buddy(struct sched_entity *se)
  	}
  }
  
@@ -1805,7 +1775,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  /*
   * Preempt the current task with a newly woken task if needed:
   */
-@@ -7978,7 +7981,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7950,7 +7914,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	struct task_struct *curr = rq->curr;
  	struct sched_entity *se = &curr->se, *pse = &p->se;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -1813,7 +1783,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	int next_buddy_marked = 0;
  	int cse_is_idle, pse_is_idle;
  
-@@ -7994,7 +7996,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7966,7 +7929,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
  		return;
  
@@ -1822,7 +1792,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  		set_next_buddy(pse);
  		next_buddy_marked = 1;
  	}
-@@ -8039,35 +8041,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -8011,35 +7974,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	if (cse_is_idle != pse_is_idle)
  		return;
  
@@ -1865,7 +1835,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  #ifdef CONFIG_SMP
-@@ -8268,8 +8254,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+@@ -8240,8 +8187,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  
  /*
   * sched_yield() is very simple
@@ -1874,7 +1844,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
   */
  static void yield_task_fair(struct rq *rq)
  {
-@@ -8285,21 +8269,19 @@ static void yield_task_fair(struct rq *rq)
+@@ -8257,21 +8202,19 @@ static void yield_task_fair(struct rq *rq)
  
  	clear_buddies(cfs_rq, se);
  
@@ -1908,7 +1878,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
-@@ -8547,8 +8529,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
+@@ -8514,8 +8457,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
  	 * Buddy candidates are cache hot:
  	 */
  	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
@@ -1918,7 +1888,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  		return 1;
  
  	if (sysctl_sched_migration_cost == -1)
-@@ -12174,8 +12155,8 @@ static void rq_offline_fair(struct rq *rq)
+@@ -12025,8 +11967,8 @@ static void rq_offline_fair(struct rq *rq)
  static inline bool
  __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
  {
@@ -1928,7 +1898,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  
  	return (rtime * min_nr_tasks > slice);
  }
-@@ -12331,8 +12312,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+@@ -12182,8 +12124,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
   */
  static void task_fork_fair(struct task_struct *p)
  {
@@ -1938,7 +1908,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	struct rq *rq = this_rq();
  	struct rq_flags rf;
  
-@@ -12341,22 +12322,9 @@ static void task_fork_fair(struct task_struct *p)
+@@ -12192,22 +12134,9 @@ static void task_fork_fair(struct task_struct *p)
  
  	cfs_rq = task_cfs_rq(current);
  	curr = cfs_rq->curr;
@@ -1963,7 +1933,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	rq_unlock(rq, &rf);
  }
  
-@@ -12385,34 +12353,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+@@ -12236,34 +12165,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
  		check_preempt_curr(rq, p, 0);
  }
  
@@ -1998,7 +1968,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * Propagate the changes of the sched_entity across the tg tree to make it
-@@ -12483,16 +12423,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
+@@ -12334,16 +12235,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
  static void detach_task_cfs_rq(struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
@@ -2015,7 +1985,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  
  	detach_entity_cfs_rq(se);
  }
-@@ -12500,12 +12430,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
+@@ -12351,12 +12242,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
  static void attach_task_cfs_rq(struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
@@ -2028,7 +1998,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  }
  
  static void switched_from_fair(struct rq *rq, struct task_struct *p)
-@@ -12616,6 +12542,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+@@ -12467,6 +12354,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  		goto err;
  
  	tg->shares = NICE_0_LOAD;
@@ -2036,7 +2006,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  
  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
-@@ -12714,6 +12641,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+@@ -12565,6 +12453,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  	}
  
  	se->my_q = cfs_rq;
@@ -2046,7 +2016,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	/* guarantee group entities always have weight */
  	update_load_set(&se->load, NICE_0_LOAD);
  	se->parent = parent;
-@@ -12844,6 +12774,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
+@@ -12695,6 +12586,29 @@ int sched_group_set_idle(struct task_group *tg, long idle)
  	return 0;
  }
  
@@ -2076,7 +2046,7 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  void free_fair_sched_group(struct task_group *tg) { }
-@@ -12870,7 +12823,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
+@@ -12721,7 +12635,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  	 * idle runqueue:
  	 */
  	if (rq->cfs.load.weight)
@@ -2086,10 +2056,10 @@ index 4039ff46fcb3..0fbb8fb24a50 100644
  	return rr_interval;
  }
 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd3..7d65b40299d9 100644
+index ee7f23c76bd3..54334ca5c5c6 100644
 --- a/kernel/sched/features.h
 +++ b/kernel/sched/features.h
-@@ -1,16 +1,12 @@
+@@ -1,16 +1,11 @@
  /* SPDX-License-Identifier: GPL-2.0 */
 -/*
 - * Only give sleepers 50% of their service deficit. This allows
@@ -2106,12 +2076,11 @@ index ee7f23c76bd3..7d65b40299d9 100644
   */
 -SCHED_FEAT(START_DEBIT, true)
 +SCHED_FEAT(PLACE_LAG, true)
-+SCHED_FEAT(PLACE_FUDGE, true)
 +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
  
  /*
   * Prefer to schedule the task we woke last (assuming it failed
-@@ -19,13 +15,6 @@ SCHED_FEAT(START_DEBIT, true)
+@@ -19,13 +14,6 @@ SCHED_FEAT(START_DEBIT, true)
   */
  SCHED_FEAT(NEXT_BUDDY, false)
  
@@ -2125,7 +2094,7 @@ index ee7f23c76bd3..7d65b40299d9 100644
  /*
   * Consider buddies to be cache hot, decreases the likeliness of a
   * cache buddy being migrated away, increases cache locality.
-@@ -98,6 +87,3 @@ SCHED_FEAT(UTIL_EST, true)
+@@ -98,6 +86,3 @@ SCHED_FEAT(UTIL_EST, true)
  SCHED_FEAT(UTIL_EST_FASTUP, true)
  
  SCHED_FEAT(LATENCY_WARN, false)
@@ -2133,7 +2102,7 @@ index ee7f23c76bd3..7d65b40299d9 100644
 -SCHED_FEAT(ALT_PERIOD, true)
 -SCHED_FEAT(BASE_SLICE, true)
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 9baeb1a2dfdd..4236c4c893aa 100644
+index e93e006a942b..67cd7e1fd501 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -372,6 +372,8 @@ struct task_group {
@@ -2154,18 +2123,17 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
  #ifdef CONFIG_SMP
  extern void set_task_rq_fair(struct sched_entity *se,
  			     struct cfs_rq *prev, struct cfs_rq *next);
-@@ -548,6 +552,10 @@ struct cfs_rq {
+@@ -548,6 +552,9 @@ struct cfs_rq {
  	unsigned int		idle_nr_running;   /* SCHED_IDLE */
  	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
  
 +	s64			avg_vruntime;
-+	u64			avg_slice;
 +	u64			avg_load;
 +
  	u64			exec_clock;
  	u64			min_vruntime;
  #ifdef CONFIG_SCHED_CORE
-@@ -567,8 +575,6 @@ struct cfs_rq {
+@@ -567,8 +574,6 @@ struct cfs_rq {
  	 */
  	struct sched_entity	*curr;
  	struct sched_entity	*next;
@@ -2174,7 +2142,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
  
  #ifdef	CONFIG_SCHED_DEBUG
  	unsigned int		nr_spread_over;
-@@ -2198,6 +2204,7 @@ extern const u32		sched_prio_to_wmult[40];
+@@ -2195,6 +2200,7 @@ extern const u32		sched_prio_to_wmult[40];
  #else
  #define ENQUEUE_MIGRATED	0x00
  #endif
@@ -2182,7 +2150,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
  
  #define RETRY_TASK		((void *)-1UL)
  
-@@ -2502,11 +2509,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+@@ -2499,11 +2505,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
  extern const_debug unsigned int sysctl_sched_nr_migrate;
  extern const_debug unsigned int sysctl_sched_migration_cost;
  
@@ -2196,7 +2164,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
  extern int sysctl_resched_latency_warn_ms;
  extern int sysctl_resched_latency_warn_once;
  
-@@ -2519,6 +2524,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+@@ -2516,6 +2520,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
  extern unsigned int sysctl_numa_balancing_hot_threshold;
  #endif
  
@@ -2205,7 +2173,7 @@ index 9baeb1a2dfdd..4236c4c893aa 100644
  #ifdef CONFIG_SCHED_HRTICK
  
  /*
-@@ -3483,4 +3490,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+@@ -3480,4 +3486,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
  static inline void init_sched_mm_cid(struct task_struct *t) { }
  #endif
  
diff --git a/patches/0002-eevdfbore.patch b/patches/0002-eevdfbore.patch
index 6d53439..0465cdf 100644
--- a/patches/0002-eevdfbore.patch
+++ b/patches/0002-eevdfbore.patch
@@ -1,49 +1,76 @@
-From e6e251fb3f3927c18ac4f2a22a43c6c198133d19 Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Sun, 23 Jul 2023 09:46:42 +0200
+From 377657f92d256b364813e3f8b2a58edfc9833815 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 30 Jul 2023 09:43:51 +0200
 Subject: [PATCH] bore-eevdf
 
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- include/linux/sched.h   |  10 ++
+ include/linux/sched.h   |  30 ++++++
  init/Kconfig            |  20 ++++
- kernel/sched/core.c     | 117 +++++++++++++++++++++++
+ kernel/sched/core.c     | 118 +++++++++++++++++++++
  kernel/sched/debug.c    |   4 +
- kernel/sched/fair.c     | 203 ++++++++++++++++++++++++++++++++++++++--
+ kernel/sched/fair.c     | 228 ++++++++++++++++++++++++++++++++++++++--
  kernel/sched/features.h |   4 +
  kernel/sched/sched.h    |   1 +
- 7 files changed, 351 insertions(+), 8 deletions(-)
+ 7 files changed, 397 insertions(+), 8 deletions(-)
 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index e99a9aa6a..14a1ce058 100644
+index c940c4dc8304..8663c0813f81 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
-@@ -559,6 +559,12 @@ struct sched_entity {
+@@ -545,6 +545,26 @@ struct sched_statistics {
+ #endif /* CONFIG_SCHEDSTATS */
+ } ____cacheline_aligned;
+ 
++#ifdef CONFIG_SCHED_BORE
++union union16 {
++	u16	u16;
++	s16	s16;
++	u8	u8[2];
++	s8	s8[2];
++};
++typedef union union16 x16;
++
++union union32 {
++	u32	u32;
++	s32	s32;
++	u16	u16[2];
++	s16	s16[2];
++	u8	u8[4];
++	s8	s8[4];
++};
++typedef union union32 x32;
++#endif // CONFIG_SCHED_BORE
++
+ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+@@ -559,6 +579,12 @@ struct sched_entity {
  	u64				sum_exec_runtime;
  	u64				prev_sum_exec_runtime;
  	u64				vruntime;
 +#ifdef CONFIG_SCHED_BORE
-+	u64				prev_burst_time;
 +	u64				burst_time;
-+	u64				max_burst_time;
-+	u8				penalty_score;
++	u16				prev_burst_penalty;
++	u16				curr_burst_penalty;
++	u16				burst_penalty;
 +#endif // CONFIG_SCHED_BORE
  	s64				vlag;
  	u64				slice;
  
-@@ -990,6 +996,10 @@ struct task_struct {
+@@ -990,6 +1016,10 @@ struct task_struct {
  	struct list_head		children;
  	struct list_head		sibling;
  	struct task_struct		*group_leader;
 +#ifdef CONFIG_SCHED_BORE
-+	u64	child_burst_cache;
++	u16	child_burst_cache;
 +	u64	child_burst_last_cached;
 +#endif // CONFIG_SCHED_BORE
  
  	/*
  	 * 'ptraced' is the list of tasks this task is using ptrace() on.
 diff --git a/init/Kconfig b/init/Kconfig
-index 71755cc8e..c697be79e 100644
+index 71755cc8ed3e..c697be79e594 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 @@ -1277,6 +1277,26 @@ config CHECKPOINT_RESTORE
@@ -74,30 +101,31 @@ index 71755cc8e..c697be79e 100644
  	bool "Automatic process group scheduling"
  	select CGROUPS
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 8a541fe2d..13969a3a3 100644
+index aff81e12460e..839605620f63 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -4491,6 +4491,112 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+@@ -4491,6 +4491,113 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  	return try_to_wake_up(p, state, 0);
  }
  
 +#ifdef CONFIG_SCHED_BORE
-+#define CHILD_BURST_CUTOFF_BITS 9
 +extern unsigned int sched_burst_cache_lifetime;
 +extern unsigned int sched_burst_fork_atavistic;
 +
 +void __init sched_init_bore(void) {
 +	init_task.child_burst_cache = 0;
 +	init_task.child_burst_last_cached = 0;
-+	init_task.se.prev_burst_time = 0;
 +	init_task.se.burst_time = 0;
-+	init_task.se.max_burst_time = 0;
++	init_task.se.prev_burst_penalty = 0;
++	init_task.se.curr_burst_penalty = 0;
++	init_task.se.burst_penalty = 0;
 +}
 +
 +void inline sched_fork_bore(struct task_struct *p) {
 +	p->child_burst_cache = 0;
 +	p->child_burst_last_cached = 0;
 +	p->se.burst_time = 0;
++	p->se.curr_burst_penalty = 0;
 +}
 +
 +static u32 count_child_tasks(struct task_struct *p) {
@@ -112,31 +140,31 @@ index 8a541fe2d..13969a3a3 100644
 +}
 +
 +static void __update_child_burst_cache(
-+	struct task_struct *p, u32 cnt, u64 sum, u64 now) {
-+	u64 avg = 0;
-+	if (cnt) avg = div_u64(sum, cnt) << CHILD_BURST_CUTOFF_BITS;
-+	p->child_burst_cache = max(avg, p->se.max_burst_time);
++	struct task_struct *p, u32 cnt, u32 sum, u64 now) {
++	u16 avg = 0;
++	if (cnt) avg = DIV_ROUND_CLOSEST(sum, cnt);
++	p->child_burst_cache = max(avg, p->se.burst_penalty);
 +	p->child_burst_last_cached = now;
 +}
 +
 +static void update_child_burst_cache(struct task_struct *p, u64 now) {
 +	struct task_struct *child;
 +	u32 cnt = 0;
-+	u64 sum = 0;
++	u32 sum = 0;
 +
 +	list_for_each_entry(child, &p->children, sibling) {
 +		cnt++;
-+		sum += child->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS;
++		sum += child->se.burst_penalty;
 +	}
 +
 +	__update_child_burst_cache(p, cnt, sum, now);
 +}
 +
 +static void update_child_burst_cache_atavistic(
-+	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u64 *asum) {
++	struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) {
 +	struct task_struct *child, *dec;
 +	u32 cnt = 0, dcnt = 0;
-+	u64 sum = 0;
++	u32 sum = 0;
 +
 +	list_for_each_entry(child, &p->children, sibling) {
 +		dec = child;
@@ -145,13 +173,13 @@ index 8a541fe2d..13969a3a3 100644
 +		
 +		if (!dcnt || !depth) {
 +			cnt++;
-+			sum += dec->se.max_burst_time >> CHILD_BURST_CUTOFF_BITS;
++			sum += dec->se.burst_penalty;
 +		} else {
 +			if (child_burst_cache_expired(dec, now))
 +				update_child_burst_cache_atavistic(dec, now, depth - 1, &cnt, &sum);
 +			else {
 +				cnt += dcnt;
-+				sum += (dec->child_burst_cache >> CHILD_BURST_CUTOFF_BITS) * dcnt;
++				sum += (dec->child_burst_cache) * dcnt;
 +			}
 +		}
 +	}
@@ -161,12 +189,12 @@ index 8a541fe2d..13969a3a3 100644
 +	*asum += sum;
 +}
 +
-+static void update_task_initial_burst_time(struct task_struct *p) {
++static void fork_burst_penalty(struct task_struct *p) {
 +	struct sched_entity *se = &p->se;
 +	struct task_struct *anc = p->real_parent;
 +	u64 now = ktime_get_ns();
 +	u32 cnt = 0;
-+	u64 sum = 0;
++	u32 sum = 0;
 +
 +	read_lock(&tasklist_lock);
 +	
@@ -182,15 +210,15 @@ index 8a541fe2d..13969a3a3 100644
 +
 +	read_unlock(&tasklist_lock);
 +
-+	se->max_burst_time = se->prev_burst_time =
-+		max(se->prev_burst_time, anc->child_burst_cache);
++	se->burst_penalty = se->prev_burst_penalty =
++		max(se->prev_burst_penalty, anc->child_burst_cache);
 +}
 +#endif // CONFIG_SCHED_BORE
 +
  /*
   * Perform scheduler related setup for a newly forked process p.
   * p is forked by current.
-@@ -4507,6 +4613,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4507,6 +4614,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	p->se.prev_sum_exec_runtime	= 0;
  	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
@@ -200,30 +228,30 @@ index 8a541fe2d..13969a3a3 100644
  	p->se.vlag			= 0;
  	INIT_LIST_HEAD(&p->se.group_node);
  
-@@ -4828,6 +4937,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+@@ -4828,6 +4938,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
  
  void sched_post_fork(struct task_struct *p)
  {
 +#ifdef CONFIG_SCHED_BORE
-+	update_task_initial_burst_time(p);
++	fork_burst_penalty(p);
 +#endif // CONFIG_SCHED_BORE
  	uclamp_post_fork(p);
  }
  
-@@ -9967,6 +10079,11 @@ void __init sched_init(void)
+@@ -9954,6 +10067,11 @@ void __init sched_init(void)
  	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
  #endif
  
 +#ifdef CONFIG_SCHED_BORE
 +	sched_init_bore();
-+	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.5.3 by Masahito Suzuki");
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 3.0 Beta2 by Masahito Suzuki");
 +#endif // CONFIG_SCHED_BORE
 +
  	wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 5c743bcb3..755ef4c8d 100644
+index e7e83181fbb6..ff41a524c1ee 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
 @@ -348,6 +348,7 @@ static __init int sched_init_debug(void)
@@ -234,18 +262,18 @@ index 5c743bcb3..755ef4c8d 100644
  
  	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
  	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-@@ -595,6 +596,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
  
 +#ifdef CONFIG_SCHED_BORE
-+	SEQ_printf(m, " %2d", p->se.penalty_score);
++	SEQ_printf(m, " %2d", ((x16*)&p->se.burst_penalty)->u8[1]);
 +#endif
  #ifdef CONFIG_NUMA_BALANCING
  	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
  #endif
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index d6042543c..e52c14232 100644
+index 461409c0eac7..90ce27fb0a3f 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -19,6 +19,9 @@
@@ -281,7 +309,7 @@ index d6042543c..e52c14232 100644
  
  /*
   * After fork, child runs first. If set to 0 (default) then
-@@ -84,8 +87,76 @@ static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
+@@ -84,8 +87,93 @@ static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
   */
  unsigned int sysctl_sched_child_runs_first __read_mostly;
  
@@ -292,61 +320,78 @@ index d6042543c..e52c14232 100644
 + * and reduces their over-scheduling. Synchronous workloads will still
 + * have immediate wakeup/sleep latencies.
 + *
-+ * (default: 3.2 msec * 1, units: nanoseconds)
++ * (default: 1.6 msec * 1, units: nanoseconds)
 + */
-+unsigned int sysctl_sched_wakeup_granularity			= 3200000UL;
-+static unsigned int normalized_sysctl_sched_wakeup_granularity	= 3200000UL;
++unsigned int sysctl_sched_wakeup_granularity			= 1600000UL;
++static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1600000UL;
 +
  const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
  
 +#ifdef CONFIG_SCHED_BORE
-+unsigned int __read_mostly sched_bore                 = 1;
-+unsigned int __read_mostly sched_burst_cache_lifetime = 60000000;
-+unsigned int __read_mostly sched_burst_penalty_offset = 12;
-+unsigned int __read_mostly sched_burst_penalty_scale  = 1292;
-+unsigned int __read_mostly sched_burst_smoothness     = 2;
-+unsigned int __read_mostly sched_burst_fork_atavistic = 2;
++unsigned int __read_mostly sched_bore                  = 1;
++unsigned int __read_mostly sched_burst_cache_lifetime  = 60000000;
++unsigned int __read_mostly sched_burst_penalty_offset  = 18;
++unsigned int __read_mostly sched_burst_penalty_scale   = 1292;
++unsigned int __read_mostly sched_burst_smoothness_up   = 1;
++unsigned int __read_mostly sched_burst_smoothness_down = 0;
++unsigned int __read_mostly sched_burst_fork_atavistic  = 2;
 +static int three          = 3;
 +static int sixty_four     = 64;
 +static int maxval_12_bits = 4095;
 +
-+#define FIXED_SHIFT 10
-+#define FIXED_ONE (1 << FIXED_SHIFT)
-+typedef u32 fixed;
++#define MAX_BURST_PENALTY ((u32)(40UL << 8) - 1)
 +
-+static void update_burst_score(struct sched_entity *se) {
-+	u64 burst_time = se->max_burst_time;
++static inline u32 log2plus1_u64_u32f8(u64 v) {
++	x32 result;
++	int msb = fls64(v);
++	result.u8[0] = v << (64 - msb) >> 55;
++	result.u8[1] = msb;
++	return result.u32;
++}
 +
-+	int msb = fls64(burst_time);
-+	fixed integer_part = msb << FIXED_SHIFT;
-+	fixed fractional_part = burst_time << (64 - msb) << 1 >> (64 - FIXED_SHIFT);
-+	fixed greed = integer_part | fractional_part;
++static inline u32 u8h_u32(u8 v) {
++	x32 result;
++	result.u8[1] = v;
++	return result.u32;
++}
 +
-+	fixed tolerance = sched_burst_penalty_offset << FIXED_SHIFT;
-+	fixed penalty = max(0, (s32)greed - (s32)tolerance);
-+	fixed scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
++static inline u32 calc_burst_penalty(struct sched_entity *se) {
++	u32 greed, tolerance, penalty, scaled_penalty;
++	
++	greed = log2plus1_u64_u32f8(se->burst_time);
++	tolerance = u8h_u32(sched_burst_penalty_offset);
++	penalty = max(0, (s32)greed - (s32)tolerance);
++	scaled_penalty = penalty * sched_burst_penalty_scale >> 10;
 +
-+	u8 score = min(39U, scaled_penalty >> FIXED_SHIFT);
-+	se->penalty_score = score;
++	return min(MAX_BURST_PENALTY, scaled_penalty);
++}
++
++static void update_burst_penalty(struct sched_entity *se) {
++	se->curr_burst_penalty = calc_burst_penalty(se);
++	se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty);
 +}
 +
 +static inline u64 penalty_scale(u64 delta, struct sched_entity *se) {
-+	return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->penalty_score], 22);
++	u8 score = ((x16*)&se->burst_penalty)->u8[1];
++	return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);
 +}
 +
-+static inline u64 __binary_smooth(u64 new, u64 old, unsigned int smoothness) {
-+	return (new <= old)? new: (new + old * ((1 << smoothness) - 1)) >> smoothness;
++static inline u32 binary_smooth(u32 new, u32 old) {
++  return (new >= old)?
++    old + ((new - old) >> sched_burst_smoothness_up):
++    old - ((old - new) >> sched_burst_smoothness_down);
 +}
 +
-+void restart_burst(struct sched_entity *se) {
-+	se->max_burst_time = se->prev_burst_time = __binary_smooth(
-+		se->burst_time, se->prev_burst_time, sched_burst_smoothness);
++static void restart_burst(struct sched_entity *se) {
++	se->burst_penalty = se->prev_burst_penalty =
++		binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty);
++	se->curr_burst_penalty = 0;
 +	se->burst_time = 0;
 +}
 +
 +#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, true)
 +#define calc_delta_fair_unscaled(delta, se) __calc_delta_fair(delta, se, false)
-+static inline u64
++static inline u64 
 +__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale);
 +
 +static s64 wakeup_preempt_backstep_delta(u64 rtime, struct sched_entity *se) {
@@ -358,7 +403,7 @@ index d6042543c..e52c14232 100644
  int sched_thermal_decay_shift;
  static int __init setup_sched_thermal_decay_shift(char *str)
  {
-@@ -145,6 +216,60 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+@@ -145,6 +233,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  
  #ifdef CONFIG_SYSCTL
  static struct ctl_table sched_fair_sysctls[] = {
@@ -407,8 +452,17 @@ index d6042543c..e52c14232 100644
 +		.extra2		= &maxval_12_bits,
 +	},
 +	{
-+		.procname	= "sched_burst_smoothness",
-+		.data		= &sched_burst_smoothness,
++		.procname	= "sched_burst_smoothness_down",
++		.data		= &sched_burst_smoothness_down,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec_minmax,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= &three,
++	},
++	{
++		.procname	= "sched_burst_smoothness_up",
++		.data		= &sched_burst_smoothness_up,
 +		.maxlen		= sizeof(unsigned int),
 +		.mode		= 0644,
 +		.proc_handler	= &proc_dointvec_minmax,
@@ -419,7 +473,7 @@ index d6042543c..e52c14232 100644
  	{
  		.procname       = "sched_child_runs_first",
  		.data           = &sysctl_sched_child_runs_first,
-@@ -238,6 +363,7 @@ static void update_sysctl(void)
+@@ -238,6 +389,7 @@ static void update_sysctl(void)
  #define SET_SYSCTL(name) \
  	(sysctl_##name = (factor) * normalized_sysctl_##name)
  	SET_SYSCTL(sched_base_slice);
@@ -427,12 +481,12 @@ index d6042543c..e52c14232 100644
  #undef SET_SYSCTL
  }
  
-@@ -308,11 +434,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
+@@ -308,11 +460,19 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
  /*
   * delta /= w
   */
 +#ifdef CONFIG_SCHED_BORE
-+static inline u64
++static inline u64 
 +__calc_delta_fair(u64 delta, struct sched_entity *se, bool bscale)
 +#else // CONFIG_SCHED_BORE
  static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
@@ -447,7 +501,7 @@ index d6042543c..e52c14232 100644
  	return delta;
  }
  
-@@ -708,7 +842,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -706,7 +866,11 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	SCHED_WARN_ON(!se->on_rq);
  	lag = avg_vruntime(cfs_rq) - se->vruntime;
  
@@ -459,7 +513,7 @@ index d6042543c..e52c14232 100644
  	se->vlag = clamp(lag, -limit, limit);
  }
  
-@@ -946,6 +1084,7 @@ int sched_update_scaling(void)
+@@ -944,6 +1108,7 @@ int sched_update_scaling(void)
  #define WRT_SYSCTL(name) \
  	(normalized_sysctl_##name = sysctl_##name / (factor))
  	WRT_SYSCTL(sched_base_slice);
@@ -467,19 +521,18 @@ index d6042543c..e52c14232 100644
  #undef WRT_SYSCTL
  
  	return 0;
-@@ -1123,6 +1262,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -1121,6 +1286,10 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	curr->sum_exec_runtime += delta_exec;
  	schedstat_add(cfs_rq->exec_clock, delta_exec);
  
 +#ifdef CONFIG_SCHED_BORE
 +	curr->burst_time += delta_exec;
-+	curr->max_burst_time = max(curr->max_burst_time, curr->burst_time);
-+	update_burst_score(curr);
++	update_burst_penalty(curr);
 +#endif // CONFIG_SCHED_BORE
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
  	update_deadline(cfs_rq, curr);
  	update_min_vruntime(cfs_rq);
-@@ -5237,6 +5381,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -5187,6 +5356,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
@@ -489,7 +542,7 @@ index d6042543c..e52c14232 100644
  /*
   * Pick the next process, keeping these things in mind, in this order:
   * 1) keep things fair between processes/task groups
-@@ -5247,14 +5394,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -5197,14 +5369,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
@@ -499,7 +552,7 @@ index d6042543c..e52c14232 100644
  	 */
  	if (sched_feat(NEXT_BUDDY) &&
 -	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
-+	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) &&
++	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) && 
 +			wakeup_preempt_entity(cfs_rq->next, candidate) < 1)
  		return cfs_rq->next;
  
@@ -508,7 +561,7 @@ index d6042543c..e52c14232 100644
  }
  
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-@@ -6522,6 +6671,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6452,6 +6626,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	hrtick_update(rq);
  }
  
@@ -547,7 +600,7 @@ index d6042543c..e52c14232 100644
  static void set_next_buddy(struct sched_entity *se);
  
  /*
-@@ -6540,6 +6721,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6470,6 +6676,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	util_est_dequeue(&rq->cfs, p);
  
  	for_each_sched_entity(se) {
@@ -557,7 +610,7 @@ index d6042543c..e52c14232 100644
  		cfs_rq = cfs_rq_of(se);
  		dequeue_entity(cfs_rq, se, flags);
  
-@@ -8047,7 +8231,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+@@ -7980,7 +8189,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  	/*
  	 * XXX pick_eevdf(cfs_rq) != se ?
  	 */
@@ -566,7 +619,7 @@ index d6042543c..e52c14232 100644
  		goto preempt;
  
  	return;
-@@ -8260,6 +8444,9 @@ static void yield_task_fair(struct rq *rq)
+@@ -8193,6 +8402,9 @@ static void yield_task_fair(struct rq *rq)
  	struct task_struct *curr = rq->curr;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  	struct sched_entity *se = &curr->se;
@@ -577,10 +630,10 @@ index d6042543c..e52c14232 100644
  	/*
  	 * Are we the only task in the tree?
 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index 7d65b4029..bd274f7c7 100644
+index 54334ca5c5c6..416ec4bcdb0f 100644
 --- a/kernel/sched/features.h
 +++ b/kernel/sched/features.h
-@@ -13,7 +13,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+@@ -12,7 +12,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
   * wakeup-preemption), since its likely going to consume data we
   * touched, increases cache locality.
   */
@@ -593,10 +646,10 @@ index 7d65b4029..bd274f7c7 100644
  /*
   * Consider buddies to be cache hot, decreases the likeliness of a
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index 4236c4c89..714cc6ad9 100644
+index 67cd7e1fd501..04d065015d6c 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
-@@ -2510,6 +2510,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
+@@ -2506,6 +2506,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate;
  extern const_debug unsigned int sysctl_sched_migration_cost;
  
  extern unsigned int sysctl_sched_base_slice;
@@ -605,4 +658,4 @@ index 4236c4c89..714cc6ad9 100644
  #ifdef CONFIG_SCHED_DEBUG
  extern int sysctl_resched_latency_warn_ms;
 -- 
-2.41.0.159.g0bfa463d37
+2.41.0
diff --git a/patches/0006-AMD-cppc.patch b/patches/0006-AMD-cppc.patch
new file mode 100644
index 0000000..eee57cc
--- /dev/null
+++ b/patches/0006-AMD-cppc.patch
@@ -0,0 +1,573 @@
+From ab6268d199fa749e274a48b00c443538ae492b16 Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Wed, 9 Aug 2023 14:07:31 +0200
+Subject: [PATCH] amd-6.5: merge changes from dev tree
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ .../admin-guide/kernel-parameters.txt         |   5 +
+ Documentation/admin-guide/pm/amd-pstate.rst   |  55 +++++
+ drivers/acpi/cppc_acpi.c                      |  13 ++
+ drivers/acpi/processor_driver.c               |   6 +
+ drivers/cpufreq/amd-pstate.c                  | 191 ++++++++++++++++--
+ drivers/cpufreq/cpufreq.c                     |  13 ++
+ include/acpi/cppc_acpi.h                      |   5 +
+ include/linux/amd-pstate.h                    |   1 +
+ include/linux/cpufreq.h                       |   4 +
+ 9 files changed, 272 insertions(+), 21 deletions(-)
+
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index a1457995f..1f53c395a 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -363,6 +363,11 @@
+ 			  selects a performance level in this range and appropriate
+ 			  to the current workload.
+ 
++	amd_prefcore=
++			[X86]
++			enable
++			  Enable AMD Pstate Preferred Core.
++
+ 	amijoy.map=	[HW,JOY] Amiga joystick support
+ 			Map of devices attached to JOY0DAT and JOY1DAT
+ 			Format: <a>,<b>
+diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
+index 1cf40f692..4a30cf235 100644
+--- a/Documentation/admin-guide/pm/amd-pstate.rst
++++ b/Documentation/admin-guide/pm/amd-pstate.rst
+@@ -353,6 +353,49 @@ is activated.  In this mode, driver requests minimum and maximum performance
+ level and the platform autonomously selects a performance level in this range
+ and appropriate to the current workload.
+ 
++AMD Pstate Preferred Core
++=================================
++
++The core frequency is subjected to the process variation in semiconductors.
++Not all cores are able to reach the maximum frequency respecting the
++infrastructure limits. Consequently, AMD has redefined the concept of
++maximum frequency of a part. This means that a fraction of cores can reach
++maximum frequency. To find the best process scheduling policy for a given
++scenario, OS needs to know the core ordering informed by the platform through
++highest performance capability register of the CPPC interface.
++
++``AMD Pstate Preferred Core`` use ITMT arch provides functions and data structures
++for enabling the scheduler to favor scheduling on cores can be get a higher frequency
++with lower voltage under preferred core. And it has the ability to dynamically
++change the preferred core based on the workload and platform conditions and
++accounting for thermals and aging.
++
++The priority metric will be initialized by the AMD Pstate driver. The AMD Pstate
++driver will also determine whether or not ``AMD Pstate Preferred Core`` is
++supported by the platform.
++
++AMD Pstate driver will provide an initial core ordering when the system boots.
++The platform uses the CPPC interfaces to communicate the core ranking to the
++operating system and scheduler to make sure that OS is choosing the cores
++with highest performance firstly for scheduling the process. When AMD Pstate
++driver receives a message with the highest performance change, it will
++update the core ranking and set the cpu's priority.
++
++AMD Preferred Core Switch
++=================================
++Kernel Parameters
++-----------------
++
++``AMD Pstate Preferred Core`` has two states: enable and disable.
++Enable/disable states can be chosen by different kernel parameters.
++Default disable ``AMD Pstate Preferred Core``.
++
++``amd_prefcore=enable``
++
++If ``amd_prefcore=enable`` is passed to kernel command line option
++then enable ``AMD Pstate Preferred Core`` if the processor and power
++firmware can support preferred core feature.
++
+ User Space Interface in ``sysfs`` - General
+ ===========================================
+ 
+@@ -385,6 +428,18 @@ control its functionality at the system level.  They are located in the
+         to the operation mode represented by that string - or to be
+         unregistered in the "disable" case.
+ 
++``prefcore_state``
++	Preferred Core state of the driver: "enabled" or "disabled".
++
++	"enabled"
++		Enable the AMD Preferred Core.
++
++	"disabled"
++		Disable the AMD Preferred Core
++
++
++        This attribute is read-only to check the state of Preferred Core.
++
+ ``cpupower`` tool support for ``amd-pstate``
+ ===============================================
+ 
+diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
+index 7ff269a78..ad388a0e8 100644
+--- a/drivers/acpi/cppc_acpi.c
++++ b/drivers/acpi/cppc_acpi.c
+@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+ 	return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
+ }
+ 
++/**
++ * cppc_get_highest_perf - Get the highest performance register value.
++ * @cpunum: CPU from which to get highest performance.
++ * @highest_perf: Return address.
++ *
++ * Return: 0 for success, -EIO otherwise.
++ */
++int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
++{
++	return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf);
++}
++EXPORT_SYMBOL_GPL(cppc_get_highest_perf);
++
+ /**
+  * cppc_get_epp_perf - Get the epp register value.
+  * @cpunum: CPU from which to get epp preference value.
+diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
+index 4bd16b3f0..29b2fb68a 100644
+--- a/drivers/acpi/processor_driver.c
++++ b/drivers/acpi/processor_driver.c
+@@ -27,6 +27,7 @@
+ #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
+ #define ACPI_PROCESSOR_NOTIFY_POWER	0x81
+ #define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
++#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED	0x85
+ 
+ MODULE_AUTHOR("Paul Diefenbaugh");
+ MODULE_DESCRIPTION("ACPI Processor Driver");
+@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data)
+ 		acpi_bus_generate_netlink_event(device->pnp.device_class,
+ 						  dev_name(&device->dev), event, 0);
+ 		break;
++	case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED:
++		cpufreq_update_highest_perf(pr->id);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++						  dev_name(&device->dev), event, 0);
++		break;
+ 	default:
+ 		acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event);
+ 		break;
+diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
+index 81fba0dcb..ba10aa971 100644
+--- a/drivers/cpufreq/amd-pstate.c
++++ b/drivers/cpufreq/amd-pstate.c
+@@ -37,6 +37,7 @@
+ #include <linux/uaccess.h>
+ #include <linux/static_call.h>
+ #include <linux/amd-pstate.h>
++#include <linux/topology.h>
+ 
+ #include <acpi/processor.h>
+ #include <acpi/cppc_acpi.h>
+@@ -49,6 +50,8 @@
+ 
+ #define AMD_PSTATE_TRANSITION_LATENCY	20000
+ #define AMD_PSTATE_TRANSITION_DELAY	1000
++#define AMD_PSTATE_PREFCORE_THRESHOLD	166
++#define AMD_PSTATE_MAX_CPPC_PERF	255
+ 
+ /*
+  * TODO: We need more time to fine tune processors with shared memory solution
+@@ -65,6 +68,14 @@ static struct cpufreq_driver amd_pstate_epp_driver;
+ static int cppc_state = AMD_PSTATE_UNDEFINED;
+ static bool cppc_enabled;
+ 
++/*
++ * CPPC Preferred Core feature is supported by power firmware
++ */
++static bool prefcore_enabled = false;
++
++/* Disable AMD Pstate Preferred Core loading */
++static bool no_prefcore __read_mostly = true;
++
+ /*
+  * AMD Energy Preference Performance (EPP)
+  * The EPP is used in the CCLK DPM controller to drive
+@@ -290,27 +301,26 @@ static inline int amd_pstate_enable(bool enable)
+ static int pstate_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	u64 cap1;
+-	u32 highest_perf;
+ 
+ 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
+ 				     &cap1);
+ 	if (ret)
+ 		return ret;
+ 
+-	/*
+-	 * TODO: Introduce AMD specific power feature.
+-	 *
+-	 * CPPC entry doesn't indicate the highest performance in some ASICs.
++	/* For platforms that do not support the preferred core feature, the
++	 * highest_pef may be configured with 166 or 255, to avoid max frequency
++	 * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
++	 * the default max perf.
+ 	 */
+-	highest_perf = amd_get_highest_perf();
+-	if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1))
+-		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
+-
+-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
++	if (!prefcore_enabled)
++		WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
++	else
++		WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
+ 
+ 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
+ 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
++	WRITE_ONCE(cpudata->prefcore_highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ 
+ 	return 0;
+ }
+@@ -318,22 +328,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
+ static int cppc_init_perf(struct amd_cpudata *cpudata)
+ {
+ 	struct cppc_perf_caps cppc_perf;
+-	u32 highest_perf;
+ 
+ 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
+ 	if (ret)
+ 		return ret;
+ 
+-	highest_perf = amd_get_highest_perf();
+-	if (highest_perf > cppc_perf.highest_perf)
+-		highest_perf = cppc_perf.highest_perf;
+-
+-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
++	if (!prefcore_enabled)
++		WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
++	else
++		WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD);
+ 
+ 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
+ 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
+ 		   cppc_perf.lowest_nonlinear_perf);
+ 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
++	WRITE_ONCE(cpudata->prefcore_highest_perf, cppc_perf.highest_perf);
+ 
+ 	if (cppc_state == AMD_PSTATE_ACTIVE)
+ 		return 0;
+@@ -676,6 +685,118 @@ static void amd_perf_ctl_reset(unsigned int cpu)
+ 	wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0);
+ }
+ 
++/*
++ * Set AMD Pstate Preferred Core enable can't be done directly from cpufreq callbacks
++ * due to locking, so queue the work for later.
++ */
++static void amd_pstste_sched_prefcore_workfn(struct work_struct *work)
++{
++	sched_set_itmt_support();
++}
++static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
++
++/**
++ * Get the highest performance register value.
++ * @cpu: CPU from which to get highest performance.
++ * @highest_perf: Return address.
++ *
++ * Return: 0 for success, -EIO otherwise.
++ */
++static int amd_pstate_get_highest_perf(int cpu, u64 *highest_perf)
++{
++       int ret;
++
++       if (boot_cpu_has(X86_FEATURE_CPPC)) {
++               u64 cap1;
++
++               ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
++               if (ret)
++                       return ret;
++               WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
++       } else {
++               ret = cppc_get_highest_perf(cpu, highest_perf);
++       }
++
++       return (ret);
++}
++
++static void amd_pstate_init_prefcore(void)
++{
++	int cpu, ret;
++	u64 highest_perf;
++
++	if (no_prefcore)
++		return;
++
++	for_each_possible_cpu(cpu) {
++		ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
++		if (ret)
++			break;
++
++		sched_set_itmt_core_prio(highest_perf, cpu);
++	}
++
++	/*
++	 * This code can be run during CPU online under the
++	 * CPU hotplug locks, so sched_set_amd_prefcore_support()
++	 * cannot be called from here.  Queue up a work item
++	 * to invoke it.
++	 */
++	schedule_work(&sched_prefcore_work);
++}
++
++static void amd_pstate_update_highest_perf(unsigned int cpu)
++{
++	struct cpufreq_policy *policy;
++	struct amd_cpudata *cpudata;
++	u32 prev_high = 0, cur_high = 0;
++	u64 highest_perf;
++	int ret;
++
++	if (!prefcore_enabled)
++		return;
++
++	ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
++	if (ret)
++		return;
++
++	policy = cpufreq_cpu_get(cpu);
++	cpudata = policy->driver_data;
++	cur_high = highest_perf;
++	prev_high = READ_ONCE(cpudata->prefcore_highest_perf);
++
++	if (prev_high != cur_high) {
++		WRITE_ONCE(cpudata->prefcore_highest_perf, cur_high);
++		sched_set_itmt_core_prio(cur_high, cpu);
++	}
++
++	cpufreq_cpu_put(policy);
++}
++
++/*
++ * Check if AMD Pstate Preferred core feature is supported and enabled
++ * 1) no_prefcore is used to enable or disable AMD Pstate Preferred Core
++ * loading when user would like to enable or disable it. Without that,
++ * AMD Pstate Preferred Core will be disabled by default if the processor
++ * and power firmware can support preferred core feature.
++ * 2) prefcore_enabled is used to indicate whether CPPC preferred core is enabled.
++ */
++static void check_prefcore_supported(int cpu)
++{
++	u64 highest_perf;
++	int ret;
++
++	if (no_prefcore)
++		return;
++
++	ret = amd_pstate_get_highest_perf(cpu, &highest_perf);
++	if (ret)
++		return;
++
++	if(highest_perf < AMD_PSTATE_MAX_CPPC_PERF)
++		prefcore_enabled = true;
++}
++
+ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
+ {
+ 	int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
+@@ -697,6 +818,9 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
+ 
+ 	cpudata->cpu = policy->cpu;
+ 
++	/* check if CPPC preferred core feature is enabled*/
++	check_prefcore_supported(policy->cpu);
++
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+@@ -1012,8 +1136,8 @@ static int amd_pstate_update_status(const char *buf, size_t size)
+ 	return 0;
+ }
+ 
+-static ssize_t show_status(struct kobject *kobj,
+-			   struct kobj_attribute *attr, char *buf)
++static ssize_t status_show(struct device *dev,
++			   struct device_attribute *attr, char *buf)
+ {
+ 	ssize_t ret;
+ 
+@@ -1024,7 +1148,7 @@ static ssize_t show_status(struct kobject *kobj,
+ 	return ret;
+ }
+ 
+-static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
++static ssize_t status_store(struct device *a, struct device_attribute *b,
+ 			    const char *buf, size_t count)
+ {
+ 	char *p = memchr(buf, '\n', count);
+@@ -1037,13 +1161,20 @@ static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
+ 	return ret < 0 ? ret : count;
+ }
+ 
++static ssize_t prefcore_state_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%s\n", prefcore_enabled ? "enabled" : "disabled");
++}
++
+ cpufreq_freq_attr_ro(amd_pstate_max_freq);
+ cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
+ 
+ cpufreq_freq_attr_ro(amd_pstate_highest_perf);
+ cpufreq_freq_attr_rw(energy_performance_preference);
+ cpufreq_freq_attr_ro(energy_performance_available_preferences);
+-define_one_global_rw(status);
++static DEVICE_ATTR_RW(status);
++static DEVICE_ATTR_RO(prefcore_state);
+ 
+ static struct freq_attr *amd_pstate_attr[] = {
+ 	&amd_pstate_max_freq,
+@@ -1062,7 +1193,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
+ };
+ 
+ static struct attribute *pstate_global_attributes[] = {
+-	&status.attr,
++	&dev_attr_status.attr,
++	&dev_attr_prefcore_state.attr,
+ 	NULL
+ };
+ 
+@@ -1114,6 +1246,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
+ 	cpudata->cpu = policy->cpu;
+ 	cpudata->epp_policy = 0;
+ 
++	/* check if CPPC preferred core feature is supported*/
++	check_prefcore_supported(policy->cpu);
++
+ 	ret = amd_pstate_init_perf(cpudata);
+ 	if (ret)
+ 		goto free_cpudata1;
+@@ -1392,6 +1527,7 @@ static struct cpufreq_driver amd_pstate_driver = {
+ 	.suspend	= amd_pstate_cpu_suspend,
+ 	.resume		= amd_pstate_cpu_resume,
+ 	.set_boost	= amd_pstate_set_boost,
++	.update_highest_perf	= amd_pstate_update_highest_perf,
+ 	.name		= "amd-pstate",
+ 	.attr		= amd_pstate_attr,
+ };
+@@ -1406,6 +1542,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
+ 	.online		= amd_pstate_epp_cpu_online,
+ 	.suspend	= amd_pstate_epp_suspend,
+ 	.resume		= amd_pstate_epp_resume,
++	.update_highest_perf	= amd_pstate_update_highest_perf,
+ 	.name		= "amd-pstate-epp",
+ 	.attr		= amd_pstate_epp_attr,
+ };
+@@ -1506,6 +1643,8 @@ static int __init amd_pstate_init(void)
+ 		}
+ 	}
+ 
++	amd_pstate_init_prefcore();
++
+ 	return ret;
+ 
+ global_attr_free:
+@@ -1527,7 +1666,17 @@ static int __init amd_pstate_param(char *str)
+ 
+ 	return amd_pstate_set_driver(mode_idx);
+ }
++
++static int __init amd_prefcore_param(char *str)
++{
++	if (!strcmp(str, "enable"))
++		no_prefcore = false;
++
++	return 0;
++}
++
+ early_param("amd_pstate", amd_pstate_param);
++early_param("amd_prefcore", amd_prefcore_param);
+ 
+ MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+ MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
+diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
+index 50bbc969f..842357abf 100644
+--- a/drivers/cpufreq/cpufreq.c
++++ b/drivers/cpufreq/cpufreq.c
+@@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu)
+ }
+ EXPORT_SYMBOL_GPL(cpufreq_update_limits);
+ 
++/**
++ * cpufreq_update_highest_perf - Update highest performance for a given CPU.
++ * @cpu: CPU to update the highest performance for.
++ *
++ * Invoke the driver's ->update_highest_perf callback if present
++ */
++void cpufreq_update_highest_perf(unsigned int cpu)
++{
++	if (cpufreq_driver->update_highest_perf)
++		cpufreq_driver->update_highest_perf(cpu);
++}
++EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf);
++
+ /*********************************************************************
+  *               BOOST						     *
+  *********************************************************************/
+diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
+index 6126c977e..c0b69ffe7 100644
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -139,6 +139,7 @@ struct cppc_cpudata {
+ #ifdef CONFIG_ACPI_CPPC_LIB
+ extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
+ extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf);
++extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf);
+ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
+ extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
+ extern int cppc_set_enable(int cpu, bool enable);
+@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+ {
+ 	return -ENOTSUPP;
+ }
++static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
++{
++	return -ENOTSUPP;
++}
+ static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
+ {
+ 	return -ENOTSUPP;
+diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
+index 446394f84..fa86bc953 100644
+--- a/include/linux/amd-pstate.h
++++ b/include/linux/amd-pstate.h
+@@ -70,6 +70,7 @@ struct amd_cpudata {
+ 	u32	nominal_perf;
+ 	u32	lowest_nonlinear_perf;
+ 	u32	lowest_perf;
++	u32     prefcore_highest_perf;
+ 
+ 	u32	max_freq;
+ 	u32	min_freq;
+diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
+index 172ff51c1..766c83a4f 100644
+--- a/include/linux/cpufreq.h
++++ b/include/linux/cpufreq.h
+@@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
+ void refresh_frequency_limits(struct cpufreq_policy *policy);
+ void cpufreq_update_policy(unsigned int cpu);
+ void cpufreq_update_limits(unsigned int cpu);
++void cpufreq_update_highest_perf(unsigned int cpu);
+ bool have_governor_per_policy(void);
+ bool cpufreq_supports_freq_invariance(void);
+ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
+@@ -376,6 +377,9 @@ struct cpufreq_driver {
+ 	/* Called to update policy limits on firmware notifications. */
+ 	void		(*update_limits)(unsigned int cpu);
+ 
++	/* Called to update highest performance on firmware notifications. */
++	void		(*update_highest_perf)(unsigned int cpu);
++
+ 	/* optional */
+ 	int		(*bios_limit)(int cpu, unsigned int *limit);
+ 
+-- 
+2.42.0.rc0.25.ga82fb66fed
diff --git a/scripts/patch.sh b/scripts/patch.sh
index 0cf2285..4705d26 100755
--- a/scripts/patch.sh
+++ b/scripts/patch.sh
@@ -15,4 +15,6 @@ patch -Np1 < "../patches/0002-eevdfbore.patch"
 # Allow setting custom pollrates for usb devices
 patch -Np1 < "../patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch"
 # Allow pre polaris cards to use the amdgpu kernel module
-patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch"
\ No newline at end of file
+patch -Np1 < "../patches/0005-amdgpu-si-cik-default.patch"
+# AMD Patch for CPPC
+patch -Np1 < "../patches/0006-AMD-cppc.patch"
\ No newline at end of file
diff --git a/scripts/source.sh b/scripts/source.sh
index b0658da..b1bc0b1 100755
--- a/scripts/source.sh
+++ b/scripts/source.sh
@@ -2,7 +2,7 @@
 
 echo "Pika Kernel - Getting source"
 
-wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc3.tar.gz
-tar -xf ./linux-6.5-rc3.tar.gz
+wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc5.tar.gz
+tar -xf ./linux-6.5-rc5.tar.gz
 
-cd linux-6.5-rc3
+cd linux-6.5-rc5